static void Extract(bool recoveryEnabled) { List <int> errorSessionList = new List <int>(); int sessionsX = 0; int global = 0; var pdfReader = new PdfReader(_filePath); int lastPage; string format = "D" + pdfReader.NumberOfPages.ToString().Length; int lastValidIdFromPreviousPage = -1; for (int p = 0; p < pdfReader.NumberOfPages; p++) { Console.Clear(); Console.SetCursorPosition(0, 0); Console.WriteLine("Processing page " + (p + 1).ToString(format) + " of " + pdfReader.NumberOfPages); string textFromPage = ""; lastPage = p; while (lastPage + 1 <= pdfReader.NumberOfPages && lastPage + 1 != pdfReader.NumberOfPages) { lastPage++; textFromPage = PdfTextExtractor.GetTextFromPage(pdfReader, lastPage + 1); if (textFromPage.Contains("Sess")) { break; } } string segments = PdfTextExtractor.GetTextFromPage(pdfReader, p + 1); List <string> sessionList = new List <string>(); while (segments.Length > 0) { int i1 = segments.IndexOf("Sess"); //text goes over more than 2 pages but we already got the session info if (i1 == -1) { break; } int i2 = segments.IndexOf("Sess", i1 + "Session".Length); if (i2 != -1) { string[] kArr = segments.Substring(i2, 20).Split(' '); if (!kArr[1].Contains("#")) { string temp = segments.Substring(i1, i2 - i1); sessionList.Add(temp); segments = segments.Substring(i2, segments.Length - i2); } else { i2 = segments.IndexOf("Sess", i2 + "Session".Length); if (i2 != -1) { string temp = segments.Substring(i1, i2 - i1); sessionList.Add(temp); segments = segments.Substring(i2, segments.Length - i2); } else { string temp = segments.Substring(i1, segments.Length - i1); sessionList.Add(temp); segments = ""; } } } else { string temp = segments.Substring(i1, segments.Length - i1); sessionList.Add(temp); segments = ""; } } List <Info> infoList = new List <Info>(); int sessions = 0; var tes = new MyLocationTextExtractionStrategy(); var ex = PdfTextExtractor.GetTextFromPage(pdfReader, p + 1, tes); for (int chunk = 0; chunk < tes.myPoints.Count; chunk++) { if (tes.myPoints[chunk].Text.Contains("Sess")) { if (chunk + 1 < tes.myPoints.Count && tes.myPoints[chunk + 1].Text.Contains("#")) { continue; } Info info; info.sessionId = "NA"; info.rect = tes.myPoints[chunk].Rect; //get the Session number Regex regex = new Regex(@"^[0-9]+$"); string[] tokens = sessionList[sessions].Split(' '); for (int t = 0; t < tokens.Length; t++) { if (tokens[t].Contains("Sess")) { /**** do a regex to any number of digits****/ if (t + 1 < tokens.Length && regex.IsMatch(tokens[t + 1].Trim())) { info.sessionId = tokens[t + 1].Trim(); } break; } } //info.Print(); infoList.Add(info); sessions++; } } if (infoList.Count == 0) { Console.WriteLine("Fatal: PDF resolution did not allow to recover word rendering locations"); return; } /*** Recovering session numbers lost ***/ if (recoveryEnabled) { int indexFound = -1; int s = 0; for (; s < infoList.Count; s++) { if (indexFound != -1) { if (!infoList[s].sessionId.Contains("NA")) { if (int.Parse(infoList[indexFound].sessionId) - int.Parse(infoList[s].sessionId) == indexFound - s) { break; } } } if (!infoList[s].sessionId.Contains("NA")) { indexFound = s; if (lastValidIdFromPreviousPage != -1) { if (Math.Abs(lastValidIdFromPreviousPage - int.Parse(infoList[s].sessionId)) == s + 1) { break; } } } } if (s == infoList.Count && indexFound != s - 1) { indexFound = -1; } if (indexFound != -1) { int val = int.Parse(infoList[indexFound].sessionId); for (int tt = 0; tt < infoList.Count; tt++) { Info info = infoList[tt]; if (info.sessionId.Contains("NA") && tt < indexFound) { int newVal = val - Math.Abs(indexFound - tt); info.sessionId = (newVal).ToString(); infoList[tt] = info; errorSessionList.Add(newVal); } else if (info.sessionId.Contains("NA") && tt > indexFound) { int newVal = val + Math.Abs(indexFound - tt); info.sessionId = (newVal).ToString(); infoList[tt] = info; errorSessionList.Add(newVal); } else if (!info.sessionId.Contains("NA")) { if (lastValidIdFromPreviousPage != -1) { int iVal = lastValidIdFromPreviousPage + tt + 1; if (int.Parse(infoList[tt].sessionId) != iVal) { //Correcting OCR error info.sessionId = iVal.ToString(); infoList[tt] = info; errorSessionList.Add(iVal); } } else { int iVal = int.Parse(infoList[indexFound].sessionId) - indexFound + tt; if (int.Parse(infoList[tt].sessionId) != iVal) { //Correcting OCR error info.sessionId = iVal.ToString(); infoList[tt] = info; errorSessionList.Add(iVal); } } } } lastValidIdFromPreviousPage = int.Parse(infoList[infoList.Count - 1].sessionId); } else //try again in case the whole page failed { if (lastValidIdFromPreviousPage != -1) { int ss = lastValidIdFromPreviousPage + 1; for (int tt = 0; tt < infoList.Count; tt++) { Info info = infoList[tt]; if (info.sessionId.Contains("NA")) { info.sessionId = (ss + tt).ToString(); infoList[tt] = info; } } if (infoList.Count - 1 >= 0) { lastValidIdFromPreviousPage = int.Parse(infoList[infoList.Count - 1].sessionId); } } else { if (!int.TryParse(infoList[infoList.Count - 1].sessionId, out lastValidIdFromPreviousPage)) { Console.WriteLine("Fatal: Could not recover from this error on page " + (p + 1)); lastValidIdFromPreviousPage = -1; } } } //double check foreach (Info info in infoList) { if (info.sessionId.Contains("NA")) { Console.WriteLine("Could not recover session Id on page " + (p + 1)); sessionsX++; } } } else { //just count foreach (Info info in infoList) { if (info.sessionId.Contains("NA")) { sessionsX++; } } } //Single-page template Document doc1 = new Document(); PdfCopy copy1; PdfImportedPage importedPage1; using (FileStream fs1 = new FileStream(outputDir + "template1.pdf", FileMode.Create)) { copy1 = new PdfCopy(doc1, fs1); doc1.Open(); importedPage1 = copy1.GetImportedPage(pdfReader, p + 1); copy1.AddPage(importedPage1); doc1.Close(); } //Multi-page template int pTemp; Document docN = new Document(); PdfCopy copyN = null; PdfImportedPage importedPageN = null; using (FileStream fs = new FileStream(outputDir + "templateN.pdf", FileMode.Create)) { copyN = new PdfCopy(docN, fs); docN.Open(); pTemp = p; while (pTemp <= lastPage && pTemp != pdfReader.NumberOfPages) { importedPageN = copyN.GetImportedPage(pdfReader, pTemp + 1); copyN.AddPage(importedPageN); pTemp++; } docN.Close(); } //retrieve the info from for the last page where the next session is Info infoEnd; infoEnd.sessionId = "Last"; Rectangle r = pdfReader.GetPageSize(lastPage + 1); infoEnd.rect = new Rectangle(0, r.Top, 0, r.Bottom); ///swapping top tes = new MyLocationTextExtractionStrategy(); ex = PdfTextExtractor.GetTextFromPage(pdfReader, lastPage + 1, tes); for (int chunk = 0; chunk < tes.myPoints.Count; chunk++) { if (tes.myPoints[chunk].Text.Contains("Sess")) { infoEnd.rect = tes.myPoints[chunk].Rect; break; } } //go through all info for (int info = 0; info < infoList.Count; info++) { string outputFilename = infoList[info].sessionId.Trim(); if (File.Exists(outputDir + outputFilename + ".pdf")) { outputFilename += "-" + (global + 1).ToString("D4"); } global++; outputFilename += ".pdf"; PdfStamper stamper; PdfDictionary page; if (info + 1 < infoList.Count) { var pdfReader2 = new PdfReader(outputDir + "template1.pdf"); using (FileStream fs = new FileStream(outputDir + outputFilename, FileMode.Create)) { stamper = new PdfStamper(pdfReader2, fs); page = pdfReader2.GetPageN(1); page.Put(PdfName.CROPBOX, new PdfArray(new float[] { 0, infoList[info + 1].rect.Top + gapV, importedPage1.Width, infoList[info].rect.Top + gapV })); stamper.MarkUsed(page); Dictionary <String, String> meta = pdfReader2.Info; meta.Add("Keywords", infoList[info].sessionId); stamper.MoreInfo = meta; stamper.Close(); pdfReader2.Close(); } } else { PdfReader pdfReader2; //last page if (p + 1 == pdfReader.NumberOfPages) { pdfReader2 = new PdfReader(outputDir + "template1.pdf"); stamper = new PdfStamper(pdfReader2, new FileStream(outputDir + outputFilename, FileMode.Create)); page = pdfReader2.GetPageN(1); page.Put(PdfName.CROPBOX, new PdfArray(new float[] { 0, 0, importedPageN.Width, infoList[info].rect.Top + gapV })); stamper.MarkUsed(page); Dictionary <String, String> meta2 = pdfReader2.Info; meta2.Add("Keywords", infoList[info].sessionId); stamper.MoreInfo = meta2; stamper.Close(); pdfReader2.Close(); break; } else //last sesion { pdfReader2 = new PdfReader(outputDir + "templateN.pdf"); stamper = new PdfStamper(pdfReader2, new FileStream(outputDir + outputFilename, FileMode.Create)); page = pdfReader2.GetPageN(1); page.Put(PdfName.CROPBOX, new PdfArray(new float[] { 0, 0, importedPageN.Width, infoList[info].rect.Top + gapV })); stamper.MarkUsed(page); //session with more than 2 pages if (lastPage - p > 1) { int newP = 1; int max = lastPage - p; while (newP <= max) { page = pdfReader2.GetPageN(newP + 1); if (newP != max) { page.Put(PdfName.CROPBOX, new PdfArray(new float[] { 0, 0, importedPageN.Width, importedPageN.Height })); } else { page.Put(PdfName.CROPBOX, new PdfArray(new float[] { 0, infoEnd.rect.Top + gapV, importedPageN.Width, importedPageN.Height })); } stamper.MarkUsed(page); newP++; } p = lastPage - 1; } else { page = pdfReader2.GetPageN(2); page.Put(PdfName.CROPBOX, new PdfArray(new float[] { 0, infoEnd.rect.Top + gapV, importedPageN.Width, importedPageN.Height })); stamper.MarkUsed(page); } } Dictionary <String, String> meta = pdfReader2.Info; meta.Add("Keywords", infoList[info].sessionId); stamper.MoreInfo = meta; stamper.Close(); pdfReader2.Close(); } } } Console.WriteLine("Sessions extracted: " + global); if (recoveryEnabled) { Console.WriteLine("Session ID recovery was attempted. Percentage of undetected Session IDs after OCR reconstruction was " + (((float)sessionsX / (float)global) * 100).ToString("0.00") + "%"); } else { Console.WriteLine("No Session ID recovery was attempted. Percentage of undetected Session IDs with no In/Out Digits was " + (((float)sessionsX / (float)global) * 100).ToString("0.00") + "%"); } Console.WriteLine(); }