public String ExportData() { //Document variables DocInfo docInfo = new DocInfo(); System.Boolean hasOfficialUse = false; string officialText; try { if (!ExportFilePath.isFilePathOK(".txt")) { return("Invalid export file path: " + ExportFilePath); } BeforeProcessing(); using (var pdfReader = new PdfReader(PdfPath)) { // For image checking var parser = new PdfReaderContentParser(pdfReader); ImageRenderListener listener = null; // Check to see if doc has "for official use only" at the bottom ITextExtractionStrategy officialTextRectangle = MakeRectangle(70, 1, 375, 120); officialText = PdfTextExtractor.GetTextFromPage(pdfReader, 1, officialTextRectangle); officialText = Encoding.UTF8.GetString(Encoding.Convert(Encoding.Default, Encoding.UTF8, Encoding.Default.GetBytes(officialText))); if (officialText.ToString().ToUpper().Contains("FOROFFICIALUSEONLY")) { hasOfficialUse = true; } else { hasOfficialUse = false; } // Loop through each page of the PDF for (Int32 currentPage = 1; currentPage <= pdfReader.NumberOfPages; currentPage++) { PageInfo currentPageInfo = new PageInfo() { PageNum = currentPage }; ITextExtractionStrategy rectangleStrategy; float height = pdfReader.GetPageSize(currentPage).Height; float width = pdfReader.GetPageSize(currentPage).Width; if (height > 785 && height < 802 && width > 1215 && width < 1230) { rectangleStrategy = MakeRectangle(450, 1, 450, 70); } else if (height > 785 && height < 802 && width > 608 && width < 617) { rectangleStrategy = MakeRectangle(190, 1, 255, 74); } else { myLogger.Log("Page # " + currentPage.ToString() + " not 8.5 x 11 or 11 x 17"); continue; } string currentText = PdfTextExtractor.GetTextFromPage(pdfReader, currentPage, rectangleStrategy); currentText = Encoding.UTF8.GetString(Encoding.Convert(Encoding.Default, Encoding.UTF8, Encoding.Default.GetBytes(currentText))); if (hasOfficialUse) { currentText = OfficialUseRegex.Replace(currentText, "").Trim(); } ITextExtractionStrategy workPackageIndexStrategy = MakeRectangle(60, 600, 160, 50); string WPI = PdfTextExtractor.GetTextFromPage(pdfReader, currentPage, workPackageIndexStrategy); WPI = Encoding.UTF8.GetString(Encoding.Convert(Encoding.Default, Encoding.UTF8, Encoding.Default.GetBytes(WPI))); if (WPI.ToUpper().Contains("WORKPACKAGEINDEX")) { currentPageInfo.HasWpIndex = true; } // #-# if (NumDashNumRegex.IsMatch(currentText)) { currentPageInfo.PageNumText = NumDashNumRegex.Match(currentText).Value.Trim(); currentPageInfo.IsWP = true; } else { // #-#/blank if (NumDashNumBlankRegex.IsMatch(currentText)) { currentPageInfo.PageNumText = NumDashNumBlankRegex.Match(currentText).Value.Trim(); currentPageInfo.IsDashBlank = true; currentPageInfo.IsWP = true; } else { if (romanNumRegex.IsMatch(currentText.ToUpper().Trim())) { currentPageInfo.PageNumText = romanNumRegex.Match(currentText.ToUpper().Trim()).Value.Trim(); if (String.Equals(currentPageInfo.PageNumText.ToUpper(), "C") || String.Equals(currentPageInfo.PageNumText.ToUpper(), "D")) { currentPageInfo.PageNumText = currentPageInfo.PageNumText.ToLower(); currentPageInfo.IsLetter = true; } else { currentPageInfo.IsRoman = true; } } else { if (LetterRegex.IsMatch(currentText.Trim())) { currentPageInfo.PageNumText = LetterRegex.Match(currentText).Value.Trim(); currentPageInfo.IsLetter = true; } else { // Check if whole page is empty parser.ProcessContent(currentPage, (listener = new ImageRenderListener())); ITextExtractionStrategy currentTextRectangle = MakeRectangle(1, 1, 1000000, 1000000); String checkText = PdfTextExtractor.GetTextFromPage(pdfReader, currentPage, currentTextRectangle); checkText = Encoding.UTF8.GetString(Encoding.Convert(Encoding.Default, Encoding.UTF8, Encoding.Default.GetBytes(checkText))); if ((listener.Images.Count <= 0) && String.IsNullOrWhiteSpace(checkText)) { currentPageInfo.IsWholePageEmpty = true; currentPageInfo.IsPageNumAreaBlank = true; } else { if (String.IsNullOrWhiteSpace(currentText)) { currentPageInfo.IsPageNumAreaBlank = true; } else { if (indexRegex.IsMatch(currentText.Trim())) { currentPageInfo.PageNumText = indexRegex.Match(currentText).Value.Trim(); currentPageInfo.IsIndex = true; } else { currentPageInfo.PageNumText = currentText; currentPageInfo.IsMisc = true; } } } } } } } if (Bw.CancellationPending) { myLogger.Log("Processing cancelled at dwg #: " + currentPage.ToString()); break; } Bw.ReportProgress(Utils.GetPercentage(currentPage, pdfReader.NumberOfPages)); docInfo.Pages.Add(currentPageInfo); } } WriteDocInfoToTextFile(docInfo); } catch (System.Exception se) { return(se.Message); } finally { AfterProcessing(); } return(String.Concat(docInfo.ToString(), Environment.NewLine, "Processing completed in ", timer.Elapsed.TotalSeconds.PrintTimeFromSeconds(), Environment.NewLine, myLogger.ErrorCount.ToString(), " errors found.")); //return String.Concat( // docInfo.NumSheets, // "Processing completed in ", // timer.Elapsed.TotalSeconds.PrintTimeFromSeconds(), // " with ", // myLogger.ErrorCount, // " errors."); }
private void WriteDocInfoToTextFile(DocInfo docInfo) { Regex DashRegex = new Regex("–"); Regex blankRegex = new Regex(@"/blank$"); int convertedRoman = -1; String lastWpPageNum = String.Empty; string[] dashedNums = new string[2]; int currentWpCount = 0; int wpCount = 0; string currentWorkPackageNumber = String.Empty; using (StreamWriter sw = new StreamWriter(ExportFilePath)) { sw.Write("PDF Page #\tWork Package\t\tFull Page #" + Environment.NewLine + Environment.NewLine); foreach (PageInfo currentPageInfo in docInfo.Pages) { if (Bw.CancellationPending || Bw == null) { myLogger.Log("Processing cancelled while exporting PDF information to text file"); break; } // Is Work Package ? if (currentPageInfo.IsWP) { dashedNums = DashRegex.Split(currentPageInfo.PageNumText); if (dashedNums[0] == currentWorkPackageNumber) { currentWpCount++; } else { if (currentWorkPackageNumber != String.Empty) { sw.Write("WP page count for WP" + currentWorkPackageNumber + ": " + currentWpCount + Environment.NewLine + Environment.NewLine); } wpCount++; currentWorkPackageNumber = dashedNums[0]; currentWpCount = 1; } sw.Write(currentPageInfo.PageNum + "\t\t\t" + dashedNums[0] + "\t\t" + currentPageInfo.PageNumText + (currentPageInfo.HasWpIndex ? " - WP INDEX" : "") + Environment.NewLine); continue; } // Is Dash Blank ? if (currentPageInfo.IsDashBlank) { dashedNums = DashRegex.Split(blankRegex.Replace(currentPageInfo.PageNumText, " ")); if (dashedNums[0] == currentWorkPackageNumber) { currentWpCount++; } else { if (currentWorkPackageNumber != String.Empty) { sw.Write("WP page count for WP " + currentWorkPackageNumber + ": " + currentWpCount + Environment.NewLine + Environment.NewLine); } wpCount++; currentWorkPackageNumber = dashedNums[0]; currentWpCount = 1; } sw.Write(currentPageInfo.PageNum + "\t\t\t" + dashedNums[0] + "\t\t" + currentPageInfo.PageNumText + (currentPageInfo.HasWpIndex ? " - WP INDEX" : "") + Environment.NewLine); lastWpPageNum = currentWorkPackageNumber; continue; } if (currentPageInfo.IsRoman) { convertedRoman = RomanToNumber(currentPageInfo.PageNumText.Trim().ToUpper()); sw.Write(currentPageInfo.PageNum + "\t\t\t\t\t" + currentPageInfo.PageNumText + " (" + convertedRoman + ")" + (currentPageInfo.HasWpIndex ? " - WP INDEX" : "") + Environment.NewLine); continue; } if (currentPageInfo.IsLetter) { sw.Write(currentPageInfo.PageNum + "\t\t\t\t\t" + currentPageInfo.PageNumText + (currentPageInfo.HasWpIndex ? " - WP INDEX" : "") + Environment.NewLine); continue; } if (currentPageInfo.IsMisc) { sw.Write(currentPageInfo.PageNum + "\t\t\t\t\t" + (currentPageInfo.PageNumText.Length > 15 ? currentPageInfo.PageNumText.Substring(0, 15) : currentPageInfo.PageNumText) + (currentPageInfo.HasWpIndex ? " - WP INDEX" : "") + Environment.NewLine); continue; } if (currentPageInfo.IsPageNumAreaBlank) { sw.Write(currentPageInfo.PageNum + "\t\t\t\t\t" + "*No Page #*" + (currentPageInfo.HasWpIndex ? " - WP INDEX" : "") + Environment.NewLine); continue; } if (currentPageInfo.IsWholePageEmpty) { sw.Write(currentPageInfo.PageNum + "\t\t\t\t\t" + "*Empty Page*" + (currentPageInfo.HasWpIndex ? " - WP INDEX" : "") + Environment.NewLine); continue; } if (currentPageInfo.IsIndex) { sw.Write(currentPageInfo.PageNum + "\t\t\t\t\t" + currentPageInfo.PageNumText + (currentPageInfo.HasWpIndex ? " - WP INDEX" : "") + Environment.NewLine); continue; } } sw.Write("WP page count for WP " + currentWorkPackageNumber + ": " + currentWpCount + Environment.NewLine + Environment.NewLine); sw.Write("Number of total Work Packages: " + wpCount + Environment.NewLine); } //int i = docInfo.NumPrintable; return; }