public String ExportData() { //Document variables DocInfo docInfo = new DocInfo(); System.Boolean hasOfficialUse = false; string officialText; try { if (!ExportFilePath.isFilePathOK(".txt")) { return("Invalid export file path: " + ExportFilePath); } BeforeProcessing(); using (var pdfReader = new PdfReader(PdfPath)) { // For image checking var parser = new PdfReaderContentParser(pdfReader); ImageRenderListener listener = null; // Check to see if doc has "for official use only" at the bottom ITextExtractionStrategy officialTextRectangle = MakeRectangle(70, 1, 375, 120); officialText = PdfTextExtractor.GetTextFromPage(pdfReader, 1, officialTextRectangle); officialText = Encoding.UTF8.GetString(Encoding.Convert(Encoding.Default, Encoding.UTF8, Encoding.Default.GetBytes(officialText))); if (officialText.ToString().ToUpper().Contains("FOROFFICIALUSEONLY")) { hasOfficialUse = true; } else { hasOfficialUse = false; } // Loop through each page of the PDF for (Int32 currentPage = 1; currentPage <= pdfReader.NumberOfPages; currentPage++) { PageInfo currentPageInfo = new PageInfo() { PageNum = currentPage }; ITextExtractionStrategy rectangleStrategy; float height = pdfReader.GetPageSize(currentPage).Height; float width = pdfReader.GetPageSize(currentPage).Width; if (height > 785 && height < 802 && width > 1215 && width < 1230) { rectangleStrategy = MakeRectangle(450, 1, 450, 70); } else if (height > 785 && height < 802 && width > 608 && width < 617) { rectangleStrategy = MakeRectangle(190, 1, 255, 74); } else { myLogger.Log("Page # " + currentPage.ToString() + " not 8.5 x 11 or 11 x 17"); continue; } string currentText = PdfTextExtractor.GetTextFromPage(pdfReader, currentPage, rectangleStrategy); currentText = Encoding.UTF8.GetString(Encoding.Convert(Encoding.Default, Encoding.UTF8, Encoding.Default.GetBytes(currentText))); if (hasOfficialUse) { currentText = OfficialUseRegex.Replace(currentText, "").Trim(); } ITextExtractionStrategy workPackageIndexStrategy = MakeRectangle(60, 600, 160, 50); string WPI = PdfTextExtractor.GetTextFromPage(pdfReader, currentPage, workPackageIndexStrategy); WPI = Encoding.UTF8.GetString(Encoding.Convert(Encoding.Default, Encoding.UTF8, Encoding.Default.GetBytes(WPI))); if (WPI.ToUpper().Contains("WORKPACKAGEINDEX")) { currentPageInfo.HasWpIndex = true; } // #-# if (NumDashNumRegex.IsMatch(currentText)) { currentPageInfo.PageNumText = NumDashNumRegex.Match(currentText).Value.Trim(); currentPageInfo.IsWP = true; } else { // #-#/blank if (NumDashNumBlankRegex.IsMatch(currentText)) { currentPageInfo.PageNumText = NumDashNumBlankRegex.Match(currentText).Value.Trim(); currentPageInfo.IsDashBlank = true; currentPageInfo.IsWP = true; } else { if (romanNumRegex.IsMatch(currentText.ToUpper().Trim())) { currentPageInfo.PageNumText = romanNumRegex.Match(currentText.ToUpper().Trim()).Value.Trim(); if (String.Equals(currentPageInfo.PageNumText.ToUpper(), "C") || String.Equals(currentPageInfo.PageNumText.ToUpper(), "D")) { currentPageInfo.PageNumText = currentPageInfo.PageNumText.ToLower(); currentPageInfo.IsLetter = true; } else { currentPageInfo.IsRoman = true; } } else { if (LetterRegex.IsMatch(currentText.Trim())) { currentPageInfo.PageNumText = LetterRegex.Match(currentText).Value.Trim(); currentPageInfo.IsLetter = true; } else { // Check if whole page is empty parser.ProcessContent(currentPage, (listener = new ImageRenderListener())); ITextExtractionStrategy currentTextRectangle = MakeRectangle(1, 1, 1000000, 1000000); String checkText = PdfTextExtractor.GetTextFromPage(pdfReader, currentPage, currentTextRectangle); checkText = Encoding.UTF8.GetString(Encoding.Convert(Encoding.Default, Encoding.UTF8, Encoding.Default.GetBytes(checkText))); if ((listener.Images.Count <= 0) && String.IsNullOrWhiteSpace(checkText)) { currentPageInfo.IsWholePageEmpty = true; currentPageInfo.IsPageNumAreaBlank = true; } else { if (String.IsNullOrWhiteSpace(currentText)) { currentPageInfo.IsPageNumAreaBlank = true; } else { if (indexRegex.IsMatch(currentText.Trim())) { currentPageInfo.PageNumText = indexRegex.Match(currentText).Value.Trim(); currentPageInfo.IsIndex = true; } else { currentPageInfo.PageNumText = currentText; currentPageInfo.IsMisc = true; } } } } } } } if (Bw.CancellationPending) { myLogger.Log("Processing cancelled at dwg #: " + currentPage.ToString()); break; } Bw.ReportProgress(Utils.GetPercentage(currentPage, pdfReader.NumberOfPages)); docInfo.Pages.Add(currentPageInfo); } } WriteDocInfoToTextFile(docInfo); } catch (System.Exception se) { return(se.Message); } finally { AfterProcessing(); } return(String.Concat(docInfo.ToString(), Environment.NewLine, "Processing completed in ", timer.Elapsed.TotalSeconds.PrintTimeFromSeconds(), Environment.NewLine, myLogger.ErrorCount.ToString(), " errors found.")); //return String.Concat( // docInfo.NumSheets, // "Processing completed in ", // timer.Elapsed.TotalSeconds.PrintTimeFromSeconds(), // " with ", // myLogger.ErrorCount, // " errors."); }
public String Split() { Int32 currentPage = 1; Int32 totalPages = 0; String currentPageField = String.Empty; String previousWP = String.Empty; String currentWP = String.Empty; FileStream outputStream = null;; PdfWriter writer; PdfImportedPage pageImport; byte[] byteArrayPdf; PdfContentByte cb; Document doc = null; try { if (!outputFolder.isDirectoryPathOK()) { return("Invalid output folder"); } BeforeProcessing(); //Document doc; PdfReader pdfReaderTemp = new PdfReader(PdfPath); Document docTemp = new Document(pdfReaderTemp.GetPageSize(currentPage)); outputStream = new FileStream(outputFolder + "\\tmp.pdf", FileMode.OpenOrCreate); outputStream.Flush(); writer = PdfWriter.GetInstance(docTemp, outputStream); outputStream.Close(); pdfReaderTemp.Close(); using (PdfReader pdfReader = new PdfReader(PdfPath)) { totalPages = pdfReader.NumberOfPages; ITextExtractionStrategy rectangleStrategy; /*Document*/ doc = new Document(pdfReader.GetPageSize(1)); for (currentPage = 1; currentPage <= pdfReader.NumberOfPages; currentPage++) { // Update progress bar try { Bw.ReportProgress(Utils.GetPercentage(currentPage, pdfReader.NumberOfPages)); } catch { } // check if bw is trying to be cancelled if (Bw == null || Bw.CancellationPending) { myLogger.Log("Processing cancelled on PDF #" + currentPage); break; } float height = pdfReader.GetPageSize(currentPage).Height; float width = pdfReader.GetPageSize(currentPage).Width; if (height > 785 && height < 802 && width > 1215 && width < 1230) { // Page is 11 x 17 rectangleStrategy = MakeRectangle(450, 1, 450, 70); } else if (height > 785 && height < 802 && width > 608 && width < 617) { // Page is 8.5 x 11 rectangleStrategy = MakeRectangle(190, 1, 255, 74); } else { myLogger.Log("Page # " + currentPage.ToString() + " not 8.5 x 11 or 11 x 17"); continue; } String currentText = PdfTextExtractor.GetTextFromPage(pdfReader, currentPage, rectangleStrategy); currentText = Encoding.UTF8.GetString(Encoding.Convert(Encoding.Default, Encoding.UTF8, Encoding.Default.GetBytes(currentText))); if (pageFieldRegex.IsMatch(currentText)) { currentPageField = pageFieldRegex.Match(currentText).Value.Trim(); currentWP = ExtractWP(currentPageField, false); } else { if (pageFieldBlankRegex.IsMatch(currentText)) { currentPageField = pageFieldBlankRegex.Match(currentText).Value.Trim(); currentWP = ExtractWP(currentPageField, true); } else { continue; } } if (!String.Equals(previousWP, currentWP)) { //current wp is different than previous try { doc.Close(); } catch { } doc = new Document(pdfReader.GetPageSize(currentPage)); outputStream = new FileStream(outputFolder + "\\" + currentWP + ".pdf", FileMode.Create); writer = PdfWriter.GetInstance(doc, outputStream); byteArrayPdf = pdfReader.GetPageContent(currentPage); doc.Open(); cb = writer.DirectContent; pageImport = writer.GetImportedPage(pdfReader, currentPage); cb.AddTemplate(pageImport, 0, 0); writer.Flush(); previousWP = currentWP; } else { //current wp is the same as previous doc.NewPage(); byteArrayPdf = pdfReader.GetPageContent(currentPage); cb = writer.DirectContent; pageImport = writer.GetImportedPage(pdfReader, currentPage); cb.AddTemplate(pageImport, 0, 0); writer.Flush(); } } //try { doc.Close(); } //catch { } try { File.Delete(outputFolder + "\\tmp.pdf"); } catch { } }// using PdfReader pdfReader } catch (System.Exception se) { return(se.Message); } finally { try { doc.Close(); } catch { } try { outputStream.Dispose(); } catch { } try { File.Delete(outputFolder + "\\tmp.pdf"); } catch { } AfterProcessing(); } return(String.Concat(totalPages.ToString(), " pages processed in ", timer.Elapsed.TotalSeconds.PrintTimeFromSeconds(), " with ", myLogger.ErrorCount, " errors.")); }
public String Extract() { String currentPageField = String.Empty; byte[] byteArrayPdf; PdfContentByte cb; PdfImportedPage pageImport; Int32 totalPages = 0; try { if (!WpFile.isFilePathOK()) { return("Invalid WP file"); } if (!outputFolder.isDirectoryPathOK()) { return("Invalid output folder"); } BeforeProcessing(); HashSet <String> WPs = GetWPsToExtract(); using (PdfReader pdfReader = new PdfReader(PdfPath)) { using (Document doc = new Document(pdfReader.GetPageSize(1))) { using (FileStream outputStream = new FileStream(outputFolder + "\\ExtractedWPs.pdf", FileMode.Create)) { PdfWriter writer = PdfWriter.GetInstance(doc, outputStream); totalPages = pdfReader.NumberOfPages; ITextExtractionStrategy rectangleStrategy; for (Int32 currentPage = 1; currentPage <= pdfReader.NumberOfPages; currentPage++) { String currentWP = String.Empty; try { Bw.ReportProgress(Utils.GetPercentage(currentPage, pdfReader.NumberOfPages)); } catch { } if (Bw.CancellationPending || Bw == null) { myLogger.Log("Processing cancelled on PDF #" + currentPage); break; } float height = pdfReader.GetPageSize(currentPage).Height; float width = pdfReader.GetPageSize(currentPage).Width; if (height > 785 && height < 802 && width > 1215 && width < 1230) { // Page is 11 x 17 rectangleStrategy = MakeRectangle(450, 1, 450, 70); } else if (height > 785 && height < 802 && width > 608 && width < 617) { // Page is 8.5 x 11 rectangleStrategy = MakeRectangle(190, 1, 255, 74); } else { myLogger.Log("Page # " + currentPage.ToString() + " not 8.5 x 11 or 11 x 17"); continue; } String currentText = PdfTextExtractor.GetTextFromPage(pdfReader, currentPage, rectangleStrategy); currentText = Encoding.UTF8.GetString(Encoding.Convert(Encoding.Default, Encoding.UTF8, Encoding.Default.GetBytes(currentText))); if (pageFieldRegex.IsMatch(currentText)) { currentPageField = pageFieldRegex.Match(currentText).Value.Trim(); currentWP = ExtractWP(currentPageField, false); } else { if (pageFieldBlankRegex.IsMatch(currentText)) { currentPageField = pageFieldBlankRegex.Match(currentText).Value.Trim(); currentWP = ExtractWP(currentPageField, true); } else { // This page has no Page # field/WP, so skip it continue; } } if (WPs.Contains(currentWP)) { // Write this page to document if (!doc.IsOpen()) { doc.Open(); } doc.NewPage(); byteArrayPdf = pdfReader.GetPageContent(currentPage); cb = writer.DirectContent; pageImport = writer.GetImportedPage(pdfReader, currentPage); cb.AddTemplate(pageImport, 0, 0); writer.Flush(); } } try { writer.Dispose(); } catch { } // PdfWriter } //FileStream } //Doc } //PdfReader } catch (System.Exception se) { return(se.Message); } finally { AfterProcessing(); } return(String.Concat(totalPages.ToString(), " pages processed in ", timer.Elapsed.TotalSeconds.PrintTimeFromSeconds(), " with ", myLogger.ErrorCount, " errors.")); }