Example #1
0
        public String ExportData()
        {
            //Document variables

            DocInfo docInfo = new DocInfo();

            System.Boolean hasOfficialUse = false;
            string         officialText;

            try
            {
                if (!ExportFilePath.isFilePathOK(".txt"))
                {
                    return("Invalid export file path: " + ExportFilePath);
                }

                BeforeProcessing();

                using (var pdfReader = new PdfReader(PdfPath))
                {
                    // For image checking
                    var parser = new PdfReaderContentParser(pdfReader);
                    ImageRenderListener listener = null;

                    // Check to see if doc has "for official use only" at the bottom
                    ITextExtractionStrategy officialTextRectangle = MakeRectangle(70, 1, 375, 120);
                    officialText = PdfTextExtractor.GetTextFromPage(pdfReader, 1, officialTextRectangle);
                    officialText = Encoding.UTF8.GetString(Encoding.Convert(Encoding.Default, Encoding.UTF8, Encoding.Default.GetBytes(officialText)));

                    if (officialText.ToString().ToUpper().Contains("FOROFFICIALUSEONLY"))
                    {
                        hasOfficialUse = true;
                    }
                    else
                    {
                        hasOfficialUse = false;
                    }

                    // Loop through each page of the PDF
                    for (Int32 currentPage = 1; currentPage <= pdfReader.NumberOfPages; currentPage++)
                    {
                        PageInfo currentPageInfo = new PageInfo()
                        {
                            PageNum = currentPage
                        };

                        ITextExtractionStrategy rectangleStrategy;

                        float height = pdfReader.GetPageSize(currentPage).Height;
                        float width  = pdfReader.GetPageSize(currentPage).Width;

                        if (height > 785 && height < 802 && width > 1215 && width < 1230)
                        {
                            rectangleStrategy = MakeRectangle(450, 1, 450, 70);
                        }
                        else if (height > 785 && height < 802 && width > 608 && width < 617)
                        {
                            rectangleStrategy = MakeRectangle(190, 1, 255, 74);
                        }
                        else
                        {
                            myLogger.Log("Page # " + currentPage.ToString() + " not 8.5 x 11 or 11 x 17");
                            continue;
                        }

                        string currentText = PdfTextExtractor.GetTextFromPage(pdfReader, currentPage, rectangleStrategy);
                        currentText = Encoding.UTF8.GetString(Encoding.Convert(Encoding.Default, Encoding.UTF8, Encoding.Default.GetBytes(currentText)));

                        if (hasOfficialUse)
                        {
                            currentText = OfficialUseRegex.Replace(currentText, "").Trim();
                        }

                        ITextExtractionStrategy workPackageIndexStrategy = MakeRectangle(60, 600, 160, 50);
                        string WPI = PdfTextExtractor.GetTextFromPage(pdfReader, currentPage, workPackageIndexStrategy);
                        WPI = Encoding.UTF8.GetString(Encoding.Convert(Encoding.Default, Encoding.UTF8, Encoding.Default.GetBytes(WPI)));

                        if (WPI.ToUpper().Contains("WORKPACKAGEINDEX"))
                        {
                            currentPageInfo.HasWpIndex = true;
                        }

                        // #-#
                        if (NumDashNumRegex.IsMatch(currentText))
                        {
                            currentPageInfo.PageNumText = NumDashNumRegex.Match(currentText).Value.Trim();
                            currentPageInfo.IsWP        = true;
                        }
                        else
                        {
                            // #-#/blank
                            if (NumDashNumBlankRegex.IsMatch(currentText))
                            {
                                currentPageInfo.PageNumText = NumDashNumBlankRegex.Match(currentText).Value.Trim();
                                currentPageInfo.IsDashBlank = true;
                                currentPageInfo.IsWP        = true;
                            }
                            else
                            {
                                if (romanNumRegex.IsMatch(currentText.ToUpper().Trim()))
                                {
                                    currentPageInfo.PageNumText = romanNumRegex.Match(currentText.ToUpper().Trim()).Value.Trim();

                                    if (String.Equals(currentPageInfo.PageNumText.ToUpper(), "C") || String.Equals(currentPageInfo.PageNumText.ToUpper(), "D"))
                                    {
                                        currentPageInfo.PageNumText = currentPageInfo.PageNumText.ToLower();
                                        currentPageInfo.IsLetter    = true;
                                    }
                                    else
                                    {
                                        currentPageInfo.IsRoman = true;
                                    }
                                }
                                else
                                {
                                    if (LetterRegex.IsMatch(currentText.Trim()))
                                    {
                                        currentPageInfo.PageNumText = LetterRegex.Match(currentText).Value.Trim();
                                        currentPageInfo.IsLetter    = true;
                                    }
                                    else
                                    {
                                        // Check if whole page is empty
                                        parser.ProcessContent(currentPage, (listener = new ImageRenderListener()));

                                        ITextExtractionStrategy currentTextRectangle = MakeRectangle(1, 1, 1000000, 1000000);

                                        String checkText = PdfTextExtractor.GetTextFromPage(pdfReader, currentPage, currentTextRectangle);
                                        checkText = Encoding.UTF8.GetString(Encoding.Convert(Encoding.Default, Encoding.UTF8, Encoding.Default.GetBytes(checkText)));

                                        if ((listener.Images.Count <= 0) && String.IsNullOrWhiteSpace(checkText))
                                        {
                                            currentPageInfo.IsWholePageEmpty   = true;
                                            currentPageInfo.IsPageNumAreaBlank = true;
                                        }
                                        else
                                        {
                                            if (String.IsNullOrWhiteSpace(currentText))
                                            {
                                                currentPageInfo.IsPageNumAreaBlank = true;
                                            }
                                            else
                                            {
                                                if (indexRegex.IsMatch(currentText.Trim()))
                                                {
                                                    currentPageInfo.PageNumText = indexRegex.Match(currentText).Value.Trim();
                                                    currentPageInfo.IsIndex     = true;
                                                }
                                                else
                                                {
                                                    currentPageInfo.PageNumText = currentText;
                                                    currentPageInfo.IsMisc      = true;
                                                }
                                            }
                                        }
                                    }
                                }
                            }
                        }

                        if (Bw.CancellationPending)
                        {
                            myLogger.Log("Processing cancelled at dwg #: " + currentPage.ToString());
                            break;
                        }

                        Bw.ReportProgress(Utils.GetPercentage(currentPage, pdfReader.NumberOfPages));

                        docInfo.Pages.Add(currentPageInfo);
                    }
                }

                WriteDocInfoToTextFile(docInfo);
            }
            catch (System.Exception se)
            {
                return(se.Message);
            }
            finally
            {
                AfterProcessing();
            }

            return(String.Concat(docInfo.ToString(),
                                 Environment.NewLine,
                                 "Processing completed in ",
                                 timer.Elapsed.TotalSeconds.PrintTimeFromSeconds(),
                                 Environment.NewLine,
                                 myLogger.ErrorCount.ToString(),
                                 " errors found."));

            //return String.Concat(
            //        docInfo.NumSheets,
            //        "Processing completed in ",
            //        timer.Elapsed.TotalSeconds.PrintTimeFromSeconds(),
            //        " with ",
            //        myLogger.ErrorCount,
            //        " errors.");
        }
Example #2
0
        public String Split()
        {
            Int32  currentPage      = 1;
            Int32  totalPages       = 0;
            String currentPageField = String.Empty;
            String previousWP       = String.Empty;
            String currentWP        = String.Empty;

            FileStream      outputStream = null;;
            PdfWriter       writer;
            PdfImportedPage pageImport;

            byte[]         byteArrayPdf;
            PdfContentByte cb;
            Document       doc = null;

            try
            {
                if (!outputFolder.isDirectoryPathOK())
                {
                    return("Invalid output folder");
                }

                BeforeProcessing();

                //Document doc;
                PdfReader pdfReaderTemp = new PdfReader(PdfPath);
                Document  docTemp       = new Document(pdfReaderTemp.GetPageSize(currentPage));
                outputStream = new FileStream(outputFolder + "\\tmp.pdf", FileMode.OpenOrCreate);
                outputStream.Flush();
                writer = PdfWriter.GetInstance(docTemp, outputStream);
                outputStream.Close();
                pdfReaderTemp.Close();

                using (PdfReader pdfReader = new PdfReader(PdfPath))
                {
                    totalPages = pdfReader.NumberOfPages;
                    ITextExtractionStrategy rectangleStrategy;

                    /*Document*/
                    doc = new Document(pdfReader.GetPageSize(1));

                    for (currentPage = 1; currentPage <= pdfReader.NumberOfPages; currentPage++)
                    {
                        // Update progress bar
                        try { Bw.ReportProgress(Utils.GetPercentage(currentPage, pdfReader.NumberOfPages)); }
                        catch { }

                        // check if bw is trying to be cancelled
                        if (Bw == null || Bw.CancellationPending)
                        {
                            myLogger.Log("Processing cancelled on PDF #" + currentPage);
                            break;
                        }

                        float height = pdfReader.GetPageSize(currentPage).Height;
                        float width  = pdfReader.GetPageSize(currentPage).Width;

                        if (height > 785 && height < 802 && width > 1215 && width < 1230)
                        {
                            // Page is 11 x 17
                            rectangleStrategy = MakeRectangle(450, 1, 450, 70);
                        }
                        else if (height > 785 && height < 802 && width > 608 && width < 617)
                        {
                            // Page is 8.5 x 11
                            rectangleStrategy = MakeRectangle(190, 1, 255, 74);
                        }
                        else
                        {
                            myLogger.Log("Page # " + currentPage.ToString() + " not 8.5 x 11 or 11 x 17");
                            continue;
                        }

                        String currentText = PdfTextExtractor.GetTextFromPage(pdfReader, currentPage, rectangleStrategy);
                        currentText = Encoding.UTF8.GetString(Encoding.Convert(Encoding.Default, Encoding.UTF8, Encoding.Default.GetBytes(currentText)));

                        if (pageFieldRegex.IsMatch(currentText))
                        {
                            currentPageField = pageFieldRegex.Match(currentText).Value.Trim();
                            currentWP        = ExtractWP(currentPageField, false);
                        }
                        else
                        {
                            if (pageFieldBlankRegex.IsMatch(currentText))
                            {
                                currentPageField = pageFieldBlankRegex.Match(currentText).Value.Trim();
                                currentWP        = ExtractWP(currentPageField, true);
                            }
                            else
                            {
                                continue;
                            }
                        }

                        if (!String.Equals(previousWP, currentWP))
                        {
                            //current wp is different than previous

                            try { doc.Close(); }
                            catch { }
                            doc          = new Document(pdfReader.GetPageSize(currentPage));
                            outputStream = new FileStream(outputFolder + "\\" + currentWP + ".pdf", FileMode.Create);

                            writer = PdfWriter.GetInstance(doc, outputStream);

                            byteArrayPdf = pdfReader.GetPageContent(currentPage);
                            doc.Open();
                            cb         = writer.DirectContent;
                            pageImport = writer.GetImportedPage(pdfReader, currentPage);
                            cb.AddTemplate(pageImport, 0, 0);
                            writer.Flush();
                            previousWP = currentWP;
                        }
                        else
                        {
                            //current wp is the same as previous
                            doc.NewPage();
                            byteArrayPdf = pdfReader.GetPageContent(currentPage);
                            cb           = writer.DirectContent;
                            pageImport   = writer.GetImportedPage(pdfReader, currentPage);
                            cb.AddTemplate(pageImport, 0, 0);
                            writer.Flush();
                        }
                    }

                    //try { doc.Close(); }
                    //catch { }

                    try { File.Delete(outputFolder + "\\tmp.pdf"); }
                    catch { }
                }// using PdfReader pdfReader
            }
            catch (System.Exception se)
            {
                return(se.Message);
            }
            finally
            {
                try { doc.Close(); }
                catch { }
                try { outputStream.Dispose(); }
                catch { }
                try { File.Delete(outputFolder + "\\tmp.pdf"); }
                catch { }
                AfterProcessing();
            }

            return(String.Concat(totalPages.ToString(),
                                 " pages processed in ",
                                 timer.Elapsed.TotalSeconds.PrintTimeFromSeconds(),
                                 " with ",
                                 myLogger.ErrorCount,
                                 " errors."));
        }
Example #3
0
        public String Extract()
        {
            String currentPageField = String.Empty;

            byte[]          byteArrayPdf;
            PdfContentByte  cb;
            PdfImportedPage pageImport;
            Int32           totalPages = 0;

            try
            {
                if (!WpFile.isFilePathOK())
                {
                    return("Invalid WP file");
                }

                if (!outputFolder.isDirectoryPathOK())
                {
                    return("Invalid output folder");
                }

                BeforeProcessing();

                HashSet <String> WPs = GetWPsToExtract();

                using (PdfReader pdfReader = new PdfReader(PdfPath))
                {
                    using (Document doc = new Document(pdfReader.GetPageSize(1)))
                    {
                        using (FileStream outputStream = new FileStream(outputFolder + "\\ExtractedWPs.pdf", FileMode.Create))
                        {
                            PdfWriter writer = PdfWriter.GetInstance(doc, outputStream);

                            totalPages = pdfReader.NumberOfPages;
                            ITextExtractionStrategy rectangleStrategy;

                            for (Int32 currentPage = 1; currentPage <= pdfReader.NumberOfPages; currentPage++)
                            {
                                String currentWP = String.Empty;

                                try { Bw.ReportProgress(Utils.GetPercentage(currentPage, pdfReader.NumberOfPages)); }
                                catch { }

                                if (Bw.CancellationPending || Bw == null)
                                {
                                    myLogger.Log("Processing cancelled on PDF #" + currentPage);
                                    break;
                                }

                                float height = pdfReader.GetPageSize(currentPage).Height;
                                float width  = pdfReader.GetPageSize(currentPage).Width;

                                if (height > 785 && height < 802 && width > 1215 && width < 1230)
                                {
                                    // Page is 11 x 17
                                    rectangleStrategy = MakeRectangle(450, 1, 450, 70);
                                }
                                else if (height > 785 && height < 802 && width > 608 && width < 617)
                                {
                                    // Page is 8.5 x 11
                                    rectangleStrategy = MakeRectangle(190, 1, 255, 74);
                                }
                                else
                                {
                                    myLogger.Log("Page # " + currentPage.ToString() + " not 8.5 x 11 or 11 x 17");
                                    continue;
                                }

                                String currentText = PdfTextExtractor.GetTextFromPage(pdfReader, currentPage, rectangleStrategy);

                                currentText = Encoding.UTF8.GetString(Encoding.Convert(Encoding.Default, Encoding.UTF8, Encoding.Default.GetBytes(currentText)));

                                if (pageFieldRegex.IsMatch(currentText))
                                {
                                    currentPageField = pageFieldRegex.Match(currentText).Value.Trim();
                                    currentWP        = ExtractWP(currentPageField, false);
                                }
                                else
                                {
                                    if (pageFieldBlankRegex.IsMatch(currentText))
                                    {
                                        currentPageField = pageFieldBlankRegex.Match(currentText).Value.Trim();
                                        currentWP        = ExtractWP(currentPageField, true);
                                    }
                                    else
                                    {
                                        // This page has no Page # field/WP, so skip it
                                        continue;
                                    }
                                }
                                if (WPs.Contains(currentWP))
                                {
                                    // Write this page to document
                                    if (!doc.IsOpen())
                                    {
                                        doc.Open();
                                    }
                                    doc.NewPage();
                                    byteArrayPdf = pdfReader.GetPageContent(currentPage);
                                    cb           = writer.DirectContent;
                                    pageImport   = writer.GetImportedPage(pdfReader, currentPage);
                                    cb.AddTemplate(pageImport, 0, 0);
                                    writer.Flush();
                                }
                            }
                            try { writer.Dispose(); } catch { }
                            // PdfWriter
                        } //FileStream
                    }     //Doc
                }         //PdfReader
            }
            catch (System.Exception se)
            {
                return(se.Message);
            }
            finally
            {
                AfterProcessing();
            }

            return(String.Concat(totalPages.ToString(),
                                 " pages processed in ",
                                 timer.Elapsed.TotalSeconds.PrintTimeFromSeconds(),
                                 " with ",
                                 myLogger.ErrorCount,
                                 " errors."));
        }