Exemplo n.º 1
0
        public String ExportData()
        {
            //Document variables

            DocInfo docInfo = new DocInfo();

            System.Boolean hasOfficialUse = false;
            string         officialText;

            try
            {
                if (!ExportFilePath.isFilePathOK(".txt"))
                {
                    return("Invalid export file path: " + ExportFilePath);
                }

                BeforeProcessing();

                using (var pdfReader = new PdfReader(PdfPath))
                {
                    // For image checking
                    var parser = new PdfReaderContentParser(pdfReader);
                    ImageRenderListener listener = null;

                    // Check to see if doc has "for official use only" at the bottom
                    ITextExtractionStrategy officialTextRectangle = MakeRectangle(70, 1, 375, 120);
                    officialText = PdfTextExtractor.GetTextFromPage(pdfReader, 1, officialTextRectangle);
                    officialText = Encoding.UTF8.GetString(Encoding.Convert(Encoding.Default, Encoding.UTF8, Encoding.Default.GetBytes(officialText)));

                    if (officialText.ToString().ToUpper().Contains("FOROFFICIALUSEONLY"))
                    {
                        hasOfficialUse = true;
                    }
                    else
                    {
                        hasOfficialUse = false;
                    }

                    // Loop through each page of the PDF
                    for (Int32 currentPage = 1; currentPage <= pdfReader.NumberOfPages; currentPage++)
                    {
                        PageInfo currentPageInfo = new PageInfo()
                        {
                            PageNum = currentPage
                        };

                        ITextExtractionStrategy rectangleStrategy;

                        float height = pdfReader.GetPageSize(currentPage).Height;
                        float width  = pdfReader.GetPageSize(currentPage).Width;

                        if (height > 785 && height < 802 && width > 1215 && width < 1230)
                        {
                            rectangleStrategy = MakeRectangle(450, 1, 450, 70);
                        }
                        else if (height > 785 && height < 802 && width > 608 && width < 617)
                        {
                            rectangleStrategy = MakeRectangle(190, 1, 255, 74);
                        }
                        else
                        {
                            myLogger.Log("Page # " + currentPage.ToString() + " not 8.5 x 11 or 11 x 17");
                            continue;
                        }

                        string currentText = PdfTextExtractor.GetTextFromPage(pdfReader, currentPage, rectangleStrategy);
                        currentText = Encoding.UTF8.GetString(Encoding.Convert(Encoding.Default, Encoding.UTF8, Encoding.Default.GetBytes(currentText)));

                        if (hasOfficialUse)
                        {
                            currentText = OfficialUseRegex.Replace(currentText, "").Trim();
                        }

                        ITextExtractionStrategy workPackageIndexStrategy = MakeRectangle(60, 600, 160, 50);
                        string WPI = PdfTextExtractor.GetTextFromPage(pdfReader, currentPage, workPackageIndexStrategy);
                        WPI = Encoding.UTF8.GetString(Encoding.Convert(Encoding.Default, Encoding.UTF8, Encoding.Default.GetBytes(WPI)));

                        if (WPI.ToUpper().Contains("WORKPACKAGEINDEX"))
                        {
                            currentPageInfo.HasWpIndex = true;
                        }

                        // #-#
                        if (NumDashNumRegex.IsMatch(currentText))
                        {
                            currentPageInfo.PageNumText = NumDashNumRegex.Match(currentText).Value.Trim();
                            currentPageInfo.IsWP        = true;
                        }
                        else
                        {
                            // #-#/blank
                            if (NumDashNumBlankRegex.IsMatch(currentText))
                            {
                                currentPageInfo.PageNumText = NumDashNumBlankRegex.Match(currentText).Value.Trim();
                                currentPageInfo.IsDashBlank = true;
                                currentPageInfo.IsWP        = true;
                            }
                            else
                            {
                                if (romanNumRegex.IsMatch(currentText.ToUpper().Trim()))
                                {
                                    currentPageInfo.PageNumText = romanNumRegex.Match(currentText.ToUpper().Trim()).Value.Trim();

                                    if (String.Equals(currentPageInfo.PageNumText.ToUpper(), "C") || String.Equals(currentPageInfo.PageNumText.ToUpper(), "D"))
                                    {
                                        currentPageInfo.PageNumText = currentPageInfo.PageNumText.ToLower();
                                        currentPageInfo.IsLetter    = true;
                                    }
                                    else
                                    {
                                        currentPageInfo.IsRoman = true;
                                    }
                                }
                                else
                                {
                                    if (LetterRegex.IsMatch(currentText.Trim()))
                                    {
                                        currentPageInfo.PageNumText = LetterRegex.Match(currentText).Value.Trim();
                                        currentPageInfo.IsLetter    = true;
                                    }
                                    else
                                    {
                                        // Check if whole page is empty
                                        parser.ProcessContent(currentPage, (listener = new ImageRenderListener()));

                                        ITextExtractionStrategy currentTextRectangle = MakeRectangle(1, 1, 1000000, 1000000);

                                        String checkText = PdfTextExtractor.GetTextFromPage(pdfReader, currentPage, currentTextRectangle);
                                        checkText = Encoding.UTF8.GetString(Encoding.Convert(Encoding.Default, Encoding.UTF8, Encoding.Default.GetBytes(checkText)));

                                        if ((listener.Images.Count <= 0) && String.IsNullOrWhiteSpace(checkText))
                                        {
                                            currentPageInfo.IsWholePageEmpty   = true;
                                            currentPageInfo.IsPageNumAreaBlank = true;
                                        }
                                        else
                                        {
                                            if (String.IsNullOrWhiteSpace(currentText))
                                            {
                                                currentPageInfo.IsPageNumAreaBlank = true;
                                            }
                                            else
                                            {
                                                if (indexRegex.IsMatch(currentText.Trim()))
                                                {
                                                    currentPageInfo.PageNumText = indexRegex.Match(currentText).Value.Trim();
                                                    currentPageInfo.IsIndex     = true;
                                                }
                                                else
                                                {
                                                    currentPageInfo.PageNumText = currentText;
                                                    currentPageInfo.IsMisc      = true;
                                                }
                                            }
                                        }
                                    }
                                }
                            }
                        }

                        if (Bw.CancellationPending)
                        {
                            myLogger.Log("Processing cancelled at dwg #: " + currentPage.ToString());
                            break;
                        }

                        Bw.ReportProgress(Utils.GetPercentage(currentPage, pdfReader.NumberOfPages));

                        docInfo.Pages.Add(currentPageInfo);
                    }
                }

                WriteDocInfoToTextFile(docInfo);
            }
            catch (System.Exception se)
            {
                return(se.Message);
            }
            finally
            {
                AfterProcessing();
            }

            return(String.Concat(docInfo.ToString(),
                                 Environment.NewLine,
                                 "Processing completed in ",
                                 timer.Elapsed.TotalSeconds.PrintTimeFromSeconds(),
                                 Environment.NewLine,
                                 myLogger.ErrorCount.ToString(),
                                 " errors found."));

            //return String.Concat(
            //        docInfo.NumSheets,
            //        "Processing completed in ",
            //        timer.Elapsed.TotalSeconds.PrintTimeFromSeconds(),
            //        " with ",
            //        myLogger.ErrorCount,
            //        " errors.");
        }
Exemplo n.º 2
0
        private void WriteDocInfoToTextFile(DocInfo docInfo)
        {
            Regex DashRegex  = new Regex("–");
            Regex blankRegex = new Regex(@"/blank$");

            int    convertedRoman = -1;
            String lastWpPageNum  = String.Empty;

            string[] dashedNums               = new string[2];
            int      currentWpCount           = 0;
            int      wpCount                  = 0;
            string   currentWorkPackageNumber = String.Empty;

            using (StreamWriter sw = new StreamWriter(ExportFilePath))
            {
                sw.Write("PDF Page #\tWork Package\t\tFull Page #" + Environment.NewLine + Environment.NewLine);

                foreach (PageInfo currentPageInfo in docInfo.Pages)
                {
                    if (Bw.CancellationPending || Bw == null)
                    {
                        myLogger.Log("Processing cancelled while exporting PDF information to text file");
                        break;
                    }

                    // Is Work Package ?
                    if (currentPageInfo.IsWP)
                    {
                        dashedNums = DashRegex.Split(currentPageInfo.PageNumText);

                        if (dashedNums[0] == currentWorkPackageNumber)
                        {
                            currentWpCount++;
                        }
                        else
                        {
                            if (currentWorkPackageNumber != String.Empty)
                            {
                                sw.Write("WP page count for WP" + currentWorkPackageNumber + ": " + currentWpCount + Environment.NewLine + Environment.NewLine);
                            }
                            wpCount++;
                            currentWorkPackageNumber = dashedNums[0];
                            currentWpCount           = 1;
                        }
                        sw.Write(currentPageInfo.PageNum + "\t\t\t" + dashedNums[0] + "\t\t" + currentPageInfo.PageNumText + (currentPageInfo.HasWpIndex ? " - WP INDEX" : "") + Environment.NewLine);

                        continue;
                    }

                    // Is Dash Blank ?
                    if (currentPageInfo.IsDashBlank)
                    {
                        dashedNums = DashRegex.Split(blankRegex.Replace(currentPageInfo.PageNumText, " "));

                        if (dashedNums[0] == currentWorkPackageNumber)
                        {
                            currentWpCount++;
                        }
                        else
                        {
                            if (currentWorkPackageNumber != String.Empty)
                            {
                                sw.Write("WP page count for WP " + currentWorkPackageNumber + ": " + currentWpCount + Environment.NewLine + Environment.NewLine);
                            }
                            wpCount++;
                            currentWorkPackageNumber = dashedNums[0];
                            currentWpCount           = 1;
                        }
                        sw.Write(currentPageInfo.PageNum + "\t\t\t" + dashedNums[0] + "\t\t" + currentPageInfo.PageNumText + (currentPageInfo.HasWpIndex ? " - WP INDEX" : "") + Environment.NewLine);

                        lastWpPageNum = currentWorkPackageNumber;
                        continue;
                    }

                    if (currentPageInfo.IsRoman)
                    {
                        convertedRoman = RomanToNumber(currentPageInfo.PageNumText.Trim().ToUpper());

                        sw.Write(currentPageInfo.PageNum + "\t\t\t\t\t" + currentPageInfo.PageNumText + " (" + convertedRoman + ")" + (currentPageInfo.HasWpIndex ? " - WP INDEX" : "") + Environment.NewLine);
                        continue;
                    }

                    if (currentPageInfo.IsLetter)
                    {
                        sw.Write(currentPageInfo.PageNum + "\t\t\t\t\t" + currentPageInfo.PageNumText + (currentPageInfo.HasWpIndex ? " - WP INDEX" : "") + Environment.NewLine);
                        continue;
                    }

                    if (currentPageInfo.IsMisc)
                    {
                        sw.Write(currentPageInfo.PageNum + "\t\t\t\t\t" + (currentPageInfo.PageNumText.Length > 15 ? currentPageInfo.PageNumText.Substring(0, 15) : currentPageInfo.PageNumText) + (currentPageInfo.HasWpIndex ? " - WP INDEX" : "") + Environment.NewLine);
                        continue;
                    }

                    if (currentPageInfo.IsPageNumAreaBlank)
                    {
                        sw.Write(currentPageInfo.PageNum + "\t\t\t\t\t" + "*No Page #*" + (currentPageInfo.HasWpIndex ? " - WP INDEX" : "") + Environment.NewLine);
                        continue;
                    }

                    if (currentPageInfo.IsWholePageEmpty)
                    {
                        sw.Write(currentPageInfo.PageNum + "\t\t\t\t\t" + "*Empty Page*" + (currentPageInfo.HasWpIndex ? " - WP INDEX" : "") + Environment.NewLine);
                        continue;
                    }

                    if (currentPageInfo.IsIndex)
                    {
                        sw.Write(currentPageInfo.PageNum + "\t\t\t\t\t" + currentPageInfo.PageNumText + (currentPageInfo.HasWpIndex ? " - WP INDEX" : "") + Environment.NewLine);
                        continue;
                    }
                }
                sw.Write("WP page count for WP " + currentWorkPackageNumber + ": " + currentWpCount + Environment.NewLine + Environment.NewLine);
                sw.Write("Number of total Work Packages: " + wpCount + Environment.NewLine);
            }
            //int i = docInfo.NumPrintable;
            return;
        }