コード例 #1
0
ファイル: Program.cs プロジェクト: SofianeCh/Pdf_parser
        /// <summary>
        /// Writing the extracted date into textfile.
        /// </summary>
        /// <param name="reader"> Open reader to the to read pdf file </param>
        /// <param name="page"> which page are we going to extract the information from the pdf file </param>
        /// <param name="its"> Which extraction strategy do we use when extracting our data </param>
        /// <param name="outPath"> Where is the textfile located in my computer </param>
        ///
        private static void WriteInfile(PdfReader reader, int page, ITextExtractionStrategy its, string outPath)
        {
            string strText = string.Empty;

            strText = PdfTextExtractor.GetTextFromPage(reader, page, its);
            strText = Encoding.UTF8.GetString(ASCIIEncoding.Convert(Encoding.Default, Encoding.UTF8, Encoding.Default.GetBytes(strText)));
            string[] lines = strText.Split('\n');
            foreach (string line in lines)
            {
                using (System.IO.StreamWriter file = new System.IO.StreamWriter(outPath, true))
                {
                    string test  = line + "\0";
                    int    index = test.Length;
                    if (index > 0 && index < 55 && !char.IsPunctuation(test[index - 2]) && !char.IsDigit(test[0]))
                    {
                        Console.WriteLine("TITLE = " + line + "  " + index);
                        file.Write("Title - - - - - ");
                        file.WriteLine(line + "\n");
                    }
                    else
                    {
                        file.WriteLine(line);
                    }
                }
            }
            using (System.IO.StreamWriter file = new System.IO.StreamWriter(outPath, true))
                file.WriteLine("- - - - - - - - - - - - - - - - - - - - - - - - - - - ");
        }
コード例 #2
0
        public virtual void TestWithMultiFilteredRenderListener()
        {
            PdfDocument           pdfDocument = new PdfDocument(new PdfReader(sourceFolder + "test.pdf"));
            float                 x1;
            float                 y1;
            float                 x2;
            float                 y2;
            FilteredEventListener listener = new FilteredEventListener();

            x1 = 122;
            x2 = 22;
            y1 = 678.9f;
            y2 = 12;
            ITextExtractionStrategy region1Listener = listener.AttachEventListener(new LocationTextExtractionStrategy(
                                                                                       ), new TextRegionEventFilter(new Rectangle(x1, y1, x2, y2)));

            x1 = 156;
            x2 = 13;
            y1 = 678.9f;
            y2 = 12;
            ITextExtractionStrategy region2Listener = listener.AttachEventListener(new LocationTextExtractionStrategy(
                                                                                       ), new TextRegionEventFilter(new Rectangle(x1, y1, x2, y2)));
            PdfCanvasProcessor parser = new PdfCanvasProcessor(new GlyphEventListener(listener));

            parser.ProcessPageContent(pdfDocument.GetPage(1));
            NUnit.Framework.Assert.AreEqual("Your", region1Listener.GetResultantText());
            NUnit.Framework.Assert.AreEqual("dju", region2Listener.GetResultantText());
        }
コード例 #3
0
        /// <summary>Extract text from a specified page using an extraction strategy.</summary>
        /// <param name="page">the page for the text to be extracted from</param>
        /// <param name="strategy">the strategy to use for extracting text</param>
        /// <returns>the extracted text</returns>
        public static String GetTextFromPage(PdfPage page, ITextExtractionStrategy strategy)
        {
            PdfCanvasProcessor parser = new PdfCanvasProcessor(strategy);

            parser.ProcessPageContent(page);
            return(strategy.GetResultantText());
        }
コード例 #4
0
            public virtual void EventOccurred(IEventData data, EventType type)
            {
                switch (type)
                {
                case EventType.RENDER_TEXT: {
                    TextRenderInfo textInfo = (TextRenderInfo)data;
                    int            mcid     = textInfo.GetMcid();
                    if (mcid != -1)
                    {
                        ITextExtractionStrategy textExtractionStrategy = this.contentByMcid.Get(mcid);
                        if (textExtractionStrategy == null)
                        {
                            textExtractionStrategy = new LocationTextExtractionStrategy();
                            this.contentByMcid.Put(mcid, textExtractionStrategy);
                        }
                        textExtractionStrategy.EventOccurred(data, type);
                    }
                    break;
                }

                default: {
                    break;
                }
                }
            }
コード例 #5
0
        virtual public void TestWithMultiFilteredRenderListener()
        {
            PdfReader pdfReader           = TestResourceUtils.GetResourceAsPdfReader(TEST_RESOURCES_PATH, "test.pdf");
            PdfReaderContentParser parser = new PdfReaderContentParser(pdfReader);

            float x1, y1, x2, y2;

            MultiFilteredRenderListener listener = new MultiFilteredRenderListener();

            x1 = 122;
            x2 = 144;
            y1 = 841.9f - 151;
            y2 = 841.9f - 163;
            ITextExtractionStrategy region1Listener = listener.AttachRenderListener(
                new LocationTextExtractionStrategy(), new RegionTextRenderFilter(new Rectangle(x1, y1, x2, y2)));

            x1 = 156;
            x2 = 169;
            y1 = 841.9f - 151;
            y2 = 841.9f - 163;
            ITextExtractionStrategy region2Listener = listener.AttachRenderListener(
                new LocationTextExtractionStrategy(), new RegionTextRenderFilter(new Rectangle(x1, y1, x2, y2)));

            parser.ProcessContent(1, new GlyphRenderListener(listener));
            Assert.AreEqual("Your", region1Listener.GetResultantText());
            Assert.AreEqual("dju", region2Listener.GetResultantText());
        }
コード例 #6
0
        public static MemoryStream ExtractPdfText(string filename, ITextExtractionStrategy textExtractionStrategy)
        {
            if (!File.Exists(filename))
            {
                throw new FileNotFoundException("File: [" + filename + "] does not exist.");
            }
            var textStream = new MemoryStream();

            using (var output = new StreamWriter(textStream, Encoding.UTF8, 1024, true))
            {
                using (var pdfReader = new PdfReader(filename))
                {
                    for (var page = 1; page <= pdfReader.NumberOfPages; page++)
                    {
                        var text =
                            Encoding.UTF8.GetString(Encoding.Convert(Encoding.Default, Encoding.UTF8,
                                                                     Encoding.Default.GetBytes(PdfTextExtractor.GetTextFromPage(pdfReader, page, textExtractionStrategy))));

                        output.WriteLine(text);
                        output.WriteLine("*********************************************************************************************");
                    }
                }
            }

            textStream.Seek(0, SeekOrigin.Begin);
            return(textStream);
        }
コード例 #7
0
        /// <summary>Extract text from a specified page using an extraction strategy.</summary>
        /// <remarks>
        /// Extract text from a specified page using an extraction strategy.
        /// Also allows registration of custom IContentOperators that can influence
        /// how (and whether or not) the PDF instructions will be parsed.
        /// </remarks>
        /// <param name="page">the page for the text to be extracted from</param>
        /// <param name="strategy">the strategy to use for extracting text</param>
        /// <param name="additionalContentOperators">
        /// an optional map of custom
        /// <see cref="IContentOperator"/>
        /// s for rendering instructions
        /// </param>
        /// <returns>the extracted text</returns>
        public static String GetTextFromPage(PdfPage page, ITextExtractionStrategy strategy, IDictionary <String, IContentOperator
                                                                                                          > additionalContentOperators)
        {
            PdfCanvasProcessor parser = new PdfCanvasProcessor(strategy, additionalContentOperators);

            parser.ProcessPageContent(page);
            return(strategy.GetResultantText());
        }
コード例 #8
0
 public PdfSearcher(string filename)
 {
     if (File.Exists(filename))
     {
         Pdf                    = new PdfReader(filename);
         PdfDocument            = new PdfDocument(Pdf);
         textExtractionStrategy = new SimpleTextExtractionStrategy();
     }
     else
     {
         throw new FileNotFoundException($"Did not find {filename}");
     }
 }
コード例 #9
0
ファイル: Common.cs プロジェクト: BaccusQin/BoxUpload
        public SetRectangle(int x1, int y1, int x2, int y2, string sourceFileName)
        {
            Rectangle             rect         = new Rectangle(x1, y1, x2, y2);
            TextRegionEventFilter regionFilter = new TextRegionEventFilter(rect);

            pdfReader = new PdfReader(sourceFileName);
            pdfDoc    = new PdfDocument(pdfReader);
            strategy  = new FilteredTextEventListener(new LocationTextExtractionStrategy(), regionFilter);
            sn        = sourceFileName;
            this.x1   = x1;
            this.y1   = y1;
            this.x2   = x2;
            this.y2   = y2;
        }
コード例 #10
0
ファイル: Routines.cs プロジェクト: fsol/Statement-Reader
        public static MemoryStream ExtractPdfText(string filename, ITextExtractionStrategy textExtractionStrategy)
        {
            if (!File.Exists(filename)) throw new FileNotFoundException("File: [" + filename + "] does not exist.");
            var textStream = new MemoryStream();

            using (var output = new StreamWriter(textStream, Encoding.UTF8, 1024, true))
            {
                using (var pdfReader = new PdfReader(filename))
                {
                    for (var page = 1; page <= pdfReader.NumberOfPages; page++)
                    {
                        var text =
                            Encoding.UTF8.GetString(Encoding.Convert(Encoding.Default, Encoding.UTF8,
                                Encoding.Default.GetBytes(PdfTextExtractor.GetTextFromPage(pdfReader, page, textExtractionStrategy))));

                        output.WriteLine(text);
                        output.WriteLine("*********************************************************************************************");
                    }
                }
            }

            textStream.Seek(0, SeekOrigin.Begin);
            return textStream;
        }
コード例 #11
0
 public GlyphTextRenderListener(ITextExtractionStrategy deleg) : base(deleg) {
     this.deleg = deleg;
 }
コード例 #12
0
        /**
         * Extract text from a specified page using an extraction strategy.
         * Also allows registration of custom ContentOperators
         * @param reader the reader to extract text from
         * @param pageNumber the page to extract text from
         * @param strategy the strategy to use for extracting text
         * @param additionalContentOperators an optional dictionary of custom IContentOperators for rendering instructions
         * @return the extracted text
         * @throws IOException if any operation fails while reading from the provided PdfReader
         */
        public static String GetTextFromPage(PdfReader reader, int pageNumber, ITextExtractionStrategy strategy, IDictionary <string, IContentOperator> additionalContentOperators)
        {
            PdfReaderContentParser parser = new PdfReaderContentParser(reader);

            return(parser.ProcessContent(pageNumber, strategy, additionalContentOperators).GetResultantText());
        }
コード例 #13
0
 public LimitedTextStrategy2(ITextExtractionStrategy strategy)
 {
     this.textextractionstrategy = strategy;
 }
コード例 #14
0
 /// <summary>
 /// Constructs a
 /// <see cref="GlyphEventListener"/>
 /// instance by a
 /// <see cref="ITextExtractionStrategy"/>
 /// delegate to which
 /// the expanded text events for each glyph occurred will be passed on.
 /// </summary>
 /// <param name="delegate_">delegate to pass the expanded glyph render events to.</param>
 public GlyphTextEventListener(ITextExtractionStrategy delegate_)
     : base(delegate_)
 {
 }
コード例 #15
0
 /**
  * Construction
  * @param deleg the deleg {@link RenderListener} that will receive filtered text operations
  * @param filters the Filter(s) to apply
  */
 public FilteredTextRenderListener(ITextExtractionStrategy deleg, RenderFilter[] filters)
     : base(deleg, filters)
 {
     this.deleg = deleg;
 }
コード例 #16
0
 /// <summary>
 /// Constructs a
 /// <see cref="FilteredTextEventListener"/>
 /// instance with a
 /// <see cref="ITextExtractionStrategy"/>
 /// delegate.
 /// </summary>
 /// <param name="delegate_">a delegate that fill be called when all the corresponding filters for an event pass
 ///     </param>
 /// <param name="filterSet">filters attached to the delegate that will be tested before passing an event on to the delegate
 ///     </param>
 public FilteredTextEventListener(ITextExtractionStrategy delegate_, params IEventFilter[] filterSet)
     : base(delegate_, filterSet)
 {
 }
コード例 #17
0
ファイル: City.cs プロジェクト: cykb518hu/Scraper
        public bool ReadPdf(string pdfFile, ref Documents doc, ref int pages)
        {
            bool success = false;

            try
            {
                if (pdfFile.ToLower().Contains("pdf"))
                {
                    StringBuilder textBuilder = new StringBuilder();
                    PdfReader     r           = new PdfReader(pdfFile);
                    pages = r.NumberOfPages;

                    for (int i = 1; i <= pages; i++)
                    {
                        PdfReaderContentParser  parser = new PdfReaderContentParser(r);
                        ITextExtractionStrategy st     = parser.ProcessContent <SimpleTextExtractionStrategy>(i, new SimpleTextExtractionStrategy());
                        string text = st.GetResultantText().Trim('\r', '\n', '\t', (char)32, (char)160);

                        if (!string.IsNullOrEmpty(text))
                        {
                            doc.DocBodyDic.Add(i, text);
                        }
                        else
                        {
                            text = PdfTextExtractor.GetTextFromPage(r, i).Trim('\r', '\n', '\t', (char)32, (char)160);

                            if (!string.IsNullOrEmpty(text))
                            {
                                doc.DocBodyDic.Add(i, text);
                            }
                        }
                    }

                    r.Close();
                    success = true;
                }
                else if (pdfFile.ToLower().Contains("doc"))
                {
                    MsWord.Application newApp = null;
                    MsWord.Document    msdoc  = null;

                    try
                    {
                        int retry = 2;
                        while (retry > 0)
                        {
                            try
                            {
                                //newApp = (MsWord.Application)Marshal.GetActiveObject("Word.Application");
                                newApp = newApp == null ? new MsWord.Application() : newApp;
                                System.Threading.Thread.Sleep(1000);
                                //msdoc = newApp.ActiveDocument;
                                msdoc = newApp.Documents.Open(pdfFile);
                                System.Threading.Thread.Sleep(1000);
                                object             nothing = Missing.Value;
                                MsWord.WdStatistic stat    = MsWord.WdStatistic.wdStatisticPages;
                                int num = msdoc.ComputeStatistics(stat, ref nothing);

                                for (int i = 1; i <= num; i++)
                                {
                                    if (doc.DocBodyDic.ContainsKey(i))
                                    {
                                        continue;
                                    }

                                    object objWhat  = MsWord.WdGoToItem.wdGoToPage;
                                    object objWhich = MsWord.WdGoToDirection.wdGoToAbsolute;

                                    object       objPage = (object)i;
                                    MsWord.Range range1  = msdoc.GoTo(ref objWhat, ref objWhich, ref objPage, ref nothing);
                                    MsWord.Range range2  = range1.GoToNext(MsWord.WdGoToItem.wdGoToPage);

                                    object objStart = range1.Start;
                                    object objEnd   = range2.Start;
                                    if (range1.Start == range2.Start)
                                    {
                                        objEnd = msdoc.Characters.Count;
                                    }

                                    Console.ForegroundColor = ConsoleColor.Red;
                                    Console.WriteLine("DEBUG: Path: {0}, {1}-{2}........", pdfFile, objStart, objEnd);
                                    Console.ResetColor();

                                    if ((int)objStart <= (int)objEnd)
                                    {
                                        string innerText = msdoc.Range(ref objStart, ref objEnd).Text;
                                        doc.DocBodyDic.Add(i, innerText);
                                    }
                                }

                                success = true;
                                break;
                            }
                            catch (Exception ex)
                            {
                                Console.ForegroundColor = ConsoleColor.Red;
                                Console.WriteLine("Retry to read word {0}, Exception: {1}..", pdfFile, ex.ToString());
                                Console.ResetColor();
                                System.Threading.Thread.Sleep(1000);
                                retry--;
                            }
                            finally
                            {
                                if (newApp != null)
                                {
                                    newApp.NormalTemplate.Saved = true;

                                    if (msdoc != null)
                                    {
                                        msdoc.Close(false);
                                    }

                                    newApp.Quit();
                                }
                            }
                        }
                    }
                    catch (Exception e)
                    {
                    }
                }
            }
            catch (Exception ex)
            {
            }

            return(success);
        }
コード例 #18
0
        /**
         * Extract text from a specified page using an extraction strategy.
         * @param reader the reader to extract text from
         * @param pageNumber the page to extract text from
         * @param strategy the strategy to use for extracting text
         * @return the extracted text
         * @throws IOException if any operation fails while reading from the provided PdfReader
         * @since 5.0.2
         */
        public static String GetTextFromPage(PdfReader reader, int pageNumber, ITextExtractionStrategy strategy)
        {
            PdfReaderContentParser parser = new PdfReaderContentParser(reader);

            return(parser.ProcessContent(pageNumber, strategy).GetResultantText());
        }
コード例 #19
0
 /**
  * Extract text from a specified page using an extraction strategy.
  * @param reader the reader to extract text from
  * @param pageNumber the page to extract text from
  * @param strategy the strategy to use for extracting text
  * @return the extracted text
  * @throws IOException if any operation fails while reading from the provided PdfReader
  * @since 5.0.2
  */
 public static String GetTextFromPage(PdfReader reader, int pageNumber, ITextExtractionStrategy strategy)
 {
     PdfReaderContentParser parser = new PdfReaderContentParser(reader);
     return parser.ProcessContent(pageNumber, strategy).GetResultantText();
 }
コード例 #20
0
        static int Main(string[] args)
        {
            #region var declaration
            bool   verbose          = false;
            bool   datedFileNames   = false;
            bool   showRegexMatches = false;
            string inputFilename    = "";
            bool   inputFileExists;
            string extension;
            bool   isPDF            = false;
            string outputDirname    = "";
            bool   outputDirExsists = false;

            uint splitType = 0x0;

            string keySplitRegex = "";
            #endregion

            #region arg intake
            // intake args and load values in to scope
            var result = Parser.Default.ParseArguments <Options>(args);

            result.WithParsed <Options>(o =>
            {
                inputFilename = o.inputFilename;

                outputDirname = o.outputDirname;


                if (o.verbose)
                {
                    verbose = true;
                }

                if (o.datedFileNames)
                {
                    datedFileNames = true;
                }

                if (o.showRegexMatches)
                {
                    showRegexMatches = true;
                }

                if (!String.IsNullOrEmpty(o.keySplitRegex) && o.keySplitRegex.Length > 0)
                {
                    splitType     = splitType | 0x1;
                    keySplitRegex = o.keySplitRegex;
                }
            });

            #endregion

            #region verify input

            // input file location
            if (verbose)
            {
                Console.WriteLine("Input File:\t" + inputFilename);
            }
            inputFileExists = File.Exists(inputFilename);
            if (verbose)
            {
                Console.WriteLine(inputFileExists ? "File exists:\tTrue" : "File exists:\tFalse");
            }
            if (!inputFileExists)
            {
                if (verbose)
                {
                    Console.WriteLine("Input File does not exsist; Exiting with error code 1.");
                }
                #if DEBUG
                Console.ReadKey();
                #endif
                return(1);
            }

            // input file format
            extension = System.IO.Path.GetExtension(inputFilename).ToLower();
            if (verbose)
            {
                Console.WriteLine("File format:\t" + extension);
            }
            isPDF = string.Equals(extension, ".pdf");
            if (verbose)
            {
                Console.WriteLine(isPDF ? "Correct Format:\tTrue" : "Correct Format:\tFalse");
            }
            if (!isPDF)
            {
                if (verbose)
                {
                    Console.WriteLine("Input File is not a PDF; Exiting with error code 2.");
                }
                #if DEBUG
                Console.ReadKey();
                #endif
                return(2);
            }

            // output directory exsistance
            if (verbose)
            {
                Console.WriteLine("Output to:\t" + outputDirname);
            }
            outputDirExsists = Directory.Exists(outputDirname);
            if (verbose)
            {
                Console.WriteLine(outputDirExsists ? "Output valid:\tTrue" : "Output valid:\tFalse");
            }
            if (!outputDirExsists)
            {
                if (verbose)
                {
                    Console.WriteLine("Output dir does not exsist; Exiting with error code 3.");
                }
                #if DEBUG
                Console.ReadKey();
                #endif
                return(3);
            }


            #endregion

            // Split
            switch (splitType)
            {
            case 0x1:     // key match
                if (verbose)
                {
                    Console.WriteLine("split type:\tKey");
                }

                if (verbose)
                {
                    Console.WriteLine("Key regex:\t" + keySplitRegex);
                }

                Regex regex = new Regex(keySplitRegex, RegexOptions.Compiled | RegexOptions.Multiline);

                PdfReader reader = new PdfReader(inputFilename);

                PdfReaderContentParser parser = new PdfReaderContentParser(reader);

                string regexKeyMatch = "";
                int    docPageStart  = 1;
                string newDocName    = "";

                for (int page = 1; page <= reader.NumberOfPages; page++)
                {
                    if (showRegexMatches)
                    {
                        Console.WriteLine("Page: " + page);
                    }

                    ITextExtractionStrategy strategy = parser.ProcessContent
                                                           (page, new SimpleTextExtractionStrategy());

                    int matchCount = 0;

                    Match match = regex.Match(strategy.GetResultantText());
                    {
                        if (showRegexMatches)
                        {
                            Console.WriteLine("Match: " + (++matchCount));
                        }
                        for (int x = 1; x <= 2; x++)
                        {
                            Group group = match.Groups[x];
                            if (showRegexMatches)
                            {
                                Console.WriteLine("Group " + x + " = '" + group + "'");
                            }
                            CaptureCollection cc = group.Captures;
                            for (int y = 0; y < cc.Count; y++)
                            {
                                Capture capture = cc[y];

                                string captureS = capture.ToString();

                                if (!string.Equals(captureS, regexKeyMatch))
                                {
                                    // if not first instance print last doc
                                    if (page > 1)
                                    {
                                        ExtractPages(inputFilename, outputDirname + newDocName, docPageStart, (page - 1));
                                    }

                                    // reset the count
                                    regexKeyMatch = captureS;
                                    if (datedFileNames)
                                    {
                                        newDocName = DateTime.Now.ToString("yyyyMMdd") + "_" + captureS + ".pdf";
                                    }
                                    else
                                    {
                                        newDocName = captureS + ".pdf";
                                    }

                                    docPageStart = page;

                                    if (verbose)
                                    {
                                        System.Console.WriteLine("New document at page:\t" + docPageStart);
                                    }
                                }

                                if (showRegexMatches)
                                {
                                    System.Console.WriteLine("Capture " + y + " = '" + capture + "', Position=" + capture.Index);
                                }
                            }
                        }
                        match = match.NextMatch();
                    }
                }

                break;

            default:
                if (verbose)
                {
                    Console.WriteLine("No valid split type selected; Exiting with error code 4.");
                }
                    #if DEBUG
                Console.ReadKey();
                    #endif
                return(4);
            }


            #if DEBUG
            Console.ReadKey();
            #endif

            return(0);
        }
コード例 #21
0
 /**
  * Construction
  * @param deleg the deleg {@link RenderListener} that will receive filtered text operations
  * @param filters the Filter(s) to apply
  */
 public FilteredTextRenderListener(ITextExtractionStrategy deleg, params RenderFilter[] filters) : base(deleg, filters)
 {
     this.deleg = deleg;
 }
コード例 #22
0
        public String ExportData()
        {
            //Document variables

            DocInfo docInfo = new DocInfo();

            System.Boolean hasOfficialUse = false;
            string         officialText;

            try
            {
                if (!ExportFilePath.isFilePathOK(".txt"))
                {
                    return("Invalid export file path: " + ExportFilePath);
                }

                BeforeProcessing();

                using (var pdfReader = new PdfReader(PdfPath))
                {
                    // For image checking
                    var parser = new PdfReaderContentParser(pdfReader);
                    ImageRenderListener listener = null;

                    // Check to see if doc has "for official use only" at the bottom
                    ITextExtractionStrategy officialTextRectangle = MakeRectangle(70, 1, 375, 120);
                    officialText = PdfTextExtractor.GetTextFromPage(pdfReader, 1, officialTextRectangle);
                    officialText = Encoding.UTF8.GetString(Encoding.Convert(Encoding.Default, Encoding.UTF8, Encoding.Default.GetBytes(officialText)));

                    if (officialText.ToString().ToUpper().Contains("FOROFFICIALUSEONLY"))
                    {
                        hasOfficialUse = true;
                    }
                    else
                    {
                        hasOfficialUse = false;
                    }

                    // Loop through each page of the PDF
                    for (Int32 currentPage = 1; currentPage <= pdfReader.NumberOfPages; currentPage++)
                    {
                        PageInfo currentPageInfo = new PageInfo()
                        {
                            PageNum = currentPage
                        };

                        ITextExtractionStrategy rectangleStrategy;

                        float height = pdfReader.GetPageSize(currentPage).Height;
                        float width  = pdfReader.GetPageSize(currentPage).Width;

                        if (height > 785 && height < 802 && width > 1215 && width < 1230)
                        {
                            rectangleStrategy = MakeRectangle(450, 1, 450, 70);
                        }
                        else if (height > 785 && height < 802 && width > 608 && width < 617)
                        {
                            rectangleStrategy = MakeRectangle(190, 1, 255, 74);
                        }
                        else
                        {
                            myLogger.Log("Page # " + currentPage.ToString() + " not 8.5 x 11 or 11 x 17");
                            continue;
                        }

                        string currentText = PdfTextExtractor.GetTextFromPage(pdfReader, currentPage, rectangleStrategy);
                        currentText = Encoding.UTF8.GetString(Encoding.Convert(Encoding.Default, Encoding.UTF8, Encoding.Default.GetBytes(currentText)));

                        if (hasOfficialUse)
                        {
                            currentText = OfficialUseRegex.Replace(currentText, "").Trim();
                        }

                        ITextExtractionStrategy workPackageIndexStrategy = MakeRectangle(60, 600, 160, 50);
                        string WPI = PdfTextExtractor.GetTextFromPage(pdfReader, currentPage, workPackageIndexStrategy);
                        WPI = Encoding.UTF8.GetString(Encoding.Convert(Encoding.Default, Encoding.UTF8, Encoding.Default.GetBytes(WPI)));

                        if (WPI.ToUpper().Contains("WORKPACKAGEINDEX"))
                        {
                            currentPageInfo.HasWpIndex = true;
                        }

                        // #-#
                        if (NumDashNumRegex.IsMatch(currentText))
                        {
                            currentPageInfo.PageNumText = NumDashNumRegex.Match(currentText).Value.Trim();
                            currentPageInfo.IsWP        = true;
                        }
                        else
                        {
                            // #-#/blank
                            if (NumDashNumBlankRegex.IsMatch(currentText))
                            {
                                currentPageInfo.PageNumText = NumDashNumBlankRegex.Match(currentText).Value.Trim();
                                currentPageInfo.IsDashBlank = true;
                                currentPageInfo.IsWP        = true;
                            }
                            else
                            {
                                if (romanNumRegex.IsMatch(currentText.ToUpper().Trim()))
                                {
                                    currentPageInfo.PageNumText = romanNumRegex.Match(currentText.ToUpper().Trim()).Value.Trim();

                                    if (String.Equals(currentPageInfo.PageNumText.ToUpper(), "C") || String.Equals(currentPageInfo.PageNumText.ToUpper(), "D"))
                                    {
                                        currentPageInfo.PageNumText = currentPageInfo.PageNumText.ToLower();
                                        currentPageInfo.IsLetter    = true;
                                    }
                                    else
                                    {
                                        currentPageInfo.IsRoman = true;
                                    }
                                }
                                else
                                {
                                    if (LetterRegex.IsMatch(currentText.Trim()))
                                    {
                                        currentPageInfo.PageNumText = LetterRegex.Match(currentText).Value.Trim();
                                        currentPageInfo.IsLetter    = true;
                                    }
                                    else
                                    {
                                        // Check if whole page is empty
                                        parser.ProcessContent(currentPage, (listener = new ImageRenderListener()));

                                        ITextExtractionStrategy currentTextRectangle = MakeRectangle(1, 1, 1000000, 1000000);

                                        String checkText = PdfTextExtractor.GetTextFromPage(pdfReader, currentPage, currentTextRectangle);
                                        checkText = Encoding.UTF8.GetString(Encoding.Convert(Encoding.Default, Encoding.UTF8, Encoding.Default.GetBytes(checkText)));

                                        if ((listener.Images.Count <= 0) && String.IsNullOrWhiteSpace(checkText))
                                        {
                                            currentPageInfo.IsWholePageEmpty   = true;
                                            currentPageInfo.IsPageNumAreaBlank = true;
                                        }
                                        else
                                        {
                                            if (String.IsNullOrWhiteSpace(currentText))
                                            {
                                                currentPageInfo.IsPageNumAreaBlank = true;
                                            }
                                            else
                                            {
                                                if (indexRegex.IsMatch(currentText.Trim()))
                                                {
                                                    currentPageInfo.PageNumText = indexRegex.Match(currentText).Value.Trim();
                                                    currentPageInfo.IsIndex     = true;
                                                }
                                                else
                                                {
                                                    currentPageInfo.PageNumText = currentText;
                                                    currentPageInfo.IsMisc      = true;
                                                }
                                            }
                                        }
                                    }
                                }
                            }
                        }

                        if (Bw.CancellationPending)
                        {
                            myLogger.Log("Processing cancelled at dwg #: " + currentPage.ToString());
                            break;
                        }

                        Bw.ReportProgress(Utils.GetPercentage(currentPage, pdfReader.NumberOfPages));

                        docInfo.Pages.Add(currentPageInfo);
                    }
                }

                WriteDocInfoToTextFile(docInfo);
            }
            catch (System.Exception se)
            {
                return(se.Message);
            }
            finally
            {
                AfterProcessing();
            }

            return(String.Concat(docInfo.ToString(),
                                 Environment.NewLine,
                                 "Processing completed in ",
                                 timer.Elapsed.TotalSeconds.PrintTimeFromSeconds(),
                                 Environment.NewLine,
                                 myLogger.ErrorCount.ToString(),
                                 " errors found."));

            //return String.Concat(
            //        docInfo.NumSheets,
            //        "Processing completed in ",
            //        timer.Elapsed.TotalSeconds.PrintTimeFromSeconds(),
            //        " with ",
            //        myLogger.ErrorCount,
            //        " errors.");
        }
コード例 #23
0
 /// <summary>Extract text from a specified page using an extraction strategy.</summary>
 /// <param name="page">the page for the text to be extracted from</param>
 /// <param name="strategy">the strategy to use for extracting text</param>
 /// <returns>the extracted text</returns>
 public static String GetTextFromPage(PdfPage page, ITextExtractionStrategy strategy)
 {
     return(GetTextFromPage(page, strategy, new Dictionary <String, IContentOperator>()));
 }
コード例 #24
0
 public GlyphTextRenderListener(ITextExtractionStrategy deleg) : base(deleg)
 {
     this.deleg = deleg;
 }