public virtual void TestWithMultiFilteredRenderListener()
        {
            PdfDocument           pdfDocument = new PdfDocument(new PdfReader(sourceFolder + "test.pdf"));
            float                 x1;
            float                 y1;
            float                 x2;
            float                 y2;
            FilteredEventListener listener = new FilteredEventListener();

            x1 = 122;
            x2 = 22;
            y1 = 678.9f;
            y2 = 12;
            ITextExtractionStrategy region1Listener = listener.AttachEventListener(new LocationTextExtractionStrategy(
                                                                                       ), new TextRegionEventFilter(new Rectangle(x1, y1, x2, y2)));

            x1 = 156;
            x2 = 13;
            y1 = 678.9f;
            y2 = 12;
            ITextExtractionStrategy region2Listener = listener.AttachEventListener(new LocationTextExtractionStrategy(
                                                                                       ), new TextRegionEventFilter(new Rectangle(x1, y1, x2, y2)));
            PdfCanvasProcessor parser = new PdfCanvasProcessor(new GlyphEventListener(listener));

            parser.ProcessPageContent(pdfDocument.GetPage(1));
            NUnit.Framework.Assert.AreEqual("Your", region1Listener.GetResultantText());
            NUnit.Framework.Assert.AreEqual("dju", region2Listener.GetResultantText());
        }
        virtual public void TestWithMultiFilteredRenderListener()
        {
            PdfReader pdfReader           = TestResourceUtils.GetResourceAsPdfReader(TEST_RESOURCES_PATH, "test.pdf");
            PdfReaderContentParser parser = new PdfReaderContentParser(pdfReader);

            float x1, y1, x2, y2;

            MultiFilteredRenderListener listener = new MultiFilteredRenderListener();

            x1 = 122;
            x2 = 144;
            y1 = 841.9f - 151;
            y2 = 841.9f - 163;
            ITextExtractionStrategy region1Listener = listener.AttachRenderListener(
                new LocationTextExtractionStrategy(), new RegionTextRenderFilter(new Rectangle(x1, y1, x2, y2)));

            x1 = 156;
            x2 = 169;
            y1 = 841.9f - 151;
            y2 = 841.9f - 163;
            ITextExtractionStrategy region2Listener = listener.AttachRenderListener(
                new LocationTextExtractionStrategy(), new RegionTextRenderFilter(new Rectangle(x1, y1, x2, y2)));

            parser.ProcessContent(1, new GlyphRenderListener(listener));
            Assert.AreEqual("Your", region1Listener.GetResultantText());
            Assert.AreEqual("dju", region2Listener.GetResultantText());
        }
Ejemplo n.º 3
0
        /// <summary>Extract text from a specified page using an extraction strategy.</summary>
        /// <param name="page">the page for the text to be extracted from</param>
        /// <param name="strategy">the strategy to use for extracting text</param>
        /// <returns>the extracted text</returns>
        public static String GetTextFromPage(PdfPage page, ITextExtractionStrategy strategy)
        {
            PdfCanvasProcessor parser = new PdfCanvasProcessor(strategy);

            parser.ProcessPageContent(page);
            return(strategy.GetResultantText());
        }
Ejemplo n.º 4
0
        /// <summary>Extract text from a specified page using an extraction strategy.</summary>
        /// <remarks>
        /// Extract text from a specified page using an extraction strategy.
        /// Also allows registration of custom IContentOperators that can influence
        /// how (and whether or not) the PDF instructions will be parsed.
        /// </remarks>
        /// <param name="page">the page for the text to be extracted from</param>
        /// <param name="strategy">the strategy to use for extracting text</param>
        /// <param name="additionalContentOperators">
        /// an optional map of custom
        /// <see cref="IContentOperator"/>
        /// s for rendering instructions
        /// </param>
        /// <returns>the extracted text</returns>
        public static String GetTextFromPage(PdfPage page, ITextExtractionStrategy strategy, IDictionary <String, IContentOperator
                                                                                                          > additionalContentOperators)
        {
            PdfCanvasProcessor parser = new PdfCanvasProcessor(strategy, additionalContentOperators);

            parser.ProcessPageContent(page);
            return(strategy.GetResultantText());
        }
Ejemplo n.º 5
0
        public int Ocurrences(string searchTerm)
        {
            var count      = 0;
            var totalPages = PdfDocument.GetNumberOfPages();

            for (int page = 1; page <= totalPages; page++)
            {
                FilteredTextEventListener listener = new FilteredTextEventListener(textExtractionStrategy);
                var currentPage = PdfDocument.GetPage(page);
                new PdfCanvasProcessor(listener).ProcessPageContent(currentPage);
                if (RemoveDiacritics(textExtractionStrategy.GetResultantText().ToLowerInvariant())
                    .Contains(RemoveDiacritics(searchTerm.ToLowerInvariant())))
                {
                    count++;
                }
            }
            return(count);
        }
Ejemplo n.º 6
0
 /**
  * This class delegates this call
  * @see com.itextpdf.text.pdf.parser.TextExtractionStrategy#getResultantText()
  */
 public virtual String GetResultantText()
 {
     return(deleg.GetResultantText());
 }
Ejemplo n.º 7
0
 public string GetResultantText()
 {
     return(deleg.GetResultantText());
 }
Ejemplo n.º 8
0
        static int Main(string[] args)
        {
            #region var declaration
            bool   verbose          = false;
            bool   datedFileNames   = false;
            bool   showRegexMatches = false;
            string inputFilename    = "";
            bool   inputFileExists;
            string extension;
            bool   isPDF            = false;
            string outputDirname    = "";
            bool   outputDirExsists = false;

            uint splitType = 0x0;

            string keySplitRegex = "";
            #endregion

            #region arg intake
            // intake args and load values in to scope
            var result = Parser.Default.ParseArguments <Options>(args);

            result.WithParsed <Options>(o =>
            {
                inputFilename = o.inputFilename;

                outputDirname = o.outputDirname;


                if (o.verbose)
                {
                    verbose = true;
                }

                if (o.datedFileNames)
                {
                    datedFileNames = true;
                }

                if (o.showRegexMatches)
                {
                    showRegexMatches = true;
                }

                if (!String.IsNullOrEmpty(o.keySplitRegex) && o.keySplitRegex.Length > 0)
                {
                    splitType     = splitType | 0x1;
                    keySplitRegex = o.keySplitRegex;
                }
            });

            #endregion

            #region verify input

            // input file location
            if (verbose)
            {
                Console.WriteLine("Input File:\t" + inputFilename);
            }
            inputFileExists = File.Exists(inputFilename);
            if (verbose)
            {
                Console.WriteLine(inputFileExists ? "File exists:\tTrue" : "File exists:\tFalse");
            }
            if (!inputFileExists)
            {
                if (verbose)
                {
                    Console.WriteLine("Input File does not exsist; Exiting with error code 1.");
                }
                #if DEBUG
                Console.ReadKey();
                #endif
                return(1);
            }

            // input file format
            extension = System.IO.Path.GetExtension(inputFilename).ToLower();
            if (verbose)
            {
                Console.WriteLine("File format:\t" + extension);
            }
            isPDF = string.Equals(extension, ".pdf");
            if (verbose)
            {
                Console.WriteLine(isPDF ? "Correct Format:\tTrue" : "Correct Format:\tFalse");
            }
            if (!isPDF)
            {
                if (verbose)
                {
                    Console.WriteLine("Input File is not a PDF; Exiting with error code 2.");
                }
                #if DEBUG
                Console.ReadKey();
                #endif
                return(2);
            }

            // output directory exsistance
            if (verbose)
            {
                Console.WriteLine("Output to:\t" + outputDirname);
            }
            outputDirExsists = Directory.Exists(outputDirname);
            if (verbose)
            {
                Console.WriteLine(outputDirExsists ? "Output valid:\tTrue" : "Output valid:\tFalse");
            }
            if (!outputDirExsists)
            {
                if (verbose)
                {
                    Console.WriteLine("Output dir does not exsist; Exiting with error code 3.");
                }
                #if DEBUG
                Console.ReadKey();
                #endif
                return(3);
            }


            #endregion

            // Split
            switch (splitType)
            {
            case 0x1:     // key match
                if (verbose)
                {
                    Console.WriteLine("split type:\tKey");
                }

                if (verbose)
                {
                    Console.WriteLine("Key regex:\t" + keySplitRegex);
                }

                Regex regex = new Regex(keySplitRegex, RegexOptions.Compiled | RegexOptions.Multiline);

                PdfReader reader = new PdfReader(inputFilename);

                PdfReaderContentParser parser = new PdfReaderContentParser(reader);

                string regexKeyMatch = "";
                int    docPageStart  = 1;
                string newDocName    = "";

                for (int page = 1; page <= reader.NumberOfPages; page++)
                {
                    if (showRegexMatches)
                    {
                        Console.WriteLine("Page: " + page);
                    }

                    ITextExtractionStrategy strategy = parser.ProcessContent
                                                           (page, new SimpleTextExtractionStrategy());

                    int matchCount = 0;

                    Match match = regex.Match(strategy.GetResultantText());
                    {
                        if (showRegexMatches)
                        {
                            Console.WriteLine("Match: " + (++matchCount));
                        }
                        for (int x = 1; x <= 2; x++)
                        {
                            Group group = match.Groups[x];
                            if (showRegexMatches)
                            {
                                Console.WriteLine("Group " + x + " = '" + group + "'");
                            }
                            CaptureCollection cc = group.Captures;
                            for (int y = 0; y < cc.Count; y++)
                            {
                                Capture capture = cc[y];

                                string captureS = capture.ToString();

                                if (!string.Equals(captureS, regexKeyMatch))
                                {
                                    // if not first instance print last doc
                                    if (page > 1)
                                    {
                                        ExtractPages(inputFilename, outputDirname + newDocName, docPageStart, (page - 1));
                                    }

                                    // reset the count
                                    regexKeyMatch = captureS;
                                    if (datedFileNames)
                                    {
                                        newDocName = DateTime.Now.ToString("yyyyMMdd") + "_" + captureS + ".pdf";
                                    }
                                    else
                                    {
                                        newDocName = captureS + ".pdf";
                                    }

                                    docPageStart = page;

                                    if (verbose)
                                    {
                                        System.Console.WriteLine("New document at page:\t" + docPageStart);
                                    }
                                }

                                if (showRegexMatches)
                                {
                                    System.Console.WriteLine("Capture " + y + " = '" + capture + "', Position=" + capture.Index);
                                }
                            }
                        }
                        match = match.NextMatch();
                    }
                }

                break;

            default:
                if (verbose)
                {
                    Console.WriteLine("No valid split type selected; Exiting with error code 4.");
                }
                    #if DEBUG
                Console.ReadKey();
                    #endif
                return(4);
            }


            #if DEBUG
            Console.ReadKey();
            #endif

            return(0);
        }
Ejemplo n.º 9
0
        public bool ReadPdf(string pdfFile, ref Documents doc, ref int pages)
        {
            bool success = false;

            try
            {
                if (pdfFile.ToLower().Contains("pdf"))
                {
                    StringBuilder textBuilder = new StringBuilder();
                    PdfReader     r           = new PdfReader(pdfFile);
                    pages = r.NumberOfPages;

                    for (int i = 1; i <= pages; i++)
                    {
                        PdfReaderContentParser  parser = new PdfReaderContentParser(r);
                        ITextExtractionStrategy st     = parser.ProcessContent <SimpleTextExtractionStrategy>(i, new SimpleTextExtractionStrategy());
                        string text = st.GetResultantText().Trim('\r', '\n', '\t', (char)32, (char)160);

                        if (!string.IsNullOrEmpty(text))
                        {
                            doc.DocBodyDic.Add(i, text);
                        }
                        else
                        {
                            text = PdfTextExtractor.GetTextFromPage(r, i).Trim('\r', '\n', '\t', (char)32, (char)160);

                            if (!string.IsNullOrEmpty(text))
                            {
                                doc.DocBodyDic.Add(i, text);
                            }
                        }
                    }

                    r.Close();
                    success = true;
                }
                else if (pdfFile.ToLower().Contains("doc"))
                {
                    MsWord.Application newApp = null;
                    MsWord.Document    msdoc  = null;

                    try
                    {
                        int retry = 2;
                        while (retry > 0)
                        {
                            try
                            {
                                //newApp = (MsWord.Application)Marshal.GetActiveObject("Word.Application");
                                newApp = newApp == null ? new MsWord.Application() : newApp;
                                System.Threading.Thread.Sleep(1000);
                                //msdoc = newApp.ActiveDocument;
                                msdoc = newApp.Documents.Open(pdfFile);
                                System.Threading.Thread.Sleep(1000);
                                object             nothing = Missing.Value;
                                MsWord.WdStatistic stat    = MsWord.WdStatistic.wdStatisticPages;
                                int num = msdoc.ComputeStatistics(stat, ref nothing);

                                for (int i = 1; i <= num; i++)
                                {
                                    if (doc.DocBodyDic.ContainsKey(i))
                                    {
                                        continue;
                                    }

                                    object objWhat  = MsWord.WdGoToItem.wdGoToPage;
                                    object objWhich = MsWord.WdGoToDirection.wdGoToAbsolute;

                                    object       objPage = (object)i;
                                    MsWord.Range range1  = msdoc.GoTo(ref objWhat, ref objWhich, ref objPage, ref nothing);
                                    MsWord.Range range2  = range1.GoToNext(MsWord.WdGoToItem.wdGoToPage);

                                    object objStart = range1.Start;
                                    object objEnd   = range2.Start;
                                    if (range1.Start == range2.Start)
                                    {
                                        objEnd = msdoc.Characters.Count;
                                    }

                                    Console.ForegroundColor = ConsoleColor.Red;
                                    Console.WriteLine("DEBUG: Path: {0}, {1}-{2}........", pdfFile, objStart, objEnd);
                                    Console.ResetColor();

                                    if ((int)objStart <= (int)objEnd)
                                    {
                                        string innerText = msdoc.Range(ref objStart, ref objEnd).Text;
                                        doc.DocBodyDic.Add(i, innerText);
                                    }
                                }

                                success = true;
                                break;
                            }
                            catch (Exception ex)
                            {
                                Console.ForegroundColor = ConsoleColor.Red;
                                Console.WriteLine("Retry to read word {0}, Exception: {1}..", pdfFile, ex.ToString());
                                Console.ResetColor();
                                System.Threading.Thread.Sleep(1000);
                                retry--;
                            }
                            finally
                            {
                                if (newApp != null)
                                {
                                    newApp.NormalTemplate.Saved = true;

                                    if (msdoc != null)
                                    {
                                        msdoc.Close(false);
                                    }

                                    newApp.Quit();
                                }
                            }
                        }
                    }
                    catch (Exception e)
                    {
                    }
                }
            }
            catch (Exception ex)
            {
            }

            return(success);
        }
 /**
  * This class delegates this call
  * @see com.itextpdf.text.pdf.parser.TextExtractionStrategy#getResultantText()
  */
 public virtual String GetResultantText(string lineposition)
 {
     return(deleg.GetResultantText(lineposition));
 }