Beispiel #1
0
        //Override for GetResultantText
        public static String GetResultantText(this LocationTextExtractionStrategy strategy, Rectangle rect)
        {
            //Get chunks of text from extraction strategy
            IList <TextChunk> locationalResult = (IList <TextChunk>)locationalResultField.GetValue(strategy);
            //Make container list to store chunks that do not match
            List <TextChunk> nonMatching = new List <TextChunk>();

            //For reach chunk in extraction strategy
            foreach (TextChunk chunk in locationalResult)
            {
                //Get chunk location
                ITextChunkLocation location = chunk.GetLocation();
                //Make start and end vectors
                Vector start = location.GetStartLocation();
                Vector end   = location.GetEndLocation();
                //check if asked rectangle is NOT intersecting current chunk of text
                if (!rect.IntersectsLine(start.Get(Vector.I1), start.Get(Vector.I2), end.Get(Vector.I1), end.Get(Vector.I2)))
                {
                    //if rectangle is not containing this chunk add to nonMatching
                    nonMatching.Add(chunk);
                }
            }
            //For each element in nonMatching remove from locationalResult collection
            nonMatching.ForEach(c => locationalResult.Remove(c));
            try
            {
                //Try returning value if something remain
                return(strategy.GetResultantText());
            }
            finally
            {
                //Return everything from non matching to localResult if there is no return value.
                nonMatching.ForEach(c => locationalResult.Add(c));
            }
        }
        private string GetTextFromRectangle(Rectangle rectangle)
        {
            string rectText = String.Empty;

            TextRegionEventFilter          textRegionEventFilter = new TextRegionEventFilter(rectangle);
            LocationTextExtractionStrategy extractionStrategy    = new LocationTextExtractionStrategy();

            foreach (IEventData textRender in textRenderList)
            {
                if (textRegionEventFilter.IsInsideRectangle(textRender, EventType.RENDER_TEXT))
                {
                    extractionStrategy.EventOccurred(textRender, EventType.RENDER_TEXT);
                }
                else if (textRegionEventFilter.Accept(textRender, EventType.RENDER_TEXT))
                {
                    TextRenderInfo         textRenderInfo = (TextRenderInfo)textRender;
                    IList <TextRenderInfo> renderInfoList = textRenderInfo.GetCharacterRenderInfos();
                    for (int index = 0; index < renderInfoList.Count(); index++)
                    {
                        if (textRegionEventFilter.IsInsideRectangle(renderInfoList[index], EventType.RENDER_TEXT))
                        {
                            extractionStrategy.EventOccurred(renderInfoList[index], EventType.RENDER_TEXT);
                        }
                    }
                }
            }
            rectText = extractionStrategy.GetResultantText();
            return(rectText);
        }
        virtual public void MultipleFiltersForOneRegionTest()
        {
            PdfReader pdfReader = TestResourceUtils.GetResourceAsPdfReader(TEST_RESOURCES_PATH, "test.pdf");

            Rectangle[] regions = new Rectangle[] {
                new Rectangle(0, 0, 500, 650),
                new Rectangle(0, 0, 400, 400), new Rectangle(200, 200, 500, 600), new Rectangle(100, 100, 450, 400)
            };

            RegionTextRenderFilter[] regionFilters = new RegionTextRenderFilter[regions.Length];
            for (int i = 0; i < regions.Length; i++)
            {
                regionFilters[i] = new RegionTextRenderFilter(regions[i]);
            }

            MultiFilteredRenderListener    listener           = new MultiFilteredRenderListener();
            LocationTextExtractionStrategy extractionStrategy =
                (LocationTextExtractionStrategy)
                listener.AttachRenderListener(new LocationTextExtractionStrategy(), regionFilters);

            new PdfReaderContentParser(pdfReader).ProcessContent(1, listener);
            String actualText = extractionStrategy.GetResultantText();

            String expectedText = PdfTextExtractor.GetTextFromPage(pdfReader, 1,
                                                                   new FilteredTextRenderListener(new LocationTextExtractionStrategy(), regionFilters));

            Assert.AreEqual(expectedText, actualText);
        }
Beispiel #4
0
        public static void Test_GetPdfText_04(string file)
        {
            string outputFile = zpath.PathSetFileNameWithExtension(file, Path.GetFileNameWithoutExtension(file) + "_text.txt");

            _tr.WriteLine("export pdf file \"{0}\" to \"{1}\"", file, outputFile);
            FileStream   fs = new FileStream(outputFile, FileMode.Create, FileAccess.Write, FileShare.Read);
            StreamWriter sw = new StreamWriter(fs, Encoding.Default);

            sw.WriteLine("export pdf text of \"{0}\"", file);
            sw.WriteLine();
            iTextSharp.text.pdf.PdfReader  reader   = new iTextSharp.text.pdf.PdfReader(file);
            LocationTextExtractionStrategy strategy = new LocationTextExtractionStrategy();

            for (int page = 1; page <= reader.NumberOfPages; page++)
            {
                sw.WriteLine("================ page {0} ================", page);
                //string s = GetTextFromPage(reader, 1, strategy);
                //GetTextFromPage(reader, page, strategy);
                Test_iTextSharp.PdfTools.ProcessContentPage(reader, page, strategy);
                string s = strategy.GetResultantText();
                sw.Write(s);
                sw.WriteLine();
            }
            //_tr.WriteLine("LocationTextExtractionStrategy()");
            //_tr.WriteLine(s);
            reader.Close();
            sw.Close();
        }
Beispiel #5
0
        public static String GetResultantText(this LocationTextExtractionStrategy strategy, Rectangle rect)
        {
            IList <TextChunk> locationalResult = (IList <TextChunk>)locationalResultField.GetValue(strategy);
            List <TextChunk>  nonMatching      = new List <TextChunk>();

            foreach (TextChunk chunk in locationalResult)
            {
                ITextChunkLocation location = chunk.GetLocation();
                Vector             start    = location.GetStartLocation();
                Vector             end      = location.GetEndLocation();
                if (!rect.IntersectsLine(start.Get(Vector.I1), start.Get(Vector.I2), end.Get(Vector.I1), end.Get(Vector.I2)))
                {
                    nonMatching.Add(chunk);
                }
            }
            nonMatching.ForEach(c => locationalResult.Remove(c));
            try
            {
                return(strategy.GetResultantText());
            }
            finally
            {
                nonMatching.ForEach(c => locationalResult.Add(c));
            }
        }
Beispiel #6
0
        public virtual void ManipulatePdf(String dest)
        {
            PdfDocument pdfDoc = new PdfDocument(new PdfReader(SRC));

            Rectangle             rect       = new Rectangle(36, 750, 523, 56);
            CustomFontFilter      fontFilter = new CustomFontFilter(rect);
            FilteredEventListener listener   = new FilteredEventListener();

            // Create a text extraction renderer
            LocationTextExtractionStrategy extractionStrategy = listener
                                                                .AttachEventListener(new LocationTextExtractionStrategy(), fontFilter);

            // Note: If you want to re-use the PdfCanvasProcessor, you must call PdfCanvasProcessor.reset()
            new PdfCanvasProcessor(listener).ProcessPageContent(pdfDoc.GetFirstPage());

            // Get the resultant text after applying the custom filter
            String actualText = extractionStrategy.GetResultantText();

            pdfDoc.Close();

            // See the resultant text in the console
            Console.Out.WriteLine(actualText);

            using (StreamWriter writer = new StreamWriter(dest))
            {
                writer.Write(actualText);
            }
        }
        private void btnSelecionarArquivo_Click(object sender, EventArgs e)
        {
            openFileDialog.ShowDialog();
            var caminhoArquivo = openFileDialog.FileName;

            lblArquivoSelecionado.Text = caminhoArquivo;

            _pdfDocument = new PdfDocument(new PdfReader(caminhoArquivo));

            LocationTextExtractionStrategy strategy = new LocationTextExtractionStrategy();

            var numberOfPages = _pdfDocument.GetNumberOfPages();

            for (var i = 1; i <= numberOfPages; i++)
            {
                PdfCanvasProcessor parser = new PdfCanvasProcessor(strategy);
                parser.ProcessPageContent(_pdfDocument.GetPage(i));

                byte[] array = Encoding.Default.GetBytes(strategy.GetResultantText());


                var str = Encoding.Default.GetString(array);
            }

            _pdfDocument.Close();
        }
Beispiel #8
0
        public static void ExtractPhysicalAddress()
        {
            var di = new DirectoryInfo(@"c:\temp\ime");

            foreach (var file in di.GetFiles("*.pdf"))
            {
                PdfDocument pdfDoc = new PdfDocument(new PdfReader(file.FullName));

                Rectangle             rect         = new Rectangle(300, 470, 70, 150);
                TextRegionEventFilter regionFilter = new TextRegionEventFilter(rect);

                FilteredEventListener listener = new FilteredEventListener();

                LocationTextExtractionStrategy extractionStrategy = listener
                                                                    .AttachEventListener(new LocationTextExtractionStrategy(), regionFilter);


                new PdfCanvasProcessor(listener).ProcessPageContent(pdfDoc.GetPage(2));


                String actualText = extractionStrategy.GetResultantText();

                pdfDoc.Close();

                Console.WriteLine(file.Name);
                Console.WriteLine(actualText);

                using (StreamWriter writer = new StreamWriter(file.FullName.Replace(".pdf", ".txt")))
                {
                    writer.Write(actualText);
                }
            }
        }
Beispiel #9
0
 public static string GetResultantText(string fileName)
 {
     using (var pdfDoc = new PdfDocument(new PdfReader(fileName)))
     {
         var strategy = new LocationTextExtractionStrategy();
         var parser   = new PdfCanvasProcessor(strategy);
         parser.ProcessPageContent(pdfDoc.GetFirstPage());
         var text = strategy.GetResultantText();
         return(text);
     }
 }
Beispiel #10
0
        public static string[] ExtractText(this PdfPage page, params Rectangle[] rects)
        {
            var textEventListener = new LocationTextExtractionStrategy();

            PdfTextExtractor.GetTextFromPage(page, textEventListener);
            string[] result = new string[rects.Length];
            for (int i = 0; i < result.Length; i++)
            {
                result[i] = textEventListener.GetResultantText(rects[i]);
            }
            return(result);
        }
        public virtual void TestNoninvertibleMatrix()
        {
            String      fileName    = "noninvertibleMatrix.pdf";
            PdfDocument pdfDocument = new PdfDocument(new PdfReader(sourceFolder + fileName));
            LocationTextExtractionStrategy strategy  = new LocationTextExtractionStrategy();
            PdfCanvasProcessor             processor = new PdfCanvasProcessor(strategy);
            PdfPage page = pdfDocument.GetFirstPage();

            processor.ProcessPageContent(page);
            String resultantText = strategy.GetResultantText();

            pdfDocument.Close();
            NUnit.Framework.Assert.AreEqual("Hello World!\nHello World!\nHello World!\nHello World! Hello World! Hello World!"
                                            , resultantText);
        }
Beispiel #12
0
        public static string GetPDFFromFile(string path)
        {
            PdfDocument pdfDoc = new PdfDocument(new PdfReader(path));

            LocationTextExtractionStrategy strategy = new LocationTextExtractionStrategy();

            PdfCanvasProcessor parser = new PdfCanvasProcessor(strategy);

            // Known limitation: read more than one page. Sample documents are all one page long.
            parser.ProcessPageContent(pdfDoc.GetFirstPage());

            pdfDoc.Close();

            return(strategy.GetResultantText());
        }
Beispiel #13
0
        private ICollection <string> GetPDF(string filename)
        {
            var pageText = new List <string>();

            using (PdfDocument pdfDocument = new PdfDocument(new PdfReader(filename)))
            {
                var pageNumbers = pdfDocument.GetNumberOfPages();
                for (int i = 1; i <= pageNumbers; i++)
                {
                    LocationTextExtractionStrategy strategy = new LocationTextExtractionStrategy();
                    PdfCanvasProcessor             parser   = new PdfCanvasProcessor(strategy);
                    parser.ProcessPageContent(pdfDocument.GetPage(i));
                    pageText.Add(strategy.GetResultantText());
                }
            }
            return(pageText);
        }
Beispiel #14
0
        public void ToTxt(string absoluteFilePath, string outputPath)
        {
            using (var pdfDocument = new PdfDocument(new PdfReader(absoluteFilePath)))
            {
                for (var pageIndex = 1; pageIndex <= pdfDocument.GetNumberOfPages(); pageIndex++)
                {
                    using (var fos = System.IO.File.OpenWrite(outputPath))
                    {
                        var strategy = new LocationTextExtractionStrategy();
                        var parser   = new PdfCanvasProcessor(strategy);
                        parser.ProcessPageContent(pdfDocument.GetPage(pageIndex));
                        var array = Encoding.UTF8.GetBytes(strategy.GetResultantText());
                        fos.Write(array, 0, array.Length);

                        fos.Flush();
                    }
                }
            }
        }
Beispiel #15
0
        protected virtual void ManipulatePdf(String dest)
        {
            PdfDocument pdfDoc = new PdfDocument(new PdfReader(SRC));

            // Create a text extraction renderer
            LocationTextExtractionStrategy strategy = new LocationTextExtractionStrategy();

            // Note: if you want to re-use the PdfCanvasProcessor, you must call PdfCanvasProcessor.Reset()
            PdfCanvasProcessor parser = new PdfCanvasProcessor(strategy);

            parser.ProcessPageContent(pdfDoc.GetFirstPage());

            byte[] array = Encoding.UTF8.GetBytes(strategy.GetResultantText());
            using (FileStream stream = new FileStream(dest, FileMode.Create))
            {
                stream.Write(array, 0, array.Length);
            }

            pdfDoc.Close();
        }
Beispiel #16
0
        /*Extract text from asked page inside marked rectangle
         * @page - page number to extratc from
         * @rects - array of rectangles/fields to extract text from page.
         */
        public static string[] ExtractText(this PdfPage page, params Rectangle[] rects)
        {
            //Make strategy
            var textEventListener = new LocationTextExtractionStrategy();

            //Get all text from page
            PdfTextExtractor.GetTextFromPage(page, textEventListener);

            //Make string container to handle all stored data
            string[] result = new string[rects.Length];
            //Loop all rectangles in the array
            for (int i = 0; i < result.Length; i++)
            {
                //Overrided GetResultantText
                //For each rectangle get text under the page and write it in the result arry
                result[i] = textEventListener.GetResultantText(rects[i]);
            }
            //returnt string array with results
            return(result);
        }
Beispiel #17
0
        public static string ManipulatePdf(string filePath)
        {
            PdfDocument pdfDoc = new PdfDocument(new PdfReader(filePath));

            //CustomFontFilter fontFilter = new CustomFontFilter(rect);
            FilteredEventListener listener = new FilteredEventListener();

            // Create a text extraction renderer
            LocationTextExtractionStrategy extractionStrategy = listener
                                                                .AttachEventListener(new LocationTextExtractionStrategy());

            // Note: If you want to re-use the PdfCanvasProcessor, you must call PdfCanvasProcessor.reset()
            new PdfCanvasProcessor(listener).ProcessPageContent(pdfDoc.GetFirstPage());

            // Get the resultant text after applying the custom filter
            String actualText = extractionStrategy.GetResultantText();

            pdfDoc.Close();

            return(actualText);
        }
Beispiel #18
0
        public virtual void MultipleFiltersForOneRegionTest()
        {
            PdfDocument pdfDocument = new PdfDocument(new PdfReader(sourceFolder + "test.pdf"));

            Rectangle[] regions = new Rectangle[] { new Rectangle(0, 0, 500, 650), new Rectangle(0, 0, 400, 400), new
                                                    Rectangle(200, 200, 300, 400), new Rectangle(100, 100, 350, 300) };
            TextRegionEventFilter[] regionFilters = new TextRegionEventFilter[regions.Length];
            for (int i = 0; i < regions.Length; i++)
            {
                regionFilters[i] = new TextRegionEventFilter(regions[i]);
            }
            FilteredEventListener          listener           = new FilteredEventListener();
            LocationTextExtractionStrategy extractionStrategy = listener.AttachEventListener(new LocationTextExtractionStrategy
                                                                                                 (), regionFilters);

            new PdfCanvasProcessor(listener).ProcessPageContent(pdfDocument.GetPage(1));
            String actualText   = extractionStrategy.GetResultantText();
            String expectedText = PdfTextExtractor.GetTextFromPage(pdfDocument.GetPage(1), new FilteredTextEventListener
                                                                       (new LocationTextExtractionStrategy(), regionFilters));

            NUnit.Framework.Assert.AreEqual(expectedText, actualText);
        }
Beispiel #19
0
 public static void Test_GetPdfText_04(string file)
 {
     string outputFile = zpath.PathSetFileNameWithExtension(file, Path.GetFileNameWithoutExtension(file) + "_text.txt");
     _tr.WriteLine("export pdf file \"{0}\" to \"{1}\"", file, outputFile);
     FileStream fs = new FileStream(outputFile, FileMode.Create, FileAccess.Write, FileShare.Read);
     StreamWriter sw = new StreamWriter(fs, Encoding.Default);
     sw.WriteLine("export pdf text of \"{0}\"", file);
     sw.WriteLine();
     iTextSharp.text.pdf.PdfReader reader = new iTextSharp.text.pdf.PdfReader(file);
     LocationTextExtractionStrategy strategy = new LocationTextExtractionStrategy();
     for (int page = 1; page <= reader.NumberOfPages; page++)
     {
         sw.WriteLine("================ page {0} ================", page);
         //string s = GetTextFromPage(reader, 1, strategy);
         //GetTextFromPage(reader, page, strategy);
         Test_iTextSharp.PdfTools.ProcessContentPage(reader, page, strategy);
         string s = strategy.GetResultantText();
         sw.Write(s);
         sw.WriteLine();
     }
     //_tr.WriteLine("LocationTextExtractionStrategy()");
     //_tr.WriteLine(s);
     reader.Close();
     sw.Close();
 }
Beispiel #20
0
        public async Task <IActionResult> PostAsync([FromForm] FormData formData, CancellationToken cancellationToken)
        {
            var importResult = new ImportResult();

            if (formData == null)
            {
                importResult.Error = "Form data is missing.";

                return(BadRequest(importResult));
            }

            if (formData.ApiKey?.Equals(configuration["Import:ApiKey"]) != true)
            {
                importResult.Error = "Api Key is missing or invalid.";

                return(BadRequest(importResult));
            }

            try
            {
                using (Stream stream = formData.Pdf.OpenReadStream())
                {
                    var document = new PdfDocument(new PdfReader(stream));
                    var text     = new StringBuilder();

                    for (var i = 1; i <= document.GetNumberOfPages(); i++)
                    {
                        var strategy = new LocationTextExtractionStrategy();
                        var parser   = new PdfCanvasProcessor(strategy);
                        parser.ProcessPageContent(document.GetPage(i));
                        text.Append(strategy.GetResultantText());
                    }

                    importResult.Pdf2Text = text.ToString();
                }
            }
            catch (Exception e)
            {
                logger.LogError(e, e.Message);
                importResult.Error = e.Message;

                return(BadRequest(importResult));
            }

            try
            {
                IEnumerable <Menu> menus = menuParser.ParseText(importResult.Pdf2Text).ToArray();

                foreach (Menu menu in menus)
                {
                    await menuRepository.ReplaceOrInsertAsync(menu, cancellationToken);
                }

                importResult.ImportedMenus = menus.Count();

                return(Ok(importResult));
            }
            catch (Exception e)
            {
                logger.LogError(e, e.Message);
                importResult.Error = e.Message;

                return(StatusCode(500, importResult));
            }
        }