//Override for GetResultantText public static String GetResultantText(this LocationTextExtractionStrategy strategy, Rectangle rect) { //Get chunks of text from extraction strategy IList <TextChunk> locationalResult = (IList <TextChunk>)locationalResultField.GetValue(strategy); //Make container list to store chunks that do not match List <TextChunk> nonMatching = new List <TextChunk>(); //For reach chunk in extraction strategy foreach (TextChunk chunk in locationalResult) { //Get chunk location ITextChunkLocation location = chunk.GetLocation(); //Make start and end vectors Vector start = location.GetStartLocation(); Vector end = location.GetEndLocation(); //check if asked rectangle is NOT intersecting current chunk of text if (!rect.IntersectsLine(start.Get(Vector.I1), start.Get(Vector.I2), end.Get(Vector.I1), end.Get(Vector.I2))) { //if rectangle is not containing this chunk add to nonMatching nonMatching.Add(chunk); } } //For each element in nonMatching remove from locationalResult collection nonMatching.ForEach(c => locationalResult.Remove(c)); try { //Try returning value if something remain return(strategy.GetResultantText()); } finally { //Return everything from non matching to localResult if there is no return value. nonMatching.ForEach(c => locationalResult.Add(c)); } }
private string GetTextFromRectangle(Rectangle rectangle) { string rectText = String.Empty; TextRegionEventFilter textRegionEventFilter = new TextRegionEventFilter(rectangle); LocationTextExtractionStrategy extractionStrategy = new LocationTextExtractionStrategy(); foreach (IEventData textRender in textRenderList) { if (textRegionEventFilter.IsInsideRectangle(textRender, EventType.RENDER_TEXT)) { extractionStrategy.EventOccurred(textRender, EventType.RENDER_TEXT); } else if (textRegionEventFilter.Accept(textRender, EventType.RENDER_TEXT)) { TextRenderInfo textRenderInfo = (TextRenderInfo)textRender; IList <TextRenderInfo> renderInfoList = textRenderInfo.GetCharacterRenderInfos(); for (int index = 0; index < renderInfoList.Count(); index++) { if (textRegionEventFilter.IsInsideRectangle(renderInfoList[index], EventType.RENDER_TEXT)) { extractionStrategy.EventOccurred(renderInfoList[index], EventType.RENDER_TEXT); } } } } rectText = extractionStrategy.GetResultantText(); return(rectText); }
virtual public void MultipleFiltersForOneRegionTest() { PdfReader pdfReader = TestResourceUtils.GetResourceAsPdfReader(TEST_RESOURCES_PATH, "test.pdf"); Rectangle[] regions = new Rectangle[] { new Rectangle(0, 0, 500, 650), new Rectangle(0, 0, 400, 400), new Rectangle(200, 200, 500, 600), new Rectangle(100, 100, 450, 400) }; RegionTextRenderFilter[] regionFilters = new RegionTextRenderFilter[regions.Length]; for (int i = 0; i < regions.Length; i++) { regionFilters[i] = new RegionTextRenderFilter(regions[i]); } MultiFilteredRenderListener listener = new MultiFilteredRenderListener(); LocationTextExtractionStrategy extractionStrategy = (LocationTextExtractionStrategy) listener.AttachRenderListener(new LocationTextExtractionStrategy(), regionFilters); new PdfReaderContentParser(pdfReader).ProcessContent(1, listener); String actualText = extractionStrategy.GetResultantText(); String expectedText = PdfTextExtractor.GetTextFromPage(pdfReader, 1, new FilteredTextRenderListener(new LocationTextExtractionStrategy(), regionFilters)); Assert.AreEqual(expectedText, actualText); }
public static void Test_GetPdfText_04(string file) { string outputFile = zpath.PathSetFileNameWithExtension(file, Path.GetFileNameWithoutExtension(file) + "_text.txt"); _tr.WriteLine("export pdf file \"{0}\" to \"{1}\"", file, outputFile); FileStream fs = new FileStream(outputFile, FileMode.Create, FileAccess.Write, FileShare.Read); StreamWriter sw = new StreamWriter(fs, Encoding.Default); sw.WriteLine("export pdf text of \"{0}\"", file); sw.WriteLine(); iTextSharp.text.pdf.PdfReader reader = new iTextSharp.text.pdf.PdfReader(file); LocationTextExtractionStrategy strategy = new LocationTextExtractionStrategy(); for (int page = 1; page <= reader.NumberOfPages; page++) { sw.WriteLine("================ page {0} ================", page); //string s = GetTextFromPage(reader, 1, strategy); //GetTextFromPage(reader, page, strategy); Test_iTextSharp.PdfTools.ProcessContentPage(reader, page, strategy); string s = strategy.GetResultantText(); sw.Write(s); sw.WriteLine(); } //_tr.WriteLine("LocationTextExtractionStrategy()"); //_tr.WriteLine(s); reader.Close(); sw.Close(); }
public static String GetResultantText(this LocationTextExtractionStrategy strategy, Rectangle rect) { IList <TextChunk> locationalResult = (IList <TextChunk>)locationalResultField.GetValue(strategy); List <TextChunk> nonMatching = new List <TextChunk>(); foreach (TextChunk chunk in locationalResult) { ITextChunkLocation location = chunk.GetLocation(); Vector start = location.GetStartLocation(); Vector end = location.GetEndLocation(); if (!rect.IntersectsLine(start.Get(Vector.I1), start.Get(Vector.I2), end.Get(Vector.I1), end.Get(Vector.I2))) { nonMatching.Add(chunk); } } nonMatching.ForEach(c => locationalResult.Remove(c)); try { return(strategy.GetResultantText()); } finally { nonMatching.ForEach(c => locationalResult.Add(c)); } }
public virtual void ManipulatePdf(String dest) { PdfDocument pdfDoc = new PdfDocument(new PdfReader(SRC)); Rectangle rect = new Rectangle(36, 750, 523, 56); CustomFontFilter fontFilter = new CustomFontFilter(rect); FilteredEventListener listener = new FilteredEventListener(); // Create a text extraction renderer LocationTextExtractionStrategy extractionStrategy = listener .AttachEventListener(new LocationTextExtractionStrategy(), fontFilter); // Note: If you want to re-use the PdfCanvasProcessor, you must call PdfCanvasProcessor.reset() new PdfCanvasProcessor(listener).ProcessPageContent(pdfDoc.GetFirstPage()); // Get the resultant text after applying the custom filter String actualText = extractionStrategy.GetResultantText(); pdfDoc.Close(); // See the resultant text in the console Console.Out.WriteLine(actualText); using (StreamWriter writer = new StreamWriter(dest)) { writer.Write(actualText); } }
private void btnSelecionarArquivo_Click(object sender, EventArgs e) { openFileDialog.ShowDialog(); var caminhoArquivo = openFileDialog.FileName; lblArquivoSelecionado.Text = caminhoArquivo; _pdfDocument = new PdfDocument(new PdfReader(caminhoArquivo)); LocationTextExtractionStrategy strategy = new LocationTextExtractionStrategy(); var numberOfPages = _pdfDocument.GetNumberOfPages(); for (var i = 1; i <= numberOfPages; i++) { PdfCanvasProcessor parser = new PdfCanvasProcessor(strategy); parser.ProcessPageContent(_pdfDocument.GetPage(i)); byte[] array = Encoding.Default.GetBytes(strategy.GetResultantText()); var str = Encoding.Default.GetString(array); } _pdfDocument.Close(); }
public static void ExtractPhysicalAddress() { var di = new DirectoryInfo(@"c:\temp\ime"); foreach (var file in di.GetFiles("*.pdf")) { PdfDocument pdfDoc = new PdfDocument(new PdfReader(file.FullName)); Rectangle rect = new Rectangle(300, 470, 70, 150); TextRegionEventFilter regionFilter = new TextRegionEventFilter(rect); FilteredEventListener listener = new FilteredEventListener(); LocationTextExtractionStrategy extractionStrategy = listener .AttachEventListener(new LocationTextExtractionStrategy(), regionFilter); new PdfCanvasProcessor(listener).ProcessPageContent(pdfDoc.GetPage(2)); String actualText = extractionStrategy.GetResultantText(); pdfDoc.Close(); Console.WriteLine(file.Name); Console.WriteLine(actualText); using (StreamWriter writer = new StreamWriter(file.FullName.Replace(".pdf", ".txt"))) { writer.Write(actualText); } } }
public static string GetResultantText(string fileName) { using (var pdfDoc = new PdfDocument(new PdfReader(fileName))) { var strategy = new LocationTextExtractionStrategy(); var parser = new PdfCanvasProcessor(strategy); parser.ProcessPageContent(pdfDoc.GetFirstPage()); var text = strategy.GetResultantText(); return(text); } }
public static string[] ExtractText(this PdfPage page, params Rectangle[] rects) { var textEventListener = new LocationTextExtractionStrategy(); PdfTextExtractor.GetTextFromPage(page, textEventListener); string[] result = new string[rects.Length]; for (int i = 0; i < result.Length; i++) { result[i] = textEventListener.GetResultantText(rects[i]); } return(result); }
public virtual void TestNoninvertibleMatrix() { String fileName = "noninvertibleMatrix.pdf"; PdfDocument pdfDocument = new PdfDocument(new PdfReader(sourceFolder + fileName)); LocationTextExtractionStrategy strategy = new LocationTextExtractionStrategy(); PdfCanvasProcessor processor = new PdfCanvasProcessor(strategy); PdfPage page = pdfDocument.GetFirstPage(); processor.ProcessPageContent(page); String resultantText = strategy.GetResultantText(); pdfDocument.Close(); NUnit.Framework.Assert.AreEqual("Hello World!\nHello World!\nHello World!\nHello World! Hello World! Hello World!" , resultantText); }
public static string GetPDFFromFile(string path) { PdfDocument pdfDoc = new PdfDocument(new PdfReader(path)); LocationTextExtractionStrategy strategy = new LocationTextExtractionStrategy(); PdfCanvasProcessor parser = new PdfCanvasProcessor(strategy); // Known limitation: read more than one page. Sample documents are all one page long. parser.ProcessPageContent(pdfDoc.GetFirstPage()); pdfDoc.Close(); return(strategy.GetResultantText()); }
private ICollection <string> GetPDF(string filename) { var pageText = new List <string>(); using (PdfDocument pdfDocument = new PdfDocument(new PdfReader(filename))) { var pageNumbers = pdfDocument.GetNumberOfPages(); for (int i = 1; i <= pageNumbers; i++) { LocationTextExtractionStrategy strategy = new LocationTextExtractionStrategy(); PdfCanvasProcessor parser = new PdfCanvasProcessor(strategy); parser.ProcessPageContent(pdfDocument.GetPage(i)); pageText.Add(strategy.GetResultantText()); } } return(pageText); }
public void ToTxt(string absoluteFilePath, string outputPath) { using (var pdfDocument = new PdfDocument(new PdfReader(absoluteFilePath))) { for (var pageIndex = 1; pageIndex <= pdfDocument.GetNumberOfPages(); pageIndex++) { using (var fos = System.IO.File.OpenWrite(outputPath)) { var strategy = new LocationTextExtractionStrategy(); var parser = new PdfCanvasProcessor(strategy); parser.ProcessPageContent(pdfDocument.GetPage(pageIndex)); var array = Encoding.UTF8.GetBytes(strategy.GetResultantText()); fos.Write(array, 0, array.Length); fos.Flush(); } } } }
protected virtual void ManipulatePdf(String dest) { PdfDocument pdfDoc = new PdfDocument(new PdfReader(SRC)); // Create a text extraction renderer LocationTextExtractionStrategy strategy = new LocationTextExtractionStrategy(); // Note: if you want to re-use the PdfCanvasProcessor, you must call PdfCanvasProcessor.Reset() PdfCanvasProcessor parser = new PdfCanvasProcessor(strategy); parser.ProcessPageContent(pdfDoc.GetFirstPage()); byte[] array = Encoding.UTF8.GetBytes(strategy.GetResultantText()); using (FileStream stream = new FileStream(dest, FileMode.Create)) { stream.Write(array, 0, array.Length); } pdfDoc.Close(); }
/*Extract text from asked page inside marked rectangle * @page - page number to extratc from * @rects - array of rectangles/fields to extract text from page. */ public static string[] ExtractText(this PdfPage page, params Rectangle[] rects) { //Make strategy var textEventListener = new LocationTextExtractionStrategy(); //Get all text from page PdfTextExtractor.GetTextFromPage(page, textEventListener); //Make string container to handle all stored data string[] result = new string[rects.Length]; //Loop all rectangles in the array for (int i = 0; i < result.Length; i++) { //Overrided GetResultantText //For each rectangle get text under the page and write it in the result arry result[i] = textEventListener.GetResultantText(rects[i]); } //returnt string array with results return(result); }
public static string ManipulatePdf(string filePath) { PdfDocument pdfDoc = new PdfDocument(new PdfReader(filePath)); //CustomFontFilter fontFilter = new CustomFontFilter(rect); FilteredEventListener listener = new FilteredEventListener(); // Create a text extraction renderer LocationTextExtractionStrategy extractionStrategy = listener .AttachEventListener(new LocationTextExtractionStrategy()); // Note: If you want to re-use the PdfCanvasProcessor, you must call PdfCanvasProcessor.reset() new PdfCanvasProcessor(listener).ProcessPageContent(pdfDoc.GetFirstPage()); // Get the resultant text after applying the custom filter String actualText = extractionStrategy.GetResultantText(); pdfDoc.Close(); return(actualText); }
public virtual void MultipleFiltersForOneRegionTest() { PdfDocument pdfDocument = new PdfDocument(new PdfReader(sourceFolder + "test.pdf")); Rectangle[] regions = new Rectangle[] { new Rectangle(0, 0, 500, 650), new Rectangle(0, 0, 400, 400), new Rectangle(200, 200, 300, 400), new Rectangle(100, 100, 350, 300) }; TextRegionEventFilter[] regionFilters = new TextRegionEventFilter[regions.Length]; for (int i = 0; i < regions.Length; i++) { regionFilters[i] = new TextRegionEventFilter(regions[i]); } FilteredEventListener listener = new FilteredEventListener(); LocationTextExtractionStrategy extractionStrategy = listener.AttachEventListener(new LocationTextExtractionStrategy (), regionFilters); new PdfCanvasProcessor(listener).ProcessPageContent(pdfDocument.GetPage(1)); String actualText = extractionStrategy.GetResultantText(); String expectedText = PdfTextExtractor.GetTextFromPage(pdfDocument.GetPage(1), new FilteredTextEventListener (new LocationTextExtractionStrategy(), regionFilters)); NUnit.Framework.Assert.AreEqual(expectedText, actualText); }
public static void Test_GetPdfText_04(string file) { string outputFile = zpath.PathSetFileNameWithExtension(file, Path.GetFileNameWithoutExtension(file) + "_text.txt"); _tr.WriteLine("export pdf file \"{0}\" to \"{1}\"", file, outputFile); FileStream fs = new FileStream(outputFile, FileMode.Create, FileAccess.Write, FileShare.Read); StreamWriter sw = new StreamWriter(fs, Encoding.Default); sw.WriteLine("export pdf text of \"{0}\"", file); sw.WriteLine(); iTextSharp.text.pdf.PdfReader reader = new iTextSharp.text.pdf.PdfReader(file); LocationTextExtractionStrategy strategy = new LocationTextExtractionStrategy(); for (int page = 1; page <= reader.NumberOfPages; page++) { sw.WriteLine("================ page {0} ================", page); //string s = GetTextFromPage(reader, 1, strategy); //GetTextFromPage(reader, page, strategy); Test_iTextSharp.PdfTools.ProcessContentPage(reader, page, strategy); string s = strategy.GetResultantText(); sw.Write(s); sw.WriteLine(); } //_tr.WriteLine("LocationTextExtractionStrategy()"); //_tr.WriteLine(s); reader.Close(); sw.Close(); }
public async Task <IActionResult> PostAsync([FromForm] FormData formData, CancellationToken cancellationToken) { var importResult = new ImportResult(); if (formData == null) { importResult.Error = "Form data is missing."; return(BadRequest(importResult)); } if (formData.ApiKey?.Equals(configuration["Import:ApiKey"]) != true) { importResult.Error = "Api Key is missing or invalid."; return(BadRequest(importResult)); } try { using (Stream stream = formData.Pdf.OpenReadStream()) { var document = new PdfDocument(new PdfReader(stream)); var text = new StringBuilder(); for (var i = 1; i <= document.GetNumberOfPages(); i++) { var strategy = new LocationTextExtractionStrategy(); var parser = new PdfCanvasProcessor(strategy); parser.ProcessPageContent(document.GetPage(i)); text.Append(strategy.GetResultantText()); } importResult.Pdf2Text = text.ToString(); } } catch (Exception e) { logger.LogError(e, e.Message); importResult.Error = e.Message; return(BadRequest(importResult)); } try { IEnumerable <Menu> menus = menuParser.ParseText(importResult.Pdf2Text).ToArray(); foreach (Menu menu in menus) { await menuRepository.ReplaceOrInsertAsync(menu, cancellationToken); } importResult.ImportedMenus = menus.Count(); return(Ok(importResult)); } catch (Exception e) { logger.LogError(e, e.Message); importResult.Error = e.Message; return(StatusCode(500, importResult)); } }