private void btnSelecionarArquivo_Click(object sender, EventArgs e) { openFileDialog.ShowDialog(); var caminhoArquivo = openFileDialog.FileName; lblArquivoSelecionado.Text = caminhoArquivo; _pdfDocument = new PdfDocument(new PdfReader(caminhoArquivo)); LocationTextExtractionStrategy strategy = new LocationTextExtractionStrategy(); var numberOfPages = _pdfDocument.GetNumberOfPages(); for (var i = 1; i <= numberOfPages; i++) { PdfCanvasProcessor parser = new PdfCanvasProcessor(strategy); parser.ProcessPageContent(_pdfDocument.GetPage(i)); byte[] array = Encoding.Default.GetBytes(strategy.GetResultantText()); var str = Encoding.Default.GetString(array); } _pdfDocument.Close(); }
public Task <File> Convert(string path) { return(Task.Run(() => { var file = new File { Path = path, Mime = "application/pdf" }; using (var document = new PdfDocument(new PdfReader(path))) { int numOfPages = document.GetNumberOfPages(); var listener = new FilteredEventListener(); var extractionStrategy = listener .AttachEventListener(new LocationTextExtractionStrategy()); var processor = new PdfCanvasProcessor(listener); var content = new StringBuilder(); for (int i = 1; i <= numOfPages; i++) { processor.ProcessPageContent(document.GetPage(i)); content.Append(extractionStrategy.GetResultantText()); processor.Reset(); } file.Content = content.ToString(); } return file; })); }
public virtual void TestFontColorInMultiPagePdf() { String testName = "testFontColorInMultiPagePdf"; String path = TEST_IMAGES_DIRECTORY + "multîpage.tiff"; String pdfPath = GetTargetDirectory() + testName + ".pdf"; FileInfo file = new FileInfo(path); tesseractReader.SetTesseract4OcrEngineProperties(tesseractReader.GetTesseract4OcrEngineProperties().SetPreprocessingImages (false)); OcrPdfCreatorProperties ocrPdfCreatorProperties = new OcrPdfCreatorProperties(); ocrPdfCreatorProperties.SetTextLayerName("Text1"); Color color = DeviceCmyk.MAGENTA; ocrPdfCreatorProperties.SetTextColor(color); OcrPdfCreator ocrPdfCreator = new OcrPdfCreator(tesseractReader, ocrPdfCreatorProperties); PdfDocument doc = ocrPdfCreator.CreatePdf(JavaCollectionsUtil.SingletonList <FileInfo>(file), GetPdfWriter( pdfPath)); NUnit.Framework.Assert.IsNotNull(doc); doc.Close(); PdfDocument pdfDocument = new PdfDocument(new PdfReader(pdfPath)); IntegrationTestHelper.ExtractionStrategy strategy = new IntegrationTestHelper.ExtractionStrategy("Text1"); PdfCanvasProcessor processor = new PdfCanvasProcessor(strategy); processor.ProcessPageContent(pdfDocument.GetPage(1)); Color fillColor = strategy.GetFillColor(); NUnit.Framework.Assert.AreEqual(fillColor, color); pdfDocument.Close(); }
/// <summary>Extract text from a specified page using an extraction strategy.</summary> /// <param name="page">the page for the text to be extracted from</param> /// <param name="strategy">the strategy to use for extracting text</param> /// <returns>the extracted text</returns> public static String GetTextFromPage(PdfPage page, ITextExtractionStrategy strategy) { PdfCanvasProcessor parser = new PdfCanvasProcessor(strategy); parser.ProcessPageContent(page); return(strategy.GetResultantText()); }
public Page[] GetBlocks(byte[] contents) { List <Page> lstPages = new List <Page>(); using (var stm = new System.IO.MemoryStream(contents)) { using (var pdfReader = new iText.Kernel.Pdf.PdfReader(stm)) { using (iText.Kernel.Pdf.PdfDocument doc = new iText.Kernel.Pdf.PdfDocument(pdfReader)) { int numOfPages = doc.GetNumberOfPages(); for (int page = 1; page <= numOfPages; page++) { var pdfPage = doc.GetPage(page); var pg = new Page(); var rotation = pdfPage.GetPageSizeWithRotation(); pg.Height = rotation.GetHeight(); pg.Width = rotation.GetWidth(); var customListener = new CustomEventListener(); var parser = new PdfCanvasProcessor(customListener); parser.ProcessPageContent(pdfPage); var lstBlocks = customListener.Blocks; pg.Blocks = customListener.Blocks.ToArray(); lstPages.Add(pg); } } } } return(lstPages.ToArray()); }
public virtual void PdfUncoloredPatternColorSize1Test() { PdfDocument pdfDocument = new PdfDocument(new PdfWriter(new ByteArrayOutputStream())); String contentColorSpace = "/Cs1 cs\n"; PdfDictionary pageDictionary = (PdfDictionary) new PdfDictionary().MakeIndirect(pdfDocument); PdfStream contentStream = new PdfStream(contentColorSpace.GetBytes()); pageDictionary.Put(PdfName.Contents, contentStream); PdfPage page = pdfDocument.AddNewPage(); page.GetPdfObject().Put(PdfName.Contents, contentStream); PdfArray pdfArray = new PdfArray(); pdfArray.Add(PdfName.Pattern); PdfColorSpace space = PdfColorSpace.MakeColorSpace(pdfArray); page.GetResources().AddColorSpace(space); Rectangle rectangle = new Rectangle(50, 50, 1000, 1000); page.SetMediaBox(rectangle); PdfCanvasProcessor processor = new PdfCanvasProcessor(new PdfArrayTest.NoOpListener()); processor.ProcessPageContent(page); // Check if we reach the end of the test without failings together with verifying expected color space instance NUnit.Framework.Assert.IsTrue(processor.GetGraphicsState().GetFillColor().GetColorSpace() is PdfSpecialCs.Pattern ); }
public virtual void ExtractByteAlignedG4TiffImageTest() { String inFileName = sourceFolder + "extractByteAlignedG4TiffImage.pdf"; String outImageFileName = destinationFolder + "extractedByteAlignedImage.png"; String cmpImageFileName = sourceFolder + "cmp_extractByteAlignedG4TiffImage.png"; PdfDocument pdfDocument = new PdfDocument(new PdfReader(inFileName)); GetImageBytesTest.ImageExtractor listener = new GetImageBytesTest.ImageExtractor(this); PdfCanvasProcessor processor = new PdfCanvasProcessor(listener); processor.ProcessPageContent(pdfDocument.GetPage(1)); IList <byte[]> images = listener.GetImages(); NUnit.Framework.Assert.AreEqual(1, images.Count); using (FileStream fos = new FileStream(outImageFileName, FileMode.Create)) { fos.Write(images[0], 0, images.Count); } // expected and actual are swapped here for simplicity int expectedLen = images[0].Length; byte[] buf = new byte[expectedLen]; using (FileStream @is = new FileStream(cmpImageFileName, FileMode.Open, FileAccess.Read)) { int read = @is.JRead(buf, 0, buf.Length); NUnit.Framework.Assert.AreEqual(expectedLen, read); read = @is.JRead(buf, 0, buf.Length); NUnit.Framework.Assert.IsTrue(read <= 0); } NUnit.Framework.Assert.AreEqual(images[0], buf); }
public virtual void TestWithMultiFilteredRenderListener() { PdfDocument pdfDocument = new PdfDocument(new PdfReader(sourceFolder + "test.pdf")); float x1; float y1; float x2; float y2; FilteredEventListener listener = new FilteredEventListener(); x1 = 122; x2 = 22; y1 = 678.9f; y2 = 12; ITextExtractionStrategy region1Listener = listener.AttachEventListener(new LocationTextExtractionStrategy( ), new TextRegionEventFilter(new Rectangle(x1, y1, x2, y2))); x1 = 156; x2 = 13; y1 = 678.9f; y2 = 12; ITextExtractionStrategy region2Listener = listener.AttachEventListener(new LocationTextExtractionStrategy( ), new TextRegionEventFilter(new Rectangle(x1, y1, x2, y2))); PdfCanvasProcessor parser = new PdfCanvasProcessor(new GlyphEventListener(listener)); parser.ProcessPageContent(pdfDocument.GetPage(1)); NUnit.Framework.Assert.AreEqual("Your", region1Listener.GetResultantText()); NUnit.Framework.Assert.AreEqual("dju", region2Listener.GetResultantText()); }
public PipelinePage ParsePdf <T>() where T : class, IEventListener, IPipelineResults <BlockPage>, new() { var listener = CreateInstance <T>(); var parser = new PdfCanvasProcessor(listener); parser.ProcessPageContent(_pdfPage); // retrieve page size. where to store? var pageSize = _pdfPage.GetPageSize(); var page = new PipelinePage(_pdf, _pageNumber); page.LastResult = listener.GetResults(); if (page.LastResult == null) { throw new InvalidOperationException(); } if (page.LastResult.AllBlocks == null) { throw new InvalidOperationException(); } _page = page; return(page); }
/// <exception cref="System.IO.IOException"/> protected internal virtual void ParseTag(PdfMcr kid) { int mcid = kid.GetMcid(); PdfDictionary pageDic = kid.GetPageObject(); String tagContent = ""; if (mcid != -1) { if (!parsedTags.ContainsKey(pageDic)) { TaggedPdfReaderTool.MarkedContentEventListener listener = new TaggedPdfReaderTool.MarkedContentEventListener (this); PdfCanvasProcessor processor = new PdfCanvasProcessor(listener); PdfPage page = document.GetPage(pageDic); processor.ProcessContent(page.GetContentBytes(), page.GetResources()); parsedTags[pageDic] = listener.GetMcidContent(); } if (parsedTags.Get(pageDic).ContainsKey(mcid)) { tagContent = parsedTags.Get(pageDic).Get(mcid); } } else { PdfObjRef objRef = (PdfObjRef)kid; PdfObject @object = objRef.GetReferencedObject(); if (@object.IsDictionary()) { PdfName subtype = ((PdfDictionary)@object).GetAsName(PdfName.Subtype); tagContent = subtype.ToString(); } } @out.Write(EscapeXML(tagContent, true)); }
public virtual void TestCharacterRenderInfos() { PdfCanvasProcessor parser = new PdfCanvasProcessor(new TextRenderInfoTest.CharacterPositionEventListener() ); parser.ProcessPageContent(new PdfDocument(new PdfReader(sourceFolder + "simple_text.pdf")).GetPage(FIRST_PAGE )); }
/// <summary>Extract text from a specified page using an extraction strategy.</summary> /// <remarks> /// Extract text from a specified page using an extraction strategy. /// Also allows registration of custom IContentOperators that can influence /// how (and whether or not) the PDF instructions will be parsed. /// </remarks> /// <param name="page">the page for the text to be extracted from</param> /// <param name="strategy">the strategy to use for extracting text</param> /// <param name="additionalContentOperators"> /// an optional map of custom /// <see cref="IContentOperator"/> /// s for rendering instructions /// </param> /// <returns>the extracted text</returns> public static String GetTextFromPage(PdfPage page, ITextExtractionStrategy strategy, IDictionary <String, IContentOperator > additionalContentOperators) { PdfCanvasProcessor parser = new PdfCanvasProcessor(strategy, additionalContentOperators); parser.ProcessPageContent(page); return(strategy.GetResultantText()); }
/// <summary> /// Parsing data from Oy axis /// </summary> /// <param name="page">Data of page</param> /// <returns>data of Oy axis</returns> internal StringBuilder ParsingOyAxis(PdfPage page) { // temp variable Rectangle readBox; TextRegionEventFilter readText; FilteredEventListener listener; LocationTextExtractionStrategy extractor; PdfCanvasProcessor parser; string[] lines; StringBuilder result = new StringBuilder(); // area limit for read readBox = new Rectangle(Margin.Left, Margin.Bottom + 60, 20, page.GetPageSize().GetHeight() - Margin.Bottom - 160); readText = new TextRegionEventFilter(readBox); listener = new FilteredEventListener(); // create a text extraction renderer extractor = listener .AttachEventListener(new LocationTextExtractionStrategy(), readText); lock (block) { (parser = new PdfCanvasProcessor(listener)) .ProcessPageContent(page); parser.Reset(); } // read every line (row) lines = extractor .GetResultantText() .Split('\n'); foreach (string line in lines) { if (!string.IsNullOrEmpty(line.Trim())) { result.AppendLine(line); } } TextExtractionStrategy strategy = listener.AttachEventListener(new TextExtractionStrategy(), readText); lock (block) { (parser = new PdfCanvasProcessor(listener)) .ProcessPageContent(page); parser.Reset(); } PositionOyAxis = strategy.TextResult.ToArray(); return(result); }
/// <summary>Processes content from the specified page number using the specified listener.</summary> /// <remarks> /// Processes content from the specified page number using the specified listener. /// Also allows registration of custom IContentOperators that can influence /// how (and whether or not) the PDF instructions will be parsed. /// </remarks> /// /// <param name="pageNumber">the page number to process</param> /// <param name="renderListener">the listener that will receive render callbacks</param> /// <param name="additionalContentOperators">an optional map of custom ContentOperators for rendering instructions /// </param> /// <returns>the provided renderListener</returns> public virtual E ProcessContent <E>(int pageNumber, E renderListener, IDictionary <String, IContentOperator> additionalContentOperators) where E : IEventListener { PdfCanvasProcessor processor = new PdfCanvasProcessor(renderListener, additionalContentOperators); processor.ProcessPageContent(pdfDocument.GetPage(pageNumber)); return(renderListener); }
public virtual void TestClosingEmptyPath() { String fileName = "closingEmptyPath.pdf"; PdfDocument document = new PdfDocument(new PdfReader(sourceFolder + fileName)); PdfCanvasProcessor processor = new PdfCanvasProcessor(new PdfCanvasProcessorTest.NoOpEventListener()); // Assert than no exception is thrown when an empty path is handled processor.ProcessPageContent(document.GetPage(1)); }
/// <summary> /// Process a PDF page to retrieve tables data from it. /// </summary> /// <param name="pdfPage">the pdf page which to process</param> /// <param name="withBorder">true if tables have fully borders, false otherwise</param> public FilterTableEventListener(PdfPage pdfPage, bool withBorder) { if (withBorder) { this.pdfPage = pdfPage; PdfCanvasProcessor processor = new PdfCanvasProcessor(this); processor.ProcessPageContent(pdfPage); GetTablesFromborders(); } }
public static string GetResultantText(string fileName) { using (var pdfDoc = new PdfDocument(new PdfReader(fileName))) { var strategy = new LocationTextExtractionStrategy(); var parser = new PdfCanvasProcessor(strategy); parser.ProcessPageContent(pdfDoc.GetFirstPage()); var text = strategy.GetResultantText(); return(text); } }
static void AnalyzeTextFromListener(string filename) { using (var pdf = new PdfDocument(new PdfReader(filename))) { var page = pdf.GetFirstPage(); var parser = new PdfCanvasProcessor(new AnalyzeTextListener()); parser.ProcessPageContent(page); } }
static void ShowLinesFromListener(string filename) { using (var pdf = new PdfDocument(new PdfReader(filename))) { var page = pdf.GetFirstPage(); var parser = new PdfCanvasProcessor(new UserPathListener()); parser.ProcessPageContent(page); } }
public virtual void ContentStreamProcessorTest() { PdfDocument document = new PdfDocument(new PdfReader(sourceFolder + "yaxiststar.pdf"), new PdfWriter(new ByteArrayOutputStream ())); for (int i = 1; i <= document.GetNumberOfPages(); ++i) { PdfPage page = document.GetPage(i); PdfCanvasProcessor processor = new PdfCanvasProcessor(new _IEventListener_40()); processor.ProcessPageContent(page); } }
private ICollection <Rectangle> ProcessPage(ILocationExtractionStrategy strategy, PdfPage page) { PdfCanvasProcessor parser = new PdfCanvasProcessor(strategy); parser.ProcessPageContent(page); IList <Rectangle> retval = new List <Rectangle>(); foreach (IPdfTextLocation l in strategy.GetResultantLocations()) { retval.Add(l.GetRectangle()); } return(retval); }
public virtual void ExpectedByteAlignedTiffImageExtractionTest() { NUnit.Framework.Assert.That(() => { //Byte-aligned image is expected in pdf file, but in fact it's not String inFileName = sourceFolder + "expectedByteAlignedTiffImageExtraction.pdf"; PdfDocument pdfDocument = new PdfDocument(new PdfReader(inFileName)); GetImageBytesTest.ImageExtractor listener = new GetImageBytesTest.ImageExtractor(this); PdfCanvasProcessor processor = new PdfCanvasProcessor(listener); processor.ProcessPageContent(pdfDocument.GetPage(1)); } , NUnit.Framework.Throws.InstanceOf <iText.IO.IOException>().With.Message.EqualTo(MessageFormatUtil.Format(iText.IO.IOException.ExpectedTrailingZeroBitsForByteAlignedLines))) ; }
public virtual void ParseCircularReferencesInResourcesTest() { NUnit.Framework.Assert.That(() => { String fileName = "circularReferencesInResources.pdf"; PdfDocument pdfDocument = new PdfDocument(new PdfReader(sourceFolder + fileName)); PdfCanvasProcessor processor = new PdfCanvasProcessor(new PdfCanvasProcessorTest.NoOpEventListener()); PdfPage page = pdfDocument.GetFirstPage(); processor.ProcessPageContent(page); pdfDocument.Close(); } , NUnit.Framework.Throws.InstanceOf <OutOfMemoryException>()) ; }
/// <summary>Get extraction strategy for given document.</summary> public static ExtractionStrategy GetExtractionStrategy(String pdfPath, String layerName, bool useActualText ) { PdfDocument pdfDocument = new PdfDocument(new PdfReader(pdfPath)); ExtractionStrategy strategy = new ExtractionStrategy(layerName); strategy.SetUseActualText(useActualText); PdfCanvasProcessor processor = new PdfCanvasProcessor(strategy); processor.ProcessPageContent(pdfDocument.GetFirstPage()); pdfDocument.Close(); return(strategy); }
public virtual void CheckBboxCalculationForType3FontsWithFontMatrix02() { String inputPdf = sourceFolder + "checkBboxCalculationForType3FontsWithFontMatrix02.pdf"; PdfDocument pdfDocument = new PdfDocument(new PdfReader(inputPdf)); GlyphBboxCalculationTest.CharacterPositionEventListener listener = new GlyphBboxCalculationTest.CharacterPositionEventListener (); PdfCanvasProcessor processor = new PdfCanvasProcessor(listener); processor.ProcessPageContent(pdfDocument.GetPage(1)); // font size (36) * |fontMatrix| (1) * glyph width (0.6) = 21.6 NUnit.Framework.Assert.AreEqual(21.6, listener.glyphWith, 1e-5); }
/// <summary>Processes content from the specified page number using the specified listener.</summary> /// <remarks> /// Processes content from the specified page number using the specified listener. /// Also allows registration of custom ContentOperators /// </remarks> /// /// <param name="pageNumber">the page number to process</param> /// <param name="renderListener">the listener that will receive render callbacks</param> /// <param name="additionalContentOperators">an optional map of custom ContentOperators for rendering instructions /// </param> /// <returns>the provided renderListener</returns> public virtual E ProcessContent <E>(int pageNumber, E renderListener, IDictionary <String, IContentOperator> additionalContentOperators) where E : IEventListener { PdfCanvasProcessor processor = new PdfCanvasProcessor(renderListener); foreach (KeyValuePair <String, IContentOperator> entry in additionalContentOperators) { processor.RegisterContentOperator(entry.Key, entry.Value); } processor.ProcessPageContent(pdfDocument.GetPage(pageNumber)); return(renderListener); }
public virtual void CheckAverageBboxCalculationForType3FontsWithFontMatrix01Test() { String inputPdf = sourceFolder + "checkAverageBboxCalculationForType3FontsWithFontMatrix01.pdf"; PdfDocument pdfDocument = new PdfDocument(new PdfReader(inputPdf)); GlyphBboxCalculationTest.CharacterPositionEventListener listener = new GlyphBboxCalculationTest.CharacterPositionEventListener (); PdfCanvasProcessor processor = new PdfCanvasProcessor(listener); processor.ProcessPageContent(pdfDocument.GetPage(1)); NUnit.Framework.Assert.AreEqual(600, listener.firstTextRenderInfo.GetFont().GetFontProgram().GetAvgWidth() , 0.01f); }
public void BackgroundTest(string exeFileName, string commandLineParameter, bool expectedBackground) { HtmlToPdfRunner runner = new HtmlToPdfRunner(exeFileName); string html = @"<!DOCTYPE html> <html> <head> </head> <body style=""background-color:blue;""> Test Page </body> </html>"; using (TempHtmlFile htmlFile = new TempHtmlFile(html)) { using (TempPdfFile pdfFile = new TempPdfFile(this.TestContext)) { string commandLine = string.Empty; if (!string.IsNullOrEmpty(commandLineParameter)) { commandLine += $"{commandLineParameter} "; } commandLine += $"\"{htmlFile.FilePath}\" \"{pdfFile.FilePath}\""; HtmlToPdfRunResult result = runner.Run(commandLine); Assert.AreEqual(0, result.ExitCode, result.Output); using (PdfReader pdfReader = new PdfReader(pdfFile.FilePath)) { using (PdfDocument pdfDocument = new PdfDocument(pdfReader)) { int pageCount = pdfDocument.GetNumberOfPages(); Assert.AreEqual(1, pageCount); PdfPage page = pdfDocument.GetPage(1); RectangleFinder rectangleFinder = new RectangleFinder(); PdfCanvasProcessor processor = new PdfCanvasProcessor(rectangleFinder); processor.ProcessPageContent(page); ICollection <Rectangle> boxes = rectangleFinder.GetBoundingBoxes(); Assert.AreEqual(expectedBackground ? 1 : 0, boxes.Count()); } } } } }
public virtual void TestNoninvertibleMatrix() { String fileName = "noninvertibleMatrix.pdf"; PdfDocument pdfDocument = new PdfDocument(new PdfReader(sourceFolder + fileName)); LocationTextExtractionStrategy strategy = new LocationTextExtractionStrategy(); PdfCanvasProcessor processor = new PdfCanvasProcessor(strategy); PdfPage page = pdfDocument.GetFirstPage(); processor.ProcessPageContent(page); String resultantText = strategy.GetResultantText(); pdfDocument.Close(); NUnit.Framework.Assert.AreEqual("Hello World!\nHello World!\nHello World!\nHello World! Hello World! Hello World!" , resultantText); }
public static string GetPDFFromFile(string path) { PdfDocument pdfDoc = new PdfDocument(new PdfReader(path)); LocationTextExtractionStrategy strategy = new LocationTextExtractionStrategy(); PdfCanvasProcessor parser = new PdfCanvasProcessor(strategy); // Known limitation: read more than one page. Sample documents are all one page long. parser.ProcessPageContent(pdfDoc.GetFirstPage()); pdfDoc.Close(); return(strategy.GetResultantText()); }