public Task <File> Convert(string path) { return(Task.Run(() => { var file = new File { Path = path, Mime = "application/pdf" }; using (var document = new PdfDocument(new PdfReader(path))) { int numOfPages = document.GetNumberOfPages(); var listener = new FilteredEventListener(); var extractionStrategy = listener .AttachEventListener(new LocationTextExtractionStrategy()); var processor = new PdfCanvasProcessor(listener); var content = new StringBuilder(); for (int i = 1; i <= numOfPages; i++) { processor.ProcessPageContent(document.GetPage(i)); content.Append(extractionStrategy.GetResultantText()); processor.Reset(); } file.Content = content.ToString(); } return file; })); }
/// <summary> /// Parsing data from Oy axis /// </summary> /// <param name="page">Data of page</param> /// <returns>data of Oy axis</returns> internal StringBuilder ParsingOyAxis(PdfPage page) { // temp variable Rectangle readBox; TextRegionEventFilter readText; FilteredEventListener listener; LocationTextExtractionStrategy extractor; PdfCanvasProcessor parser; string[] lines; StringBuilder result = new StringBuilder(); // area limit for read readBox = new Rectangle(Margin.Left, Margin.Bottom + 60, 20, page.GetPageSize().GetHeight() - Margin.Bottom - 160); readText = new TextRegionEventFilter(readBox); listener = new FilteredEventListener(); // create a text extraction renderer extractor = listener .AttachEventListener(new LocationTextExtractionStrategy(), readText); lock (block) { (parser = new PdfCanvasProcessor(listener)) .ProcessPageContent(page); parser.Reset(); } // read every line (row) lines = extractor .GetResultantText() .Split('\n'); foreach (string line in lines) { if (!string.IsNullOrEmpty(line.Trim())) { result.AppendLine(line); } } TextExtractionStrategy strategy = listener.AttachEventListener(new TextExtractionStrategy(), readText); lock (block) { (parser = new PdfCanvasProcessor(listener)) .ProcessPageContent(page); parser.Reset(); } PositionOyAxis = strategy.TextResult.ToArray(); return(result); }
/// <summary> /// Parsing columns name /// </summary> /// <param name="page">Data of page</param> /// <returns>names of columns from page</returns> internal StringBuilder ParsingColumns(PdfPage page) { // temp variable Rectangle readBox; TextRegionEventFilter readText; FilteredEventListener listener; LocationTextExtractionStrategy extractor; PdfCanvasProcessor parser; string[] lines; StringBuilder result = new StringBuilder(); // area limit for read readBox = new Rectangle(Margin.Left, page.GetPageSize().GetHeight() - Margin.Top - 70, (page.GetPageSize().GetWidth() - Margin.Right) / 4, 10); readText = new TextRegionEventFilter(readBox); listener = new FilteredEventListener(); // create a text extraction renderer extractor = listener .AttachEventListener(new LocationTextExtractionStrategy(), readText); lock (block) { (parser = new PdfCanvasProcessor(listener)) .ProcessPageContent(page); parser.Reset(); } // read every line (row) lines = extractor.GetResultantText() .Split('\n'); foreach (string line in lines) { result.AppendLine(line); } return(result); }