public Task <ParsingResult> Parse(ParsingRequest request) { if (request == null) { throw new ArgumentNullException(nameof(request)); } logger.LogDebug("Parsing [{0}]", request.File.FullName); var document = new RawDocument(); using (var documentProcessor = new PdfDocumentProcessor()) { documentProcessor.LoadDocument(request.File.FullName); var pages = request.MaxPages > documentProcessor.Document.Pages.Count ? documentProcessor.Document.Pages.Count : request.MaxPages; var pagesList = new List <RawPage>(); document.Pages = new RawPage[pages]; for (var i = 1; i <= pages; i++) { var page = new RawPage(); var data = GetImage(request, documentProcessor, i); page.Blocks = ocrImageParser.Parse(data).ToArray(); document.Pages[i - 1] = page; } } return(Task.FromResult(new ParsingResult(document, request, ParsingType.OCR))); }
public RawDocument GenerateResult(int maxPages) { ExtractBlocks(); var result = new RawDocument(); result.Pages = pages.Take(maxPages).ToArray(); return(result); }
public MarkerWindow() : base(Gtk.WindowType.Toplevel) { Build (); rdocument = null; ndocument = null; html_document = null; AddColumns (); AddButtons (); }
public void ConstructorInterfase() { int count = 0; Type etype = Type.GetType ("Scielo.PDF2Text.RawDocument"); foreach (PDFPoppler doc in test_docs) { RawDocument rdoc = new RawDocument (doc); Assert.IsInstanceOfType (etype, rdoc, "CI" + count); count++; } }
public void ConstructorString() { RawDocument rdoc0 = new RawDocument ("", "atm"); RawDocument rdoc1 = new RawDocument ("Hola Mundo", "atm"); RawDocument rdoc2 = new RawDocument (" ad ", "atm"); Type etype = Type.GetType ("Scielo.PDF2Text.RawDocument"); Assert.IsInstanceOfType (etype, rdoc0, "CI01"); Assert.IsInstanceOfType (etype, rdoc1, "CI01"); Assert.IsInstanceOfType (etype, rdoc2, "CI01"); }
public Normalizer(RawDocument document, string format) { // Construimos un StyleReader para obtener las regexp. StyleReader style = new StyleReader (format); this.format = format; rules = style.GetRules (); // Si el estilo tiene mas de una columna se rompe y se convierte a una // sola columna. if (style.GetNumColumns () > 1) document.BreakColumns (); EncodeText (document.GetText ()); }
public Task <Document[]> Extract(string domain, RawDocument rawDocument) { logger.LogDebug("Parsing"); SingleRequestData[] requests = new SingleRequestData[rawDocument.Pages.Length]; for (int i = 0; i < rawDocument.Pages.Length; i++) { string text = rawDocument.Pages[i].Blocks.Select(x => x.Text).AccumulateItems(" "); SingleRequestData request = new SingleRequestData { Text = text, Id = i.ToString() }; requests[i] = request; } return(GetSentiment(domain, requests)); }
public Task <ParsingResult> Parse(ParsingRequest request) { if (request == null) { throw new ArgumentNullException(nameof(request)); } logger.LogDebug("Parsing [{0}]", request.File.FullName); var sourceImage = Image.FromFile(request.File.FullName); using (var byteStream = new MemoryStream()) { sourceImage.Save(byteStream, ImageFormat.Tiff); var data = byteStream.ToArray(); var document = new RawDocument(); document.Pages = new[] { new RawPage() }; document.Pages[0].Blocks = ocrImageParser.Parse(data).Take(request.MaxPages).ToArray(); return(Task.FromResult(new ParsingResult(document, request, ParsingType.OCR))); } }
public Task <ParsingResult> Parse(ParsingRequest request) { if (request == null) { throw new ArgumentNullException(nameof(request)); } logger.LogDebug("Parsing [{0}]", request.File.FullName); var document = new RawDocument(); bool containsText = false; using (var documentProcessor = new PdfDocumentProcessor()) { documentProcessor.LoadDocument(request.File.FullName); var pages = request.MaxPages > documentProcessor.Document.Pages.Count ? documentProcessor.Document.Pages.Count : request.MaxPages; document.Pages = new RawPage[pages]; for (var i = 1; i <= pages; i++) { var page = new RawPage { Blocks = new[] { new TextBlockItem() } }; page.Blocks[0].Text = documentProcessor.GetPageText(i); if (!string.IsNullOrWhiteSpace(page.Blocks[0].Text)) { containsText = true; } document.Pages[i - 1] = page; } } if (!containsText) { logger.LogInformation("Failed to find text in: [{0}]", request.File.FullName); return(Task.FromResult(ParsingResult.ConstructError(request))); } return(Task.FromResult(new ParsingResult(document, request, ParsingType.Extract))); }
public void GetText() { string rawtext; int count = 0; foreach (PDFPoppler doc in test_docs) { RawDocument rdoc = new RawDocument (doc); rawtext = rdoc.GetText (); Assert.AreEqual (raw_docs[count], rawtext, "GT" + count); count += 1; } }
public void WriteDocument() { string result, temp_dir; int count = 0; foreach (PDFPoppler doc in test_docs) { RawDocument rdoc = new RawDocument (doc); temp_dir = Path.GetTempPath (); rdoc.WriteDocument (temp_dir, "temp01", "txt"); result = Test.ReadFile (Path.Combine (temp_dir, "temp01.txt")); Assert.AreEqual (raw_docs[count], result, "WD" + count); count++; } }
public void Normalize() { NormDocument ndoc; int count = 0; Type etype = Type.GetType ("Scielo.PDF2Text.RawDocument"); Type etype1 = Type.GetType ("Scielo.PDF2Text.NormDocument"); foreach (PDFPoppler doc in test_docs) { RawDocument rdoc = new RawDocument (doc); ndoc= rdoc.Normalize (styles[count]); Assert.IsInstanceOfType (etype, rdoc, "NM" + count); Assert.IsInstanceOfType (etype1, ndoc, "NM" + count); count++; } }
public void SetUp() { instance = CreateRawDocument(); }
public ParsingResult(RawDocument document, ParsingRequest request, ParsingType?processedAs) { Document = document ?? throw new ArgumentNullException(nameof(document)); Request = request ?? throw new ArgumentNullException(nameof(request)); ProcessedAs = processedAs; }
private void OnOpenActivated(object sender, System.EventArgs e) { OpenPDFDialog dialog = new OpenPDFDialog (); if (dialog.Run () == (int) ResponseType.Ok) { Uri uri = new Uri (dialog.Document); PDFPoppler reader = new PDFPoppler (uri); //Extracting images from document reader.GetNonText (); //Extracting text from document rdocument = reader.CreateRawDocument (); textview.Buffer.Text = rdocument.GetText (); Markup.Sensitive = true; Normalize.Sensitive = true; store.Clear (); // Logger.ClearList (); } dialog.Destroy (); }
public void GetText() { string normtext; int count = 0; NormDocument ndoc; foreach (PDFPoppler doc in test_docs) { RawDocument rdoc = new RawDocument (doc); ndoc = rdoc.Normalize (styles [count]); normtext = ndoc.GetText (); Assert.AreEqual (norm_docs[count], normtext, "GT" + count); count += 1; } }