Example #1
0
        public Task <ParsingResult> Parse(ParsingRequest request)
        {
            if (request == null)
            {
                throw new ArgumentNullException(nameof(request));
            }

            logger.LogDebug("Parsing [{0}]", request.File.FullName);
            var document = new RawDocument();

            using (var documentProcessor = new PdfDocumentProcessor())
            {
                documentProcessor.LoadDocument(request.File.FullName);
                var pages     = request.MaxPages > documentProcessor.Document.Pages.Count ? documentProcessor.Document.Pages.Count : request.MaxPages;
                var pagesList = new List <RawPage>();
                document.Pages = new RawPage[pages];
                for (var i = 1; i <= pages; i++)
                {
                    var page = new RawPage();
                    var data = GetImage(request, documentProcessor, i);
                    page.Blocks           = ocrImageParser.Parse(data).ToArray();
                    document.Pages[i - 1] = page;
                }
            }

            return(Task.FromResult(new ParsingResult(document, request, ParsingType.OCR)));
        }
Example #2
0
        public RawDocument GenerateResult(int maxPages)
        {
            ExtractBlocks();
            var result = new RawDocument();

            result.Pages = pages.Take(maxPages).ToArray();
            return(result);
        }
Example #3
0
 public MarkerWindow()
     : base(Gtk.WindowType.Toplevel)
 {
     Build ();
     rdocument = null;
     ndocument = null;
     html_document = null;
     AddColumns ();
     AddButtons ();
 }
Example #4
0
            public void ConstructorInterfase()
            {
                int count = 0;
                Type etype = Type.GetType ("Scielo.PDF2Text.RawDocument");

                foreach (PDFPoppler doc in test_docs) {
                RawDocument rdoc = new RawDocument (doc);
                Assert.IsInstanceOfType (etype, rdoc, "CI" + count);
                count++;
                }
            }
Example #5
0
            public void ConstructorString()
            {
                RawDocument rdoc0 = new RawDocument ("", "atm");
                RawDocument rdoc1 = new RawDocument ("Hola Mundo", "atm");
                RawDocument rdoc2 = new RawDocument ("            ad        ", "atm");

                Type etype = Type.GetType ("Scielo.PDF2Text.RawDocument");
                Assert.IsInstanceOfType (etype, rdoc0, "CI01");
                Assert.IsInstanceOfType (etype, rdoc1, "CI01");
                Assert.IsInstanceOfType (etype, rdoc2, "CI01");
            }
Example #6
0
        public Normalizer(RawDocument document, string format)
        {
            // Construimos un StyleReader para obtener las regexp.
            StyleReader style = new StyleReader (format);
            this.format = format;
            rules = style.GetRules ();

            // Si el estilo tiene mas de una columna se rompe y se convierte a una
            // sola columna.
            if (style.GetNumColumns () > 1)
            document.BreakColumns ();

            EncodeText (document.GetText ());
        }
Example #7
0
        public Task <Document[]> Extract(string domain, RawDocument rawDocument)
        {
            logger.LogDebug("Parsing");
            SingleRequestData[] requests = new SingleRequestData[rawDocument.Pages.Length];
            for (int i = 0; i < rawDocument.Pages.Length; i++)
            {
                string            text    = rawDocument.Pages[i].Blocks.Select(x => x.Text).AccumulateItems(" ");
                SingleRequestData request = new SingleRequestData
                {
                    Text = text,
                    Id   = i.ToString()
                };

                requests[i] = request;
            }

            return(GetSentiment(domain, requests));
        }
Example #8
0
        public Task <ParsingResult> Parse(ParsingRequest request)
        {
            if (request == null)
            {
                throw new ArgumentNullException(nameof(request));
            }

            logger.LogDebug("Parsing [{0}]", request.File.FullName);
            var sourceImage = Image.FromFile(request.File.FullName);

            using (var byteStream = new MemoryStream())
            {
                sourceImage.Save(byteStream, ImageFormat.Tiff);
                var data     = byteStream.ToArray();
                var document = new RawDocument();
                document.Pages           = new[] { new RawPage() };
                document.Pages[0].Blocks = ocrImageParser.Parse(data).Take(request.MaxPages).ToArray();
                return(Task.FromResult(new ParsingResult(document, request, ParsingType.OCR)));
            }
        }
        public Task <ParsingResult> Parse(ParsingRequest request)
        {
            if (request == null)
            {
                throw new ArgumentNullException(nameof(request));
            }

            logger.LogDebug("Parsing [{0}]", request.File.FullName);
            var  document     = new RawDocument();
            bool containsText = false;

            using (var documentProcessor = new PdfDocumentProcessor())
            {
                documentProcessor.LoadDocument(request.File.FullName);
                var pages = request.MaxPages > documentProcessor.Document.Pages.Count ? documentProcessor.Document.Pages.Count : request.MaxPages;
                document.Pages = new RawPage[pages];
                for (var i = 1; i <= pages; i++)
                {
                    var page = new RawPage
                    {
                        Blocks = new[] { new TextBlockItem() }
                    };

                    page.Blocks[0].Text = documentProcessor.GetPageText(i);
                    if (!string.IsNullOrWhiteSpace(page.Blocks[0].Text))
                    {
                        containsText = true;
                    }

                    document.Pages[i - 1] = page;
                }
            }

            if (!containsText)
            {
                logger.LogInformation("Failed to find text in: [{0}]", request.File.FullName);
                return(Task.FromResult(ParsingResult.ConstructError(request)));
            }

            return(Task.FromResult(new ParsingResult(document, request, ParsingType.Extract)));
        }
Example #10
0
            public void GetText()
            {
                string rawtext;
                int count = 0;

                foreach (PDFPoppler doc in test_docs) {
                RawDocument rdoc = new RawDocument (doc);
                rawtext = rdoc.GetText ();
                Assert.AreEqual (raw_docs[count], rawtext, "GT" + count);
                count += 1;
                }
            }
Example #11
0
            public void WriteDocument()
            {
                string result, temp_dir;
                int count = 0;

                foreach (PDFPoppler doc in test_docs) {
                RawDocument rdoc = new RawDocument (doc);

                temp_dir = Path.GetTempPath ();

                rdoc.WriteDocument (temp_dir, "temp01", "txt");
                result = Test.ReadFile (Path.Combine (temp_dir, "temp01.txt"));

                Assert.AreEqual (raw_docs[count], result, "WD" + count);
                count++;
                }
            }
Example #12
0
            public void Normalize()
            {
                NormDocument ndoc;
                int count = 0;
                Type etype = Type.GetType ("Scielo.PDF2Text.RawDocument");
                Type etype1 = Type.GetType ("Scielo.PDF2Text.NormDocument");

                foreach (PDFPoppler doc in test_docs) {
                RawDocument rdoc = new RawDocument (doc);
                ndoc= rdoc.Normalize (styles[count]);
                Assert.IsInstanceOfType (etype, rdoc, "NM" + count);
                Assert.IsInstanceOfType (etype1, ndoc, "NM" + count);
                count++;
                }
            }
 public void SetUp()
 {
     instance = CreateRawDocument();
 }
 public ParsingResult(RawDocument document, ParsingRequest request, ParsingType?processedAs)
 {
     Document    = document ?? throw new ArgumentNullException(nameof(document));
     Request     = request ?? throw new ArgumentNullException(nameof(request));
     ProcessedAs = processedAs;
 }
Example #15
0
        private void OnOpenActivated(object sender, System.EventArgs e)
        {
            OpenPDFDialog dialog = new OpenPDFDialog ();
            if (dialog.Run () == (int) ResponseType.Ok) {
            Uri uri = new Uri (dialog.Document);
            PDFPoppler reader = new PDFPoppler (uri);

            //Extracting images from document
            reader.GetNonText ();

            //Extracting text from document
            rdocument = reader.CreateRawDocument ();
            textview.Buffer.Text = rdocument.GetText ();
            Markup.Sensitive = true;
            Normalize.Sensitive = true;
            store.Clear ();
            //			Logger.ClearList ();
            }

            dialog.Destroy ();
        }
Example #16
0
            public void GetText()
            {
                string normtext;
                int count = 0;
                NormDocument ndoc;

                foreach (PDFPoppler doc in test_docs) {
                RawDocument rdoc = new RawDocument (doc);
                ndoc = rdoc.Normalize (styles [count]);
                normtext = ndoc.GetText ();
                Assert.AreEqual (norm_docs[count], normtext, "GT" + count);
                count += 1;
                }
            }