示例#1
0
        private byte[] GetImage(ParsingRequest request, PdfDocumentProcessor documentProcessor, int i)
        {
            using (var memory = new MemoryStream())
            {
                documentProcessor.CreateTiff(memory, 1024 * 5, new[] { i });
                using (var image = Image.FromStream(memory))
                {
#if DEBUG
                    image.Save($"{request.File.Name}.jpeg", ImageFormat.Jpeg);
#endif
                    if (request.BwThreshold == null)
                    {
                        return(memory.ToArray());
                    }

                    using (var bwImage = image.GetBlackAndWhiteImage(request.BwThreshold.Value))
                    {
#if DEBUG
                        bwImage.Save($"{request.File.Name}_BW.jpeg", ImageFormat.Jpeg);
#endif
                        using (var bwStream = new MemoryStream())
                        {
                            bwImage.Save(bwStream, ImageFormat.Tiff);
                            return(bwStream.ToArray());
                        }
                    }
                }
            }
        }
示例#2
0
        public Task <ParsingResult> Parse(ParsingRequest request)
        {
            if (request == null)
            {
                throw new ArgumentNullException(nameof(request));
            }

            logger.LogDebug("Parsing [{0}]", request.File.FullName);
            var document = new RawDocument();

            using (var documentProcessor = new PdfDocumentProcessor())
            {
                documentProcessor.LoadDocument(request.File.FullName);
                var pages     = request.MaxPages > documentProcessor.Document.Pages.Count ? documentProcessor.Document.Pages.Count : request.MaxPages;
                var pagesList = new List <RawPage>();
                document.Pages = new RawPage[pages];
                for (var i = 1; i <= pages; i++)
                {
                    var page = new RawPage();
                    var data = GetImage(request, documentProcessor, i);
                    page.Blocks           = ocrImageParser.Parse(data).ToArray();
                    document.Pages[i - 1] = page;
                }
            }

            return(Task.FromResult(new ParsingResult(document, request, ParsingType.OCR)));
        }
示例#3
0
        public async Task ParseAmazon()
        {
            fileInfo = new FileInfo(Path.Combine(TestContext.CurrentContext.TestDirectory, "data", "AmazonWebServices.pdf"));
            var request = new ParsingRequest(fileInfo, ParsingType.OCR, 10);
            var result  = await instance.Parse(request).ConfigureAwait(false);

            request.BwThreshold = 0.8f;
            var resultBw = await instance.Parse(request).ConfigureAwait(false);

            var length   = result.Document.Build().Length;
            var lengthBw = resultBw.Document.Build().Length;

            Assert.Greater(lengthBw, length);
        }
示例#4
0
        public async Task Parse()
        {
            var parser  = new DocumentParser(new ApiClientFactory(wrapper.Client, wrapper.Client.BaseAddress));
            var data    = File.ReadAllBytes(Path.Combine(TestContext.CurrentContext.TestDirectory, "Data", "Research.pdf"));
            var request = new ParsingRequest();

            request.Name = "Test.pdf";
            request.Data = data;
            request.Type = RequestParsingType.OCR;
            var result = await parser.Parse(request, CancellationToken.None).ConfigureAwait(false);

            Assert.AreEqual(35, result.Document.Pages.Length);
            Assert.GreaterOrEqual(result.Document.Pages[0].Build().Length, 1718);
        }
示例#5
0
        public async Task PostRequest()
        {
            // Setup a respond for the user api (including a wildcard in the URL)
            string output = JsonConvert.SerializeObject(result);

            mockHttp.When("http://localhost/api/parser/processfile")
            .Respond("application/json", output);
            var request = new ParsingRequest();

            request.Name = "Test";
            request.Data = new byte[] { };
            var actual = await instance.Parse(request, CancellationToken.None).ConfigureAwait(false);

            Assert.AreEqual("Text", actual.Document.Pages[0].Blocks[0].Text);
        }
示例#6
0
        public async Task <ParsingResult> Parse(ParsingRequest request, CancellationToken token)
        {
            if (request == null)
            {
                throw new ArgumentNullException(nameof(request));
            }

            var result = await client.PostRequest <ParsingRequest, RawResponse <ParsingResult> >("api/parser/processfile", request, token).ConfigureAwait(false);

            if (!result.IsSuccess)
            {
                throw new ApplicationException("Failed to retrieve:" + result.HttpResponseMessage);
            }

            return(result.Result.Value);
        }
示例#7
0
        public async Task <ParsingResult> Parse(ParsingRequest request)
        {
            if (request == null)
            {
                throw new ArgumentNullException(nameof(request));
            }

            foreach (var parser in inner.Where(item => item.Type == request.Type || request.Type == ParsingType.Any))
            {
                var result = await parser.Parse(request).ConfigureAwait(false);

                if (result.Succeeded)
                {
                    return(result);
                }
            }

            return(ParsingResult.ConstructError(request));
        }
示例#8
0
        public Task <ParsingResult> Parse(ParsingRequest request)
        {
            if (request == null)
            {
                throw new ArgumentNullException(nameof(request));
            }

            logger.LogDebug("Parsing [{0}]", request.File.FullName);
            var sourceImage = Image.FromFile(request.File.FullName);

            using (var byteStream = new MemoryStream())
            {
                sourceImage.Save(byteStream, ImageFormat.Tiff);
                var data     = byteStream.ToArray();
                var document = new RawDocument();
                document.Pages           = new[] { new RawPage() };
                document.Pages[0].Blocks = ocrImageParser.Parse(data).Take(request.MaxPages).ToArray();
                return(Task.FromResult(new ParsingResult(document, request, ParsingType.OCR)));
            }
        }
        public Task <ParsingResult> Parse(ParsingRequest request)
        {
            if (request == null)
            {
                throw new ArgumentNullException(nameof(request));
            }

            logger.LogDebug("Parsing [{0}]", request.File.FullName);
            var  document     = new RawDocument();
            bool containsText = false;

            using (var documentProcessor = new PdfDocumentProcessor())
            {
                documentProcessor.LoadDocument(request.File.FullName);
                var pages = request.MaxPages > documentProcessor.Document.Pages.Count ? documentProcessor.Document.Pages.Count : request.MaxPages;
                document.Pages = new RawPage[pages];
                for (var i = 1; i <= pages; i++)
                {
                    var page = new RawPage
                    {
                        Blocks = new[] { new TextBlockItem() }
                    };

                    page.Blocks[0].Text = documentProcessor.GetPageText(i);
                    if (!string.IsNullOrWhiteSpace(page.Blocks[0].Text))
                    {
                        containsText = true;
                    }

                    document.Pages[i - 1] = page;
                }
            }

            if (!containsText)
            {
                logger.LogInformation("Failed to find text in: [{0}]", request.File.FullName);
                return(Task.FromResult(ParsingResult.ConstructError(request)));
            }

            return(Task.FromResult(new ParsingResult(document, request, ParsingType.Extract)));
        }
        public async Task <ParsingResult> Parse(ParsingRequest request)
        {
            if (request == null)
            {
                throw new ArgumentNullException(nameof(request));
            }

            logger.LogDebug("Parsing [{0}]", request.File.FullName);
            using (var documentProcessor = new RichEditDocumentServer())
            {
                documentProcessor.LayoutCalculationMode = CalculationModeType.Automatic;
                documentProcessor.LayoutUnit            = DocumentLayoutUnit.Document;
                var loaded = Observable.FromEventPattern <EventHandler, EventArgs>(
                    h => documentProcessor.DocumentLayout.DocumentFormatted += h,
                    h => documentProcessor.DocumentLayout.DocumentFormatted -= h)
                             .FirstOrDefaultAsync()
                             .GetAwaiter();

                documentProcessor.LoadDocument(request.File.FullName);
                await loaded;

                var iterator   = new DocumentIterator(documentProcessor.Document);
                var pageLayout = new CurrentLayoutVisitor();
                var visitor    = new DocumentVisitor(pageLayout);

                var layoutIterator = new LayoutIterator(documentProcessor.DocumentLayout);
                while (layoutIterator.MoveNext(LayoutLevel.Page))
                {
                    layoutIterator.Current.Accept(pageLayout);
                }

                while (iterator.MoveNext())
                {
                    iterator.Current.Accept(visitor);
                }

                return(new ParsingResult(visitor.GenerateResult(request.MaxPages), request, ParsingType.Extract));
            }
        }
示例#11
0
 public Task <ParsingResult> Parse(ParsingRequest request)
 {
     return(Task.FromResult(ParsingResult.ConstructError(request)));
 }