private byte[] GetImage(ParsingRequest request, PdfDocumentProcessor documentProcessor, int i) { using (var memory = new MemoryStream()) { documentProcessor.CreateTiff(memory, 1024 * 5, new[] { i }); using (var image = Image.FromStream(memory)) { #if DEBUG image.Save($"{request.File.Name}.jpeg", ImageFormat.Jpeg); #endif if (request.BwThreshold == null) { return(memory.ToArray()); } using (var bwImage = image.GetBlackAndWhiteImage(request.BwThreshold.Value)) { #if DEBUG bwImage.Save($"{request.File.Name}_BW.jpeg", ImageFormat.Jpeg); #endif using (var bwStream = new MemoryStream()) { bwImage.Save(bwStream, ImageFormat.Tiff); return(bwStream.ToArray()); } } } } }
public Task <ParsingResult> Parse(ParsingRequest request) { if (request == null) { throw new ArgumentNullException(nameof(request)); } logger.LogDebug("Parsing [{0}]", request.File.FullName); var document = new RawDocument(); using (var documentProcessor = new PdfDocumentProcessor()) { documentProcessor.LoadDocument(request.File.FullName); var pages = request.MaxPages > documentProcessor.Document.Pages.Count ? documentProcessor.Document.Pages.Count : request.MaxPages; var pagesList = new List <RawPage>(); document.Pages = new RawPage[pages]; for (var i = 1; i <= pages; i++) { var page = new RawPage(); var data = GetImage(request, documentProcessor, i); page.Blocks = ocrImageParser.Parse(data).ToArray(); document.Pages[i - 1] = page; } } return(Task.FromResult(new ParsingResult(document, request, ParsingType.OCR))); }
public async Task ParseAmazon() { fileInfo = new FileInfo(Path.Combine(TestContext.CurrentContext.TestDirectory, "data", "AmazonWebServices.pdf")); var request = new ParsingRequest(fileInfo, ParsingType.OCR, 10); var result = await instance.Parse(request).ConfigureAwait(false); request.BwThreshold = 0.8f; var resultBw = await instance.Parse(request).ConfigureAwait(false); var length = result.Document.Build().Length; var lengthBw = resultBw.Document.Build().Length; Assert.Greater(lengthBw, length); }
public async Task Parse() { var parser = new DocumentParser(new ApiClientFactory(wrapper.Client, wrapper.Client.BaseAddress)); var data = File.ReadAllBytes(Path.Combine(TestContext.CurrentContext.TestDirectory, "Data", "Research.pdf")); var request = new ParsingRequest(); request.Name = "Test.pdf"; request.Data = data; request.Type = RequestParsingType.OCR; var result = await parser.Parse(request, CancellationToken.None).ConfigureAwait(false); Assert.AreEqual(35, result.Document.Pages.Length); Assert.GreaterOrEqual(result.Document.Pages[0].Build().Length, 1718); }
public async Task PostRequest() { // Setup a respond for the user api (including a wildcard in the URL) string output = JsonConvert.SerializeObject(result); mockHttp.When("http://localhost/api/parser/processfile") .Respond("application/json", output); var request = new ParsingRequest(); request.Name = "Test"; request.Data = new byte[] { }; var actual = await instance.Parse(request, CancellationToken.None).ConfigureAwait(false); Assert.AreEqual("Text", actual.Document.Pages[0].Blocks[0].Text); }
public async Task <ParsingResult> Parse(ParsingRequest request, CancellationToken token) { if (request == null) { throw new ArgumentNullException(nameof(request)); } var result = await client.PostRequest <ParsingRequest, RawResponse <ParsingResult> >("api/parser/processfile", request, token).ConfigureAwait(false); if (!result.IsSuccess) { throw new ApplicationException("Failed to retrieve:" + result.HttpResponseMessage); } return(result.Result.Value); }
public async Task <ParsingResult> Parse(ParsingRequest request) { if (request == null) { throw new ArgumentNullException(nameof(request)); } foreach (var parser in inner.Where(item => item.Type == request.Type || request.Type == ParsingType.Any)) { var result = await parser.Parse(request).ConfigureAwait(false); if (result.Succeeded) { return(result); } } return(ParsingResult.ConstructError(request)); }
public Task <ParsingResult> Parse(ParsingRequest request) { if (request == null) { throw new ArgumentNullException(nameof(request)); } logger.LogDebug("Parsing [{0}]", request.File.FullName); var sourceImage = Image.FromFile(request.File.FullName); using (var byteStream = new MemoryStream()) { sourceImage.Save(byteStream, ImageFormat.Tiff); var data = byteStream.ToArray(); var document = new RawDocument(); document.Pages = new[] { new RawPage() }; document.Pages[0].Blocks = ocrImageParser.Parse(data).Take(request.MaxPages).ToArray(); return(Task.FromResult(new ParsingResult(document, request, ParsingType.OCR))); } }
public Task <ParsingResult> Parse(ParsingRequest request) { if (request == null) { throw new ArgumentNullException(nameof(request)); } logger.LogDebug("Parsing [{0}]", request.File.FullName); var document = new RawDocument(); bool containsText = false; using (var documentProcessor = new PdfDocumentProcessor()) { documentProcessor.LoadDocument(request.File.FullName); var pages = request.MaxPages > documentProcessor.Document.Pages.Count ? documentProcessor.Document.Pages.Count : request.MaxPages; document.Pages = new RawPage[pages]; for (var i = 1; i <= pages; i++) { var page = new RawPage { Blocks = new[] { new TextBlockItem() } }; page.Blocks[0].Text = documentProcessor.GetPageText(i); if (!string.IsNullOrWhiteSpace(page.Blocks[0].Text)) { containsText = true; } document.Pages[i - 1] = page; } } if (!containsText) { logger.LogInformation("Failed to find text in: [{0}]", request.File.FullName); return(Task.FromResult(ParsingResult.ConstructError(request))); } return(Task.FromResult(new ParsingResult(document, request, ParsingType.Extract))); }
public async Task <ParsingResult> Parse(ParsingRequest request) { if (request == null) { throw new ArgumentNullException(nameof(request)); } logger.LogDebug("Parsing [{0}]", request.File.FullName); using (var documentProcessor = new RichEditDocumentServer()) { documentProcessor.LayoutCalculationMode = CalculationModeType.Automatic; documentProcessor.LayoutUnit = DocumentLayoutUnit.Document; var loaded = Observable.FromEventPattern <EventHandler, EventArgs>( h => documentProcessor.DocumentLayout.DocumentFormatted += h, h => documentProcessor.DocumentLayout.DocumentFormatted -= h) .FirstOrDefaultAsync() .GetAwaiter(); documentProcessor.LoadDocument(request.File.FullName); await loaded; var iterator = new DocumentIterator(documentProcessor.Document); var pageLayout = new CurrentLayoutVisitor(); var visitor = new DocumentVisitor(pageLayout); var layoutIterator = new LayoutIterator(documentProcessor.DocumentLayout); while (layoutIterator.MoveNext(LayoutLevel.Page)) { layoutIterator.Current.Accept(pageLayout); } while (iterator.MoveNext()) { iterator.Current.Accept(visitor); } return(new ParsingResult(visitor.GenerateResult(request.MaxPages), request, ParsingType.Extract)); } }
public Task <ParsingResult> Parse(ParsingRequest request) { return(Task.FromResult(ParsingResult.ConstructError(request))); }