private ITextExtractor GetTextExtractor(BinaryDocumentStream documentFile) { var extractor = _extractors.FirstOrDefault(x => x.CanHandle("")); if (extractor == null) { throw new TextExtractionException(documentFile.Id, "No extractor was found for document stream " + documentFile.Id); } return(extractor); }
public ExtractedDocument ExtractText(BinaryDocumentStream documentStream) { try { return(new ExtractedDocument(GetTextExtractor(documentStream).ExtractText(documentStream))); } catch (TextExtractionException e) { return(new ExtractedDocument(e)); } }
public void CanExtractJpgFromStream() { using (var stream = ResourceUtils.GetManifestResourceStream("Testdata.Binary.Styx architecture.jpg")) { var binaryStream = new BinaryDocumentStream("Styx architecture.jpg", "test", stream); long processTime; var doc = ExtractStream(binaryStream, out processTime); Console.WriteLine("{0} extraction time (ms):{1}", "Styx architecture.jpg", processTime); doc.Fields.Count.Should().BeGreaterThan(10); doc.Domain.Should().Be("test"); } }
public void CanExtractDocxFromStream() { using (var stream = ResourceUtils.GetManifestResourceStream("Testdata.Binary.Hades.docx")) { var binaryStream = new BinaryDocumentStream("Hades.docx", "test", stream); long processTime; var doc = ExtractStream(binaryStream, out processTime); Console.WriteLine("{0} extraction time (ms):{1}", "Hades.docx", processTime); doc.Domain.Should().Be("test"); doc.Fields.Should().NotBeNull(); doc.Fields.Count.Should().BeGreaterThan(10); doc.GetFieldValue(ContentFieldName).Should().Contain("Hades uses Quartz for"); } }
public void CanExtractImagedPowerpointFromStream() { using (var stream = ResourceUtils.GetManifestResourceStream("Testdata.Binary.Presentation with image.ppt")) { var binaryStream = new BinaryDocumentStream("Presentation with image.ppt", "test", stream); long processTime; var doc = ExtractStream(binaryStream, out processTime); Console.WriteLine("{0} extraction time (ms):{1}", "Presentation with image.ppt", processTime); doc.Domain.Should().Be("test"); doc.Fields.Should().NotBeNull(); doc.Fields.Count.Should().BeGreaterThan(10); doc.GetFieldValue(ContentFieldName).Should().Contain("Creuna"); } }
public void CanExtractPowerpointFromStream() { using (var stream = ResourceUtils.GetManifestResourceStream("Testdata.Binary.Findwise presentation - Hades.pptx")) { var binaryStream = new BinaryDocumentStream("Findwise presentation - Hades.pptx", "test", stream); long processTime; var doc = ExtractStream(binaryStream, out processTime); Console.WriteLine("{0} extraction time (ms):{1}", "Findwise presentation - Hades.pptx", processTime); doc.Domain.Should().Be("test"); doc.Fields.Should().NotBeNull(); doc.Fields.Count.Should().BeGreaterThan(10); doc.GetFieldValue(ContentFieldName).Should().Contain("need to be able to reindex"); } }
public void CanExtractPngFromStream() { using (var stream = ResourceUtils.GetManifestResourceStream("Testdata.Binary.tux.png")) { var binaryStream = new BinaryDocumentStream("tux.png", "test", stream); long processTime; var doc = ExtractStream(binaryStream, out processTime); Console.WriteLine("{0} extraction time (ms):{1}", "tux.png", processTime); doc.Domain.Should().Be("test"); doc.Fields.Should().NotBeNull(); doc.Fields.Count.Should().BeGreaterThan(10); doc.Fields.FirstOrDefault(x => x.Name == "width" && x.Value == "400").Should().NotBeNull(); } }
public void CanExtractPdfFromStream() { using (var stream = ResourceUtils.GetManifestResourceStream("Testdata.Binary.test.pdf")) { long processTime; var binaryStream = new BinaryDocumentStream("test.pdf", "test", stream); var doc = ExtractStream(binaryStream, out processTime); Console.WriteLine("{0} extraction time (ms):{1}", "test.pdf", processTime); doc.Domain.Should().Be("test"); doc.Fields.Should().NotBeNull(); doc.Fields.Count.Should().BeGreaterThan(10); doc.GetFieldValue(ContentFieldName).Should().Contain("Ombyggnation till Mötesplats"); } }
private AddDocument ExtractStream(BinaryDocumentStream stream, out long processTime) { var timer = new Stopwatch(); timer.Start(); var doc = _service.ExtractText(new[] { stream }).FirstOrDefault(); timer.Stop(); processTime = timer.ElapsedMilliseconds; if (doc.TextExtractionException != null) { throw doc.TextExtractionException; } return(doc.ResultingDocument as AddDocument);; }
public void PersistErrorInformation(BinaryDocumentStream stream, TextExtractionException exception) { try { if (!stream.Stream.CanRead) { return; } var fileName = !string.IsNullOrEmpty(stream.Id) ? Path.GetFileName(stream.Id) : Guid.NewGuid().ToString(); var filePath = Path.Combine(GetCurrentAddDirectoryPath(), fileName); using (var fileStream = File.Create(filePath)) { stream.Stream.Seek(0, SeekOrigin.Begin); stream.Stream.CopyTo(fileStream); } Log.Error(exception, String.Format("Persisting error document at {0}", filePath)); } catch (Exception e) { Log.Error(e, String.Format("Error when persisting error information for stream {0}", stream.Id)); } }
public IDocument ExtractText(BinaryDocumentStream documentStream) { using (HttpClient client = new HttpClient()) { client.Timeout = _timeout; var requestUrl = $"{_settings.TikaServerUrl.ConnectionString}/rmeta/text"; try { var result = client.PutAsync(requestUrl, new StreamContent(documentStream.Stream)) .Result; if (!result.IsSuccessStatusCode) { throw new TextExtractionException(documentStream.Id, result.ReasonPhrase); } return(ParseTikaResponse(result.Content, documentStream.Id, documentStream.Domain)); } catch (AggregateException e) { throw new TextExtractionException(documentStream.Id, $"Request Timeout of {_timeout.TotalMilliseconds}ms has been exceeded", e); } } }