public IDocument ExtractText(BinaryDocumentFile binaryDocumentFile) { using (HttpClient client = new HttpClient()) { try { using (var fileStream = File.OpenRead(binaryDocumentFile.FilePath)) { if (!File.Exists(binaryDocumentFile.FilePath)) { client.Timeout = _timeout; } var requestUrl = $"{_settings.TikaServerUrl.ConnectionString}/rmeta/text"; var result = client.PutAsync(requestUrl, new StreamContent(fileStream)) .Result; if (!result.IsSuccessStatusCode) { throw new TextExtractionException(binaryDocumentFile.Id, result.ReasonPhrase); } return(ParseTikaResponse(result.Content, binaryDocumentFile.Id, binaryDocumentFile.Domain)); } } catch (Exception e) { throw new TextExtractionException(binaryDocumentFile.Id, e.Message, e); } } }
private ITextExtractor GetTextExtractor(BinaryDocumentFile documentFile) { var extractor = _extractors.FirstOrDefault(x => x.CanHandle("")); if (extractor == null) { throw new TextExtractionException(documentFile.Id, "No extractor was found for document stream " + documentFile.Id); } return(extractor); }
public ExtractedDocument ExtractText(BinaryDocumentFile documentFile) { try { return(new ExtractedDocument( GetTextExtractor(documentFile).ExtractText(documentFile))); } catch (TextExtractionException e) { return(new ExtractedDocument(e)); } }
public void CanExtractJpgFromFile() { var dir = Directory.GetCurrentDirectory(); var filePath = @"Testdata\Binary\Styx architecture.jpg"; var absolutePath = Path.Combine(dir, filePath); var file = new BinaryDocumentFile(absolutePath, "test", absolutePath); long processTime; var doc = ExtractFile(file, out processTime); Console.WriteLine("{0} extraction time (ms):{1}", filePath, processTime); doc.Domain.Should().Be("test"); doc.Fields.Count.Should().BeGreaterThan(1); }
public void CanExtractDocxFromFile() { var filePath = @"Testdata\Binary\Hades.docx"; var file = new BinaryDocumentFile(filePath, "test", filePath); long processTime; var doc = ExtractFile(file, out processTime); Console.WriteLine("{0} extraction time (ms):{1}", filePath, processTime); doc.Domain.Should().Be("test"); doc.Fields.Should().NotBeNull(); doc.Fields.Count.Should().BeGreaterThan(10); doc.GetFieldValue(ContentFieldName).Should().Contain("Hades uses Quartz for"); }
public void PersistErrorInformation(BinaryDocumentFile file, TextExtractionException exception) { try { var filePath = Path.Combine(GetCurrentAddDirectoryPath(), file.Id); File.Copy(file.FilePath, filePath); Log.Error(exception, String.Format("Persisting error document at {0}", filePath)); } catch (Exception e) { Log.Error(e, String.Format("Error when persisting error information for file {0}", file.Id)); } }
private AddDocument ExtractFile(BinaryDocumentFile file, out long processTime) { var timer = new Stopwatch(); timer.Start(); var doc = _service.ExtractText(new[] { file }).FirstOrDefault(); timer.Stop(); processTime = timer.ElapsedMilliseconds; if (doc.TextExtractionException != null) { throw doc.TextExtractionException; } return(doc.ResultingDocument as AddDocument); }
public void CanExtractPowerpointFromFile() { var dir = Directory.GetCurrentDirectory(); var filePath = @"Testdata\Binary\Findwise presentation - Hades.pptx"; var absolutePath = Path.Combine(dir, filePath); var file = new BinaryDocumentFile(absolutePath, "test", absolutePath); long processTime; var doc = ExtractFile(file, out processTime); Console.WriteLine("{0} extraction time (ms):{1}", filePath, processTime); doc.Domain.Should().Be("test"); doc.Fields.Should().NotBeNull(); doc.Fields.Count.Should().BeGreaterThan(10); doc.GetFieldValue(ContentFieldName).Should().Contain("need to be able to reindex"); }
public void CanExtractVisioFromFile() { var dir = Directory.GetCurrentDirectory(); var filePath = @"Testdata\Binary\.Net search architecture.vsd"; var absolutePath = Path.Combine(dir, filePath); var file = new BinaryDocumentFile(absolutePath, "test", absolutePath); long processTime; var doc = ExtractFile(file, out processTime); Console.WriteLine("{0} extraction time (ms):{1}", filePath, processTime); doc.Domain.Should().Be("test"); doc.Fields.Should().NotBeNull(); doc.Fields.Count.Should().BeGreaterThan(10); doc.GetFieldValue(ContentFieldName).Should().Contain("Hades.AdminUI"); }
public void CanExtractPngFromFile() { var dir = Directory.GetCurrentDirectory(); var filePath = @"Testdata\Binary\tux.png"; var absolutePath = Path.Combine(dir, filePath); var file = new BinaryDocumentFile(absolutePath, "test", absolutePath); long processTime; var doc = ExtractFile(file, out processTime); Console.WriteLine("{0} extraction time (ms):{1}", filePath, processTime); doc.Domain.Should().Be("test"); doc.Fields.Should().NotBeNull(); doc.Fields.Count.Should().BeGreaterThan(10); doc.Fields.FirstOrDefault(x => x.Name == "width" && x.Value == "400").Should().NotBeNull(); }