private ITextExtractor GetTextExtractor(BinaryDocumentStream documentFile)
        {
            var extractor = _extractors.FirstOrDefault(x => x.CanHandle(""));

            if (extractor == null)
            {
                throw new TextExtractionException(documentFile.Id, "No extractor was found for document stream " + documentFile.Id);
            }
            return(extractor);
        }
 public ExtractedDocument ExtractText(BinaryDocumentStream documentStream)
 {
     try
     {
         return(new ExtractedDocument(GetTextExtractor(documentStream).ExtractText(documentStream)));
     }
     catch (TextExtractionException e)
     {
         return(new ExtractedDocument(e));
     }
 }
Esempio n. 3
0
 public void CanExtractJpgFromStream()
 {
     using (var stream = ResourceUtils.GetManifestResourceStream("Testdata.Binary.Styx architecture.jpg"))
     {
         var  binaryStream = new BinaryDocumentStream("Styx architecture.jpg", "test", stream);
         long processTime;
         var  doc = ExtractStream(binaryStream, out processTime);
         Console.WriteLine("{0} extraction time (ms):{1}", "Styx architecture.jpg", processTime);
         doc.Fields.Count.Should().BeGreaterThan(10);
         doc.Domain.Should().Be("test");
     }
 }
Esempio n. 4
0
 public void CanExtractDocxFromStream()
 {
     using (var stream = ResourceUtils.GetManifestResourceStream("Testdata.Binary.Hades.docx"))
     {
         var  binaryStream = new BinaryDocumentStream("Hades.docx", "test", stream);
         long processTime;
         var  doc = ExtractStream(binaryStream, out processTime);
         Console.WriteLine("{0} extraction time (ms):{1}", "Hades.docx", processTime);
         doc.Domain.Should().Be("test");
         doc.Fields.Should().NotBeNull();
         doc.Fields.Count.Should().BeGreaterThan(10);
         doc.GetFieldValue(ContentFieldName).Should().Contain("Hades uses Quartz for");
     }
 }
Esempio n. 5
0
 public void CanExtractImagedPowerpointFromStream()
 {
     using (var stream = ResourceUtils.GetManifestResourceStream("Testdata.Binary.Presentation with image.ppt"))
     {
         var  binaryStream = new BinaryDocumentStream("Presentation with image.ppt", "test", stream);
         long processTime;
         var  doc = ExtractStream(binaryStream, out processTime);
         Console.WriteLine("{0} extraction time (ms):{1}", "Presentation with image.ppt", processTime);
         doc.Domain.Should().Be("test");
         doc.Fields.Should().NotBeNull();
         doc.Fields.Count.Should().BeGreaterThan(10);
         doc.GetFieldValue(ContentFieldName).Should().Contain("Creuna");
     }
 }
Esempio n. 6
0
 public void CanExtractPowerpointFromStream()
 {
     using (var stream = ResourceUtils.GetManifestResourceStream("Testdata.Binary.Findwise presentation - Hades.pptx"))
     {
         var  binaryStream = new BinaryDocumentStream("Findwise presentation - Hades.pptx", "test", stream);
         long processTime;
         var  doc = ExtractStream(binaryStream, out processTime);
         Console.WriteLine("{0} extraction time (ms):{1}", "Findwise presentation - Hades.pptx", processTime);
         doc.Domain.Should().Be("test");
         doc.Fields.Should().NotBeNull();
         doc.Fields.Count.Should().BeGreaterThan(10);
         doc.GetFieldValue(ContentFieldName).Should().Contain("need to be able to reindex");
     }
 }
Esempio n. 7
0
 public void CanExtractPngFromStream()
 {
     using (var stream = ResourceUtils.GetManifestResourceStream("Testdata.Binary.tux.png"))
     {
         var  binaryStream = new BinaryDocumentStream("tux.png", "test", stream);
         long processTime;
         var  doc = ExtractStream(binaryStream, out processTime);
         Console.WriteLine("{0} extraction time (ms):{1}", "tux.png", processTime);
         doc.Domain.Should().Be("test");
         doc.Fields.Should().NotBeNull();
         doc.Fields.Count.Should().BeGreaterThan(10);
         doc.Fields.FirstOrDefault(x => x.Name == "width" && x.Value == "400").Should().NotBeNull();
     }
 }
Esempio n. 8
0
        public void CanExtractPdfFromStream()
        {
            using (var stream = ResourceUtils.GetManifestResourceStream("Testdata.Binary.test.pdf"))
            {
                long processTime;
                var  binaryStream = new BinaryDocumentStream("test.pdf", "test", stream);
                var  doc          = ExtractStream(binaryStream, out processTime);
                Console.WriteLine("{0} extraction time (ms):{1}", "test.pdf", processTime);
                doc.Domain.Should().Be("test");

                doc.Fields.Should().NotBeNull();
                doc.Fields.Count.Should().BeGreaterThan(10);
                doc.GetFieldValue(ContentFieldName).Should().Contain("Ombyggnation till Mötesplats");
            }
        }
Esempio n. 9
0
        private AddDocument ExtractStream(BinaryDocumentStream stream, out long processTime)
        {
            var timer = new Stopwatch();

            timer.Start();
            var doc = _service.ExtractText(new[] { stream }).FirstOrDefault();

            timer.Stop();
            processTime = timer.ElapsedMilliseconds;
            if (doc.TextExtractionException != null)
            {
                throw doc.TextExtractionException;
            }
            return(doc.ResultingDocument as AddDocument);;
        }
Esempio n. 10
0
 public void PersistErrorInformation(BinaryDocumentStream stream, TextExtractionException exception)
 {
     try
     {
         if (!stream.Stream.CanRead)
         {
             return;
         }
         var fileName = !string.IsNullOrEmpty(stream.Id)
             ? Path.GetFileName(stream.Id)
             : Guid.NewGuid().ToString();
         var filePath = Path.Combine(GetCurrentAddDirectoryPath(), fileName);
         using (var fileStream = File.Create(filePath))
         {
             stream.Stream.Seek(0, SeekOrigin.Begin);
             stream.Stream.CopyTo(fileStream);
         }
         Log.Error(exception, String.Format("Persisting error document at {0}", filePath));
     }
     catch (Exception e)
     {
         Log.Error(e, String.Format("Error when persisting error information for stream {0}", stream.Id));
     }
 }
        public IDocument ExtractText(BinaryDocumentStream documentStream)
        {
            using (HttpClient client = new HttpClient())
            {
                client.Timeout = _timeout;
                var requestUrl = $"{_settings.TikaServerUrl.ConnectionString}/rmeta/text";
                try
                {
                    var result = client.PutAsync(requestUrl, new StreamContent(documentStream.Stream))
                                 .Result;

                    if (!result.IsSuccessStatusCode)
                    {
                        throw new TextExtractionException(documentStream.Id, result.ReasonPhrase);
                    }

                    return(ParseTikaResponse(result.Content, documentStream.Id, documentStream.Domain));
                }
                catch (AggregateException e)
                {
                    throw new TextExtractionException(documentStream.Id, $"Request Timeout of {_timeout.TotalMilliseconds}ms has been exceeded", e);
                }
            }
        }