public IDocument ExtractText(BinaryDocumentFile binaryDocumentFile)
        {
            using (HttpClient client = new HttpClient()) {
                try
                {
                    using (var fileStream = File.OpenRead(binaryDocumentFile.FilePath))
                    {
                        if (!File.Exists(binaryDocumentFile.FilePath))
                        {
                            client.Timeout = _timeout;
                        }
                        var requestUrl = $"{_settings.TikaServerUrl.ConnectionString}/rmeta/text";
                        var result     = client.PutAsync(requestUrl, new StreamContent(fileStream))
                                         .Result;

                        if (!result.IsSuccessStatusCode)
                        {
                            throw new TextExtractionException(binaryDocumentFile.Id, result.ReasonPhrase);
                        }

                        return(ParseTikaResponse(result.Content, binaryDocumentFile.Id, binaryDocumentFile.Domain));
                    }
                }
                catch (Exception e)
                {
                    throw new TextExtractionException(binaryDocumentFile.Id, e.Message, e);
                }
            }
        }
        private ITextExtractor GetTextExtractor(BinaryDocumentFile documentFile)
        {
            var extractor = _extractors.FirstOrDefault(x => x.CanHandle(""));

            if (extractor == null)
            {
                throw new TextExtractionException(documentFile.Id, "No extractor was found for document stream " + documentFile.Id);
            }
            return(extractor);
        }
 public ExtractedDocument ExtractText(BinaryDocumentFile documentFile)
 {
     try
     {
         return(new ExtractedDocument(
                    GetTextExtractor(documentFile).ExtractText(documentFile)));
     }
     catch (TextExtractionException e)
     {
         return(new ExtractedDocument(e));
     }
 }
Exemple #4
0
        public void CanExtractJpgFromFile()
        {
            var  dir          = Directory.GetCurrentDirectory();
            var  filePath     = @"Testdata\Binary\Styx architecture.jpg";
            var  absolutePath = Path.Combine(dir, filePath);
            var  file         = new BinaryDocumentFile(absolutePath, "test", absolutePath);
            long processTime;
            var  doc = ExtractFile(file, out processTime);

            Console.WriteLine("{0} extraction time (ms):{1}", filePath, processTime);
            doc.Domain.Should().Be("test");
            doc.Fields.Count.Should().BeGreaterThan(1);
        }
Exemple #5
0
        public void CanExtractDocxFromFile()
        {
            var  filePath = @"Testdata\Binary\Hades.docx";
            var  file     = new BinaryDocumentFile(filePath, "test", filePath);
            long processTime;
            var  doc = ExtractFile(file, out processTime);

            Console.WriteLine("{0} extraction time (ms):{1}", filePath, processTime);
            doc.Domain.Should().Be("test");
            doc.Fields.Should().NotBeNull();
            doc.Fields.Count.Should().BeGreaterThan(10);
            doc.GetFieldValue(ContentFieldName).Should().Contain("Hades uses Quartz for");
        }
Exemple #6
0
 public void PersistErrorInformation(BinaryDocumentFile file, TextExtractionException exception)
 {
     try
     {
         var filePath = Path.Combine(GetCurrentAddDirectoryPath(), file.Id);
         File.Copy(file.FilePath, filePath);
         Log.Error(exception, String.Format("Persisting error document at {0}", filePath));
     }
     catch (Exception e)
     {
         Log.Error(e, String.Format("Error when persisting error information for file {0}", file.Id));
     }
 }
Exemple #7
0
        private AddDocument ExtractFile(BinaryDocumentFile file, out long processTime)
        {
            var timer = new Stopwatch();

            timer.Start();
            var doc = _service.ExtractText(new[] { file }).FirstOrDefault();

            timer.Stop();
            processTime = timer.ElapsedMilliseconds;
            if (doc.TextExtractionException != null)
            {
                throw doc.TextExtractionException;
            }
            return(doc.ResultingDocument as AddDocument);
        }
Exemple #8
0
        public void CanExtractPowerpointFromFile()
        {
            var  dir          = Directory.GetCurrentDirectory();
            var  filePath     = @"Testdata\Binary\Findwise presentation - Hades.pptx";
            var  absolutePath = Path.Combine(dir, filePath);
            var  file         = new BinaryDocumentFile(absolutePath, "test", absolutePath);
            long processTime;
            var  doc = ExtractFile(file, out processTime);

            Console.WriteLine("{0} extraction time (ms):{1}", filePath, processTime);
            doc.Domain.Should().Be("test");
            doc.Fields.Should().NotBeNull();
            doc.Fields.Count.Should().BeGreaterThan(10);
            doc.GetFieldValue(ContentFieldName).Should().Contain("need to be able to reindex");
        }
Exemple #9
0
        public void CanExtractVisioFromFile()
        {
            var dir          = Directory.GetCurrentDirectory();
            var filePath     = @"Testdata\Binary\.Net search architecture.vsd";
            var absolutePath = Path.Combine(dir, filePath);
            var file         = new BinaryDocumentFile(absolutePath, "test", absolutePath);

            long processTime;
            var  doc = ExtractFile(file, out processTime);

            Console.WriteLine("{0} extraction time (ms):{1}", filePath, processTime);
            doc.Domain.Should().Be("test");
            doc.Fields.Should().NotBeNull();
            doc.Fields.Count.Should().BeGreaterThan(10);
            doc.GetFieldValue(ContentFieldName).Should().Contain("Hades.AdminUI");
        }
Exemple #10
0
        public void CanExtractPngFromFile()
        {
            var  dir          = Directory.GetCurrentDirectory();
            var  filePath     = @"Testdata\Binary\tux.png";
            var  absolutePath = Path.Combine(dir, filePath);
            var  file         = new BinaryDocumentFile(absolutePath, "test", absolutePath);
            long processTime;
            var  doc = ExtractFile(file, out processTime);

            Console.WriteLine("{0} extraction time (ms):{1}", filePath, processTime);

            doc.Domain.Should().Be("test");
            doc.Fields.Should().NotBeNull();
            doc.Fields.Count.Should().BeGreaterThan(10);
            doc.Fields.FirstOrDefault(x => x.Name == "width" && x.Value == "400").Should().NotBeNull();
        }