コード例 #1
0
        private AddDocument ExtractFile(BinaryDocumentFile file, out long processTime)
        {
            var timer = new Stopwatch();

            timer.Start();
            var doc = _service.ExtractText(new[] { file }).FirstOrDefault();

            timer.Stop();
            processTime = timer.ElapsedMilliseconds;
            if (doc.TextExtractionException != null)
            {
                throw doc.TextExtractionException;
            }
            return(doc.ResultingDocument as AddDocument);
        }
コード例 #2
0
        public IDocument ConvertToDocument(Page page, string domain)
        {
            var doc = new AddDocument()
            {
                Id = page.Id, Domain = domain
            };

            doc.Fields.Add(new Field("Url", page.Url.AbsoluteUri));
            doc.Fields.Add(new Field("Domain", domain));
            doc.Fields.Add(new Field("Depth", page.Depth.ToString()));
            doc.Fields.Add(new Field("Host", page.Url.Host));
            using (var memoryStream = new MemoryStream((page as BinaryPage).Data))
            {
                var extractedDoc =
                    _textExtractorService.ExtractText(new BinaryDocumentStream(page.Url.ToString(), domain, memoryStream));

                if (extractedDoc.TextExtractionException != null)
                {
                    doc.Fields.Add(new Field("ErrorMessage", extractedDoc.TextExtractionException.Message));
                    doc.Fields.Add(new Field("ErrorStackTrace", extractedDoc.TextExtractionException.StackTrace));
                }
                if (extractedDoc.ResultingDocument == null)
                {
                    return(doc);
                }
                foreach (var field in (extractedDoc.ResultingDocument as AddDocument).Fields)
                {
                    doc.Fields.Add(field);
                }
            }
            return(doc);
        }
コード例 #3
0
        private IEnumerable <Field> ExtractFields(string documentId, BinaryField binaryField, string domain)
        {
            try
            {
                using (var stream = new MemoryStream(binaryField.Data))
                {
                    var extractedDoc = _textExtractorService.ExtractText(
                        new BinaryDocumentStream(documentId, domain, stream));
                    if (extractedDoc.TextExtractionException != null)
                    {
                        Log.Warning(
                            $"Error when extracting binary field {binaryField.Name} from {documentId}: {extractedDoc.TextExtractionException}");
                        return(new Field[]
                        {
                            new Field("ExtractionError",
                                      $"Error when extracting binary field {binaryField.Name} from {documentId}: {extractedDoc.TextExtractionException}")
                        });
                    }
                    var doc = extractedDoc.ResultingDocument as AddDocument;


                    foreach (var field in doc.Fields)
                    {
                        field.Name = string.Format("{0}_extracted_{1}",
                                                   binaryField.Name,
                                                   field.Name.Replace(":", "-").Replace(" ", "-").Replace("(", "").Replace(")", ""));
                    }
                    return(doc.Fields);
                }
            }
            catch (Exception e)
            {
                return(new[]
                {
                    new Field("Error",
                              string.Format("Error when extracting text from binary field {0} in document {3} :{1} ({2})",
                                            binaryField.Name,
                                            e.Message,
                                            e.InnerException != null ? e.InnerException.Message : "",
                                            documentId))
                });
            }
        }