private AddDocument ExtractFile(BinaryDocumentFile file, out long processTime) { var timer = new Stopwatch(); timer.Start(); var doc = _service.ExtractText(new[] { file }).FirstOrDefault(); timer.Stop(); processTime = timer.ElapsedMilliseconds; if (doc.TextExtractionException != null) { throw doc.TextExtractionException; } return(doc.ResultingDocument as AddDocument); }
public IDocument ConvertToDocument(Page page, string domain) { var doc = new AddDocument() { Id = page.Id, Domain = domain }; doc.Fields.Add(new Field("Url", page.Url.AbsoluteUri)); doc.Fields.Add(new Field("Domain", domain)); doc.Fields.Add(new Field("Depth", page.Depth.ToString())); doc.Fields.Add(new Field("Host", page.Url.Host)); using (var memoryStream = new MemoryStream((page as BinaryPage).Data)) { var extractedDoc = _textExtractorService.ExtractText(new BinaryDocumentStream(page.Url.ToString(), domain, memoryStream)); if (extractedDoc.TextExtractionException != null) { doc.Fields.Add(new Field("ErrorMessage", extractedDoc.TextExtractionException.Message)); doc.Fields.Add(new Field("ErrorStackTrace", extractedDoc.TextExtractionException.StackTrace)); } if (extractedDoc.ResultingDocument == null) { return(doc); } foreach (var field in (extractedDoc.ResultingDocument as AddDocument).Fields) { doc.Fields.Add(field); } } return(doc); }
private IEnumerable <Field> ExtractFields(string documentId, BinaryField binaryField, string domain) { try { using (var stream = new MemoryStream(binaryField.Data)) { var extractedDoc = _textExtractorService.ExtractText( new BinaryDocumentStream(documentId, domain, stream)); if (extractedDoc.TextExtractionException != null) { Log.Warning( $"Error when extracting binary field {binaryField.Name} from {documentId}: {extractedDoc.TextExtractionException}"); return(new Field[] { new Field("ExtractionError", $"Error when extracting binary field {binaryField.Name} from {documentId}: {extractedDoc.TextExtractionException}") }); } var doc = extractedDoc.ResultingDocument as AddDocument; foreach (var field in doc.Fields) { field.Name = string.Format("{0}_extracted_{1}", binaryField.Name, field.Name.Replace(":", "-").Replace(" ", "-").Replace("(", "").Replace(")", "")); } return(doc.Fields); } } catch (Exception e) { return(new[] { new Field("Error", string.Format("Error when extracting text from binary field {0} in document {3} :{1} ({2})", binaryField.Name, e.Message, e.InnerException != null ? e.InnerException.Message : "", documentId)) }); } }