protected string GetOpenXmlText(Stream stream, TextExtractorContext context) { var result = new StringBuilder(); using (var zip = ZipFile.Read(stream)) { foreach (var entry in zip) { if (Path.GetExtension(entry.FileName.ToLower()).Trim('.') == "xml") { var zipStream = new MemoryStream(); entry.Extract(zipStream); zipStream.Seek(0, SeekOrigin.Begin); // use the XML extractor for inner entries in OpenXml files var extractor = ResolveExtractor("xml"); var extractedText = extractor == null ? null : extractor.Extract(zipStream, context); if (String.IsNullOrEmpty(extractedText)) { zipStream.Close(); continue; } result.Append(extractedText); zipStream.Close(); } } } return(result.ToString()); }
public override string Extract(Stream stream, TextExtractorContext context) { try { //extract text using IFilter var target = new FilterReader(GetBytesFromStream(stream), ".pdf"); target.Init(); return(target.ReadToEnd()); } catch (OutOfMemoryException ex) { Logger.WriteWarning(EventId.Indexing.BinaryIsTooLarge, "Pdf text extract failed with out of memory exception. " + ex, properties: new Dictionary <string, object> { { "Stream size", stream.Length } }); return(string.Empty); } catch (Exception ex) { Logger.WriteWarning(EventId.Indexing.IFilterError, "Pdf IFilter error: " + ex.Message); } //fallback to the other mechanism in case the pdf IFilter is missing var text = new StringBuilder(); try { var pdfReader = new PdfReader(stream); for (var page = 1; page <= pdfReader.NumberOfPages; page++) { // extract text using the old version (4.1.6) of iTextSharp var pageText = ExtractTextFromPdfBytes(pdfReader.GetPageContent(page)); if (string.IsNullOrEmpty(pageText)) { continue; } text.Append(pageText); } } catch (OutOfMemoryException ex) { Logger.WriteWarning(EventId.Indexing.BinaryIsTooLarge, "Pdf text extract failed with out of memory exception. " + ex, properties: new Dictionary <string, object> { { "Stream size", stream.Length } }); } return(text.ToString()); }
public override string Extract(Stream stream, TextExtractorContext context) { try { //IFilter var target = new FilterReader(GetBytesFromStream(stream), ".msg"); target.Init(); return(target.ReadToEnd()); } catch (Exception ex) { Logger.WriteWarning(EventId.Indexing.IFilterError, "Msg IFilter error: " + ex.Message); } return(string.Empty); }
public override string Extract(Stream stream, TextExtractorContext context) { // IMPORTANT: as this extractor is used for extracting text from inner // entries of OpenXml files, please do not make this method asynchronous, // because we cannot assume that the file is a real content in the // Content Repository. // initial length: chars = bytes / 2, relevant text rate: ~25% var sb = new StringBuilder(Math.Max(20, Convert.ToInt32(stream.Length / 8))); var reader = new XmlTextReader(stream); while (reader.Read()) { if (reader.NodeType == XmlNodeType.Text && reader.HasValue) { sb.Append(reader.Value).Append(' '); } } return(sb.ToString()); }
public abstract string Extract(Stream stream, TextExtractorContext context);
public override string Extract(Stream stream, TextExtractorContext context) { return(RichTextStripper.StripRichTextFormat(Tools.GetStreamString(stream))); }
public override string Extract(Stream stream, TextExtractorContext context) { return(Tools.GetStreamString(stream)); }
public override string Extract(Stream stream, TextExtractorContext context) { return(base.GetOpenXmlText(stream, context)); }
public static string GetExtract(BinaryData binaryData, Node node) { var extractor = ResolveExtractor(binaryData); if (extractor == null) { return(string.Empty); } var result = string.Empty; var stream = binaryData.GetStream(); if (stream == null) { return(String.Empty); } if (stream.Length == 0) { return(String.Empty); } try { var ctx = new TextExtractorContext(node.VersionId); //-- async Action <TimeboxedActivity> timeboxedFunctionCall = activity => { var x = (Stream)activity.InArgument; var extract = extractor.Extract(x, ctx); activity.OutArgument = extract; }; var act = new TimeboxedActivity(); act.InArgument = stream; act.Activity = timeboxedFunctionCall; act.Context = HttpContext.Current; var finishedWithinTime = act.ExecuteAndWait(Repository.TextExtractTimeout * 1000); if (!finishedWithinTime) { act.Abort(); var msg = String.Format("Text extracting timeout. Version: {0}, path: {1}", node.Version, node.Path); Logger.WriteWarning(Logger.EventId.NotDefined, msg); return(String.Empty); } else if (act.ExecutionException != null) { WriteError(act.ExecutionException, node); } else { result = (string)act.OutArgument; } } catch (Exception e) { WriteError(e, node); } if (result == null) { Logger.WriteWarning(Logger.EventId.NotDefined, String.Format(CultureInfo.InvariantCulture, @"Couldn't extract text. VersionId: {0}, path: '{1}' ", node.VersionId, node.Path)); } else { result = result.Replace('\0', '.'); } return(result); }