protected string GetOpenXmlText(Stream stream, TextExtractorContext context) { var result = new StringBuilder(); using (var zip = ZipFile.Read(stream)) { foreach (var entry in zip) { if (Path.GetExtension(entry.FileName.ToLower()).Trim('.') == "xml") { var zipStream = new MemoryStream(); entry.Extract(zipStream); zipStream.Seek(0, SeekOrigin.Begin); // use the XML extractor for inner entries in OpenXml files var extractor = ResolveExtractor("xml"); var extractedText = extractor == null ? null : extractor.Extract(zipStream, context); if (String.IsNullOrEmpty(extractedText)) { zipStream.Close(); continue; } result.Append(extractedText); zipStream.Close(); } } } return(result.ToString()); }
public override string Extract(Stream stream, TextExtractorContext context) { try { // extract text using IFilter return(SnIFilter.GetText(stream, ".pdf")); } catch (OutOfMemoryException ex) { SnLog.WriteWarning("Pdf text extract failed with out of memory exception. " + ex, EventId.Indexing, properties: new Dictionary <string, object> { { "Stream size", stream.Length } }); return(string.Empty); } catch (Exception ex) { // log iFilter error only once if (!_iFilterErrorLogged) { SnLog.WriteWarning("Pdf IFilter error: " + ex.Message, EventId.Indexing); _iFilterErrorLogged = true; } } // fallback to the other mechanism in case the pdf IFilter is missing var text = new StringBuilder(); try { var pdfReader = new PdfReader(stream); for (var page = 1; page <= pdfReader.NumberOfPages; page++) { // extract text using the old version (4.1.6) of iTextSharp var pageText = ExtractTextFromPdfBytes(pdfReader.GetPageContent(page)); if (string.IsNullOrEmpty(pageText)) { continue; } text.Append(pageText); } } catch (OutOfMemoryException ex) { SnLog.WriteWarning("Pdf text extract failed with out of memory exception. " + ex, EventId.Indexing, properties: new Dictionary <string, object> { { "Stream size", stream.Length } }); } return(text.ToString()); }
public override string Extract(Stream stream, TextExtractorContext context) { try { // IFilter return(SnIFilter.GetText(stream, ".msg")); } catch (Exception ex) { SnLog.WriteWarning("Msg IFilter error: " + ex.Message, EventId.Indexing); } return(string.Empty); }
public override string Extract(Stream stream, TextExtractorContext context) { // IMPORTANT: as this extractor is used for extracting text from inner // entries of OpenXml files, please do not make this method asynchronous, // because we cannot assume that the file is a real content in the // Content Repository. // initial length: chars = bytes / 2, relevant text rate: ~25% var sb = new StringBuilder(Math.Max(20, Convert.ToInt32(stream.Length / 8))); var reader = new XmlTextReader(stream); while (reader.Read()) { if (reader.NodeType == XmlNodeType.Text && reader.HasValue) { sb.Append(reader.Value).Append(' '); } } return(sb.ToString()); }
public override string Extract(Stream stream, TextExtractorContext context) { return(string.Empty); }
public override string Extract(Stream stream, TextExtractorContext context) { return(RichTextStripper.StripRichTextFormat(RepositoryTools.GetStreamString(stream))); }
public abstract string Extract(Stream stream, TextExtractorContext context);
public override string Extract(Stream stream, TextExtractorContext context) { return(RepositoryTools.GetStreamString(stream)); }
public override string Extract(Stream stream, TextExtractorContext context) { return(base.GetOpenXmlText(stream, context)); }
public static string GetExtract(BinaryData binaryData, Node node) { using (var op = SnTrace.Index.StartOperation("Getting text extract, VId:{0}, Path:{1}", node.VersionId, node.Path)) { var extractor = ResolveExtractor(binaryData); if (extractor == null) { op.Successful = true; return(string.Empty); } var result = string.Empty; using (var stream = binaryData.GetStream()) { if (stream == null || stream.Length == 0) { op.Successful = true; return(String.Empty); } try { var ctx = new TextExtractorContext(node.VersionId); // async Action <TimeboxedActivity> timeboxedFunctionCall = activity => { var x = (Stream)activity.InArgument; var extract = extractor.Extract(x, ctx); activity.OutArgument = extract; }; var act = new TimeboxedActivity(); act.InArgument = stream; act.Activity = timeboxedFunctionCall; act.Context = HttpContext.Current; var finishedWithinTime = act.ExecuteAndWait(Configuration.Indexing.TextExtractTimeout * 1000); if (!finishedWithinTime) { act.Abort(); var msg = String.Format("Text extracting timeout. Version: {0}, path: {1}", node.Version, node.Path); SnTrace.Index.Write(msg); SnLog.WriteWarning(msg); op.Successful = true; return(String.Empty); } else if (act.ExecutionException != null) { WriteError(act.ExecutionException, node); } else { result = (string)act.OutArgument; } } catch (Exception e) { WriteError(e, node); } } if (result == null) { SnLog.WriteWarning(string.Format(CultureInfo.InvariantCulture, @"Couldn't extract text. VersionId: {0}, path: '{1}' ", node.VersionId, node.Path)); } else { result = result.Replace('\0', '.'); } if (result == null) { SnTrace.Index.Write("Couldn't extract text"); } else { SnTrace.Index.Write("Extracted length length: {0}.", result.Length); } op.Successful = true; return(result); } }