Пример #1
0
        protected string GetOpenXmlText(Stream stream, TextExtractorContext context)
        {
            var result = new StringBuilder();

            using (var zip = ZipFile.Read(stream))
            {
                foreach (var entry in zip)
                {
                    if (Path.GetExtension(entry.FileName.ToLower()).Trim('.') == "xml")
                    {
                        var zipStream = new MemoryStream();
                        entry.Extract(zipStream);
                        zipStream.Seek(0, SeekOrigin.Begin);

                        // use the XML extractor for inner entries in OpenXml files
                        var extractor     = ResolveExtractor("xml");
                        var extractedText = extractor == null ? null : extractor.Extract(zipStream, context);

                        if (String.IsNullOrEmpty(extractedText))
                        {
                            zipStream.Close();
                            continue;
                        }
                        result.Append(extractedText);
                        zipStream.Close();
                    }
                }
            }

            return(result.ToString());
        }
Пример #2
0
        public override string Extract(Stream stream, TextExtractorContext context)
        {
            try
            {
                //extract text using IFilter
                var target = new FilterReader(GetBytesFromStream(stream), ".pdf");
                target.Init();
                return(target.ReadToEnd());
            }
            catch (OutOfMemoryException ex)
            {
                Logger.WriteWarning(EventId.Indexing.BinaryIsTooLarge,
                                    "Pdf text extract failed with out of memory exception. " + ex,
                                    properties: new Dictionary <string, object> {
                    { "Stream size", stream.Length }
                });

                return(string.Empty);
            }
            catch (Exception ex)
            {
                Logger.WriteWarning(EventId.Indexing.IFilterError, "Pdf IFilter error: " + ex.Message);
            }

            //fallback to the other mechanism in case the pdf IFilter is missing
            var text = new StringBuilder();

            try
            {
                var pdfReader = new PdfReader(stream);
                for (var page = 1; page <= pdfReader.NumberOfPages; page++)
                {
                    // extract text using the old version (4.1.6) of iTextSharp
                    var pageText = ExtractTextFromPdfBytes(pdfReader.GetPageContent(page));
                    if (string.IsNullOrEmpty(pageText))
                    {
                        continue;
                    }

                    text.Append(pageText);
                }
            }
            catch (OutOfMemoryException ex)
            {
                Logger.WriteWarning(EventId.Indexing.BinaryIsTooLarge,
                                    "Pdf text extract failed with out of memory exception. " + ex,
                                    properties: new Dictionary <string, object> {
                    { "Stream size", stream.Length }
                });
            }

            return(text.ToString());
        }
Пример #3
0
        public override string Extract(Stream stream, TextExtractorContext context)
        {
            try
            {
                //IFilter
                var target = new FilterReader(GetBytesFromStream(stream), ".msg");
                target.Init();
                return(target.ReadToEnd());
            }
            catch (Exception ex)
            {
                Logger.WriteWarning(EventId.Indexing.IFilterError, "Msg IFilter error: " + ex.Message);
            }

            return(string.Empty);
        }
Пример #4
0
        public override string Extract(Stream stream, TextExtractorContext context)
        {
            // IMPORTANT: as this extractor is used for extracting text from inner
            // entries of OpenXml files, please do not make this method asynchronous,
            // because we cannot assume that the file is a real content in the
            // Content Repository.

            // initial length: chars = bytes / 2, relevant text rate: ~25%
            var sb     = new StringBuilder(Math.Max(20, Convert.ToInt32(stream.Length / 8)));
            var reader = new XmlTextReader(stream);

            while (reader.Read())
            {
                if (reader.NodeType == XmlNodeType.Text && reader.HasValue)
                {
                    sb.Append(reader.Value).Append(' ');
                }
            }

            return(sb.ToString());
        }
Пример #5
0
 public abstract string Extract(Stream stream, TextExtractorContext context);
Пример #6
0
 public override string Extract(Stream stream, TextExtractorContext context)
 {
     return(RichTextStripper.StripRichTextFormat(Tools.GetStreamString(stream)));
 }
Пример #7
0
 public override string Extract(Stream stream, TextExtractorContext context)
 {
     return(Tools.GetStreamString(stream));
 }
Пример #8
0
 public override string Extract(Stream stream, TextExtractorContext context)
 {
     return(base.GetOpenXmlText(stream, context));
 }
Пример #9
0
        public static string GetExtract(BinaryData binaryData, Node node)
        {
            var extractor = ResolveExtractor(binaryData);

            if (extractor == null)
            {
                return(string.Empty);
            }

            var result = string.Empty;
            var stream = binaryData.GetStream();

            if (stream == null)
            {
                return(String.Empty);
            }
            if (stream.Length == 0)
            {
                return(String.Empty);
            }

            try
            {
                var ctx = new TextExtractorContext(node.VersionId);
                //-- async
                Action <TimeboxedActivity> timeboxedFunctionCall = activity =>
                {
                    var x       = (Stream)activity.InArgument;
                    var extract = extractor.Extract(x, ctx);
                    activity.OutArgument = extract;
                };

                var act = new TimeboxedActivity();
                act.InArgument = stream;
                act.Activity   = timeboxedFunctionCall;
                act.Context    = HttpContext.Current;

                var finishedWithinTime = act.ExecuteAndWait(Repository.TextExtractTimeout * 1000);
                if (!finishedWithinTime)
                {
                    act.Abort();
                    var msg = String.Format("Text extracting timeout. Version: {0}, path: {1}", node.Version, node.Path);
                    Logger.WriteWarning(Logger.EventId.NotDefined, msg);
                    return(String.Empty);
                }
                else if (act.ExecutionException != null)
                {
                    WriteError(act.ExecutionException, node);
                }
                else
                {
                    result = (string)act.OutArgument;
                }
            }
            catch (Exception e)
            {
                WriteError(e, node);
            }

            if (result == null)
            {
                Logger.WriteWarning(Logger.EventId.NotDefined, String.Format(CultureInfo.InvariantCulture, @"Couldn't extract text. VersionId: {0}, path: '{1}' ", node.VersionId, node.Path));
            }
            else
            {
                result = result.Replace('\0', '.');
            }

            return(result);
        }