Пример #1
0
        protected string GetOpenXmlText(Stream stream, TextExtractorContext context)
        {
            var result = new StringBuilder();

            using (var zip = ZipFile.Read(stream))
            {
                foreach (var entry in zip)
                {
                    if (Path.GetExtension(entry.FileName.ToLower()).Trim('.') == "xml")
                    {
                        var zipStream = new MemoryStream();
                        entry.Extract(zipStream);
                        zipStream.Seek(0, SeekOrigin.Begin);

                        // use the XML extractor for inner entries in OpenXml files
                        var extractor     = ResolveExtractor("xml");
                        var extractedText = extractor == null ? null : extractor.Extract(zipStream, context);

                        if (String.IsNullOrEmpty(extractedText))
                        {
                            zipStream.Close();
                            continue;
                        }
                        result.Append(extractedText);
                        zipStream.Close();
                    }
                }
            }

            return(result.ToString());
        }
Пример #2
0
        public override string Extract(Stream stream, TextExtractorContext context)
        {
            try
            {
                // extract text using IFilter
                return(SnIFilter.GetText(stream, ".pdf"));
            }
            catch (OutOfMemoryException ex)
            {
                SnLog.WriteWarning("Pdf text extract failed with out of memory exception. " + ex,
                                   EventId.Indexing,
                                   properties: new Dictionary <string, object> {
                    { "Stream size", stream.Length }
                });

                return(string.Empty);
            }
            catch (Exception ex)
            {
                // log iFilter error only once
                if (!_iFilterErrorLogged)
                {
                    SnLog.WriteWarning("Pdf IFilter error: " + ex.Message, EventId.Indexing);
                    _iFilterErrorLogged = true;
                }
            }

            // fallback to the other mechanism in case the pdf IFilter is missing
            var text = new StringBuilder();

            try
            {
                var pdfReader = new PdfReader(stream);
                for (var page = 1; page <= pdfReader.NumberOfPages; page++)
                {
                    // extract text using the old version (4.1.6) of iTextSharp
                    var pageText = ExtractTextFromPdfBytes(pdfReader.GetPageContent(page));
                    if (string.IsNullOrEmpty(pageText))
                    {
                        continue;
                    }

                    text.Append(pageText);
                }
            }
            catch (OutOfMemoryException ex)
            {
                SnLog.WriteWarning("Pdf text extract failed with out of memory exception. " + ex,
                                   EventId.Indexing,
                                   properties: new Dictionary <string, object> {
                    { "Stream size", stream.Length }
                });
            }

            return(text.ToString());
        }
Пример #3
0
        public override string Extract(Stream stream, TextExtractorContext context)
        {
            try
            {
                // IFilter
                return(SnIFilter.GetText(stream, ".msg"));
            }
            catch (Exception ex)
            {
                SnLog.WriteWarning("Msg IFilter error: " + ex.Message, EventId.Indexing);
            }

            return(string.Empty);
        }
Пример #4
0
        public override string Extract(Stream stream, TextExtractorContext context)
        {
            // IMPORTANT: as this extractor is used for extracting text from inner
            // entries of OpenXml files, please do not make this method asynchronous,
            // because we cannot assume that the file is a real content in the
            // Content Repository.

            // initial length: chars = bytes / 2, relevant text rate: ~25%
            var sb     = new StringBuilder(Math.Max(20, Convert.ToInt32(stream.Length / 8)));
            var reader = new XmlTextReader(stream);

            while (reader.Read())
            {
                if (reader.NodeType == XmlNodeType.Text && reader.HasValue)
                {
                    sb.Append(reader.Value).Append(' ');
                }
            }

            return(sb.ToString());
        }
Пример #5
0
 public override string Extract(Stream stream, TextExtractorContext context)
 {
     return(string.Empty);
 }
Пример #6
0
 public override string Extract(Stream stream, TextExtractorContext context)
 {
     return(RichTextStripper.StripRichTextFormat(RepositoryTools.GetStreamString(stream)));
 }
Пример #7
0
 public abstract string Extract(Stream stream, TextExtractorContext context);
Пример #8
0
 public override string Extract(Stream stream, TextExtractorContext context)
 {
     return(RepositoryTools.GetStreamString(stream));
 }
Пример #9
0
 public override string Extract(Stream stream, TextExtractorContext context)
 {
     return(base.GetOpenXmlText(stream, context));
 }
Пример #10
0
        public static string GetExtract(BinaryData binaryData, Node node)
        {
            using (var op = SnTrace.Index.StartOperation("Getting text extract, VId:{0}, Path:{1}", node.VersionId, node.Path))
            {
                var extractor = ResolveExtractor(binaryData);
                if (extractor == null)
                {
                    op.Successful = true;
                    return(string.Empty);
                }

                var result = string.Empty;

                using (var stream = binaryData.GetStream())
                {
                    if (stream == null || stream.Length == 0)
                    {
                        op.Successful = true;
                        return(String.Empty);
                    }

                    try
                    {
                        var ctx = new TextExtractorContext(node.VersionId);
                        // async
                        Action <TimeboxedActivity> timeboxedFunctionCall = activity =>
                        {
                            var x       = (Stream)activity.InArgument;
                            var extract = extractor.Extract(x, ctx);
                            activity.OutArgument = extract;
                        };

                        var act = new TimeboxedActivity();
                        act.InArgument = stream;
                        act.Activity   = timeboxedFunctionCall;
                        act.Context    = HttpContext.Current;

                        var finishedWithinTime = act.ExecuteAndWait(Configuration.Indexing.TextExtractTimeout * 1000);
                        if (!finishedWithinTime)
                        {
                            act.Abort();
                            var msg = String.Format("Text extracting timeout. Version: {0}, path: {1}", node.Version, node.Path);
                            SnTrace.Index.Write(msg);
                            SnLog.WriteWarning(msg);
                            op.Successful = true;
                            return(String.Empty);
                        }
                        else if (act.ExecutionException != null)
                        {
                            WriteError(act.ExecutionException, node);
                        }
                        else
                        {
                            result = (string)act.OutArgument;
                        }
                    }
                    catch (Exception e)
                    {
                        WriteError(e, node);
                    }
                }

                if (result == null)
                {
                    SnLog.WriteWarning(string.Format(CultureInfo.InvariantCulture, @"Couldn't extract text. VersionId: {0}, path: '{1}' ", node.VersionId, node.Path));
                }
                else
                {
                    result = result.Replace('\0', '.');
                }

                if (result == null)
                {
                    SnTrace.Index.Write("Couldn't extract text");
                }
                else
                {
                    SnTrace.Index.Write("Extracted length length: {0}.", result.Length);
                }

                op.Successful = true;
                return(result);
            }
        }