예제 #1
0
        /// <summary>
        /// Extracts text from the given stream that contains the content of the open xml file.
        /// </summary>
        protected string GetOpenXmlText(Stream stream, TextExtractorContext context)
        {
            var result = new StringBuilder();

            using (var zip = ZipFile.Read(stream))
            {
                foreach (var entry in zip)
                {
                    if (Path.GetExtension(entry.FileName.ToLower()).Trim('.') == "xml")
                    {
                        var zipStream = new MemoryStream();
                        entry.Extract(zipStream);
                        zipStream.Seek(0, SeekOrigin.Begin);

                        // use the XML extractor for inner entries in OpenXml files
                        var extractor     = ResolveExtractor("xml");
                        var extractedText = extractor?.Extract(zipStream, context);

                        if (string.IsNullOrEmpty(extractedText))
                        {
                            zipStream.Close();
                            continue;
                        }
                        result.Append(extractedText);
                        zipStream.Close();
                    }
                }
            }

            return(result.ToString());
        }
예제 #2
0
        public override string Extract(Stream stream, TextExtractorContext context)
        {
            try
            {
                // extract text using IFilter
                return(SnIFilter.GetText(stream, ".pdf"));
            }
            catch (OutOfMemoryException ex)
            {
                SnLog.WriteWarning("Pdf text extract failed with out of memory exception. " + ex,
                                   EventId.Indexing,
                                   properties: new Dictionary <string, object> {
                    { "Stream size", stream.Length }
                });

                return(string.Empty);
            }
            catch (Exception ex)
            {
                // log iFilter error only once
                if (!_iFilterErrorLogged)
                {
                    SnLog.WriteWarning("Pdf IFilter error: " + ex.Message, EventId.Indexing);
                    _iFilterErrorLogged = true;
                }
            }

            // fallback to the other mechanism in case the pdf IFilter is missing
            var text = new StringBuilder();

            try
            {
                var pdfReader = new PdfReader(stream);
                for (var page = 1; page <= pdfReader.NumberOfPages; page++)
                {
                    // extract text using the old version (4.1.6) of iTextSharp
                    var pageText = ExtractTextFromPdfBytes(pdfReader.GetPageContent(page));
                    if (string.IsNullOrEmpty(pageText))
                    {
                        continue;
                    }

                    text.Append(pageText);
                }
            }
            catch (OutOfMemoryException ex)
            {
                SnLog.WriteWarning("Pdf text extract failed with out of memory exception. " + ex,
                                   EventId.Indexing,
                                   properties: new Dictionary <string, object> {
                    { "Stream size", stream.Length }
                });
            }

            return(text.ToString());
        }
예제 #3
0
        public override string Extract(Stream stream, TextExtractorContext context)
        {
            try
            {
                // IFilter
                return(SnIFilter.GetText(stream, ".msg"));
            }
            catch (Exception ex)
            {
                SnLog.WriteWarning("Msg IFilter error: " + ex.Message, EventId.Indexing);
            }

            return(string.Empty);
        }
예제 #4
0
        /// <summary>
        /// Extracts text from the given stream that contains the content of the open xml file.
        /// </summary>
        protected string GetOpenXmlText(Stream stream, TextExtractorContext context)
        {
            // use the XML extractor for inner entries in OpenXml files
            var extractor = ResolveExtractor("xml");

            if (extractor == null)
            {
                return(string.Empty);
            }

            var result = new StringBuilder();

            using (var archive = new ZipArchive(stream))
            {
                foreach (var entry in archive.Entries)
                {
                    if (!entry.FullName.Trim('.').EndsWith(".xml", StringComparison.OrdinalIgnoreCase))
                    {
                        continue;
                    }

                    // First we have to copy the entry stream to an in-memory stream because the
                    // built-in archive api does not let us Seek the stream during extraction.
                    using (var memoryStream = new MemoryStream())
                    {
                        using (var zipStream = entry.Open())
                        {
                            zipStream.CopyTo(memoryStream);
                            memoryStream.Seek(0, SeekOrigin.Begin);

                            // this line would throw an exception on the original zipStream
                            var extractedText = extractor.Extract(memoryStream, context);

                            if (string.IsNullOrEmpty(extractedText))
                            {
                                continue;
                            }

                            result.Append(extractedText);
                        }
                    }
                }
            }

            return(result.ToString());
        }
예제 #5
0
        public override string Extract(Stream stream, TextExtractorContext context)
        {
            // IMPORTANT: as this extractor is used for extracting text from inner
            // entries of OpenXml files, please do not make this method asynchronous,
            // because we cannot assume that the file is a real content in the
            // Content Repository.

            // initial length: chars = bytes / 2, relevant text rate: ~25%
            var sb     = new StringBuilder(Math.Max(20, Convert.ToInt32(stream.Length / 8)));
            var reader = new XmlTextReader(stream);

            while (reader.Read())
            {
                if (reader.NodeType == XmlNodeType.Text && reader.HasValue)
                {
                    sb.Append(reader.Value).Append(' ');
                }
            }

            return(sb.ToString());
        }
예제 #6
0
        public override string Extract(Stream stream, TextExtractorContext context)
        {
            // IMPORTANT: as this extractor is used for extracting text from inner
            // entries of OpenXml files, please do not make this method asynchronous,
            // because we cannot assume that the file is a real content in the
            // Content Repository.

            try
            {
                // initial length: chars = bytes / 2, relevant text rate: ~25%
                var sb = new StringBuilder(Math.Max(20, Convert.ToInt32(stream.Length / 8)));

                var reader = new XmlTextReader(stream);
                while (reader.Read())
                {
                    if (reader.NodeType == XmlNodeType.Text && reader.HasValue)
                    {
                        sb.Append(reader.Value).Append(' ');
                    }
                }

                return(sb.ToString());
            }
            catch
            {
                // execute the fallback algorithm
            }

            // Split to words by predefined string delimiters
            string text;

            stream.Seek(0L, SeekOrigin.Begin);
            using (var reader = new StreamReader(stream))
                text = reader.ReadToEnd();

            var words = text.Split(WordDelimiters, StringSplitOptions.RemoveEmptyEntries);

            return(string.Join(" ", words));
        }
예제 #7
0
 /// <inheritdoc />
 public abstract string Extract(Stream stream, TextExtractorContext context);
예제 #8
0
 public override string Extract(Stream stream, TextExtractorContext context)
 {
     return(string.Empty);
 }
예제 #9
0
 public override string Extract(Stream stream, TextExtractorContext context)
 {
     return(RichTextStripper.StripRichTextFormat(RepositoryTools.GetStreamString(stream)));
 }
예제 #10
0
 public override string Extract(Stream stream, TextExtractorContext context)
 {
     return(RepositoryTools.GetStreamString(stream));
 }
예제 #11
0
 public override string Extract(Stream stream, TextExtractorContext context)
 {
     return(GetOpenXmlText(stream, context));
 }
예제 #12
0
        /// <summary>
        /// Returns with the text extract of the given binaryData of the node.
        /// </summary>
        /// <param name="binaryData"><see cref="BinaryData"/> that will be extracted.</param>
        /// <param name="node">Owner <see cref="Node"/>.</param>
        /// <returns></returns>
        public static string GetExtract(BinaryData binaryData, Node node)
        {
            using (var op = SnTrace.Index.StartOperation("Getting text extract, VId:{0}, Path:{1}", node.VersionId, node.Path))
            {
                var extractor = ResolveExtractor(binaryData);
                if (extractor == null)
                {
                    op.Successful = true;
                    return(string.Empty);
                }

                var result = string.Empty;

                using (var stream = binaryData.GetStream())
                {
                    if (stream == null || stream.Length == 0)
                    {
                        op.Successful = true;
                        return(string.Empty);
                    }

                    try
                    {
                        var ctx = new TextExtractorContext(node.VersionId);
                        // async
                        void TimeboxedFunctionCall(TimeboxedActivity activity)
                        {
                            var x       = (Stream)activity.InArgument;
                            var extract = extractor.Extract(x, ctx);

                            activity.OutArgument = extract;
                        }

                        var act = new TimeboxedActivity
                        {
                            InArgument = stream,
                            Activity   = TimeboxedFunctionCall,
                            Context    = HttpContext.Current
                        };

                        var finishedWithinTime = act.ExecuteAndWait(Configuration.Indexing.TextExtractTimeout * 1000);
                        if (!finishedWithinTime)
                        {
                            act.Abort();
                            var msg = $"Text extracting timeout. Version: {node.Version}, path: {node.Path}";
                            SnTrace.Index.Write(msg);
                            SnLog.WriteWarning(msg);
                            op.Successful = true;
                            return(string.Empty);
                        }
                        else if (act.ExecutionException != null)
                        {
                            WriteError(act.ExecutionException, node);
                        }
                        else
                        {
                            result = (string)act.OutArgument;
                        }
                    }
                    catch (Exception e)
                    {
                        WriteError(e, node);
                    }
                }

                if (result == null)
                {
                    SnLog.WriteWarning(string.Format(CultureInfo.InvariantCulture, @"Couldn't extract text. VersionId: {0}, path: '{1}' ", node.VersionId, node.Path));
                }
                else
                {
                    result = result.Replace('\0', '.');
                }

                if (result == null)
                {
                    SnTrace.Index.Write("Couldn't extract text");
                }
                else
                {
                    SnTrace.Index.Write("Extracted length length: {0}.", result.Length);
                }

                op.Successful = true;
                return(result);
            }
        }
예제 #13
0
 public override string Extract(Stream stream, TextExtractorContext context)
 {
     return(ExtractiFilter(stream, out _));
 }