Inheritance: TextExtractor
Ejemplo n.º 1
0
        public static string GetExtract(BinaryData binaryData, Node node)
        {
            if (binaryData == null)
            {
                return(string.Empty);
            }
            var fname = binaryData.FileName;

            if (fname == null)
            {
                return(string.Empty);
            }
            var ext = fname.Extension;

            if (String.IsNullOrEmpty(ext))
            {
                return(string.Empty);
            }

            ITextExtractor extractor = null;
            var            result    = string.Empty;

            switch (ext.ToLower())
            {
            case "contenttype":
            case "xml": extractor = new XmlTextExtractor(); break;

            case "doc": extractor = new DocTextExtractor(); break;

            case "xls": extractor = new XlsTextExtractor(); break;

            case "pdf": extractor = new PdfTextExtractor(); break;

            case "docx": extractor = new DocxTextExtractor(); break;

            case "xlsx": extractor = new XlsxTextExtractor(); break;

            case "pptx": extractor = new PptxTextExtractor(); break;

            case "txt": extractor = new PlainTextExtractor(); break;

            default:
                return(String.Empty);
            }

            var stream = binaryData.GetStream();

            if (stream == null)
            {
                return(String.Empty);
            }
            if (stream.Length == 0)
            {
                return(String.Empty);
            }

            try
            {
                ////-- sync
                //result = extractor.Extract(stream);

                //-- async
                Action <TimeboxedActivity> timeboxedFunctionCall = activity =>
                {
                    var x       = (Stream)activity.InArgument;
                    var extract = extractor.Extract(x);
                    activity.OutArgument = extract;
                };

                var act = new TimeboxedActivity();
                act.InArgument = stream;
                act.Activity   = timeboxedFunctionCall;
                act.Context    = HttpContext.Current;

                var finishedWithinTime = act.ExecuteAndWait(Repository.TextExtractTimeout * 1000);
                if (!finishedWithinTime)
                {
                    act.Abort();
                    var msg = String.Format("Text extracting timeout. Version: {0}, path: {1}", node.Version, node.Path);
                    Logger.WriteWarning(msg);
                    return(String.Empty);
                }
                else if (act.ExecutionException != null)
                {
                    WriteError(act.ExecutionException, node);
                }
                else
                {
                    result = (string)act.OutArgument;
                }
            }
            catch (Exception e)
            {
                WriteError(e, node);
            }

            if (String.IsNullOrEmpty(result))
            {
                var format = @"Couldn't extract text. VersionId: {0}, path: '{1}' ";
                var inf    = String.Format(CultureInfo.InvariantCulture, format, node.VersionId, node.Path);
                Logger.WriteWarning(inf);
            }

            result = result.Replace('\0', '.');
            return(result);
        }
Ejemplo n.º 2
0
        public static string GetExtract(Stream stream, string fileName, out string errorMessage)
        {
            if (stream == null)
            {
                errorMessage = null;
                return(String.Empty);
            }
            if (stream.Length == 0)
            {
                errorMessage = null;
                return(String.Empty);
            }
            if (String.IsNullOrEmpty(fileName))
            {
                errorMessage = "Cannot resolve a TextExtractor if FileName is null or empty";
                return(String.Empty);
            }
            var extension = Path.GetExtension(fileName);

            if (String.IsNullOrEmpty(extension))
            {
                errorMessage = "Cannot resolve a TextExtractor if FileName's extension is null or empty";
                return(string.Empty);
            }
            extension = extension.TrimStart('.');
            if (extension.Length == 0)
            {
                errorMessage = "Cannot resolve a TextExtractor if FileName's extension is empty";
                return(string.Empty);
            }
            extension = extension.ToLower();
            if (extension == "txt")
            {
                errorMessage = null;
                return(SenseNet.ContentRepository.Tools.GetStreamString(stream));
            }

            ITextExtractor extractor = null;
            var            result    = string.Empty;

            switch (extension)
            {
            case "contenttype":
            case "xml": extractor = new XmlTextExtractor(); break;

            case "doc": extractor = new DocTextExtractor(); break;

            case "xls": extractor = new XlsTextExtractor(); break;

            case "pdf": extractor = new PdfTextExtractor(); break;

            case "docx": extractor = new DocxTextExtractor(); break;

            case "xlsx": extractor = new XlsxTextExtractor(); break;

            case "pptx": extractor = new PptxTextExtractor(); break;

            case "txt": extractor = new PlainTextExtractor(); break;

            default:
                errorMessage = String.Format("Cannot resolve a TextExtractor for this extension: '{0}'", extension);
                return(String.Empty);
            }

            try
            {
                //-- sync
                result       = extractor.Extract(stream);
                errorMessage = null;

                ////-- async

                /*
                 * Action<TimeboxedActivity> timeboxedFunctionCall = activity =>
                 * {
                 *  var x = (Stream)activity.InArgument;
                 *  var extract = extractor.Extract(x);
                 *  activity.OutArgument = extract;
                 * };
                 *
                 * var act = new TimeboxedActivity();
                 * act.InArgument = stream;
                 * act.Activity = timeboxedFunctionCall;
                 *
                 * var finishedWithinTime = act.ExecuteAndWait(5000);
                 * if (!finishedWithinTime)
                 * {
                 *  act.Abort();
                 *  errorMessage = String.Format("Text extracting timeout. path: {0}", fileName);
                 *  return String.Empty;
                 * }
                 * else if (act.ExecutionException != null)
                 * {
                 *  errorMessage = String.Format("An error occured during extracting text. Path: {0}. Message: {1}", fileName, act.ExecutionException.Message);
                 * }
                 * else
                 * {
                 *  result = (string)act.OutArgument;
                 *  errorMessage = null;
                 * }
                 */
            }
            catch (Exception e)
            {
                errorMessage = String.Format("An error occured during extracting text. Path: {0}. Message: {1}", fileName, e.Message);
            }

            if (String.IsNullOrEmpty(result))
            {
                var format = @"Couldn't extract text. FileName: '{0}' ";
                errorMessage = String.Format(CultureInfo.InvariantCulture, format, fileName);
            }

            result = result.Replace('\0', '.');
            return(result);
        }