Exemplo n.º 1
0
        public override WFState Run()
        {
            WFState retval = new WFState();

            try
            {
                retval.Value = WFState.WFStateFail;

                Options dtOptions = new Options
                {
                    FieldFlags  = FieldFlags.dtsoFfOfficeSkipHiddenContent,
                    BinaryFiles = BinaryFilesSettings.dtsoIndexSkipBinary
                };
                dtOptions.Save();

                FileConverter fileConverter = new FileConverter
                {
                    InputFile    = this.FileToProcess,
                    OutputFile   = this.FileToProcess + ".dts",
                    OutputFormat = OutputFormats.it_ContentAsXml,
                    Flags        = ConvertFlags.dtsConvertInlineContainer
                };
                fileConverter.Execute();
                this.OutputFiles.Add(this.FileToProcess + ".dts");
                retval.Value = WFState.WFStateSuccess;

                JobErrorInfo errorInfo = fileConverter.Errors;
                if (errorInfo != null && errorInfo.Count > 0)
                {
                    for (int i = 0; i < errorInfo.Count; i++)
                    {
                        SFWorkflow.WFLogger.NLogger.Error(string.Format("DTSearch Error: ErrorCode={0}  ErrorMessage={1}", errorInfo.Message(i), errorInfo.Code(i)));
                    }
                }
            }
            catch (Exception ex)
            {
                SFWorkflow.WFLogger.NLogger.ErrorException("ERROR: TextExtractor.Run", ex);
            }

            return(retval);
        }
Exemplo n.º 2
0
        public PSTProcessingResult ProcessQueueObject()
        {
            int start = System.Environment.TickCount;

            try
            {
                Options dtOptions = new Options();
                dtOptions.FieldFlags  = FieldFlags.dtsoFfOfficeSkipHiddenContent;
                dtOptions.BinaryFiles = BinaryFilesSettings.dtsoIndexSkipBinary;
                dtOptions.Save();

                FileConverter fileConverter = new FileConverter();
                fileConverter.InputFile    = this.FileToProcess;
                fileConverter.OutputFile   = this.FileToProcess + ".xml";
                fileConverter.OutputFormat = OutputFormats.it_ContentAsXml;
                fileConverter.Flags        = ConvertFlags.dtsConvertInlineContainer;

                fileConverter.Execute();

                JobErrorInfo errorInfo = fileConverter.Errors;
                if (errorInfo != null && errorInfo.Count > 0)
                {
                    for (int i = 0; i < errorInfo.Count; i++)
                    {
                        Console.WriteLine("DTSearch Error: " + errorInfo.Code(i));
                    }
                }
            }
            catch (Exception ex)
            {
                Console.WriteLine(ex.Message);
            }

            PSTProcessingResult result = new PSTProcessingResult()
            {
                IsSuccessful = true, Filename = this.FileToProcess + ".xml"
            };

            result.SetProcessingObject <TextExtractor>(this);
            return(result);
        }
Exemplo n.º 3
0
        private void DoExecution(IndexJob job, TextBox textBoxStatus)
        {
            // Monitor the job execution thread as it progresses
            IndexProgressInfo status = new IndexProgressInfo();

            while (job.IsThreadDone(500, status) == false)
            {
                // Set the status text based on the current indexing step
                switch (status.Step)
                {
                case IndexingStep.ixStepBegin:
                    textBoxStatus.Text = "Opening index";
                    break;

                case IndexingStep.ixStepCheckingFiles:
                    textBoxStatus.Text = "Checking files";
                    break;

                case IndexingStep.ixStepCompressing:
                    textBoxStatus.Text = "Compressing index";
                    break;

                case IndexingStep.ixStepCreatingIndex:
                    textBoxStatus.Text = "Creating index";
                    break;

                case IndexingStep.ixStepDone:
                    textBoxStatus.Text = "Indexing Complete";
                    break;

                case IndexingStep.ixStepMerging:
                    textBoxStatus.Text = "Merging words into index";
                    break;

                case IndexingStep.ixStepNone:
                    textBoxStatus.Text = string.Empty;
                    break;

                case IndexingStep.ixStepReadingFiles:
                    textBoxStatus.Text = status.File.Name;
                    break;

                case IndexingStep.ixStepStoringWords:
                    textBoxStatus.Text = status.File.Name + " (storing words)";
                    break;

                default:
                    textBoxStatus.Text = string.Empty;
                    break;
                }

                // Let other form events be handled while we're looping
                Application.DoEvents();

                DTTableSource ds = (DTTableSource)job.DataSourceToIndex;
                if (ds != null) // Only applies to Indexing job
                {
                    _textBoxProcessed.Text = ds.RecordProcessed.ToString();
                    if (_stopRequested)
                    {
                        ds.StopRequested = true;
                    }
                }
            }

            // If there were errors, display the errors as additions to the
            // status text
            JobErrorInfo err = job.Errors;

            for (int i = 0; i < err.Count; i++)
            {
                textBoxStatus.Text = textBoxStatus.Text + " " + err.Message(i);
            }
        }
Exemplo n.º 4
0
        // called to extract text from a file
        // passed - sourceFileName - full path of file to extract text from
        // returns -
        // indexText - text that will be used for Lucene indexing, includes metadata
        // analysisText - text that will be used for clustering and LSA
        // errorFlag - false if an unrecoverable error occurred, else true
        // errorText - text of error if one occurred
        public void extractText(string docID, string sourceFileName, string title, ref string indexText, ref string analysisText,
                                List <ErrorDataObject> errObjs, out bool errorFlag)
        {
            Options       dtOptions;
            FileConverter fileConverter;
            StringBuilder outStringIndex    = new StringBuilder();
            StringBuilder outStringAnalysis = new StringBuilder();

            errorFlag    = true;
            indexText    = "";
            analysisText = "";

            try
            {
                // construct temporary file name for xml file output by dtSearch
                string targetFileNameDTSearch = @"C:\temp\_DTSearch.txt";
                File.Delete(targetFileNameDTSearch);

                dtOptions             = new Options();
                dtOptions.FieldFlags  = FieldFlags.dtsoFfOfficeSkipHiddenContent;
                dtOptions.BinaryFiles = BinaryFilesSettings.dtsoIndexSkipBinary;
                dtOptions.Save();

                fileConverter              = new FileConverter();
                fileConverter.InputFile    = sourceFileName;
                fileConverter.OutputFile   = targetFileNameDTSearch;
                fileConverter.OutputFormat = OutputFormats.it_ContentAsXml;
                fileConverter.Flags        = ConvertFlags.dtsConvertInlineContainer;

                fileConverter.Execute();

                //check for image file type
                TypeId deType = fileConverter.DetectedTypeId;
                if (imageTypes.Contains(deType))
                {
                    errObjs.Add(new ErrorDataObject("1002", "Image File Type: " + deType.ToString(), "Warning"));
                }

                // return if there is a dtSearch error other than file corrupt (10) or file encrypted (17)
                JobErrorInfo errorInfo           = fileConverter.Errors;
                bool         fatalError          = false;
                bool         fileMissingOrNoText = false;
                int          dtErCode            = 0;
                if (errorInfo != null && errorInfo.Count > 0)
                {
                    for (int i = 0; i < errorInfo.Count; i++)
                    {
                        dtErCode = errorInfo.Code(i);
                        string errorCode = "";
                        if (dtErCode != 9 && dtErCode != 10 && dtErCode != 17 && dtErCode != 207 && dtErCode != 16 && dtErCode != 21)
                        {
                            errObjs.Add(new ErrorDataObject("1005", "Text extraction Error occurred during processing of the document. " + errorInfo.Message(i), "Error"));
                            fatalError = true;
                        }
                        else
                        {
                            string errText = "";
                            if (dtErCode == 10)                                                 // dtsErFileCorrupt
                            {
                                errorCode = "1013";
                                errText   = "Document is corrupted.";
                            }
                            if (dtErCode == 17)                                                 // dtsErFileEncrypted
                            {
                                errorCode = "1007";
                                errText   = "A component of the document is encrypted.";
                            }
                            if (dtErCode == 207)                                        // dtsErContainerItemEncrypted, internal error code
                            {
                                errorCode = "1014";
                                errText   = "The document is encrypted.";
                                string text = errorInfo.Message(i);
                                if (text != null)
                                {
                                    int index = text.IndexOf("->");
                                    if (index >= 0)
                                    {
                                        errText = "A component of the document is encrypted. " + text.Substring(index);
                                    }
                                }
                            }
                            if (dtErCode == 9)                                                  // dtsErAccFile
                            {
                                errorCode = "1010";
                                errText   = "The system cannot access the file specified.";
                            }
                            if (dtErCode == 16)                                                 // dtsErFileNotFound
                            {
                                errorCode = "1011";
                                errText   = "Document file does not exist.";
                            }
                            if (dtErCode == 21)                                                 // dtsErFileEmpty
                            {
                                errorCode = "1012";
                                errText   = "Document file is empty";
                            }

                            if (dtErCode == 9 || dtErCode == 10 || dtErCode == 207 || dtErCode == 16 || dtErCode == 21)
                            {
                                fileMissingOrNoText = true;                                     // file missing, no text, corrupt or encrypted
                            }
                            if (errText == "")
                            {
                                errText = errorInfo.Message(i);
                            }
                            errObjs.Add(new ErrorDataObject(errorCode, "Text extraction error: " + errText, "Warning"));
                        }
                    }
                }

                if (fatalError)
                {
                    errorFlag = false;
                    return;
                }
                else
                {
                    if (fileMissingOrNoText)
                    {
                        return;
                    }

                    if (dtErCode == 17)
                    {
                        FileInfo fi = new FileInfo(targetFileNameDTSearch);
                        if (fi.Length == 0)
                        {
                            errObjs.Clear();                                    // remove error "1007"
                            errObjs.Add(new ErrorDataObject("1014", "Text extraction error: document is encrypted.", "Warning"));
                            return;
                        }
                    }

                    //load the dtSearch XML output file into an XML document
                    XmlDocument xmlDoc = new XmlDocument();
                    try
                    {
                        xmlDoc.Load(targetFileNameDTSearch);
                    }
                    catch
                    {
                        //try cleaning up the metadata tags and loading again
                        cleanMetadataTags(targetFileNameDTSearch);
                        xmlDoc.Load(targetFileNameDTSearch);
                    }

                    //start with the document node
                    XmlNode docNode = xmlDoc.DocumentElement;

                    //initialize the output strings
                    outStringIndex.Length = 0;
                    outStringIndex.AppendLine("DocID: " + docID);
                    if (title != null && title.Length > 0)
                    {
                        outStringIndex.AppendLine("Filename: " + title);
                    }
                    outStringAnalysis.Length = 0;

                    //start outputting with the document node
                    outputNode(docNode, outStringIndex, outStringAnalysis, errObjs);

                    indexText    = outStringIndex.ToString();
                    analysisText = outStringAnalysis.ToString();

                    //signal error if no analysis text
                    if (analysisText.Length == 0)
                    {
                        errObjs.Add(new ErrorDataObject("1003", "No text to analyze", "Warning"));
                    }

                    return;
                }
            }
            catch (Exception ex)
            {
                errObjs.Add(new ErrorDataObject("1001", "Text extraction Error occurred during processing of the document. " + ex.Message, "Error"));
                errorFlag = false;
                return;
            }
        }