private void DoExecution(IndexJob job, TextBox textBoxStatus) { // Monitor the job execution thread as it progresses IndexProgressInfo status = new IndexProgressInfo(); while (job.IsThreadDone(500, status) == false) { // Set the status text based on the current indexing step switch (status.Step) { case IndexingStep.ixStepBegin: textBoxStatus.Text = "Opening index"; break; case IndexingStep.ixStepCheckingFiles: textBoxStatus.Text = "Checking files"; break; case IndexingStep.ixStepCompressing: textBoxStatus.Text = "Compressing index"; break; case IndexingStep.ixStepCreatingIndex: textBoxStatus.Text = "Creating index"; break; case IndexingStep.ixStepDone: textBoxStatus.Text = "Indexing Complete"; break; case IndexingStep.ixStepMerging: textBoxStatus.Text = "Merging words into index"; break; case IndexingStep.ixStepNone: textBoxStatus.Text = string.Empty; break; case IndexingStep.ixStepReadingFiles: textBoxStatus.Text = status.File.Name; break; case IndexingStep.ixStepStoringWords: textBoxStatus.Text = status.File.Name + " (storing words)"; break; default: textBoxStatus.Text = string.Empty; break; } // Let other form events be handled while we're looping Application.DoEvents(); DTTableSource ds = (DTTableSource)job.DataSourceToIndex; if (ds != null) // Only applies to Indexing job { _textBoxProcessed.Text = ds.RecordProcessed.ToString(); if (_stopRequested) { ds.StopRequested = true; } } } // If there were errors, display the errors as additions to the // status text JobErrorInfo err = job.Errors; for (int i = 0; i < err.Count; i++) { textBoxStatus.Text = textBoxStatus.Text + " " + err.Message(i); } }
public override void Process(TextExtractorData data) { data.WFState.Value = WFState.WFStateFail; try { Options dtOptions = new Options { FieldFlags = FieldFlags.dtsoFfOfficeSkipHiddenContent, BinaryFiles = BinaryFilesSettings.dtsoIndexSkipBinary }; dtOptions.Save(); FileConverter fileConverter = new FileConverter { InputFile = data.DocumentToProcess, OutputFile = data.DocumentToProcess + ".dts", OutputFormat = OutputFormats.it_ContentAsXml, Flags = ConvertFlags.dtsConvertInlineContainer }; fileConverter.Execute(); data.OutputDocuments.Add(data.DocumentToProcess + ".dts"); data.WFState.Value = WFState.WFStateSuccess; JobErrorInfo errorInfo = fileConverter.Errors; if (errorInfo != null && errorInfo.Count > 0) { for (int i = 0; i < errorInfo.Count; i++) { KRSrcWorkflow.WFLogger.NLogger.Error(string.Format("DTSearch Error: ErrorCode={0} ErrorMessage={1}", errorInfo.Message(i), errorInfo.Code(i))); } } } catch (Exception ex) { KRSrcWorkflow.WFLogger.NLogger.ErrorException("ERROR: TextExtractor.Run", ex); } }
// called to extract text from a file // passed - sourceFileName - full path of file to extract text from // returns - // indexText - text that will be used for Lucene indexing, includes metadata // analysisText - text that will be used for clustering and LSA // errorFlag - false if an unrecoverable error occurred, else true // errorText - text of error if one occurred public void extractText(string docID, string sourceFileName, string title, ref string indexText, ref string analysisText, List <ErrorDataObject> errObjs, out bool errorFlag) { Options dtOptions; FileConverter fileConverter; StringBuilder outStringIndex = new StringBuilder(); StringBuilder outStringAnalysis = new StringBuilder(); errorFlag = true; indexText = ""; analysisText = ""; try { // construct temporary file name for xml file output by dtSearch string targetFileNameDTSearch = @"C:\temp\_DTSearch.txt"; File.Delete(targetFileNameDTSearch); dtOptions = new Options(); dtOptions.FieldFlags = FieldFlags.dtsoFfOfficeSkipHiddenContent; dtOptions.BinaryFiles = BinaryFilesSettings.dtsoIndexSkipBinary; dtOptions.Save(); fileConverter = new FileConverter(); fileConverter.InputFile = sourceFileName; fileConverter.OutputFile = targetFileNameDTSearch; fileConverter.OutputFormat = OutputFormats.it_ContentAsXml; fileConverter.Flags = ConvertFlags.dtsConvertInlineContainer; fileConverter.Execute(); //check for image file type TypeId deType = fileConverter.DetectedTypeId; if (imageTypes.Contains(deType)) { errObjs.Add(new ErrorDataObject("1002", "Image File Type: " + deType.ToString(), "Warning")); } // return if there is a dtSearch error other than file corrupt (10) or file encrypted (17) JobErrorInfo errorInfo = fileConverter.Errors; bool fatalError = false; bool fileMissingOrNoText = false; int dtErCode = 0; if (errorInfo != null && errorInfo.Count > 0) { for (int i = 0; i < errorInfo.Count; i++) { dtErCode = errorInfo.Code(i); string errorCode = ""; if (dtErCode != 9 && dtErCode != 10 && dtErCode != 17 && dtErCode != 207 && dtErCode != 16 && dtErCode != 21) { errObjs.Add(new ErrorDataObject("1005", "Text extraction Error occurred during processing of the document. " + errorInfo.Message(i), "Error")); fatalError = true; } else { string errText = ""; if (dtErCode == 10) // dtsErFileCorrupt { errorCode = "1013"; errText = "Document is corrupted."; } if (dtErCode == 17) // dtsErFileEncrypted { errorCode = "1007"; errText = "A component of the document is encrypted."; } if (dtErCode == 207) // dtsErContainerItemEncrypted, internal error code { errorCode = "1014"; errText = "The document is encrypted."; string text = errorInfo.Message(i); if (text != null) { int index = text.IndexOf("->"); if (index >= 0) { errText = "A component of the document is encrypted. " + text.Substring(index); } } } if (dtErCode == 9) // dtsErAccFile { errorCode = "1010"; errText = "The system cannot access the file specified."; } if (dtErCode == 16) // dtsErFileNotFound { errorCode = "1011"; errText = "Document file does not exist."; } if (dtErCode == 21) // dtsErFileEmpty { errorCode = "1012"; errText = "Document file is empty"; } if (dtErCode == 9 || dtErCode == 10 || dtErCode == 207 || dtErCode == 16 || dtErCode == 21) { fileMissingOrNoText = true; // file missing, no text, corrupt or encrypted } if (errText == "") { errText = errorInfo.Message(i); } errObjs.Add(new ErrorDataObject(errorCode, "Text extraction error: " + errText, "Warning")); } } } if (fatalError) { errorFlag = false; return; } else { if (fileMissingOrNoText) { return; } if (dtErCode == 17) { FileInfo fi = new FileInfo(targetFileNameDTSearch); if (fi.Length == 0) { errObjs.Clear(); // remove error "1007" errObjs.Add(new ErrorDataObject("1014", "Text extraction error: document is encrypted.", "Warning")); return; } } //load the dtSearch XML output file into an XML document XmlDocument xmlDoc = new XmlDocument(); try { xmlDoc.Load(targetFileNameDTSearch); } catch { //try cleaning up the metadata tags and loading again cleanMetadataTags(targetFileNameDTSearch); xmlDoc.Load(targetFileNameDTSearch); } //start with the document node XmlNode docNode = xmlDoc.DocumentElement; //initialize the output strings outStringIndex.Length = 0; outStringIndex.AppendLine("DocID: " + docID); if (title != null && title.Length > 0) { outStringIndex.AppendLine("Filename: " + title); } outStringAnalysis.Length = 0; //start outputting with the document node outputNode(docNode, outStringIndex, outStringAnalysis, errObjs); indexText = outStringIndex.ToString(); analysisText = outStringAnalysis.ToString(); //signal error if no analysis text if (analysisText.Length == 0) { errObjs.Add(new ErrorDataObject("1003", "No text to analyze", "Warning")); } return; } } catch (Exception ex) { errObjs.Add(new ErrorDataObject("1001", "Text extraction Error occurred during processing of the document. " + ex.Message, "Error")); errorFlag = false; return; } }