public IEnumerable <string> GetResult(IndexerFile file) { IndexerResult indexerResult = new IndexerResult(); _configuration.Splitter.SetResultPhase(true); if (!_textExtractors.ContainsKey(file.Extension)) { string message = "No extractor is defined for file extension: " + file.Extension + "."; throw new Exception(message); } ITextExtractor textExtractor = _textExtractors[file.Extension]; string fileText = File.ReadAllText(file.Path); foreach (string identifier in textExtractor.Extract(fileText, _configuration.ExtractType)) { try { IdentifierSplitResult identifierSplitResult = new IdentifierSplitResult(identifier, file); identifierSplitResult.Add(_configuration.Splitter.Split(identifier)); indexerResult.AddSplitResult(identifierSplitResult); } catch (Exception) { continue; } } return(indexerResult.GetSplitResultList().SelectMany(x => x.Splits).Select(Filter).Where(x => !string.IsNullOrEmpty(x))); }
/// <summary> /// Extract and create a lucene document object from the document entitiy object. By default will not /// perform text extraction /// </summary> /// <param name="document">The entitiy document object</param> /// <param name="extractText">Boolean flag indicating if text extraction should occur </param> /// <returns><see cref="IndexedDocument"/> object extracted from the entitiy document object</returns> private IndexedDocument GetLuceneDocumentFromDocument(Document document, bool extractText = false) { var luceneDocument = new IndexedDocument(); luceneDocument.Title = document.Title; // we are going to set the id the same as on the uploaded document // so we can easiliy map the lucene results to documents in our system luceneDocument.Id = document.Id; if (extractText) { //get the file contents using the tika extractor var textExtractResult = _textExtractor.Extract(document.Path); luceneDocument.Contents = textExtractResult.Text; } else { luceneDocument.Contents = null; } // set the dates luceneDocument.DateIndexed = DateTime.Now; // set the date Created for the lucene document that corresponds to the Date the document // was uploaded in the system luceneDocument.DateCreated = document.DateUploaded; // return the lucene document return(luceneDocument); }
/// <summary> /// Updates dictionary /// </summary> private void UpdateTokenDictionary() { _configuration.Splitter.SetResultPhase(false); // first create a dictionary for tokens // extracts all text from source code int totalFileCount = _configuration.FilesToScan.Count; int currentFileCount = 0; foreach (IndexerFile file in _configuration.FilesToScan) { try { _configuration.NotificationHandler.UpdateStatus(NotificationType.AnalyzingFile, currentFileCount, totalFileCount, "Extracting file: " + file.Name); if (!_textExtractors.ContainsKey(file.Extension)) { string message = "No extractor is defined for file extension: " + file.Extension + ". Do you want to skip this file?"; if (_configuration.NotificationHandler.GetYesNoAnswer(QuestionType.NoTextExtratorDefined, message)) { continue; } } ITextExtractor textExtractor = _textExtractors[file.Extension]; string fileText = File.ReadAllText(file.Path); foreach (string identifier in textExtractor.Extract(fileText)) { _configuration.NotificationHandler.UpdateStatus(NotificationType.IdentifyingToken, currentFileCount, totalFileCount, "Analyzing token: " + identifier + " in file: " + file.Name); _configuration.Splitter.UpdateTokenDictionary(identifier); } } catch (Exception e) { string additionalMessage = "Error reading file. " + Environment.NewLine + "File: " + file.Name + Environment.NewLine + "Message: " + e.Message + Environment.NewLine + "Do you want to skip this file?"; if (!_configuration.NotificationHandler.GetYesNoAnswer(QuestionType.ErrorReadingFile, additionalMessage)) { throw; } } finally { currentFileCount++; } } }
public void BuildIndex() { var analyzer = new StandardAnalyzer(LuceneVersion.LUCENE_48); using (var indexDirectory = FSDirectory.Open(_indexPath)) { var iwc = new IndexWriterConfig(LuceneVersion.LUCENE_48, analyzer) { OpenMode = System.IO.Directory.Exists(_indexPath) ? OpenMode.CREATE_OR_APPEND : OpenMode.CREATE }; using (var writer = new IndexWriter(indexDirectory, iwc)) { var pdfFiles = System.IO.Directory.GetFiles(_documentsPath, "*.pdf"); foreach (var pdfFile in pdfFiles) { var resultText = _pdfTextExtractor.Extract(pdfFile); var document = new Document { new StringField("Path", pdfFile, Field.Store.YES), new TextField("Content", resultText, Field.Store.YES) }; if (writer.Config.OpenMode == OpenMode.CREATE) { writer.AddDocument(document); } else { writer.UpdateDocument(new Term("Path", pdfFile), document); } } writer.Commit(); } } }
/// <summary> /// Get Result /// </summary> /// <returns>Indexer Result</returns> private IndexerResult GetResult() { IndexerResult indexerResult = new IndexerResult(); _configuration.Splitter.SetResultPhase(true); // extract int totalFileCount = _configuration.FilesToScan.Count; int currentFileCount = 0; foreach (IndexerFile file in _configuration.FilesToScan) { try { _configuration.NotificationHandler.UpdateStatus(NotificationType.ReadingFileForIdentifiers, currentFileCount, totalFileCount, "Extracting file identifier: " + file.Name); if (!_textExtractors.ContainsKey(file.Extension)) { string message = "No extractor is defined for file extension: " + file.Extension + ". Do you want to skip this file?"; if (_configuration.NotificationHandler.GetYesNoAnswer(QuestionType.NoTextExtratorDefined, message)) { continue; } } ITextExtractor textExtractor = _textExtractors[file.Extension]; string fileText = File.ReadAllText(file.Path); foreach (string identifier in textExtractor.Extract(fileText, _configuration.ExtractType)) { _configuration.NotificationHandler.UpdateStatus(NotificationType.Splitting, currentFileCount, totalFileCount, "Splitting token: " + identifier + " in file: " + file.Name); IdentifierSplitResult identifierSplitResult = new IdentifierSplitResult(identifier, file); identifierSplitResult.Add(_configuration.Splitter.Split(identifier)); indexerResult.AddSplitResult(identifierSplitResult); } } catch (Exception e) { string additionalMessage = "Error reading file. " + Environment.NewLine + "File: " + file.Name + Environment.NewLine + "Message: " + e.Message + Environment.NewLine + "Do you want to skip this file?"; if (!_configuration.NotificationHandler.GetYesNoAnswer(QuestionType.ErrorReadingFile, additionalMessage)) { throw; } } finally { currentFileCount++; } } // Since while adding result we do not have merged token, misspelled and stemmed word info, filter them and add to respective list indexerResult.UpdateFromMergeToken(_tokenDictionary); indexerResult.UpdateFromMisspelled(_tokenDictionary); indexerResult.UpdateFromStemmed(_tokenDictionary); // Filter 3: Stem every identified. If the word is identified replace the word with stemmed word if (_configuration.Stemmer != null) { List <string> dictionaryWordList = indexerResult.GetDictionaryWordList().Keys.ToList(); int totalIdentifiedCount = dictionaryWordList.Count; int currentIdentifiedCount = 0; foreach (string identified in dictionaryWordList) { currentIdentifiedCount++; _configuration.NotificationHandler.UpdateStatus(NotificationType.Stemming, currentIdentifiedCount, totalIdentifiedCount, "Stemming: " + identified); string stemmedText = _configuration.Stemmer.GetStemmedText(identified); if (stemmedText != null && stemmedText != identified && _configuration.Dictionary.IsWord(stemmedText)) { indexerResult.AddStemmedWordAndReplaceIdentified(identified, stemmedText); } } } // Filter result indexerResult.RemoveFilterWordAndTokenResult(_configuration.Dictionary); _configuration.NotificationHandler.UpdateStatus(NotificationType.IndexingCompleted, 1, 1, "Indexing Completed"); return(indexerResult); }