ITextExtractor.Extract C# (CSharp) Code-Beispiele

Beispiel #1

0

Datei anzeigen

        public IEnumerable <string> GetResult(IndexerFile file)
        {
            IndexerResult indexerResult = new IndexerResult();

            _configuration.Splitter.SetResultPhase(true);

            if (!_textExtractors.ContainsKey(file.Extension))
            {
                string message = "No extractor is defined for file extension: " + file.Extension + ".";
                throw new Exception(message);
            }

            ITextExtractor textExtractor = _textExtractors[file.Extension];
            string         fileText      = File.ReadAllText(file.Path);

            foreach (string identifier in textExtractor.Extract(fileText, _configuration.ExtractType))
            {
                try
                {
                    IdentifierSplitResult identifierSplitResult = new IdentifierSplitResult(identifier, file);
                    identifierSplitResult.Add(_configuration.Splitter.Split(identifier));
                    indexerResult.AddSplitResult(identifierSplitResult);
                }
                catch (Exception)
                {
                    continue;
                }
            }

            return(indexerResult.GetSplitResultList().SelectMany(x => x.Splits).Select(Filter).Where(x => !string.IsNullOrEmpty(x)));
        }

Beispiel #2

0

Datei anzeigen

        /// <summary>
        /// Extract and create a lucene document object from the document entitiy object. By default will not
        /// perform text extraction
        /// </summary>
        /// <param name="document">The entitiy document object</param>
        /// <param name="extractText">Boolean flag indicating if text extraction should occur </param>
        /// <returns><see cref="IndexedDocument"/> object extracted from the entitiy document object</returns>
        private IndexedDocument GetLuceneDocumentFromDocument(Document document, bool extractText = false)
        {
            var luceneDocument = new IndexedDocument();

            luceneDocument.Title = document.Title;

            // we are going to set the id the same as on the uploaded document
            // so we can easiliy map the lucene results to documents in our system
            luceneDocument.Id = document.Id;

            if (extractText)
            {
                //get the file contents using the tika extractor
                var textExtractResult = _textExtractor.Extract(document.Path);

                luceneDocument.Contents = textExtractResult.Text;
            }
            else
            {
                luceneDocument.Contents = null;
            }

            // set the dates
            luceneDocument.DateIndexed = DateTime.Now;

            // set the date Created for the lucene document that corresponds to the Date the document
            // was uploaded in the system
            luceneDocument.DateCreated = document.DateUploaded;

            // return the lucene document
            return(luceneDocument);
        }

Beispiel #3

0

Datei anzeigen

        /// <summary>
        /// Updates dictionary
        /// </summary>
        private void UpdateTokenDictionary()
        {
            _configuration.Splitter.SetResultPhase(false);

            // first create a dictionary for tokens
            // extracts all text from source code
            int totalFileCount   = _configuration.FilesToScan.Count;
            int currentFileCount = 0;

            foreach (IndexerFile file in _configuration.FilesToScan)
            {
                try
                {
                    _configuration.NotificationHandler.UpdateStatus(NotificationType.AnalyzingFile, currentFileCount, totalFileCount, "Extracting file: " + file.Name);

                    if (!_textExtractors.ContainsKey(file.Extension))
                    {
                        string message = "No extractor is defined for file extension: " + file.Extension + ".  Do you want to skip this file?";
                        if (_configuration.NotificationHandler.GetYesNoAnswer(QuestionType.NoTextExtratorDefined, message))
                        {
                            continue;
                        }
                    }

                    ITextExtractor textExtractor = _textExtractors[file.Extension];
                    string         fileText      = File.ReadAllText(file.Path);
                    foreach (string identifier in textExtractor.Extract(fileText))
                    {
                        _configuration.NotificationHandler.UpdateStatus(NotificationType.IdentifyingToken, currentFileCount, totalFileCount, "Analyzing token: " + identifier + " in file: " + file.Name);
                        _configuration.Splitter.UpdateTokenDictionary(identifier);
                    }
                }
                catch (Exception e)
                {
                    string additionalMessage = "Error reading file. " + Environment.NewLine + "File: " + file.Name + Environment.NewLine + "Message: " + e.Message + Environment.NewLine + "Do you want to skip this file?";
                    if (!_configuration.NotificationHandler.GetYesNoAnswer(QuestionType.ErrorReadingFile, additionalMessage))
                    {
                        throw;
                    }
                }
                finally
                {
                    currentFileCount++;
                }
            }
        }

Beispiel #4

0

Datei anzeigen

        public void BuildIndex()
        {
            var analyzer = new StandardAnalyzer(LuceneVersion.LUCENE_48);

            using (var indexDirectory = FSDirectory.Open(_indexPath))
            {
                var iwc = new IndexWriterConfig(LuceneVersion.LUCENE_48, analyzer)
                {
                    OpenMode = System.IO.Directory.Exists(_indexPath) ? OpenMode.CREATE_OR_APPEND : OpenMode.CREATE
                };

                using (var writer = new IndexWriter(indexDirectory, iwc))
                {
                    var pdfFiles = System.IO.Directory.GetFiles(_documentsPath, "*.pdf");
                    foreach (var pdfFile in pdfFiles)
                    {
                        var resultText = _pdfTextExtractor.Extract(pdfFile);

                        var document = new Document
                        {
                            new StringField("Path", pdfFile, Field.Store.YES),
                            new TextField("Content", resultText, Field.Store.YES)
                        };

                        if (writer.Config.OpenMode == OpenMode.CREATE)
                        {
                            writer.AddDocument(document);
                        }
                        else
                        {
                            writer.UpdateDocument(new Term("Path", pdfFile), document);
                        }
                    }

                    writer.Commit();
                }
            }
        }

Beispiel #5

0

Datei anzeigen

        /// <summary>
        /// Get Result
        /// </summary>
        /// <returns>Indexer Result</returns>
        private IndexerResult GetResult()
        {
            IndexerResult indexerResult = new IndexerResult();

            _configuration.Splitter.SetResultPhase(true);

            // extract
            int totalFileCount   = _configuration.FilesToScan.Count;
            int currentFileCount = 0;

            foreach (IndexerFile file in _configuration.FilesToScan)
            {
                try
                {
                    _configuration.NotificationHandler.UpdateStatus(NotificationType.ReadingFileForIdentifiers, currentFileCount, totalFileCount, "Extracting file identifier: " + file.Name);

                    if (!_textExtractors.ContainsKey(file.Extension))
                    {
                        string message = "No extractor is defined for file extension: " + file.Extension + ".  Do you want to skip this file?";
                        if (_configuration.NotificationHandler.GetYesNoAnswer(QuestionType.NoTextExtratorDefined, message))
                        {
                            continue;
                        }
                    }

                    ITextExtractor textExtractor = _textExtractors[file.Extension];
                    string         fileText      = File.ReadAllText(file.Path);
                    foreach (string identifier in textExtractor.Extract(fileText, _configuration.ExtractType))
                    {
                        _configuration.NotificationHandler.UpdateStatus(NotificationType.Splitting, currentFileCount, totalFileCount, "Splitting token: " + identifier + " in file: " + file.Name);
                        IdentifierSplitResult identifierSplitResult = new IdentifierSplitResult(identifier, file);
                        identifierSplitResult.Add(_configuration.Splitter.Split(identifier));
                        indexerResult.AddSplitResult(identifierSplitResult);
                    }
                }
                catch (Exception e)
                {
                    string additionalMessage = "Error reading file. " + Environment.NewLine + "File: " + file.Name +
                                               Environment.NewLine + "Message: " + e.Message + Environment.NewLine +
                                               "Do you want to skip this file?";
                    if (!_configuration.NotificationHandler.GetYesNoAnswer(QuestionType.ErrorReadingFile, additionalMessage))
                    {
                        throw;
                    }
                }
                finally
                {
                    currentFileCount++;
                }
            }

            // Since while adding result we do not have merged token, misspelled and stemmed word info, filter them and add to respective list
            indexerResult.UpdateFromMergeToken(_tokenDictionary);
            indexerResult.UpdateFromMisspelled(_tokenDictionary);
            indexerResult.UpdateFromStemmed(_tokenDictionary);

            // Filter 3: Stem every identified. If the word is identified replace the word with stemmed word
            if (_configuration.Stemmer != null)
            {
                List <string> dictionaryWordList     = indexerResult.GetDictionaryWordList().Keys.ToList();
                int           totalIdentifiedCount   = dictionaryWordList.Count;
                int           currentIdentifiedCount = 0;
                foreach (string identified in dictionaryWordList)
                {
                    currentIdentifiedCount++;
                    _configuration.NotificationHandler.UpdateStatus(NotificationType.Stemming, currentIdentifiedCount, totalIdentifiedCount, "Stemming: " + identified);
                    string stemmedText = _configuration.Stemmer.GetStemmedText(identified);
                    if (stemmedText != null && stemmedText != identified && _configuration.Dictionary.IsWord(stemmedText))
                    {
                        indexerResult.AddStemmedWordAndReplaceIdentified(identified, stemmedText);
                    }
                }
            }

            // Filter result
            indexerResult.RemoveFilterWordAndTokenResult(_configuration.Dictionary);

            _configuration.NotificationHandler.UpdateStatus(NotificationType.IndexingCompleted, 1, 1, "Indexing Completed");
            return(indexerResult);
        }

C# (CSharp) ITextExtractor.Extract Beispiele