C# (CSharp) ITextExtractor.Extract примеры использования

Язык программирования: C# (CSharp)

Класс/Тип: ITextExtractor

Метод/Функция: Extract

Примеров на hotexamples.com: 5

C# (CSharp) ITextExtractor.Extract - 5 примеров найдено. Это лучшие примеры C# (CSharp) кода для ITextExtractor.Extract, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

Extract(5)

ExtractText(2)

GetText(2)

IsParseable(2)

ExtractTextFromFile(1)

GetChunk(1)

Initialize(1)

StartWorking(1)

Пример #1

Показать файл

        public IEnumerable <string> GetResult(IndexerFile file)
        {
            IndexerResult indexerResult = new IndexerResult();

            _configuration.Splitter.SetResultPhase(true);

            if (!_textExtractors.ContainsKey(file.Extension))
            {
                string message = "No extractor is defined for file extension: " + file.Extension + ".";
                throw new Exception(message);
            }

            ITextExtractor textExtractor = _textExtractors[file.Extension];
            string         fileText      = File.ReadAllText(file.Path);

            foreach (string identifier in textExtractor.Extract(fileText, _configuration.ExtractType))
            {
                try
                {
                    IdentifierSplitResult identifierSplitResult = new IdentifierSplitResult(identifier, file);
                    identifierSplitResult.Add(_configuration.Splitter.Split(identifier));
                    indexerResult.AddSplitResult(identifierSplitResult);
                }
                catch (Exception)
                {
                    continue;
                }
            }

            return(indexerResult.GetSplitResultList().SelectMany(x => x.Splits).Select(Filter).Where(x => !string.IsNullOrEmpty(x)));
        }

Пример #2

Показать файл

        /// <summary>
        /// Extract and create a lucene document object from the document entitiy object. By default will not
        /// perform text extraction
        /// </summary>
        /// <param name="document">The entitiy document object</param>
        /// <param name="extractText">Boolean flag indicating if text extraction should occur </param>
        /// <returns><see cref="IndexedDocument"/> object extracted from the entitiy document object</returns>
        private IndexedDocument GetLuceneDocumentFromDocument(Document document, bool extractText = false)
        {
            var luceneDocument = new IndexedDocument();

            luceneDocument.Title = document.Title;

            // we are going to set the id the same as on the uploaded document
            // so we can easiliy map the lucene results to documents in our system
            luceneDocument.Id = document.Id;

            if (extractText)
            {
                //get the file contents using the tika extractor
                var textExtractResult = _textExtractor.Extract(document.Path);

                luceneDocument.Contents = textExtractResult.Text;
            }
            else
            {
                luceneDocument.Contents = null;
            }

            // set the dates
            luceneDocument.DateIndexed = DateTime.Now;

            // set the date Created for the lucene document that corresponds to the Date the document
            // was uploaded in the system
            luceneDocument.DateCreated = document.DateUploaded;

            // return the lucene document
            return(luceneDocument);
        }

Пример #3

Показать файл

        /// <summary>
        /// Updates dictionary
        /// </summary>
        private void UpdateTokenDictionary()
        {
            _configuration.Splitter.SetResultPhase(false);

            // first create a dictionary for tokens
            // extracts all text from source code
            int totalFileCount   = _configuration.FilesToScan.Count;
            int currentFileCount = 0;

            foreach (IndexerFile file in _configuration.FilesToScan)
            {
                try
                {
                    _configuration.NotificationHandler.UpdateStatus(NotificationType.AnalyzingFile, currentFileCount, totalFileCount, "Extracting file: " + file.Name);

                    if (!_textExtractors.ContainsKey(file.Extension))
                    {
                        string message = "No extractor is defined for file extension: " + file.Extension + ".  Do you want to skip this file?";
                        if (_configuration.NotificationHandler.GetYesNoAnswer(QuestionType.NoTextExtratorDefined, message))
                        {
                            continue;
                        }
                    }

                    ITextExtractor textExtractor = _textExtractors[file.Extension];
                    string         fileText      = File.ReadAllText(file.Path);
                    foreach (string identifier in textExtractor.Extract(fileText))
                    {
                        _configuration.NotificationHandler.UpdateStatus(NotificationType.IdentifyingToken, currentFileCount, totalFileCount, "Analyzing token: " + identifier + " in file: " + file.Name);
                        _configuration.Splitter.UpdateTokenDictionary(identifier);
                    }
                }
                catch (Exception e)
                {
                    string additionalMessage = "Error reading file. " + Environment.NewLine + "File: " + file.Name + Environment.NewLine + "Message: " + e.Message + Environment.NewLine + "Do you want to skip this file?";
                    if (!_configuration.NotificationHandler.GetYesNoAnswer(QuestionType.ErrorReadingFile, additionalMessage))
                    {
                        throw;
                    }
                }
                finally
                {
                    currentFileCount++;
                }
            }
        }

Пример #4

Показать файл

        public void BuildIndex()
        {
            var analyzer = new StandardAnalyzer(LuceneVersion.LUCENE_48);

            using (var indexDirectory = FSDirectory.Open(_indexPath))
            {
                var iwc = new IndexWriterConfig(LuceneVersion.LUCENE_48, analyzer)
                {
                    OpenMode = System.IO.Directory.Exists(_indexPath) ? OpenMode.CREATE_OR_APPEND : OpenMode.CREATE
                };

                using (var writer = new IndexWriter(indexDirectory, iwc))
                {
                    var pdfFiles = System.IO.Directory.GetFiles(_documentsPath, "*.pdf");
                    foreach (var pdfFile in pdfFiles)
                    {
                        var resultText = _pdfTextExtractor.Extract(pdfFile);

                        var document = new Document
                        {
                            new StringField("Path", pdfFile, Field.Store.YES),
                            new TextField("Content", resultText, Field.Store.YES)
                        };

                        if (writer.Config.OpenMode == OpenMode.CREATE)
                        {
                            writer.AddDocument(document);
                        }
                        else
                        {
                            writer.UpdateDocument(new Term("Path", pdfFile), document);
                        }
                    }

                    writer.Commit();
                }
            }
        }

Пример #5

Показать файл

        /// <summary>
        /// Get Result
        /// </summary>
        /// <returns>Indexer Result</returns>
        private IndexerResult GetResult()
        {
            IndexerResult indexerResult = new IndexerResult();

            _configuration.Splitter.SetResultPhase(true);

            // extract
            int totalFileCount   = _configuration.FilesToScan.Count;
            int currentFileCount = 0;

            foreach (IndexerFile file in _configuration.FilesToScan)
            {
                try
                {
                    _configuration.NotificationHandler.UpdateStatus(NotificationType.ReadingFileForIdentifiers, currentFileCount, totalFileCount, "Extracting file identifier: " + file.Name);

                    if (!_textExtractors.ContainsKey(file.Extension))
                    {
                        string message = "No extractor is defined for file extension: " + file.Extension + ".  Do you want to skip this file?";
                        if (_configuration.NotificationHandler.GetYesNoAnswer(QuestionType.NoTextExtratorDefined, message))
                        {
                            continue;
                        }
                    }

                    ITextExtractor textExtractor = _textExtractors[file.Extension];
                    string         fileText      = File.ReadAllText(file.Path);
                    foreach (string identifier in textExtractor.Extract(fileText, _configuration.ExtractType))
                    {
                        _configuration.NotificationHandler.UpdateStatus(NotificationType.Splitting, currentFileCount, totalFileCount, "Splitting token: " + identifier + " in file: " + file.Name);
                        IdentifierSplitResult identifierSplitResult = new IdentifierSplitResult(identifier, file);
                        identifierSplitResult.Add(_configuration.Splitter.Split(identifier));
                        indexerResult.AddSplitResult(identifierSplitResult);
                    }
                }
                catch (Exception e)
                {
                    string additionalMessage = "Error reading file. " + Environment.NewLine + "File: " + file.Name +
                                               Environment.NewLine + "Message: " + e.Message + Environment.NewLine +
                                               "Do you want to skip this file?";
                    if (!_configuration.NotificationHandler.GetYesNoAnswer(QuestionType.ErrorReadingFile, additionalMessage))
                    {
                        throw;
                    }
                }
                finally
                {
                    currentFileCount++;
                }
            }

            // Since while adding result we do not have merged token, misspelled and stemmed word info, filter them and add to respective list
            indexerResult.UpdateFromMergeToken(_tokenDictionary);
            indexerResult.UpdateFromMisspelled(_tokenDictionary);
            indexerResult.UpdateFromStemmed(_tokenDictionary);

            // Filter 3: Stem every identified. If the word is identified replace the word with stemmed word
            if (_configuration.Stemmer != null)
            {
                List <string> dictionaryWordList     = indexerResult.GetDictionaryWordList().Keys.ToList();
                int           totalIdentifiedCount   = dictionaryWordList.Count;
                int           currentIdentifiedCount = 0;
                foreach (string identified in dictionaryWordList)
                {
                    currentIdentifiedCount++;
                    _configuration.NotificationHandler.UpdateStatus(NotificationType.Stemming, currentIdentifiedCount, totalIdentifiedCount, "Stemming: " + identified);
                    string stemmedText = _configuration.Stemmer.GetStemmedText(identified);
                    if (stemmedText != null && stemmedText != identified && _configuration.Dictionary.IsWord(stemmedText))
                    {
                        indexerResult.AddStemmedWordAndReplaceIdentified(identified, stemmedText);
                    }
                }
            }

            // Filter result
            indexerResult.RemoveFilterWordAndTokenResult(_configuration.Dictionary);

            _configuration.NotificationHandler.UpdateStatus(NotificationType.IndexingCompleted, 1, 1, "Indexing Completed");
            return(indexerResult);
        }