Exemplo n.º 1
0
        /// <summary>
        /// Counts the occurrences of unique n-grams using the given NGramIndexer.
        /// </summary>
        /// <param name="indexer">An NGramIndexer that has been initialized.</param>
        /// <param name="inputFiles">A collection of files to read.</param>
        /// <param name="knownWords">A collection of words that are "known". All others will be ignored.</param>
        /// <param name="ignoreCase">Indicates whether letter casing should be ignored.</param>
        public void CountNGrams(NGramIndexer indexer, IList<FileInfo> inputFiles, HashSet<string> knownWords, bool ignoreCase)
        {
            if (inputFiles != null && inputFiles.Count > 0)
            {
                var fileWordSeqs = inputFiles.Select(f => GetFileTokens(f.FullName, ignoreCase));

                foreach (var sequence in fileWordSeqs)
                {
                    indexer.ReadSequence(sequence, knownWords);
                }

                indexer.SaveModel();
            }
        }
Exemplo n.º 2
0
        protected virtual void Dispose(bool disposing)
        {
            if (disposing)
            {
                if (_ngramLucene != null)
                    _ngramLucene.Dispose();

                if (_ngramIndexer != null)
                    _ngramIndexer = null;
            }
        }
Exemplo n.º 3
0
        /// <summary>
        /// Counts n-grams in the text files for the selected language.
        /// </summary>
        private void CountNGrams()
        {
            try
            {
                string lang = _languageId;

                var indexer =
                    new NGramIndexer(CommonFiles.Words(lang), CommonFiles.WordCounts(lang), CommonFiles.NGrams(lang), CommonFiles.NGramCounts(lang), CommonFiles.NGramsByWord(lang), CommonFiles.WordsByNGram(lang))
                    {
                        Order = NGramOrder
                    };

                var counter = new WordCounter();
                var inputDirectory = new DirectoryInfo(CommonFiles.DocsPath(lang));
                var inputFiles = inputDirectory.GetFiles("*.txt", SearchOption.TopDirectoryOnly).Take(NGramMaxFiles).ToList();
                var knownWords = new HashSet<string>(_translator.GetKnownWords());

                counter.CountNGrams(indexer, inputFiles, knownWords, _ignoreCase);
            }
            catch (IOException ex)
            {
                Debug.WriteLine(ex.Message + ex.StackTrace);
                return;
            }
        }
Exemplo n.º 4
0
        public void Initialize()
        {
            string lang = LanguageId;

            if (_ngramLucene != null)
                _ngramLucene.Dispose();

            try
            {
                _ngramLucene = new NGramLucene(CommonFiles.IndexBasePath(lang), false, IgnoreCase);
            }
            catch (Exception ex)
            {
                Debug.WriteLine(ex.Message + ex.StackTrace);
            }

            _ngramIndexer = new NGramIndexer(CommonFiles.Words(lang), CommonFiles.WordCounts(lang), CommonFiles.NGrams(lang), CommonFiles.NGramCounts(lang), CommonFiles.NGramsByWord(lang), CommonFiles.WordsByNGram(lang));
        }