/// <summary> /// Counts the occurrences of unique n-grams using the given NGramIndexer. /// </summary> /// <param name="indexer">An NGramIndexer that has been initialized.</param> /// <param name="inputFiles">A collection of files to read.</param> /// <param name="knownWords">A collection of words that are "known". All others will be ignored.</param> /// <param name="ignoreCase">Indicates whether letter casing should be ignored.</param> public void CountNGrams(NGramIndexer indexer, IList<FileInfo> inputFiles, HashSet<string> knownWords, bool ignoreCase) { if (inputFiles != null && inputFiles.Count > 0) { var fileWordSeqs = inputFiles.Select(f => GetFileTokens(f.FullName, ignoreCase)); foreach (var sequence in fileWordSeqs) { indexer.ReadSequence(sequence, knownWords); } indexer.SaveModel(); } }
protected virtual void Dispose(bool disposing) { if (disposing) { if (_ngramLucene != null) _ngramLucene.Dispose(); if (_ngramIndexer != null) _ngramIndexer = null; } }
/// <summary> /// Counts n-grams in the text files for the selected language. /// </summary> private void CountNGrams() { try { string lang = _languageId; var indexer = new NGramIndexer(CommonFiles.Words(lang), CommonFiles.WordCounts(lang), CommonFiles.NGrams(lang), CommonFiles.NGramCounts(lang), CommonFiles.NGramsByWord(lang), CommonFiles.WordsByNGram(lang)) { Order = NGramOrder }; var counter = new WordCounter(); var inputDirectory = new DirectoryInfo(CommonFiles.DocsPath(lang)); var inputFiles = inputDirectory.GetFiles("*.txt", SearchOption.TopDirectoryOnly).Take(NGramMaxFiles).ToList(); var knownWords = new HashSet<string>(_translator.GetKnownWords()); counter.CountNGrams(indexer, inputFiles, knownWords, _ignoreCase); } catch (IOException ex) { Debug.WriteLine(ex.Message + ex.StackTrace); return; } }
public void Initialize() { string lang = LanguageId; if (_ngramLucene != null) _ngramLucene.Dispose(); try { _ngramLucene = new NGramLucene(CommonFiles.IndexBasePath(lang), false, IgnoreCase); } catch (Exception ex) { Debug.WriteLine(ex.Message + ex.StackTrace); } _ngramIndexer = new NGramIndexer(CommonFiles.Words(lang), CommonFiles.WordCounts(lang), CommonFiles.NGrams(lang), CommonFiles.NGramCounts(lang), CommonFiles.NGramsByWord(lang), CommonFiles.WordsByNGram(lang)); }