public FuzzyIndex(IEnumerable<Tuple<long, string>> idTextList, IEnumerable<string> words) { trigramIndex = new TrigramIndex(words); File.WriteAllLines("TrigramIndex.txt", trigramIndex.Trigrams.OrderByDescending(t => t.Value.Count).Select(t => t.Key + "\t" + String.Join(", ", t.Value.Select(id => trigramIndex.IdToWord[id])))); var idWordsList = idTextList.Select(idText => Tuple.Create(idText.Item1, idText.Item2.SplitInWordsAndStripHTML())).ToList(); FillWordFrequencies(idWordsList); stopWords = new HashSet<string>(wordFrequencies.OrderByDescending(kv => kv.Value).Take(130).Select(kv => kv.Key)); idWordsList = idWordsList.Select(idWords => Tuple.Create( idWords.Item1, idWords.Item2.Where(w => !stopWords.Contains(w) && w.Length >= 3).Distinct()) ).ToList(); var unknownWords = idWordsList.SelectMany(idWords => idWords.Item2).Distinct().Where(w => !trigramIndex.ContainsWord(w)).ToList(); var levensteinInfos = RetrieveLevensteinInfos(unknownWords); misspellingsIndex = GetMisspellingsIndex(levensteinInfos).ToDictionary(kv => kv.Key, kv => kv.Value); fuzzyDictionary = GetFuzzyDictionary(levensteinInfos); foreach (var idWords in idWordsList) { foreach (var word in idWords.Item2) { var dictionaryWord = trigramIndex.ContainsWord(word) ? word : (fuzzyDictionary.ContainsKey(word) ? fuzzyDictionary[word] : null); if(dictionaryWord == null) continue; if (!termToIds.ContainsKey(dictionaryWord)) termToIds[dictionaryWord] = new HashSet<long>(); termToIds[dictionaryWord].Add(idWords.Item1); } } index = termToIds.Select(termId => new InvertedIndexUnit(termId.Key, termId.Value)).ToList(); File.WriteAllLines("__Misspellings_To_Words.txt", misspellingsToWords.OrderByDescending(mw => mw.Value.Count).Select(mw => mw.Key.Item1 + "\t" + mw.Key.Item2 + "\t" + mw.Value.Count + "\t" + String.Join(", ", mw.Value))); }
public SpellChecker(TrigramIndex trigramIndex) { this.trigramIndex = trigramIndex; }