Beispiel #1
0
        public FuzzyIndex(IEnumerable<Tuple<long, string>> idTextList, IEnumerable<string> words)
        {
            trigramIndex = new TrigramIndex(words);

            File.WriteAllLines("TrigramIndex.txt", trigramIndex.Trigrams.OrderByDescending(t => t.Value.Count).Select(t => t.Key + "\t" + String.Join(", ", t.Value.Select(id => trigramIndex.IdToWord[id]))));

            var idWordsList = idTextList.Select(idText => Tuple.Create(idText.Item1,
                                                                       idText.Item2.SplitInWordsAndStripHTML())).ToList();
            FillWordFrequencies(idWordsList);
            stopWords = new HashSet<string>(wordFrequencies.OrderByDescending(kv => kv.Value).Take(130).Select(kv => kv.Key));

            idWordsList = idWordsList.Select(idWords =>
                    Tuple.Create(
                        idWords.Item1,
                        idWords.Item2.Where(w => !stopWords.Contains(w) && w.Length >= 3).Distinct())
                    ).ToList();

            var unknownWords =
                idWordsList.SelectMany(idWords => idWords.Item2).Distinct().Where(w => !trigramIndex.ContainsWord(w)).ToList();

            var levensteinInfos = RetrieveLevensteinInfos(unknownWords);
            misspellingsIndex = GetMisspellingsIndex(levensteinInfos).ToDictionary(kv => kv.Key, kv => kv.Value);
            fuzzyDictionary = GetFuzzyDictionary(levensteinInfos);

            foreach (var idWords in idWordsList)
            {
                foreach (var word in idWords.Item2)
                {
                    var dictionaryWord = trigramIndex.ContainsWord(word)
                                         	? word
                                         	: (fuzzyDictionary.ContainsKey(word)
                                                ? fuzzyDictionary[word]
                                                : null);
                    if(dictionaryWord == null) continue;
                    if (!termToIds.ContainsKey(dictionaryWord))
                        termToIds[dictionaryWord] = new HashSet<long>();
                    termToIds[dictionaryWord].Add(idWords.Item1);
                }
            }
            index = termToIds.Select(termId => new InvertedIndexUnit(termId.Key, termId.Value)).ToList();

            File.WriteAllLines("__Misspellings_To_Words.txt", misspellingsToWords.OrderByDescending(mw => mw.Value.Count).Select(mw => mw.Key.Item1 + "\t" + mw.Key.Item2 + "\t" + mw.Value.Count + "\t" + String.Join(", ", mw.Value)));
        }
Beispiel #2
0
 public SpellChecker(TrigramIndex trigramIndex)
 {
     this.trigramIndex = trigramIndex;
 }