コード例 #1
0
ファイル: FuzzyIndex.cs プロジェクト: bellal89/FuzzyIndexer
        public FuzzyIndex(TextCollection collection, List <List <string> > words)
        {
            var wordToId = words.SelectMany((ws, i) => ws.Select(w => Tuple.Create(w, i))).ToDictionary(it => it.Item1,
                                                                                                        it => it.Item2);

            var stopWords =
                new HashSet <string>(
                    collection.GetFrequencyDistribution().OrderByDescending(kv => kv.Value).Take(130).Select(kv => kv.Key));
            var idWordsList = collection.GetCollectionByFilter(tokens => tokens.Distinct(),
                                                               w => !stopWords.Contains(w) && w.Length >= 3);

            var mispellings = new Mispellings(idWordsList.SelectMany(idWords => idWords.Item2), wordToId.Keys);

            foreach (var idWords in idWordsList)
            {
                foreach (var word in idWords.Item2)
                {
                    var dictionaryWord = mispellings.GetFixedOrNull(word);
                    if (dictionaryWord == null)
                    {
                        continue;
                    }
                    if (!termToIds.ContainsKey(dictionaryWord))
                    {
                        termToIds[dictionaryWord] = new HashSet <long>();
                    }
                    termToIds[dictionaryWord].Add(idWords.Item1);
                }
            }
            var synJoinedTermToId = termToIds.GroupBy(termId => words[wordToId[termId.Key]].First(),
                                                      (key, vals) => Tuple.Create(key, vals.Aggregate(new HashSet <long>(),
                                                                                                      (src, kv) => new HashSet <long>(src.Union(kv.Value)))));

            index = synJoinedTermToId.Select(termId => new InvertedIndexUnit(termId.Item1, termId.Item2)).ToList();
        }
コード例 #2
0
ファイル: FuzzyIndex.cs プロジェクト: bellal89/FuzzyIndexer
 public FuzzyIndex(TextCollection collection, IEnumerable <string> words)
     : this(collection, words.Select(w => new List <string> { w }).ToList())
 {
 }