public static SearchStats Build(Dictionary <string, string> pages) { var pageTerms = pages .ToDictionary(x => x.Key, x => TextUtility.BuildTerms(x.Value)); var allTerms = pageTerms.Values .SelectMany(x => x) .Distinct() .ToArray(); var termsAtPagesPagesCount = allTerms.ToDictionary(x => x, x => new Dictionary <string, int>()); var termsAtPagesUsed = allTerms.ToDictionary(x => x, x => 0); var pageWordCount = new Dictionary <string, int>(); foreach (var(name, terms) in pageTerms) { foreach (var group in terms.GroupBy(x => x)) { termsAtPagesPagesCount[group.Key][name] = group.Count(); termsAtPagesUsed[group.Key] += 1; } pageWordCount[name] = terms.Length; } return(new SearchStats { TermsCount = pageTerms.Count, AveragePageWordCount = (double)pageWordCount.Values.Sum() / pageTerms.Count, PagesWordCount = pageWordCount, TermOnPageCount = termsAtPagesPagesCount, TermAtPagesCount = termsAtPagesUsed, DocumentsCount = pages.Count, }); }
public KeyValuePair <string, double>[] Search(string query) { var terms = TextUtility.BuildTerms(query); var result = new Dictionary <string, double>(); foreach (var term in terms.Distinct()) { if (!_stats.TermAtPagesCount.ContainsKey(term)) { continue; } foreach (var(page, _) in _stats.TermOnPageCount[term]) { if (!result.ContainsKey(page)) { result[page] = 0; } result[page] += Tf(page, term) * Idf(term); } } return(result .OrderByDescending(x => x.Value) .Take(10) .ToArray()); }
public KeyValuePair <string, double>[] Search(string query) { var terms = TextUtility.BuildTerms(query) .GroupBy(x => x) .ToDictionary(x => x.Key, x => x.Count()); var result = new Dictionary <string, double>(); foreach (var(term, termInQueryCount) in terms) { if (!_stats.TermAtPagesCount.ContainsKey(term)) { continue; } foreach (var(page, termInPageCount) in _stats.TermOnPageCount[term]) { if (!result.ContainsKey(page)) { result[page] = 0; } var k = _k1 * (1 - _b + _b * (_stats.PagesWordCount[page] / _stats.AveragePageWordCount)); result[page] += Idf(term) * ((_k1 + 1) * termInPageCount) / (k + termInPageCount) * ((_k2 + 1) * termInQueryCount) / (_k2 + termInQueryCount); } } return(result .OrderByDescending(x => x.Value) .Take(10) .ToArray()); }
public KeyValuePair <string, double>[] Search(string query) { var terms = TextUtility.BuildTerms(query) .GroupBy(x => x) .ToDictionary(x => x.Key, x => x.Count()); var position = terms.ToDictionary(x => x.Key, x => 0); var heap = new PriorityQueue <(int Page, int Term, int Count)>(); var resultHeap = new PriorityQueue <(double Score, int Page)>(); foreach (var(term, _) in terms) { if (!_index.TermIndexes.ContainsKey(term)) { continue; } var termIndex = _index.TermIndexes[term]; var(page, count) = _index.TermIdAndPageIdToCount[termIndex].First(); heap.Enqueue((page, termIndex, count)); } while (heap.Count() > 0) { var pageId = heap.Peek().Page; var pageName = _index.Pages[pageId]; var k = _k1 * (1 - _b + _b * (_stats.PagesWordCount[pageName] / _stats.AveragePageWordCount)); var score = 0.0; while (heap.Count() > 0 && heap.Peek().Page == pageId) { var(_, term, count) = heap.Dequeue(); var termName = _index.Terms[term]; var termInPageCount = count; var termInQueryCount = terms[termName]; score += Idf(termName) * ((_k1 + 1) * termInPageCount) / (k + termInPageCount) * ((_k2 + 1) * termInQueryCount) / (_k2 + termInQueryCount); position[termName] += 1; var list = _index.TermIdAndPageIdToCount[term]; if (position[termName] < list.Length) { var(newPage, newCount) = list[position[termName]]; heap.Enqueue((newPage, term, newCount)); } } resultHeap.Enqueue((score, pageId)); while (resultHeap.Count() > _documentCount) { resultHeap.Dequeue(); } } var result = new List <KeyValuePair <string, double> >(); while (resultHeap.Count() > 0) { var(score, pageId) = resultHeap.Dequeue(); result.Add(new KeyValuePair <string, double>(_index.Pages[pageId], score)); } return(result .OrderByDescending(x => x.Value) .ToArray()); }