Beispiel #1
0
        public static SearchStats Build(Dictionary <string, string> pages)
        {
            var pageTerms = pages
                            .ToDictionary(x => x.Key, x => TextUtility.BuildTerms(x.Value));

            var allTerms = pageTerms.Values
                           .SelectMany(x => x)
                           .Distinct()
                           .ToArray();

            var termsAtPagesPagesCount = allTerms.ToDictionary(x => x, x => new Dictionary <string, int>());
            var termsAtPagesUsed       = allTerms.ToDictionary(x => x, x => 0);
            var pageWordCount          = new Dictionary <string, int>();

            foreach (var(name, terms) in pageTerms)
            {
                foreach (var group in terms.GroupBy(x => x))
                {
                    termsAtPagesPagesCount[group.Key][name] = group.Count();
                    termsAtPagesUsed[group.Key]            += 1;
                }

                pageWordCount[name] = terms.Length;
            }

            return(new SearchStats
            {
                TermsCount = pageTerms.Count,
                AveragePageWordCount = (double)pageWordCount.Values.Sum() / pageTerms.Count,
                PagesWordCount = pageWordCount,
                TermOnPageCount = termsAtPagesPagesCount,
                TermAtPagesCount = termsAtPagesUsed,
                DocumentsCount = pages.Count,
            });
        }
Beispiel #2
0
        public KeyValuePair <string, double>[] Search(string query)
        {
            var terms  = TextUtility.BuildTerms(query);
            var result = new Dictionary <string, double>();

            foreach (var term in terms.Distinct())
            {
                if (!_stats.TermAtPagesCount.ContainsKey(term))
                {
                    continue;
                }

                foreach (var(page, _) in _stats.TermOnPageCount[term])
                {
                    if (!result.ContainsKey(page))
                    {
                        result[page] = 0;
                    }

                    result[page] += Tf(page, term) * Idf(term);
                }
            }

            return(result
                   .OrderByDescending(x => x.Value)
                   .Take(10)
                   .ToArray());
        }
Beispiel #3
0
        public KeyValuePair <string, double>[] Search(string query)
        {
            var terms = TextUtility.BuildTerms(query)
                        .GroupBy(x => x)
                        .ToDictionary(x => x.Key, x => x.Count());

            var result = new Dictionary <string, double>();

            foreach (var(term, termInQueryCount) in terms)
            {
                if (!_stats.TermAtPagesCount.ContainsKey(term))
                {
                    continue;
                }

                foreach (var(page, termInPageCount) in _stats.TermOnPageCount[term])
                {
                    if (!result.ContainsKey(page))
                    {
                        result[page] = 0;
                    }

                    var k = _k1 * (1 - _b + _b * (_stats.PagesWordCount[page] / _stats.AveragePageWordCount));

                    result[page] += Idf(term)
                                    * ((_k1 + 1) * termInPageCount) / (k + termInPageCount)
                                    * ((_k2 + 1) * termInQueryCount) / (_k2 + termInQueryCount);
                }
            }

            return(result
                   .OrderByDescending(x => x.Value)
                   .Take(10)
                   .ToArray());
        }
Beispiel #4
0
        public KeyValuePair <string, double>[] Search(string query)
        {
            var terms = TextUtility.BuildTerms(query)
                        .GroupBy(x => x)
                        .ToDictionary(x => x.Key, x => x.Count());

            var position   = terms.ToDictionary(x => x.Key, x => 0);
            var heap       = new PriorityQueue <(int Page, int Term, int Count)>();
            var resultHeap = new PriorityQueue <(double Score, int Page)>();

            foreach (var(term, _) in terms)
            {
                if (!_index.TermIndexes.ContainsKey(term))
                {
                    continue;
                }

                var termIndex = _index.TermIndexes[term];
                var(page, count) = _index.TermIdAndPageIdToCount[termIndex].First();

                heap.Enqueue((page, termIndex, count));
            }

            while (heap.Count() > 0)
            {
                var pageId   = heap.Peek().Page;
                var pageName = _index.Pages[pageId];

                var k     = _k1 * (1 - _b + _b * (_stats.PagesWordCount[pageName] / _stats.AveragePageWordCount));
                var score = 0.0;

                while (heap.Count() > 0 && heap.Peek().Page == pageId)
                {
                    var(_, term, count) = heap.Dequeue();
                    var termName         = _index.Terms[term];
                    var termInPageCount  = count;
                    var termInQueryCount = terms[termName];

                    score += Idf(termName)
                             * ((_k1 + 1) * termInPageCount) / (k + termInPageCount)
                             * ((_k2 + 1) * termInQueryCount) / (_k2 + termInQueryCount);

                    position[termName] += 1;
                    var list = _index.TermIdAndPageIdToCount[term];

                    if (position[termName] < list.Length)
                    {
                        var(newPage, newCount) = list[position[termName]];

                        heap.Enqueue((newPage, term, newCount));
                    }
                }

                resultHeap.Enqueue((score, pageId));

                while (resultHeap.Count() > _documentCount)
                {
                    resultHeap.Dequeue();
                }
            }

            var result = new List <KeyValuePair <string, double> >();

            while (resultHeap.Count() > 0)
            {
                var(score, pageId) = resultHeap.Dequeue();

                result.Add(new KeyValuePair <string, double>(_index.Pages[pageId], score));
            }

            return(result
                   .OrderByDescending(x => x.Value)
                   .ToArray());
        }