Пример #1
0
        /// <summary>
        ///     Search strategy that favors the lines containing more than one token from the query
        /// </summary>
        /// <param name="query"></param>
        /// <returns>score for each found pointer</returns>
        private Dictionary <LinePointer, double> SameLineFind(string query)
        {
            var orderedTokens = OrderByFrequency(query);

            if (orderedTokens.Count == 0)
            {
                // none of the tokens in the query was found
                return(new Dictionary <LinePointer, double>());
            }


            var result = new Dictionary <LinePointer, double>();

            var foundTokens = 1;

            Trace?.Trace("Same line strategy");

            var scores = new ScoreByPointer();

            var tokensByLine = new Dictionary <LinePointer, List <string> >();

            foreach (var tk in orderedTokens)
            {
                var positions = PositionsByToken[tk];

                foreach (var position in positions.Where(p => p.Deleted == false))
                {
                    if (!tokensByLine.TryGetValue(position, out var tokens))
                    {
                        tokens = new List <string>();
                        tokensByLine[position] = tokens;
                    }

                    tokens.Add(tk);
                }

                if (positions.Count != 0)
                {
                    var score = Math.Log10((double)Entries / positions.Count);
                    foreach (var pointer in positions)
                    {
                        scores[pointer] += score;
                    }
                }
            }

            foreach (var pair in tokensByLine.Where(p => p.Value.Count > 1))
            {
                result[pair.Key] = scores[pair.Key] *= SameLineMultiplier * foundTokens;
            }

            if (result.Count == 0)
            {
                return(result);
            }

            var maxScore = result.Max(p => p.Value);

            return(result.Where(p => p.Value > maxScore / 100).ToDictionary(p => p.Key, p => p.Value));
        }
Пример #2
0
        /// <summary>
        ///     Search strategy that favors multiple tokens in the same document (not on the same line)
        ///     Also works for single tokens per document
        /// </summary>
        /// <param name="query"></param>
        /// <returns></returns>
        private Dictionary <LinePointer, double> SameDocumentFind(string query)
        {
            var orderedTokens = OrderByFrequency(query);

            var result = new HashSet <LinePointer>();

            Trace?.Trace("Same document strategy");


            var differentTokensByDocument = new Dictionary <KeyValue, HashSet <string> >();

            var scores = new ScoreByPointer();

            foreach (var tk in orderedTokens)
            {
                var positions = PositionsByToken[tk];

                var score = Math.Log10((double)Entries / positions.Count);

                var plist = positions.Where(p => !p.Deleted).ToList();


                foreach (var pointer in plist)
                {
                    if (!differentTokensByDocument.TryGetValue(pointer.PrimaryKey, out var tset))
                    {
                        tset = new HashSet <string>();
                        differentTokensByDocument[pointer.PrimaryKey] = tset;
                    }

                    tset.Add(tk);

                    scores[pointer] += score;

                    result.Add(pointer);
                }
            }

            // better score if different tokens found in the same document

            foreach (var linePointer in result)
            {
                var tokens = differentTokensByDocument[linePointer.PrimaryKey].Count;
                if (tokens > 1)
                {
                    scores[linePointer] *= tokens * SameDocumentMultiplier;
                }
            }


            if (result.Count == 0)
            {
                return(result.ToDictionary(p => p, p => scores[p]));
            }

            var maxScore = result.Max(p => scores[p]);

            return(result.Where(p => scores[p] > maxScore / 100).ToDictionary(p => p, p => scores[p]));
        }
Пример #3
0
        /// <summary>
        ///     Search strategy that favors the lines containing more than one token from the query
        /// </summary>
        /// <param name="query"></param>
        /// <returns>score for each found pointer</returns>
        private Dictionary <LinePointer, LineResult> SameLineFind(string query)
        {
            var orderedTokens = OrderByFrequency(query);

            if (orderedTokens.Count == 0)
            {
                // none of the tokens in the query was found
                return(new Dictionary <LinePointer, LineResult>());
            }


            var result = new Dictionary <LinePointer, LineResult>();


            Trace?.Trace("Same line strategy");

            var scores = new ScoreByPointer();

            var foundTokensByLine = new Dictionary <LinePointer, List <string> >();

            foreach (var tk in orderedTokens)
            {
                var positions = PositionsByToken[tk];

                foreach (var position in positions.Where(p => p.Deleted == false))
                {
                    if (!foundTokensByLine.TryGetValue(position, out var tokens))
                    {
                        tokens = new List <string>();
                        foundTokensByLine[position] = tokens;
                    }

                    tokens.Add(tk);
                }

                if (positions.Count != 0)
                {
                    // a measure of the information content
                    var score = Math.Log10((double)Entries / positions.Count);
                    foreach (var pointer in positions)
                    {
                        scores[pointer] += score;
                    }
                }
            }


            foreach (var pair in foundTokensByLine)
            {
                // the multiplier applies only if more than one token in the query was found on the same line
                var score = scores[pair.Key];
                if (pair.Value.Count > 1)
                {
                    score *= SameLineMultiplier * pair.Value.Count;
                }
                result[pair.Key] = new LineResult {
                    Score = scores[pair.Key] = score, TokensFound = pair.Value
                };
            }

            return(result);
        }