Example #1
0
        /// <summary>
        ///     Add a tokenized line to the full-text index
        /// </summary>
        /// <param name="line"></param>
        /// <param name="lineIndex"></param>
        /// <param name="primaryKey"></param>
        private void IndexLine(TokenizedLine line, int lineIndex, KeyValue primaryKey)
        {
            var pointer = new LinePointer(lineIndex, primaryKey);

            foreach (var token in line.Tokens)
            {
                var tooFrequentToken = false;

                if (!PositionsByToken.TryGetValue(token, out var positions))
                {
                    positions = new HashSet <LinePointer>();
                    PositionsByToken[token] = positions;
                }
                else
                {
                    if (positions.Count == 0)
                    {
                        tooFrequentToken = true;
                    }
                }

                if (!tooFrequentToken)
                {
                    if (positions.Add(pointer))
                    {
                        Entries = Entries + 1;

                        AddToSecondaryIndex(pointer);
                    }
                }
            }


            // Remove the most frequent (less discriminant) tokens in the index if the index is too big
            // Limit the entries in the index: try to limit to MaxCapacity but without removing more than MaxTokensToIgnore tokens
            if (NeedsCleanup())
            {
                string mostFrequentToken = null;

                var maxFrequency = 0;

                foreach (var p in PositionsByToken)
                {
                    if (p.Value.Count > maxFrequency)
                    {
                        mostFrequentToken = p.Key;
                        maxFrequency      = p.Value.Count;
                    }
                }

                Debug.Assert(mostFrequentToken != null);

                IgnoreToken(mostFrequentToken);


                Entries = Entries - maxFrequency;

                IgnoredTokens++;
            }
        }
Example #2
0
        /// <summary>
        ///     Compute a score bonus (a multiplier to be applied on the previously computed score) if the order of tokens is
        ///     preserved between
        ///     the query and the found line. Exact sequences give a bigger bonus
        /// </summary>
        /// <param name="query"></param>
        /// <param name="line"></param>
        /// <returns></returns>
        public static double ComputeBonusIfOrderIsPreserved(TokenizedLine query, TokenizedLine line)
        {
            var first = query.Tokens;

            var second = line.Tokens;


            // index in query --> index in line or -1 if correspondent token not found
            var indexes = new List <KeyValuePair <int, int> >();

            var index1 = 0;

            foreach (var token in first)
            {
                var index2 = -1;

                for (var i = 0; i < second.Count; i++)
                {
                    if (second[i] == token)
                    {
                        index2 = i;
                        break;
                    }
                }

                indexes.Add(new KeyValuePair <int, int>(index1, index2));

                index1++;
            }

            // make the indexes in the second sequence 0 based
            var min = indexes.Min(p => p.Value >= 0 ? p.Value : 0);

            indexes = indexes.Where(p => p.Value >= 0).Select(p => new KeyValuePair <int, int>(p.Key, p.Value - min))
                      .ToList();

            double scoreMultiplier = 1;

            for (var i = 1; i < indexes.Count; i++)
            {
                var prev1 = indexes[i - 1].Key;
                var curr1 = indexes[i].Key;

                var distance1 = curr1 - prev1;

                var prev2 = indexes[i - 1].Value;
                var curr2 = indexes[i].Value;

                var distance2 = curr2 - prev2;


                if (distance1 == distance2)
                {
                    scoreMultiplier *= 100;
                }
                else if (distance2 - distance1 == 1)
                {
                    scoreMultiplier *= 50;
                }
                else if (distance2 - distance1 == 2)
                {
                    scoreMultiplier *= 30;
                }
                else if (distance2 > 0) // still apply a bonus because order is preserved
                {
                    scoreMultiplier *= 2;
                }
            }


            return(scoreMultiplier);
        }