/// <summary> /// Add a tokenized line to the full-text index /// </summary> /// <param name="line"></param> /// <param name="lineIndex"></param> /// <param name="primaryKey"></param> private void IndexLine(TokenizedLine line, int lineIndex, KeyValue primaryKey) { var pointer = new LinePointer(lineIndex, primaryKey); foreach (var token in line.Tokens) { var tooFrequentToken = false; if (!PositionsByToken.TryGetValue(token, out var positions)) { positions = new HashSet <LinePointer>(); PositionsByToken[token] = positions; } else { if (positions.Count == 0) { tooFrequentToken = true; } } if (!tooFrequentToken) { if (positions.Add(pointer)) { Entries = Entries + 1; AddToSecondaryIndex(pointer); } } } // Remove the most frequent (less discriminant) tokens in the index if the index is too big // Limit the entries in the index: try to limit to MaxCapacity but without removing more than MaxTokensToIgnore tokens if (NeedsCleanup()) { string mostFrequentToken = null; var maxFrequency = 0; foreach (var p in PositionsByToken) { if (p.Value.Count > maxFrequency) { mostFrequentToken = p.Key; maxFrequency = p.Value.Count; } } Debug.Assert(mostFrequentToken != null); IgnoreToken(mostFrequentToken); Entries = Entries - maxFrequency; IgnoredTokens++; } }
/// <summary> /// Compute a score bonus (a multiplier to be applied on the previously computed score) if the order of tokens is /// preserved between /// the query and the found line. Exact sequences give a bigger bonus /// </summary> /// <param name="query"></param> /// <param name="line"></param> /// <returns></returns> public static double ComputeBonusIfOrderIsPreserved(TokenizedLine query, TokenizedLine line) { var first = query.Tokens; var second = line.Tokens; // index in query --> index in line or -1 if correspondent token not found var indexes = new List <KeyValuePair <int, int> >(); var index1 = 0; foreach (var token in first) { var index2 = -1; for (var i = 0; i < second.Count; i++) { if (second[i] == token) { index2 = i; break; } } indexes.Add(new KeyValuePair <int, int>(index1, index2)); index1++; } // make the indexes in the second sequence 0 based var min = indexes.Min(p => p.Value >= 0 ? p.Value : 0); indexes = indexes.Where(p => p.Value >= 0).Select(p => new KeyValuePair <int, int>(p.Key, p.Value - min)) .ToList(); double scoreMultiplier = 1; for (var i = 1; i < indexes.Count; i++) { var prev1 = indexes[i - 1].Key; var curr1 = indexes[i].Key; var distance1 = curr1 - prev1; var prev2 = indexes[i - 1].Value; var curr2 = indexes[i].Value; var distance2 = curr2 - prev2; if (distance1 == distance2) { scoreMultiplier *= 100; } else if (distance2 - distance1 == 1) { scoreMultiplier *= 50; } else if (distance2 - distance1 == 2) { scoreMultiplier *= 30; } else if (distance2 > 0) // still apply a bonus because order is preserved { scoreMultiplier *= 2; } } return(scoreMultiplier); }