private IList <string> OrderByFrequency(string query) { var tokenizer = new Tokenizer(); var tokens = tokenizer.TokenizeOneLine(query).Where(t => t.TokenType != CharClass.Symbol); var frequencyByToken = new Dictionary <string, int>(); foreach (var token in tokens) { var frequency = 0; if (PositionsByToken.TryGetValue(token.NormalizedText, out var list)) { frequency = list.Count; } if (frequency != 0) { frequencyByToken[token.NormalizedText] = frequency; } } if (frequencyByToken.Count == 0) { return(new List <string>()); } // most significant (less frequent) tokens first return(frequencyByToken.OrderBy(p => p.Value).Select(p => p.Key).Distinct().ToList()); }
/// <summary> /// Add a tokenized line to the full-text index /// </summary> /// <param name="line"></param> /// <param name="lineIndex"></param> /// <param name="primaryKey"></param> private void IndexLine(TokenizedLine line, int lineIndex, KeyValue primaryKey) { var pointer = new LinePointer(lineIndex, primaryKey); foreach (var token in line.Tokens) { var tooFrequentToken = false; if (!PositionsByToken.TryGetValue(token, out var positions)) { positions = new HashSet <LinePointer>(); PositionsByToken[token] = positions; } else { if (positions.Count == 0) { tooFrequentToken = true; } } if (!tooFrequentToken) { if (positions.Add(pointer)) { Entries = Entries + 1; AddToSecondaryIndex(pointer); } } } // Remove the most frequent (less discriminant) tokens in the index if the index is too big // Limit the entries in the index: try to limit to MaxCapacity but without removing more than MaxTokensToIgnore tokens if (NeedsCleanup()) { string mostFrequentToken = null; var maxFrequency = 0; foreach (var p in PositionsByToken) { if (p.Value.Count > maxFrequency) { mostFrequentToken = p.Key; maxFrequency = p.Value.Count; } } Debug.Assert(mostFrequentToken != null); IgnoreToken(mostFrequentToken); Entries = Entries - maxFrequency; IgnoredTokens++; } }
public void Clear() { Entries = 0; IgnoredTokens = 0; PositionsByToken.Clear(); PositionsByDocument.Clear(); }