/// <summary> /// Search strategy that favors the lines containing more than one token from the query /// </summary> /// <param name="query"></param> /// <returns>score for each found pointer</returns> private Dictionary <LinePointer, double> SameLineFind(string query) { var orderedTokens = OrderByFrequency(query); if (orderedTokens.Count == 0) { // none of the tokens in the query was found return(new Dictionary <LinePointer, double>()); } var result = new Dictionary <LinePointer, double>(); var foundTokens = 1; Trace?.Trace("Same line strategy"); var scores = new ScoreByPointer(); var tokensByLine = new Dictionary <LinePointer, List <string> >(); foreach (var tk in orderedTokens) { var positions = PositionsByToken[tk]; foreach (var position in positions.Where(p => p.Deleted == false)) { if (!tokensByLine.TryGetValue(position, out var tokens)) { tokens = new List <string>(); tokensByLine[position] = tokens; } tokens.Add(tk); } if (positions.Count != 0) { var score = Math.Log10((double)Entries / positions.Count); foreach (var pointer in positions) { scores[pointer] += score; } } } foreach (var pair in tokensByLine.Where(p => p.Value.Count > 1)) { result[pair.Key] = scores[pair.Key] *= SameLineMultiplier * foundTokens; } if (result.Count == 0) { return(result); } var maxScore = result.Max(p => p.Value); return(result.Where(p => p.Value > maxScore / 100).ToDictionary(p => p.Key, p => p.Value)); }
/// <summary> /// Search strategy that favors multiple tokens in the same document (not on the same line) /// Also works for single tokens per document /// </summary> /// <param name="query"></param> /// <returns></returns> private Dictionary <LinePointer, double> SameDocumentFind(string query) { var orderedTokens = OrderByFrequency(query); var result = new HashSet <LinePointer>(); Trace?.Trace("Same document strategy"); var differentTokensByDocument = new Dictionary <KeyValue, HashSet <string> >(); var scores = new ScoreByPointer(); foreach (var tk in orderedTokens) { var positions = PositionsByToken[tk]; var score = Math.Log10((double)Entries / positions.Count); var plist = positions.Where(p => !p.Deleted).ToList(); foreach (var pointer in plist) { if (!differentTokensByDocument.TryGetValue(pointer.PrimaryKey, out var tset)) { tset = new HashSet <string>(); differentTokensByDocument[pointer.PrimaryKey] = tset; } tset.Add(tk); scores[pointer] += score; result.Add(pointer); } } // better score if different tokens found in the same document foreach (var linePointer in result) { var tokens = differentTokensByDocument[linePointer.PrimaryKey].Count; if (tokens > 1) { scores[linePointer] *= tokens * SameDocumentMultiplier; } } if (result.Count == 0) { return(result.ToDictionary(p => p, p => scores[p])); } var maxScore = result.Max(p => scores[p]); return(result.Where(p => scores[p] > maxScore / 100).ToDictionary(p => p, p => scores[p])); }
/// <summary> /// Search strategy that favors the lines containing more than one token from the query /// </summary> /// <param name="query"></param> /// <returns>score for each found pointer</returns> private Dictionary <LinePointer, LineResult> SameLineFind(string query) { var orderedTokens = OrderByFrequency(query); if (orderedTokens.Count == 0) { // none of the tokens in the query was found return(new Dictionary <LinePointer, LineResult>()); } var result = new Dictionary <LinePointer, LineResult>(); Trace?.Trace("Same line strategy"); var scores = new ScoreByPointer(); var foundTokensByLine = new Dictionary <LinePointer, List <string> >(); foreach (var tk in orderedTokens) { var positions = PositionsByToken[tk]; foreach (var position in positions.Where(p => p.Deleted == false)) { if (!foundTokensByLine.TryGetValue(position, out var tokens)) { tokens = new List <string>(); foundTokensByLine[position] = tokens; } tokens.Add(tk); } if (positions.Count != 0) { // a measure of the information content var score = Math.Log10((double)Entries / positions.Count); foreach (var pointer in positions) { scores[pointer] += score; } } } foreach (var pair in foundTokensByLine) { // the multiplier applies only if more than one token in the query was found on the same line var score = scores[pair.Key]; if (pair.Value.Count > 1) { score *= SameLineMultiplier * pair.Value.Count; } result[pair.Key] = new LineResult { Score = scores[pair.Key] = score, TokensFound = pair.Value }; } return(result); }