/// <summary> /// Constructs a new index. /// </summary> /// <param name="invertedIndex">An index of term/field to document reference.</param> /// <param name="fieldVectors">Field vectors</param> /// <param name="tokenSet">A set of all corpus tokens.</param> /// <param name="fields">The names of indexed document fields.</param> /// <param name="pipeline">The pipeline to use for search terms.</param> internal Index( InvertedIndex invertedIndex, IDictionary <string, Vector> fieldVectors, TokenSet tokenSet, IEnumerable <string> fields, Pipeline pipeline) { InvertedIndex = invertedIndex; FieldVectors = fieldVectors; TokenSet = tokenSet; Fields = fields; Pipeline = pipeline; }
/// <summary> /// Returns a new TokenSet that is the intersection of /// this TokenSet and the passed TokenSet. /// /// This intersection will take into account any wildcards /// contained within the TokenSet. /// </summary> /// <param name="other">Another TokenSet to intersect with.</param> /// <returns>The intersection of the sets.</returns> public TokenSet Intersect(TokenSet other) { var output = new TokenSet(_idProvider); Stack <(TokenSet qNode, TokenSet output, TokenSet node)> stack = new Stack <(TokenSet, TokenSet, TokenSet)>(); stack.Push((other, output, this)); while (stack.Any()) { (TokenSet frameQNode, TokenSet frameOutput, TokenSet frameNode) = stack.Pop(); foreach ((char qEdge, TokenSet qNode) in frameQNode.Edges) { foreach ((char nEdge, TokenSet node) in frameNode.Edges) { if (nEdge == qEdge || qEdge == '*') { bool isFinal = node.IsFinal && qNode.IsFinal; if (frameOutput.Edges.TryGetValue(nEdge, out TokenSet next)) { // an edge already exists for this character // no need to create a new node, just set the finality // bit unless this node is already final next.IsFinal = next.IsFinal || isFinal; } else { // no edge exists yet, must create one // set the finality bit and insert it // into the output next = new TokenSet(_idProvider) { IsFinal = isFinal }; frameOutput.Edges.Add(nEdge, next); } stack.Push((qNode, next, node)); } } } } return(output); }
/// <summary> /// Creates a TokenSet from a string. /// /// The string may contain one or more wildcard characters(*) /// that will allow wildcard matching when intersecting with /// another TokenSet. /// </summary> /// <param name="str">The string to create a TokenSet from.</param> /// <returns>The token set.</returns> public static TokenSet FromString( string str, TokenSetIdProvider?idProvider = null !) { idProvider ??= TokenSetIdProvider.Instance; var root = new TokenSet(idProvider); TokenSet node = root; // Iterates through all characters within the passed string // appending a node for each character. // // When a wildcard character is found then a self // referencing edge is introduced to continually match // any number of any characters. for (int i = 0; i < str.Length; i++) { char ch = str[i]; bool isFinal = i == str.Length - 1; if (ch == Query.Wildcard) { node.Edges.Add(ch, node); node.IsFinal = isFinal; } else { var next = new TokenSet(idProvider) { IsFinal = isFinal }; node.Edges.Add(ch, next); node = next; } } return(root); }
public void Insert(string word) { int commonPrefix = 0; if (StringComparer.Ordinal.Compare(word, _previousWord) < 0) { throw new InvalidOperationException("Out of order word insertion."); } for (int i = 0; i < word.Length && i < _previousWord.Length; i++) { if (word[i] != _previousWord[i]) { break; } commonPrefix++; } Minimize(commonPrefix); TokenSet node = _uncheckedNodes.LastOrDefault().child ?? Root; for (int i = commonPrefix; i < word.Length; i++) { var nextNode = new TokenSet(_idProvider); char ch = word[i]; node.Edges.Add(ch, nextNode); _uncheckedNodes.Add((node, ch, nextNode)); node = nextNode; } node.IsFinal = true; _previousWord = word; }
public Builder(TokenSetIdProvider?idProvider = null !) { _idProvider = idProvider ?? TokenSetIdProvider.Instance; Root = new TokenSet(_idProvider); }
/// <summary> /// Creates a token set representing a single string with a specified /// edit distance. /// /// Insertions, deletions, substitutions and transpositions are each /// treated as an edit distance of 1. /// /// Increasing the allowed edit distance will have a dramatic impact /// on the performance of both creating and intersecting these TokenSets. /// It is advised to keep the edit distance less than 3. /// </summary> /// <param name="str">The string to create the token set from.</param> /// <param name="editDistance">The allowed edit distance to match.</param> public static TokenSet FromFuzzyString( string str, int editDistance, TokenSetIdProvider?idProvider = null !) { idProvider ??= TokenSetIdProvider.Instance; var root = new TokenSet(idProvider); var stack = new Stack <(TokenSet node, int editsRemaining, string str)>(); stack.Push(( node: root, editsRemaining: editDistance, str)); while (stack.Any()) { (TokenSet frameNode, int frameEditsRemaining, string frameStr) = stack.Pop(); // no edit if (frameStr.Length > 0) { char ch = frameStr[0]; TokenSet noEditNode; if (frameNode.Edges.TryGetValue(ch, out TokenSet existingChNode)) { noEditNode = existingChNode; } else { noEditNode = new TokenSet(idProvider); frameNode.Edges.Add(ch, noEditNode); } if (frameStr.Length == 1) { noEditNode.IsFinal = true; } stack.Push(( node: noEditNode, editsRemaining: frameEditsRemaining, str: frameStr.Substring(1))); } if (frameEditsRemaining == 0) { continue; } // insertion TokenSet insertionNode; if (frameNode.Edges.TryGetValue(Query.Wildcard, out TokenSet wildcardNode)) { insertionNode = wildcardNode; } else { insertionNode = new TokenSet(idProvider); frameNode.Edges.Add(Query.Wildcard, insertionNode); } if (frameStr.Length == 0) { insertionNode.IsFinal = true; } stack.Push(( node: insertionNode, editsRemaining: frameEditsRemaining - 1, str: frameStr)); // deletion // Can only do a deletion if we have enough edits remaining // and if there are characters left to delete in the string. if (frameStr.Length > 1) { stack.Push(( node: frameNode, editsRemaining: frameEditsRemaining - 1, str: frameStr.Substring(1))); } // deletion // Just removing the last character from the string. if (frameStr.Length == 1) { frameNode.IsFinal = true; } // substitution // Can only do a substitution if we have enough edits remaining // and if there are characters left to substitute. if (frameStr.Length >= 1) { TokenSet substitutionNode; if (frameNode.Edges.TryGetValue(Query.Wildcard, out TokenSet substitutionWildcardNode)) { substitutionNode = substitutionWildcardNode; } else { substitutionNode = new TokenSet(idProvider); frameNode.Edges.Add(Query.Wildcard, substitutionNode); } if (frameStr.Length == 1) { substitutionNode.IsFinal = true; } stack.Push(( node: substitutionNode, editsRemaining: frameEditsRemaining - 1, str: frameStr.Substring(1))); } // transposition // Can only do a transposition if there are edits remaining // and there are enough characters to transpose. if (frameStr.Length > 1) { char chA = frameStr[0]; char chB = frameStr[1]; TokenSet transposeNode; if (frameNode.Edges.TryGetValue(chB, out TokenSet chBNode)) { transposeNode = chBNode; } else { transposeNode = new TokenSet(idProvider); frameNode.Edges.Add(chB, transposeNode); } if (frameStr.Length == 1) // Note: I'm pretty sure this can't happen, just porting the js version closely. { transposeNode.IsFinal = true; } stack.Push(( node: transposeNode, editsRemaining: frameEditsRemaining - 1, str: chA + frameStr.Substring(2))); } } return(root); }
/// <summary> /// Performs a query against the index using the `Query` object built /// by the provided factory. /// /// If performing programmatic queries against the index, this method is preferred /// over `Index.Search` so as to avoid the additional query parsing overhead. /// /// A query object is yielded to the supplied function which should be used to /// express the query to be run against the index. /// </summary> /// <param name="queryFactory">A function that builds the query object that gets passed to it.</param> /// <param name="cancellationToken">A cancellation token.</param> /// <returns>The results of the query.</returns> public async IAsyncEnumerable <Result> Query( Action <Query> queryFactory, [EnumeratorCancellation] CancellationToken cancellationToken) { var results = new List <Result>(); var query = new Query(Fields); var matchingFields = new Dictionary <FieldReference, MatchData>(); var termFieldCache = new HashSet <string>(); var requiredMatches = new Dictionary <string, ISet <string> >(); var prohibitedMatches = new Dictionary <string, ISet <string> >(); // To support field level boosts a query vector is created per // field. An empty vector is eagerly created to support negated // queries. var queryVectors = new Dictionary <string, Vector>(); foreach (string field in Fields) { queryVectors[field] = new Vector(); } queryFactory(query); for (int i = 0; i < query.Clauses.Count; i++) { Clause clause = query.Clauses[i]; ISet <string> clauseMatches = Set <string> .Empty; // Unless the pipeline has been disabled for this term, which is // the case for terms with wildcards, we need to pass the clause // term through the search pipeline. A pipeline returns an array // of processed terms. Pipeline functions may expand the passed // term, which means we may end up performing multiple index lookups // for a single query term. await foreach (string term in clause.UsePipeline ? Pipeline.RunString( clause.Term, new TokenMetadata { { "fields", clause.Fields } }, cancellationToken) : new[] { clause.Term }.ToAsyncEnumerable(cancellationToken)) { // Each term returned from the pipeline needs to use the same query // clause object, e.g. the same boost and or edit distance. The // simplest way to do this is to re-use the clause object but mutate // its term property. clause = clause.WithTerm(term); // From the term in the clause we create a token set which will then // be used to intersect the indexes token set to get a list of terms // to lookup in the inverted index. var termTokenSet = TokenSet.FromClause(clause); IEnumerable <string> expandedTerms = TokenSet.Intersect(termTokenSet).ToEnumeration(); // If a term marked as required does not exist in the tokenSet it is // impossible for the search to return any matches.We set all the field // scoped required matches set to empty and stop examining any further // clauses. if (!expandedTerms.Any() && clause.Presence == QueryPresence.Required) { foreach (string field in clause.Fields) { requiredMatches.Add(field, Set <string> .Empty); } break; } foreach (string expandedTerm in expandedTerms) { // For each term get the posting and termIndex, this is required for building the query vector. InvertedIndexEntry posting = InvertedIndex[expandedTerm]; int termIndex = posting.Index; foreach (string field in clause.Fields) { // For each field that this query term is scoped by (by default // all fields are in scope) we need to get all the document refs // that have this term in that field. // // The posting is the entry in the invertedIndex for the matching // term from above. // For each field that this query term is scoped by (by default // all fields are in scope) we need to get all the document refs // that have this term in that field. // // The posting is the entry in the invertedIndex for the matching // term from above. FieldMatches fieldPosting = posting[field]; ICollection <string> matchingDocumentRefs = fieldPosting.Keys; string termField = expandedTerm + '/' + field; var matchingDocumentSet = new Set <string>(matchingDocumentRefs); // if the presence of this term is required ensure that the matching // documents are added to the set of required matches for this clause. if (clause.Presence == QueryPresence.Required) { clauseMatches = clauseMatches.Union(matchingDocumentSet); if (!requiredMatches.ContainsKey(field)) { requiredMatches.Add(field, Set <string> .Complete); } } // if the presence of this term is prohibited ensure that the matching // documents are added to the set of prohibited matches for this field, // creating that set if it does not yet exist. if (clause.Presence == QueryPresence.Prohibited) { if (!prohibitedMatches.ContainsKey(field)) { prohibitedMatches.Add(field, Set <string> .Empty); } prohibitedMatches[field] = prohibitedMatches[field].Union(matchingDocumentSet); // Prohibited matches should not be part of the query vector used for // similarity scoring and no metadata should be extracted so we continue // to the next field. continue; } // The query field vector is populated using the termIndex found for // the term and a unit value with the appropriate boost applied. // Using upsert because there could already be an entry in the vector // for the term we are working with.In that case we just add the scores // together. queryVectors[field].Upsert( termIndex, clause.Boost, (a, b) => a + b); // If we've already seen this term, field combo then we've already collected // the matching documents and metadata, no need to go through all that again. if (termFieldCache.Contains(termField)) { continue; } foreach (string matchingDocumentRef in matchingDocumentRefs) { // All metadata for this term/field/document triple // are then extracted and collected into an instance // of lunr.MatchData ready to be returned in the query // results. var matchingFieldRef = new FieldReference(matchingDocumentRef, field); FieldMatchMetadata metadata = fieldPosting[matchingDocumentRef]; if (!matchingFields.TryGetValue(matchingFieldRef, out MatchData? fieldMatch)) { matchingFields.Add( matchingFieldRef, new MatchData(expandedTerm, field, metadata)); } else { fieldMatch.Add(expandedTerm, field, metadata); } } termFieldCache.Add(termField); } } } // If the presence was required we need to update the requiredMatches field sets. // We do this after all fields for the term have collected their matches because // the clause terms presence is required in _any_ of the fields not _all_ of the // fields. if (clause.Presence == QueryPresence.Required) { foreach (string field in clause.Fields) { requiredMatches[field] = requiredMatches[field].Intersect(clauseMatches); } } } // Need to combine the field scoped required and prohibited // matching documents into a global set of required and prohibited // matches. ISet <string> allRequiredMatches = Set <string> .Complete; ISet <string> allProhibitedMatches = Set <string> .Empty; foreach (string field in Fields) { if (requiredMatches.ContainsKey(field)) { allRequiredMatches = allRequiredMatches.Intersect(requiredMatches[field]); } if (prohibitedMatches.ContainsKey(field)) { allProhibitedMatches = allProhibitedMatches.Union(prohibitedMatches[field]); } } IEnumerable <string> matchingFieldRefs = matchingFields.Keys.Select(k => k.ToString()); var matches = new Dictionary <string, Result>(); // If the query is negated (contains only prohibited terms) // we need to get _all_ fieldRefs currently existing in the // index. This is only done when we know that the query is // entirely prohibited terms to avoid any cost of getting all // fieldRefs unnecessarily. // // Additionally, blank MatchData must be created to correctly // populate the results. if (query.IsNegated) { matchingFieldRefs = FieldVectors.Keys; foreach (string matchingFieldRef in matchingFieldRefs) { var fieldRef = FieldReference.FromString(matchingFieldRef); matchingFields.Add(fieldRef, MatchData.Empty); } } foreach (string fieldRefString in matchingFieldRefs) { // Currently we have document fields that match the query, but we // need to return documents.The matchData and scores are combined // from multiple fields belonging to the same document. // // Scores are calculated by field, using the query vectors created // above, and combined into a final document score using addition. var fieldRef = FieldReference.FromString(fieldRefString); string docRef = fieldRef.DocumentReference; if (!allRequiredMatches.Contains(docRef)) { continue; } if (allProhibitedMatches.Contains(docRef)) { continue; } Vector fieldVector = FieldVectors[fieldRefString]; double score = queryVectors[fieldRef.FieldName].Similarity(fieldVector); if (matches.TryGetValue(docRef, out Result? docMatch)) { docMatch.Score += score; docMatch.MatchData.Combine(matchingFields[fieldRef]); } else { var match = new Result( documentReference: docRef, score, matchData: matchingFields[fieldRef] ); matches.Add(docRef, match); if (cancellationToken.IsCancellationRequested) { yield break; } results.Add(match); } } foreach (Result match in results.OrderByDescending(r => r.Score)) { yield return(match); } }