/// <summary> /// An instance of `MatchData` will be created for every term that matches a /// document. However only one instance is required in an index result. This /// method combines metadata from another instance of `MatchData` with this /// object's metadata. /// </summary> /// <param name="otherMatchData">Another instance of match data to merge with this one.</param> public void Combine(MatchData otherMatchData) { IEnumerable <string> terms = otherMatchData.Posting.Keys; foreach (string term in terms) { IEnumerable <string> fields = otherMatchData.Posting[term].Keys; if (!Posting.ContainsKey(term)) { Posting.Add(term, new FieldMatches()); } Dictionary <string, FieldMatchMetadata> thisTermEntry = Posting[term]; foreach (string field in fields) { IEnumerable <string> keys = otherMatchData.Posting[term][field].Keys; if (!thisTermEntry.ContainsKey(field)) { thisTermEntry.Add(field, new FieldMatchMetadata(capacity: otherMatchData.Posting[term][field].Keys.Count)); } FieldMatchMetadata thisFieldEntry = thisTermEntry[field]; foreach (string key in keys) { IList <object?> otherData = otherMatchData.Posting[term][field][key]; if (!thisFieldEntry.ContainsKey(key)) { thisFieldEntry.Add(key, new List <object?>(otherData)); } else { thisFieldEntry[key] = thisFieldEntry[key].Concat(otherData).ToList(); } } } } }
/// <summary> /// Add metadata for a term/field pair to this instance of match data. /// </summary> /// <param name="term">The term this match data is associated with.</param> /// <param name="field">The field in which the term was found.</param> /// <param name="metadata">The metadata recorded about this term in this field.</param> public void Add(string term, string field, FieldMatchMetadata metadata) { if (!Posting.ContainsKey(term)) { Posting.Add(term, new FieldMatches { { field, metadata } }); return; } FieldMatches termMetadata = Posting[term]; if (!termMetadata.ContainsKey(field)) { termMetadata.Add(field, metadata); return; } foreach (string key in metadata.Keys) { FieldMatchMetadata fieldMetadata = termMetadata[field]; if (fieldMetadata.ContainsKey(key)) { fieldMetadata[key] = fieldMetadata[key].Concat(metadata[key]).ToList(); } else { fieldMetadata[key] = metadata[key]; } } }
/// <summary>Constructs a `MatchData`.</summary> /// <param name="term">The term this match data is associated with.</param> /// <param name="field">The field in which the term was found.</param> /// <param name="metadata">The metadata recorded about this term in this field.</param> public MatchData( string term, string field, FieldMatchMetadata metadata) { Term = term; Field = field; // Cloning the metadata to prevent the original being mutated during match data combination. // Metadata is kept in an array within the inverted index. var clonedMetadata = new FieldMatchMetadata(capacity: metadata.Count); foreach ((string key, IEnumerable <object?> value) in metadata) { clonedMetadata.Add(key, new List <object?>(value)); } Posting = new InvertedIndexEntry { { term, new FieldMatches { { field, clonedMetadata } } } }; }
/// <summary> /// Performs a query against the index using the `Query` object built /// by the provided factory. /// /// If performing programmatic queries against the index, this method is preferred /// over `Index.Search` so as to avoid the additional query parsing overhead. /// /// A query object is yielded to the supplied function which should be used to /// express the query to be run against the index. /// </summary> /// <param name="queryFactory">A function that builds the query object that gets passed to it.</param> /// <param name="cancellationToken">A cancellation token.</param> /// <returns>The results of the query.</returns> public async IAsyncEnumerable <Result> Query( Action <Query> queryFactory, [EnumeratorCancellation] CancellationToken cancellationToken) { var results = new List <Result>(); var query = new Query(Fields); var matchingFields = new Dictionary <FieldReference, MatchData>(); var termFieldCache = new HashSet <string>(); var requiredMatches = new Dictionary <string, ISet <string> >(); var prohibitedMatches = new Dictionary <string, ISet <string> >(); // To support field level boosts a query vector is created per // field. An empty vector is eagerly created to support negated // queries. var queryVectors = new Dictionary <string, Vector>(); foreach (string field in Fields) { queryVectors[field] = new Vector(); } queryFactory(query); for (int i = 0; i < query.Clauses.Count; i++) { Clause clause = query.Clauses[i]; ISet <string> clauseMatches = Set <string> .Empty; // Unless the pipeline has been disabled for this term, which is // the case for terms with wildcards, we need to pass the clause // term through the search pipeline. A pipeline returns an array // of processed terms. Pipeline functions may expand the passed // term, which means we may end up performing multiple index lookups // for a single query term. await foreach (string term in clause.UsePipeline ? Pipeline.RunString( clause.Term, new TokenMetadata { { "fields", clause.Fields } }, cancellationToken) : new[] { clause.Term }.ToAsyncEnumerable(cancellationToken)) { // Each term returned from the pipeline needs to use the same query // clause object, e.g. the same boost and or edit distance. The // simplest way to do this is to re-use the clause object but mutate // its term property. clause = clause.WithTerm(term); // From the term in the clause we create a token set which will then // be used to intersect the indexes token set to get a list of terms // to lookup in the inverted index. var termTokenSet = TokenSet.FromClause(clause); IEnumerable <string> expandedTerms = TokenSet.Intersect(termTokenSet).ToEnumeration(); // If a term marked as required does not exist in the tokenSet it is // impossible for the search to return any matches.We set all the field // scoped required matches set to empty and stop examining any further // clauses. if (!expandedTerms.Any() && clause.Presence == QueryPresence.Required) { foreach (string field in clause.Fields) { requiredMatches.Add(field, Set <string> .Empty); } break; } foreach (string expandedTerm in expandedTerms) { // For each term get the posting and termIndex, this is required for building the query vector. InvertedIndexEntry posting = InvertedIndex[expandedTerm]; int termIndex = posting.Index; foreach (string field in clause.Fields) { // For each field that this query term is scoped by (by default // all fields are in scope) we need to get all the document refs // that have this term in that field. // // The posting is the entry in the invertedIndex for the matching // term from above. // For each field that this query term is scoped by (by default // all fields are in scope) we need to get all the document refs // that have this term in that field. // // The posting is the entry in the invertedIndex for the matching // term from above. FieldMatches fieldPosting = posting[field]; ICollection <string> matchingDocumentRefs = fieldPosting.Keys; string termField = expandedTerm + '/' + field; var matchingDocumentSet = new Set <string>(matchingDocumentRefs); // if the presence of this term is required ensure that the matching // documents are added to the set of required matches for this clause. if (clause.Presence == QueryPresence.Required) { clauseMatches = clauseMatches.Union(matchingDocumentSet); if (!requiredMatches.ContainsKey(field)) { requiredMatches.Add(field, Set <string> .Complete); } } // if the presence of this term is prohibited ensure that the matching // documents are added to the set of prohibited matches for this field, // creating that set if it does not yet exist. if (clause.Presence == QueryPresence.Prohibited) { if (!prohibitedMatches.ContainsKey(field)) { prohibitedMatches.Add(field, Set <string> .Empty); } prohibitedMatches[field] = prohibitedMatches[field].Union(matchingDocumentSet); // Prohibited matches should not be part of the query vector used for // similarity scoring and no metadata should be extracted so we continue // to the next field. continue; } // The query field vector is populated using the termIndex found for // the term and a unit value with the appropriate boost applied. // Using upsert because there could already be an entry in the vector // for the term we are working with.In that case we just add the scores // together. queryVectors[field].Upsert( termIndex, clause.Boost, (a, b) => a + b); // If we've already seen this term, field combo then we've already collected // the matching documents and metadata, no need to go through all that again. if (termFieldCache.Contains(termField)) { continue; } foreach (string matchingDocumentRef in matchingDocumentRefs) { // All metadata for this term/field/document triple // are then extracted and collected into an instance // of lunr.MatchData ready to be returned in the query // results. var matchingFieldRef = new FieldReference(matchingDocumentRef, field); FieldMatchMetadata metadata = fieldPosting[matchingDocumentRef]; if (!matchingFields.TryGetValue(matchingFieldRef, out MatchData? fieldMatch)) { matchingFields.Add( matchingFieldRef, new MatchData(expandedTerm, field, metadata)); } else { fieldMatch.Add(expandedTerm, field, metadata); } } termFieldCache.Add(termField); } } } // If the presence was required we need to update the requiredMatches field sets. // We do this after all fields for the term have collected their matches because // the clause terms presence is required in _any_ of the fields not _all_ of the // fields. if (clause.Presence == QueryPresence.Required) { foreach (string field in clause.Fields) { requiredMatches[field] = requiredMatches[field].Intersect(clauseMatches); } } } // Need to combine the field scoped required and prohibited // matching documents into a global set of required and prohibited // matches. ISet <string> allRequiredMatches = Set <string> .Complete; ISet <string> allProhibitedMatches = Set <string> .Empty; foreach (string field in Fields) { if (requiredMatches.ContainsKey(field)) { allRequiredMatches = allRequiredMatches.Intersect(requiredMatches[field]); } if (prohibitedMatches.ContainsKey(field)) { allProhibitedMatches = allProhibitedMatches.Union(prohibitedMatches[field]); } } IEnumerable <string> matchingFieldRefs = matchingFields.Keys.Select(k => k.ToString()); var matches = new Dictionary <string, Result>(); // If the query is negated (contains only prohibited terms) // we need to get _all_ fieldRefs currently existing in the // index. This is only done when we know that the query is // entirely prohibited terms to avoid any cost of getting all // fieldRefs unnecessarily. // // Additionally, blank MatchData must be created to correctly // populate the results. if (query.IsNegated) { matchingFieldRefs = FieldVectors.Keys; foreach (string matchingFieldRef in matchingFieldRefs) { var fieldRef = FieldReference.FromString(matchingFieldRef); matchingFields.Add(fieldRef, MatchData.Empty); } } foreach (string fieldRefString in matchingFieldRefs) { // Currently we have document fields that match the query, but we // need to return documents.The matchData and scores are combined // from multiple fields belonging to the same document. // // Scores are calculated by field, using the query vectors created // above, and combined into a final document score using addition. var fieldRef = FieldReference.FromString(fieldRefString); string docRef = fieldRef.DocumentReference; if (!allRequiredMatches.Contains(docRef)) { continue; } if (allProhibitedMatches.Contains(docRef)) { continue; } Vector fieldVector = FieldVectors[fieldRefString]; double score = queryVectors[fieldRef.FieldName].Similarity(fieldVector); if (matches.TryGetValue(docRef, out Result? docMatch)) { docMatch.Score += score; docMatch.MatchData.Combine(matchingFields[fieldRef]); } else { var match = new Result( documentReference: docRef, score, matchData: matchingFields[fieldRef] ); matches.Add(docRef, match); if (cancellationToken.IsCancellationRequested) { yield break; } results.Add(match); } } foreach (Result match in results.OrderByDescending(r => r.Score)) { yield return(match); } }