示例#1
0
        /// <summary>
        /// An instance of `MatchData` will be created for every term that matches a
        /// document. However only one instance is required in an index result. This
        /// method combines metadata from another instance of `MatchData` with this
        /// object's metadata.
        /// </summary>
        /// <param name="otherMatchData">Another instance of match data to merge with this one.</param>
        public void Combine(MatchData otherMatchData)
        {
            IEnumerable <string> terms = otherMatchData.Posting.Keys;

            foreach (string term in terms)
            {
                IEnumerable <string> fields = otherMatchData.Posting[term].Keys;
                if (!Posting.ContainsKey(term))
                {
                    Posting.Add(term, new FieldMatches());
                }
                Dictionary <string, FieldMatchMetadata> thisTermEntry = Posting[term];
                foreach (string field in fields)
                {
                    IEnumerable <string> keys = otherMatchData.Posting[term][field].Keys;
                    if (!thisTermEntry.ContainsKey(field))
                    {
                        thisTermEntry.Add(field, new FieldMatchMetadata(capacity: otherMatchData.Posting[term][field].Keys.Count));
                    }
                    FieldMatchMetadata thisFieldEntry = thisTermEntry[field];
                    foreach (string key in keys)
                    {
                        IList <object?> otherData = otherMatchData.Posting[term][field][key];
                        if (!thisFieldEntry.ContainsKey(key))
                        {
                            thisFieldEntry.Add(key, new List <object?>(otherData));
                        }
                        else
                        {
                            thisFieldEntry[key] = thisFieldEntry[key].Concat(otherData).ToList();
                        }
                    }
                }
            }
        }
示例#2
0
        /// <summary>
        /// Add metadata for a term/field pair to this instance of match data.
        /// </summary>
        /// <param name="term">The term this match data is associated with.</param>
        /// <param name="field">The field in which the term was found.</param>
        /// <param name="metadata">The metadata recorded about this term in this field.</param>
        public void Add(string term, string field, FieldMatchMetadata metadata)
        {
            if (!Posting.ContainsKey(term))
            {
                Posting.Add(term, new FieldMatches
                {
                    {
                        field,
                        metadata
                    }
                });
                return;
            }

            FieldMatches termMetadata = Posting[term];

            if (!termMetadata.ContainsKey(field))
            {
                termMetadata.Add(field, metadata);
                return;
            }

            foreach (string key in metadata.Keys)
            {
                FieldMatchMetadata fieldMetadata = termMetadata[field];
                if (fieldMetadata.ContainsKey(key))
                {
                    fieldMetadata[key] = fieldMetadata[key].Concat(metadata[key]).ToList();
                }
                else
                {
                    fieldMetadata[key] = metadata[key];
                }
            }
        }
示例#3
0
        /// <summary>Constructs a `MatchData`.</summary>
        /// <param name="term">The term this match data is associated with.</param>
        /// <param name="field">The field in which the term was found.</param>
        /// <param name="metadata">The metadata recorded about this term in this field.</param>
        public MatchData(
            string term,
            string field,
            FieldMatchMetadata metadata)
        {
            Term  = term;
            Field = field;

            // Cloning the metadata to prevent the original being mutated during match data combination.
            // Metadata is kept in an array within the inverted index.
            var clonedMetadata = new FieldMatchMetadata(capacity: metadata.Count);

            foreach ((string key, IEnumerable <object?> value) in metadata)
            {
                clonedMetadata.Add(key, new List <object?>(value));
            }

            Posting = new InvertedIndexEntry
            {
                {
                    term,
                    new FieldMatches
                    {
                        { field, clonedMetadata }
                    }
                }
            };
        }
示例#4
0
文件: Index.cs 项目: carbon/lunr-core
        /// <summary>
        /// Performs a query against the index using the `Query` object built
        /// by the provided factory.
        ///
        /// If performing programmatic queries against the index, this method is preferred
        /// over `Index.Search` so as to avoid the additional query parsing overhead.
        ///
        /// A query object is yielded to the supplied function which should be used to
        /// express the query to be run against the index.
        /// </summary>
        /// <param name="queryFactory">A function that builds the query object that gets passed to it.</param>
        /// <param name="cancellationToken">A cancellation token.</param>
        /// <returns>The results of the query.</returns>
        public async IAsyncEnumerable <Result> Query(
            Action <Query> queryFactory,
            [EnumeratorCancellation] CancellationToken cancellationToken)
        {
            var results           = new List <Result>();
            var query             = new Query(Fields);
            var matchingFields    = new Dictionary <FieldReference, MatchData>();
            var termFieldCache    = new HashSet <string>();
            var requiredMatches   = new Dictionary <string, ISet <string> >();
            var prohibitedMatches = new Dictionary <string, ISet <string> >();

            // To support field level boosts a query vector is created per
            // field. An empty vector is eagerly created to support negated
            // queries.
            var queryVectors = new Dictionary <string, Vector>();

            foreach (string field in Fields)
            {
                queryVectors[field] = new Vector();
            }

            queryFactory(query);

            for (int i = 0; i < query.Clauses.Count; i++)
            {
                Clause        clause        = query.Clauses[i];
                ISet <string> clauseMatches = Set <string> .Empty;

                // Unless the pipeline has been disabled for this term, which is
                // the case for terms with wildcards, we need to pass the clause
                // term through the search pipeline. A pipeline returns an array
                // of processed terms. Pipeline functions may expand the passed
                // term, which means we may end up performing multiple index lookups
                // for a single query term.
                await foreach (string term in clause.UsePipeline
                    ? Pipeline.RunString(
                                   clause.Term,
                                   new TokenMetadata
                {
                    { "fields", clause.Fields }
                },
                                   cancellationToken)
                    : new[] { clause.Term }.ToAsyncEnumerable(cancellationToken))
                {
                    // Each term returned from the pipeline needs to use the same query
                    // clause object, e.g. the same boost and or edit distance. The
                    // simplest way to do this is to re-use the clause object but mutate
                    // its term property.
                    clause = clause.WithTerm(term);

                    // From the term in the clause we create a token set which will then
                    // be used to intersect the indexes token set to get a list of terms
                    // to lookup in the inverted index.
                    var termTokenSet = TokenSet.FromClause(clause);
                    IEnumerable <string> expandedTerms = TokenSet.Intersect(termTokenSet).ToEnumeration();

                    // If a term marked as required does not exist in the tokenSet it is
                    // impossible for the search to return any matches.We set all the field
                    // scoped required matches set to empty and stop examining any further
                    // clauses.
                    if (!expandedTerms.Any() && clause.Presence == QueryPresence.Required)
                    {
                        foreach (string field in clause.Fields)
                        {
                            requiredMatches.Add(field, Set <string> .Empty);
                        }

                        break;
                    }

                    foreach (string expandedTerm in expandedTerms)
                    {
                        // For each term get the posting and termIndex, this is required for building the query vector.
                        InvertedIndexEntry posting = InvertedIndex[expandedTerm];
                        int termIndex = posting.Index;

                        foreach (string field in clause.Fields)
                        {
                            // For each field that this query term is scoped by (by default
                            // all fields are in scope) we need to get all the document refs
                            // that have this term in that field.
                            //
                            // The posting is the entry in the invertedIndex for the matching
                            // term from above.
                            // For each field that this query term is scoped by (by default
                            // all fields are in scope) we need to get all the document refs
                            // that have this term in that field.
                            //
                            // The posting is the entry in the invertedIndex for the matching
                            // term from above.
                            FieldMatches         fieldPosting         = posting[field];
                            ICollection <string> matchingDocumentRefs = fieldPosting.Keys;
                            string termField           = expandedTerm + '/' + field;
                            var    matchingDocumentSet = new Set <string>(matchingDocumentRefs);

                            // if the presence of this term is required ensure that the matching
                            // documents are added to the set of required matches for this clause.
                            if (clause.Presence == QueryPresence.Required)
                            {
                                clauseMatches = clauseMatches.Union(matchingDocumentSet);

                                if (!requiredMatches.ContainsKey(field))
                                {
                                    requiredMatches.Add(field, Set <string> .Complete);
                                }
                            }

                            // if the presence of this term is prohibited ensure that the matching
                            // documents are added to the set of prohibited matches for this field,
                            // creating that set if it does not yet exist.
                            if (clause.Presence == QueryPresence.Prohibited)
                            {
                                if (!prohibitedMatches.ContainsKey(field))
                                {
                                    prohibitedMatches.Add(field, Set <string> .Empty);
                                }

                                prohibitedMatches[field] = prohibitedMatches[field].Union(matchingDocumentSet);

                                // Prohibited matches should not be part of the query vector used for
                                // similarity scoring and no metadata should be extracted so we continue
                                // to the next field.
                                continue;
                            }

                            // The query field vector is populated using the termIndex found for
                            // the term and a unit value with the appropriate boost applied.
                            // Using upsert because there could already be an entry in the vector
                            // for the term we are working with.In that case we just add the scores
                            // together.
                            queryVectors[field].Upsert(
                                termIndex,
                                clause.Boost,
                                (a, b) => a + b);

                            // If we've already seen this term, field combo then we've already collected
                            // the matching documents and metadata, no need to go through all that again.
                            if (termFieldCache.Contains(termField))
                            {
                                continue;
                            }

                            foreach (string matchingDocumentRef in matchingDocumentRefs)
                            {
                                // All metadata for this term/field/document triple
                                // are then extracted and collected into an instance
                                // of lunr.MatchData ready to be returned in the query
                                // results.
                                var matchingFieldRef        = new FieldReference(matchingDocumentRef, field);
                                FieldMatchMetadata metadata = fieldPosting[matchingDocumentRef];

                                if (!matchingFields.TryGetValue(matchingFieldRef, out MatchData? fieldMatch))
                                {
                                    matchingFields.Add(
                                        matchingFieldRef,
                                        new MatchData(expandedTerm, field, metadata));
                                }
                                else
                                {
                                    fieldMatch.Add(expandedTerm, field, metadata);
                                }
                            }

                            termFieldCache.Add(termField);
                        }
                    }
                }

                // If the presence was required we need to update the requiredMatches field sets.
                // We do this after all fields for the term have collected their matches because
                // the clause terms presence is required in _any_ of the fields not _all_ of the
                // fields.
                if (clause.Presence == QueryPresence.Required)
                {
                    foreach (string field in clause.Fields)
                    {
                        requiredMatches[field] = requiredMatches[field].Intersect(clauseMatches);
                    }
                }
            }

            // Need to combine the field scoped required and prohibited
            // matching documents into a global set of required and prohibited
            // matches.
            ISet <string> allRequiredMatches   = Set <string> .Complete;
            ISet <string> allProhibitedMatches = Set <string> .Empty;

            foreach (string field in Fields)
            {
                if (requiredMatches.ContainsKey(field))
                {
                    allRequiredMatches = allRequiredMatches.Intersect(requiredMatches[field]);
                }

                if (prohibitedMatches.ContainsKey(field))
                {
                    allProhibitedMatches = allProhibitedMatches.Union(prohibitedMatches[field]);
                }
            }

            IEnumerable <string> matchingFieldRefs
                = matchingFields.Keys.Select(k => k.ToString());
            var matches = new Dictionary <string, Result>();

            // If the query is negated (contains only prohibited terms)
            // we need to get _all_ fieldRefs currently existing in the
            // index. This is only done when we know that the query is
            // entirely prohibited terms to avoid any cost of getting all
            // fieldRefs unnecessarily.
            //
            // Additionally, blank MatchData must be created to correctly
            // populate the results.
            if (query.IsNegated)
            {
                matchingFieldRefs = FieldVectors.Keys;

                foreach (string matchingFieldRef in matchingFieldRefs)
                {
                    var fieldRef = FieldReference.FromString(matchingFieldRef);
                    matchingFields.Add(fieldRef, MatchData.Empty);
                }
            }

            foreach (string fieldRefString in matchingFieldRefs)
            {
                // Currently we have document fields that match the query, but we
                // need to return documents.The matchData and scores are combined
                // from multiple fields belonging to the same document.
                //
                // Scores are calculated by field, using the query vectors created
                // above, and combined into a final document score using addition.
                var    fieldRef = FieldReference.FromString(fieldRefString);
                string docRef   = fieldRef.DocumentReference;

                if (!allRequiredMatches.Contains(docRef))
                {
                    continue;
                }
                if (allProhibitedMatches.Contains(docRef))
                {
                    continue;
                }

                Vector fieldVector = FieldVectors[fieldRefString];
                double score       = queryVectors[fieldRef.FieldName].Similarity(fieldVector);

                if (matches.TryGetValue(docRef, out Result? docMatch))
                {
                    docMatch.Score += score;
                    docMatch.MatchData.Combine(matchingFields[fieldRef]);
                }
                else
                {
                    var match = new Result(
                        documentReference: docRef,
                        score,
                        matchData: matchingFields[fieldRef]
                        );
                    matches.Add(docRef, match);
                    if (cancellationToken.IsCancellationRequested)
                    {
                        yield break;
                    }
                    results.Add(match);
                }
            }

            foreach (Result match in results.OrderByDescending(r => r.Score))
            {
                yield return(match);
            }
        }