Exemplo n.º 1
0
 /// <summary>
 /// Constructs a new index.
 /// </summary>
 /// <param name="invertedIndex">An index of term/field to document reference.</param>
 /// <param name="fieldVectors">Field vectors</param>
 /// <param name="tokenSet">A set of all corpus tokens.</param>
 /// <param name="fields">The names of indexed document fields.</param>
 /// <param name="pipeline">The pipeline to use for search terms.</param>
 internal Index(
     InvertedIndex invertedIndex,
     IDictionary <string, Vector> fieldVectors,
     TokenSet tokenSet,
     IEnumerable <string> fields,
     Pipeline pipeline)
 {
     InvertedIndex = invertedIndex;
     FieldVectors  = fieldVectors;
     TokenSet      = tokenSet;
     Fields        = fields;
     Pipeline      = pipeline;
 }
Exemplo n.º 2
0
        /// <summary>
        /// Returns a new TokenSet that is the intersection of
        /// this TokenSet and the passed TokenSet.
        ///
        /// This intersection will take into account any wildcards
        /// contained within the TokenSet.
        /// </summary>
        /// <param name="other">Another TokenSet to intersect with.</param>
        /// <returns>The intersection of the sets.</returns>
        public TokenSet Intersect(TokenSet other)
        {
            var output = new TokenSet(_idProvider);

            Stack <(TokenSet qNode, TokenSet output, TokenSet node)> stack
                = new Stack <(TokenSet, TokenSet, TokenSet)>();

            stack.Push((other, output, this));

            while (stack.Any())
            {
                (TokenSet frameQNode, TokenSet frameOutput, TokenSet frameNode) = stack.Pop();

                foreach ((char qEdge, TokenSet qNode) in frameQNode.Edges)
                {
                    foreach ((char nEdge, TokenSet node) in frameNode.Edges)
                    {
                        if (nEdge == qEdge || qEdge == '*')
                        {
                            bool isFinal = node.IsFinal && qNode.IsFinal;

                            if (frameOutput.Edges.TryGetValue(nEdge, out TokenSet next))
                            {
                                // an edge already exists for this character
                                // no need to create a new node, just set the finality
                                // bit unless this node is already final
                                next.IsFinal = next.IsFinal || isFinal;
                            }
                            else
                            {
                                // no edge exists yet, must create one
                                // set the finality bit and insert it
                                // into the output
                                next = new TokenSet(_idProvider)
                                {
                                    IsFinal = isFinal
                                };
                                frameOutput.Edges.Add(nEdge, next);
                            }

                            stack.Push((qNode, next, node));
                        }
                    }
                }
            }

            return(output);
        }
Exemplo n.º 3
0
        /// <summary>
        /// Creates a TokenSet from a string.
        ///
        /// The string may contain one or more wildcard characters(*)
        /// that will allow wildcard matching when intersecting with
        /// another TokenSet.
        /// </summary>
        /// <param name="str">The string to create a TokenSet from.</param>
        /// <returns>The token set.</returns>
        public static TokenSet FromString(
            string str,
            TokenSetIdProvider?idProvider = null !)
        {
            idProvider ??= TokenSetIdProvider.Instance;
            var      root = new TokenSet(idProvider);
            TokenSet node = root;

            // Iterates through all characters within the passed string
            // appending a node for each character.
            //
            // When a wildcard character is found then a self
            // referencing edge is introduced to continually match
            // any number of any characters.
            for (int i = 0; i < str.Length; i++)
            {
                char ch      = str[i];
                bool isFinal = i == str.Length - 1;

                if (ch == Query.Wildcard)
                {
                    node.Edges.Add(ch, node);
                    node.IsFinal = isFinal;
                }
                else
                {
                    var next = new TokenSet(idProvider)
                    {
                        IsFinal = isFinal
                    };

                    node.Edges.Add(ch, next);
                    node = next;
                }
            }

            return(root);
        }
Exemplo n.º 4
0
            public void Insert(string word)
            {
                int commonPrefix = 0;

                if (StringComparer.Ordinal.Compare(word, _previousWord) < 0)
                {
                    throw new InvalidOperationException("Out of order word insertion.");
                }

                for (int i = 0; i < word.Length && i < _previousWord.Length; i++)
                {
                    if (word[i] != _previousWord[i])
                    {
                        break;
                    }
                    commonPrefix++;
                }

                Minimize(commonPrefix);

                TokenSet node = _uncheckedNodes.LastOrDefault().child ?? Root;

                for (int i = commonPrefix; i < word.Length; i++)
                {
                    var  nextNode = new TokenSet(_idProvider);
                    char ch       = word[i];

                    node.Edges.Add(ch, nextNode);

                    _uncheckedNodes.Add((node, ch, nextNode));

                    node = nextNode;
                }

                node.IsFinal  = true;
                _previousWord = word;
            }
Exemplo n.º 5
0
 public Builder(TokenSetIdProvider?idProvider = null !)
 {
     _idProvider = idProvider ?? TokenSetIdProvider.Instance;
     Root        = new TokenSet(_idProvider);
 }
Exemplo n.º 6
0
        /// <summary>
        /// Creates a token set representing a single string with a specified
        /// edit distance.
        ///
        /// Insertions, deletions, substitutions and transpositions are each
        /// treated as an edit distance of 1.
        ///
        /// Increasing the allowed edit distance will have a dramatic impact
        /// on the performance of both creating and intersecting these TokenSets.
        /// It is advised to keep the edit distance less than 3.
        /// </summary>
        /// <param name="str">The string to create the token set from.</param>
        /// <param name="editDistance">The allowed edit distance to match.</param>
        public static TokenSet FromFuzzyString(
            string str,
            int editDistance,
            TokenSetIdProvider?idProvider = null !)
        {
            idProvider ??= TokenSetIdProvider.Instance;
            var root = new TokenSet(idProvider);

            var stack = new Stack <(TokenSet node, int editsRemaining, string str)>();

            stack.Push((
                           node: root,
                           editsRemaining: editDistance,
                           str));

            while (stack.Any())
            {
                (TokenSet frameNode, int frameEditsRemaining, string frameStr) = stack.Pop();

                // no edit
                if (frameStr.Length > 0)
                {
                    char ch = frameStr[0];

                    TokenSet noEditNode;
                    if (frameNode.Edges.TryGetValue(ch, out TokenSet existingChNode))
                    {
                        noEditNode = existingChNode;
                    }
                    else
                    {
                        noEditNode = new TokenSet(idProvider);
                        frameNode.Edges.Add(ch, noEditNode);
                    }

                    if (frameStr.Length == 1)
                    {
                        noEditNode.IsFinal = true;
                    }

                    stack.Push((
                                   node: noEditNode,
                                   editsRemaining: frameEditsRemaining,
                                   str: frameStr.Substring(1)));
                }

                if (frameEditsRemaining == 0)
                {
                    continue;
                }

                // insertion
                TokenSet insertionNode;
                if (frameNode.Edges.TryGetValue(Query.Wildcard, out TokenSet wildcardNode))
                {
                    insertionNode = wildcardNode;
                }
                else
                {
                    insertionNode = new TokenSet(idProvider);
                    frameNode.Edges.Add(Query.Wildcard, insertionNode);
                }

                if (frameStr.Length == 0)
                {
                    insertionNode.IsFinal = true;
                }

                stack.Push((
                               node: insertionNode,
                               editsRemaining: frameEditsRemaining - 1,
                               str: frameStr));

                // deletion
                // Can only do a deletion if we have enough edits remaining
                // and if there are characters left to delete in the string.
                if (frameStr.Length > 1)
                {
                    stack.Push((
                                   node: frameNode,
                                   editsRemaining: frameEditsRemaining - 1,
                                   str: frameStr.Substring(1)));
                }

                // deletion
                // Just removing the last character from the string.
                if (frameStr.Length == 1)
                {
                    frameNode.IsFinal = true;
                }

                // substitution
                // Can only do a substitution if we have enough edits remaining
                // and if there are characters left to substitute.
                if (frameStr.Length >= 1)
                {
                    TokenSet substitutionNode;
                    if (frameNode.Edges.TryGetValue(Query.Wildcard, out TokenSet substitutionWildcardNode))
                    {
                        substitutionNode = substitutionWildcardNode;
                    }
                    else
                    {
                        substitutionNode = new TokenSet(idProvider);
                        frameNode.Edges.Add(Query.Wildcard, substitutionNode);
                    }

                    if (frameStr.Length == 1)
                    {
                        substitutionNode.IsFinal = true;
                    }

                    stack.Push((
                                   node: substitutionNode,
                                   editsRemaining: frameEditsRemaining - 1,
                                   str: frameStr.Substring(1)));
                }

                // transposition
                // Can only do a transposition if there are edits remaining
                // and there are enough characters to transpose.
                if (frameStr.Length > 1)
                {
                    char     chA = frameStr[0];
                    char     chB = frameStr[1];
                    TokenSet transposeNode;

                    if (frameNode.Edges.TryGetValue(chB, out TokenSet chBNode))
                    {
                        transposeNode = chBNode;
                    }
                    else
                    {
                        transposeNode = new TokenSet(idProvider);
                        frameNode.Edges.Add(chB, transposeNode);
                    }

                    if (frameStr.Length == 1) // Note: I'm pretty sure this can't happen, just porting the js version closely.
                    {
                        transposeNode.IsFinal = true;
                    }

                    stack.Push((
                                   node: transposeNode,
                                   editsRemaining: frameEditsRemaining - 1,
                                   str: chA + frameStr.Substring(2)));
                }
            }

            return(root);
        }
Exemplo n.º 7
0
        /// <summary>
        /// Performs a query against the index using the `Query` object built
        /// by the provided factory.
        ///
        /// If performing programmatic queries against the index, this method is preferred
        /// over `Index.Search` so as to avoid the additional query parsing overhead.
        ///
        /// A query object is yielded to the supplied function which should be used to
        /// express the query to be run against the index.
        /// </summary>
        /// <param name="queryFactory">A function that builds the query object that gets passed to it.</param>
        /// <param name="cancellationToken">A cancellation token.</param>
        /// <returns>The results of the query.</returns>
        public async IAsyncEnumerable <Result> Query(
            Action <Query> queryFactory,
            [EnumeratorCancellation] CancellationToken cancellationToken)
        {
            var results           = new List <Result>();
            var query             = new Query(Fields);
            var matchingFields    = new Dictionary <FieldReference, MatchData>();
            var termFieldCache    = new HashSet <string>();
            var requiredMatches   = new Dictionary <string, ISet <string> >();
            var prohibitedMatches = new Dictionary <string, ISet <string> >();

            // To support field level boosts a query vector is created per
            // field. An empty vector is eagerly created to support negated
            // queries.
            var queryVectors = new Dictionary <string, Vector>();

            foreach (string field in Fields)
            {
                queryVectors[field] = new Vector();
            }

            queryFactory(query);

            for (int i = 0; i < query.Clauses.Count; i++)
            {
                Clause        clause        = query.Clauses[i];
                ISet <string> clauseMatches = Set <string> .Empty;

                // Unless the pipeline has been disabled for this term, which is
                // the case for terms with wildcards, we need to pass the clause
                // term through the search pipeline. A pipeline returns an array
                // of processed terms. Pipeline functions may expand the passed
                // term, which means we may end up performing multiple index lookups
                // for a single query term.
                await foreach (string term in clause.UsePipeline
                    ? Pipeline.RunString(
                                   clause.Term,
                                   new TokenMetadata
                {
                    { "fields", clause.Fields }
                },
                                   cancellationToken)
                    : new[] { clause.Term }.ToAsyncEnumerable(cancellationToken))
                {
                    // Each term returned from the pipeline needs to use the same query
                    // clause object, e.g. the same boost and or edit distance. The
                    // simplest way to do this is to re-use the clause object but mutate
                    // its term property.
                    clause = clause.WithTerm(term);

                    // From the term in the clause we create a token set which will then
                    // be used to intersect the indexes token set to get a list of terms
                    // to lookup in the inverted index.
                    var termTokenSet = TokenSet.FromClause(clause);
                    IEnumerable <string> expandedTerms = TokenSet.Intersect(termTokenSet).ToEnumeration();

                    // If a term marked as required does not exist in the tokenSet it is
                    // impossible for the search to return any matches.We set all the field
                    // scoped required matches set to empty and stop examining any further
                    // clauses.
                    if (!expandedTerms.Any() && clause.Presence == QueryPresence.Required)
                    {
                        foreach (string field in clause.Fields)
                        {
                            requiredMatches.Add(field, Set <string> .Empty);
                        }

                        break;
                    }

                    foreach (string expandedTerm in expandedTerms)
                    {
                        // For each term get the posting and termIndex, this is required for building the query vector.
                        InvertedIndexEntry posting = InvertedIndex[expandedTerm];
                        int termIndex = posting.Index;

                        foreach (string field in clause.Fields)
                        {
                            // For each field that this query term is scoped by (by default
                            // all fields are in scope) we need to get all the document refs
                            // that have this term in that field.
                            //
                            // The posting is the entry in the invertedIndex for the matching
                            // term from above.
                            // For each field that this query term is scoped by (by default
                            // all fields are in scope) we need to get all the document refs
                            // that have this term in that field.
                            //
                            // The posting is the entry in the invertedIndex for the matching
                            // term from above.
                            FieldMatches         fieldPosting         = posting[field];
                            ICollection <string> matchingDocumentRefs = fieldPosting.Keys;
                            string termField           = expandedTerm + '/' + field;
                            var    matchingDocumentSet = new Set <string>(matchingDocumentRefs);

                            // if the presence of this term is required ensure that the matching
                            // documents are added to the set of required matches for this clause.
                            if (clause.Presence == QueryPresence.Required)
                            {
                                clauseMatches = clauseMatches.Union(matchingDocumentSet);

                                if (!requiredMatches.ContainsKey(field))
                                {
                                    requiredMatches.Add(field, Set <string> .Complete);
                                }
                            }

                            // if the presence of this term is prohibited ensure that the matching
                            // documents are added to the set of prohibited matches for this field,
                            // creating that set if it does not yet exist.
                            if (clause.Presence == QueryPresence.Prohibited)
                            {
                                if (!prohibitedMatches.ContainsKey(field))
                                {
                                    prohibitedMatches.Add(field, Set <string> .Empty);
                                }

                                prohibitedMatches[field] = prohibitedMatches[field].Union(matchingDocumentSet);

                                // Prohibited matches should not be part of the query vector used for
                                // similarity scoring and no metadata should be extracted so we continue
                                // to the next field.
                                continue;
                            }

                            // The query field vector is populated using the termIndex found for
                            // the term and a unit value with the appropriate boost applied.
                            // Using upsert because there could already be an entry in the vector
                            // for the term we are working with.In that case we just add the scores
                            // together.
                            queryVectors[field].Upsert(
                                termIndex,
                                clause.Boost,
                                (a, b) => a + b);

                            // If we've already seen this term, field combo then we've already collected
                            // the matching documents and metadata, no need to go through all that again.
                            if (termFieldCache.Contains(termField))
                            {
                                continue;
                            }

                            foreach (string matchingDocumentRef in matchingDocumentRefs)
                            {
                                // All metadata for this term/field/document triple
                                // are then extracted and collected into an instance
                                // of lunr.MatchData ready to be returned in the query
                                // results.
                                var matchingFieldRef        = new FieldReference(matchingDocumentRef, field);
                                FieldMatchMetadata metadata = fieldPosting[matchingDocumentRef];

                                if (!matchingFields.TryGetValue(matchingFieldRef, out MatchData? fieldMatch))
                                {
                                    matchingFields.Add(
                                        matchingFieldRef,
                                        new MatchData(expandedTerm, field, metadata));
                                }
                                else
                                {
                                    fieldMatch.Add(expandedTerm, field, metadata);
                                }
                            }

                            termFieldCache.Add(termField);
                        }
                    }
                }

                // If the presence was required we need to update the requiredMatches field sets.
                // We do this after all fields for the term have collected their matches because
                // the clause terms presence is required in _any_ of the fields not _all_ of the
                // fields.
                if (clause.Presence == QueryPresence.Required)
                {
                    foreach (string field in clause.Fields)
                    {
                        requiredMatches[field] = requiredMatches[field].Intersect(clauseMatches);
                    }
                }
            }

            // Need to combine the field scoped required and prohibited
            // matching documents into a global set of required and prohibited
            // matches.
            ISet <string> allRequiredMatches   = Set <string> .Complete;
            ISet <string> allProhibitedMatches = Set <string> .Empty;

            foreach (string field in Fields)
            {
                if (requiredMatches.ContainsKey(field))
                {
                    allRequiredMatches = allRequiredMatches.Intersect(requiredMatches[field]);
                }

                if (prohibitedMatches.ContainsKey(field))
                {
                    allProhibitedMatches = allProhibitedMatches.Union(prohibitedMatches[field]);
                }
            }

            IEnumerable <string> matchingFieldRefs
                = matchingFields.Keys.Select(k => k.ToString());
            var matches = new Dictionary <string, Result>();

            // If the query is negated (contains only prohibited terms)
            // we need to get _all_ fieldRefs currently existing in the
            // index. This is only done when we know that the query is
            // entirely prohibited terms to avoid any cost of getting all
            // fieldRefs unnecessarily.
            //
            // Additionally, blank MatchData must be created to correctly
            // populate the results.
            if (query.IsNegated)
            {
                matchingFieldRefs = FieldVectors.Keys;

                foreach (string matchingFieldRef in matchingFieldRefs)
                {
                    var fieldRef = FieldReference.FromString(matchingFieldRef);
                    matchingFields.Add(fieldRef, MatchData.Empty);
                }
            }

            foreach (string fieldRefString in matchingFieldRefs)
            {
                // Currently we have document fields that match the query, but we
                // need to return documents.The matchData and scores are combined
                // from multiple fields belonging to the same document.
                //
                // Scores are calculated by field, using the query vectors created
                // above, and combined into a final document score using addition.
                var    fieldRef = FieldReference.FromString(fieldRefString);
                string docRef   = fieldRef.DocumentReference;

                if (!allRequiredMatches.Contains(docRef))
                {
                    continue;
                }
                if (allProhibitedMatches.Contains(docRef))
                {
                    continue;
                }

                Vector fieldVector = FieldVectors[fieldRefString];
                double score       = queryVectors[fieldRef.FieldName].Similarity(fieldVector);

                if (matches.TryGetValue(docRef, out Result? docMatch))
                {
                    docMatch.Score += score;
                    docMatch.MatchData.Combine(matchingFields[fieldRef]);
                }
                else
                {
                    var match = new Result(
                        documentReference: docRef,
                        score,
                        matchData: matchingFields[fieldRef]
                        );
                    matches.Add(docRef, match);
                    if (cancellationToken.IsCancellationRequested)
                    {
                        yield break;
                    }
                    results.Add(match);
                }
            }

            foreach (Result match in results.OrderByDescending(r => r.Score))
            {
                yield return(match);
            }
        }