Exemple #1
0
        private bool AreSimilarWorker(string candidateText, out double similarityWeight)
        {
            similarityWeight = double.MaxValue;

            // If the two strings differ by more characters than the cost threshold, then there's
            // no point in even computing the edit distance as it would necessarily take at least
            // that many additions/deletions.
            if (Math.Abs(_source.Length - candidateText.Length) <= _threshold)
            {
                similarityWeight = _editDistance.GetEditDistance(candidateText, _threshold);
            }

            if (similarityWeight > _threshold)
            {
                // it had a high cost.  However, the string the user typed was contained
                // in the string we're currently looking at.  That's enough to consider it
                // although we place it just at the threshold (i.e. it's worse than all
                // other matches).
                if (candidateText.IndexOf(_source, StringComparison.OrdinalIgnoreCase) >= 0)
                {
                    similarityWeight = _threshold;
                }
                else
                {
                    return(false);
                }
            }

            Debug.Assert(similarityWeight <= _threshold);

            similarityWeight += Penalty(candidateText, _source);
            return(true);
        }
Exemple #2
0
 public static int GetEditDistance(string source, string target, int threshold = int.MaxValue)
 {
     using (var editDistance = new EditDistance(source))
     {
         return(editDistance.GetEditDistance(target, threshold));
     }
 }
Exemple #3
0
        private void Lookup(Node currentNode, char[] queryCharacters, int queryLength, int threshold, List <string> result)
        {
            // We always want to compute the real edit distance (ignoring any thresholds).  This is
            // because we need that edit distance to appropriately determine which edges to walk
            // in the tree.
            var characterSpan = currentNode.WordSpan;
            var editDistance  = EditDistance.GetEditDistance(
                new ArraySlice <char>(_concatenatedLowerCaseWords, characterSpan),
                new ArraySlice <char>(queryCharacters, 0, queryLength));

            if (editDistance <= threshold)
            {
                // Found a match.
                result.Add(new string(_concatenatedLowerCaseWords, characterSpan.Start, characterSpan.Length));
            }

            var min = editDistance - threshold;
            var max = editDistance + threshold;

            var startInclusive = currentNode.FirstEdgeIndex;
            var endExclusive   = startInclusive + currentNode.EdgeCount;

            for (var i = startInclusive; i < endExclusive; i++)
            {
                var childEditDistance = _edges[i].EditDistance;
                if (min <= childEditDistance && childEditDistance <= max)
                {
                    Lookup(_nodes[_edges[i].ChildNodeIndex],
                           queryCharacters, queryLength, threshold, result);
                }
            }
        }
Exemple #4
0
        private void Lookup(
            Node currentNode,
            char[] queryCharacters,
            int queryLength,
            int threshold,
            List <string> result,
            int recursionCount)
        {
            // Don't bother recursing too deeply in the case of pathological trees.
            // This really only happens when the actual code is strange (like
            // 10,000 symbols all a single letter long).  In htat case, searching
            // down this path will be fairly fruitless anyways.
            //
            // Note: this won't affect good searches against good data even if this
            // pathological chain exists.  That's because the good items will still
            // cluster near the root node in the tree, and won't be off the end of
            // this long chain.
            if (recursionCount > 256)
            {
                return;
            }

            // We always want to compute the real edit distance (ignoring any thresholds).  This is
            // because we need that edit distance to appropriately determine which edges to walk
            // in the tree.
            var characterSpan = currentNode.WordSpan;
            var editDistance  = EditDistance.GetEditDistance(
                _concatenatedLowerCaseWords.AsSpan(characterSpan.Start, characterSpan.Length),
                queryCharacters.AsSpan(0, queryLength));

            if (editDistance <= threshold)
            {
                // Found a match.
                result.Add(new string(_concatenatedLowerCaseWords, characterSpan.Start, characterSpan.Length));
            }

            var min = editDistance - threshold;
            var max = editDistance + threshold;

            var startInclusive = currentNode.FirstEdgeIndex;
            var endExclusive   = startInclusive + currentNode.EdgeCount;

            for (var i = startInclusive; i < endExclusive; i++)
            {
                var childEditDistance = _edges[i].EditDistance;
                if (min <= childEditDistance && childEditDistance <= max)
                {
                    Lookup(_nodes[_edges[i].ChildNodeIndex],
                           queryCharacters, queryLength, threshold, result,
                           recursionCount + 1);
                }
            }
        }
Exemple #5
0
            private void Add(TextSpan characterSpan, int insertionIndex)
            {
                if (insertionIndex == 0)
                {
                    _builderNodes[insertionIndex] = new BuilderNode(characterSpan);
                    return;
                }

                var currentNodeIndex = 0;

                while (true)
                {
                    var currentNode = _builderNodes[currentNodeIndex];

                    // Determine the edit distance between these two words.  Note: we do not use
                    // a threshold here as we need the actual edit distance so we can actually
                    // determine what edge to make or walk.
                    var editDistance = EditDistance.GetEditDistance(
                        new ArraySlice <char>(_concatenatedLowerCaseWords, currentNode.CharacterSpan),
                        new ArraySlice <char>(_concatenatedLowerCaseWords, characterSpan));

                    if (editDistance == 0)
                    {
                        // This should never happen.  We dedupe all items before proceeding to the 'Add' step.
                        // So the edit distance should always be non-zero.
                        throw new InvalidOperationException();
                    }

                    int childNodeIndex;
                    if (TryGetChildIndex(currentNode, currentNodeIndex, editDistance, out childNodeIndex))
                    {
                        // Edit distances collide.  Move to this child and add this word to it.
                        currentNodeIndex = childNodeIndex;
                        continue;
                    }

                    // found the node we want to add the child node to.
                    AddChildNode(characterSpan, insertionIndex, currentNode.EdgeCount, currentNodeIndex, editDistance);
                    return;
                }
            }
Exemple #6
0
        /// <summary>
        /// Returns true if 'value1' and 'value2' are likely a mispelling of each other.
        /// Returns false otherwlse.  If it is a likely mispelling a matchCost is provided
        /// to help rank the match.  Lower costs mean it was a better match.
        /// </summary>
        public static bool IsCloseMatch(string originalText, string candidateText, int costThreshold, out double matchCost)
        {
            matchCost = EditDistance.GetEditDistance(originalText, candidateText);

            if (matchCost > costThreshold)
            {
                // it had a high cost.  However, the string the user typed was contained
                // in the string we're currently looking at.  That's enough to consider it
                // although we place it just at the threshold (i.e. it's worse than all
                // other matches).
                if (candidateText.IndexOf(originalText, StringComparison.OrdinalIgnoreCase) >= 0)
                {
                    matchCost = costThreshold;
                }
            }

            if (matchCost > costThreshold)
            {
                return(false);
            }

            matchCost += Penalty(candidateText, originalText);
            return(true);
        }