예제 #1
0
 private void Initialize(string text, bool substringsAreSimilar)
 {
     _source               = text ?? throw new ArgumentNullException(nameof(text));
     _threshold            = GetThreshold(_source);
     _editDistance         = new EditDistance(text);
     _substringsAreSimilar = substringsAreSimilar;
 }
예제 #2
0
 public static int GetEditDistance(string source, string target, int threshold = int.MaxValue)
 {
     using (var editDistance = new EditDistance(source))
     {
         return(editDistance.GetEditDistance(target, threshold));
     }
 }
예제 #3
0
 public WordSimilarityChecker(string text, bool substringsAreSimilar)
 {
     _source               = text ?? throw new ArgumentNullException(nameof(text));
     _threshold            = GetThreshold(_source);
     _editDistance         = new EditDistance(text);
     _substringsAreSimilar = substringsAreSimilar;
 }
예제 #4
0
            private void Add(TextSpan characterSpan, int insertionIndex)
            {
                if (insertionIndex == 0)
                {
                    _builderNodes[insertionIndex] = new BuilderNode(characterSpan);
                    return;
                }

                var currentNodeIndex = 0;

                while (true)
                {
                    var currentNode = _builderNodes[currentNodeIndex];

                    // Determine the edit distance between these two words.  Note: we do not use
                    // a threshold here as we need the actual edit distance so we can actually
                    // determine what edge to make or walk.
                    var editDistance = EditDistance.GetEditDistance(
                        _concatenatedLowerCaseWords.AsSpan(
                            currentNode.CharacterSpan.Start,
                            currentNode.CharacterSpan.Length
                            ),
                        _concatenatedLowerCaseWords.AsSpan(
                            characterSpan.Start,
                            characterSpan.Length
                            )
                        );

                    if (editDistance == 0)
                    {
                        // This should never happen.  We dedupe all items before proceeding to the 'Add' step.
                        // So the edit distance should always be non-zero.
                        throw new InvalidOperationException();
                    }

                    if (
                        TryGetChildIndex(
                            currentNode,
                            currentNodeIndex,
                            editDistance,
                            out var childNodeIndex
                            )
                        )
                    {
                        // Edit distances collide.  Move to this child and add this word to it.
                        currentNodeIndex = childNodeIndex;
                        continue;
                    }

                    // found the node we want to add the child node to.
                    AddChildNode(
                        characterSpan,
                        insertionIndex,
                        currentNode.EdgeCount,
                        currentNodeIndex,
                        editDistance
                        );
                    return;
                }
            }
예제 #5
0
        private void Lookup(Node currentNode, char[] queryCharacters, int queryLength, int threshold, List <string> result)
        {
            // We always want to compute the real edit distance (ignoring any thresholds).  This is
            // because we need that edit distance to appropriately determine which edges to walk
            // in the tree.
            var characterSpan = currentNode.WordSpan;
            var editDistance  = EditDistance.GetEditDistance(
                new ArraySlice <char>(_concatenatedLowerCaseWords, characterSpan),
                new ArraySlice <char>(queryCharacters, 0, queryLength));

            if (editDistance <= threshold)
            {
                // Found a match.
                result.Add(new string(_concatenatedLowerCaseWords, characterSpan.Start, characterSpan.Length));
            }

            var min = editDistance - threshold;
            var max = editDistance + threshold;

            var startInclusive = currentNode.FirstEdgeIndex;
            var endExclusive   = startInclusive + currentNode.EdgeCount;

            for (var i = startInclusive; i < endExclusive; i++)
            {
                var childEditDistance = _edges[i].EditDistance;
                if (min <= childEditDistance && childEditDistance <= max)
                {
                    Lookup(_nodes[_edges[i].ChildNodeIndex],
                           queryCharacters, queryLength, threshold, result);
                }
            }
        }
예제 #6
0
 public void Free()
 {
     _editDistance?.Dispose();
     _source               = null;
     _editDistance         = null;
     _lastAreSimilarResult = default(CacheResult);
     s_pool.Push(this);
 }
예제 #7
0
 public void Free()
 {
     _editDistance?.Dispose();
     _source               = null;
     _editDistance         = null;
     _lastAreSimilarResult = default;
     lock (s_poolGate) {
         s_pool.Push(this);
     }
 }
예제 #8
0
        public WordSimilarityChecker(string text)
        {
            if (text == null)
            {
                throw new ArgumentNullException(nameof(text));
            }

            _source = text;
            _threshold = GetThreshold(_source);
            _editDistance = new EditDistance(text);
        }
예제 #9
0
        public WordSimilarityChecker(string text)
        {
            if (text == null)
            {
                throw new ArgumentNullException(nameof(text));
            }

            _source       = text;
            _threshold    = GetThreshold(_source);
            _editDistance = new EditDistance(text);
        }
예제 #10
0
        private void Lookup(
            Node currentNode,
            char[] queryCharacters,
            int queryLength,
            int threshold,
            List <string> result,
            int recursionCount)
        {
            // Don't bother recursing too deeply in the case of pathological trees.
            // This really only happens when the actual code is strange (like
            // 10,000 symbols all a single letter long).  In htat case, searching
            // down this path will be fairly fruitless anyways.
            //
            // Note: this won't affect good searches against good data even if this
            // pathological chain exists.  That's because the good items will still
            // cluster near the root node in the tree, and won't be off the end of
            // this long chain.
            if (recursionCount > 256)
            {
                return;
            }

            // We always want to compute the real edit distance (ignoring any thresholds).  This is
            // because we need that edit distance to appropriately determine which edges to walk
            // in the tree.
            var characterSpan = currentNode.WordSpan;
            var editDistance  = EditDistance.GetEditDistance(
                _concatenatedLowerCaseWords.AsSpan(characterSpan.Start, characterSpan.Length),
                queryCharacters.AsSpan(0, queryLength));

            if (editDistance <= threshold)
            {
                // Found a match.
                result.Add(new string(_concatenatedLowerCaseWords, characterSpan.Start, characterSpan.Length));
            }

            var min = editDistance - threshold;
            var max = editDistance + threshold;

            var startInclusive = currentNode.FirstEdgeIndex;
            var endExclusive   = startInclusive + currentNode.EdgeCount;

            for (var i = startInclusive; i < endExclusive; i++)
            {
                var childEditDistance = _edges[i].EditDistance;
                if (min <= childEditDistance && childEditDistance <= max)
                {
                    Lookup(_nodes[_edges[i].ChildNodeIndex],
                           queryCharacters, queryLength, threshold, result,
                           recursionCount + 1);
                }
            }
        }
예제 #11
0
        public WordSimilarityChecker(string text, bool substringsAreSimilar)
        {
            if (text != null)
            {
                _source = text;
            }
            else
            {
                throw new ArgumentNullException("text");
            }

            _threshold            = GetThreshold(_source);
            _editDistance         = new EditDistance(text);
            _substringsAreSimilar = substringsAreSimilar;
        }
예제 #12
0
        /// <summary>
        /// Returns true if 'value1' and 'value2' are likely a mispelling of each other.
        /// Returns false otherwlse.  If it is a likely mispelling a matchCost is provided
        /// to help rank the match.  Lower costs mean it was a better match.
        /// </summary>
        public static bool IsCloseMatch(string originalText, string candidateText, int costThreshold, out double matchCost)
        {
            matchCost = EditDistance.GetEditDistance(originalText, candidateText);

            if (matchCost > costThreshold)
            {
                // it had a high cost.  However, the string the user typed was contained
                // in the string we're currently looking at.  That's enough to consider it
                // although we place it just at the threshold (i.e. it's worse than all
                // other matches).
                if (candidateText.IndexOf(originalText, StringComparison.OrdinalIgnoreCase) >= 0)
                {
                    matchCost = costThreshold;
                }
            }

            if (matchCost > costThreshold)
            {
                return(false);
            }

            matchCost += Penalty(candidateText, originalText);
            return(true);
        }
예제 #13
0
 public void Dispose()
 {
     _editDistance.Dispose();
     _editDistance = null;
 }
예제 #14
0
 public void Dispose()
 {
     _editDistance?.Dispose();
     _editDistance = null;
 }
예제 #15
0
 public WordSimilarityChecker(string text, bool substringsAreSimilar)
 {
     _source = text ?? throw new ArgumentNullException(nameof(text));
     _threshold = GetThreshold(_source);
     _editDistance = new EditDistance(text);
     _substringsAreSimilar = substringsAreSimilar;
 }