private void Initialize(string text, bool substringsAreSimilar) { _source = text ?? throw new ArgumentNullException(nameof(text)); _threshold = GetThreshold(_source); _editDistance = new EditDistance(text); _substringsAreSimilar = substringsAreSimilar; }
public static int GetEditDistance(string source, string target, int threshold = int.MaxValue) { using (var editDistance = new EditDistance(source)) { return(editDistance.GetEditDistance(target, threshold)); } }
public WordSimilarityChecker(string text, bool substringsAreSimilar) { _source = text ?? throw new ArgumentNullException(nameof(text)); _threshold = GetThreshold(_source); _editDistance = new EditDistance(text); _substringsAreSimilar = substringsAreSimilar; }
private void Add(TextSpan characterSpan, int insertionIndex) { if (insertionIndex == 0) { _builderNodes[insertionIndex] = new BuilderNode(characterSpan); return; } var currentNodeIndex = 0; while (true) { var currentNode = _builderNodes[currentNodeIndex]; // Determine the edit distance between these two words. Note: we do not use // a threshold here as we need the actual edit distance so we can actually // determine what edge to make or walk. var editDistance = EditDistance.GetEditDistance( _concatenatedLowerCaseWords.AsSpan( currentNode.CharacterSpan.Start, currentNode.CharacterSpan.Length ), _concatenatedLowerCaseWords.AsSpan( characterSpan.Start, characterSpan.Length ) ); if (editDistance == 0) { // This should never happen. We dedupe all items before proceeding to the 'Add' step. // So the edit distance should always be non-zero. throw new InvalidOperationException(); } if ( TryGetChildIndex( currentNode, currentNodeIndex, editDistance, out var childNodeIndex ) ) { // Edit distances collide. Move to this child and add this word to it. currentNodeIndex = childNodeIndex; continue; } // found the node we want to add the child node to. AddChildNode( characterSpan, insertionIndex, currentNode.EdgeCount, currentNodeIndex, editDistance ); return; } }
private void Lookup(Node currentNode, char[] queryCharacters, int queryLength, int threshold, List <string> result) { // We always want to compute the real edit distance (ignoring any thresholds). This is // because we need that edit distance to appropriately determine which edges to walk // in the tree. var characterSpan = currentNode.WordSpan; var editDistance = EditDistance.GetEditDistance( new ArraySlice <char>(_concatenatedLowerCaseWords, characterSpan), new ArraySlice <char>(queryCharacters, 0, queryLength)); if (editDistance <= threshold) { // Found a match. result.Add(new string(_concatenatedLowerCaseWords, characterSpan.Start, characterSpan.Length)); } var min = editDistance - threshold; var max = editDistance + threshold; var startInclusive = currentNode.FirstEdgeIndex; var endExclusive = startInclusive + currentNode.EdgeCount; for (var i = startInclusive; i < endExclusive; i++) { var childEditDistance = _edges[i].EditDistance; if (min <= childEditDistance && childEditDistance <= max) { Lookup(_nodes[_edges[i].ChildNodeIndex], queryCharacters, queryLength, threshold, result); } } }
public void Free() { _editDistance?.Dispose(); _source = null; _editDistance = null; _lastAreSimilarResult = default(CacheResult); s_pool.Push(this); }
public void Free() { _editDistance?.Dispose(); _source = null; _editDistance = null; _lastAreSimilarResult = default; lock (s_poolGate) { s_pool.Push(this); } }
public WordSimilarityChecker(string text) { if (text == null) { throw new ArgumentNullException(nameof(text)); } _source = text; _threshold = GetThreshold(_source); _editDistance = new EditDistance(text); }
private void Lookup( Node currentNode, char[] queryCharacters, int queryLength, int threshold, List <string> result, int recursionCount) { // Don't bother recursing too deeply in the case of pathological trees. // This really only happens when the actual code is strange (like // 10,000 symbols all a single letter long). In htat case, searching // down this path will be fairly fruitless anyways. // // Note: this won't affect good searches against good data even if this // pathological chain exists. That's because the good items will still // cluster near the root node in the tree, and won't be off the end of // this long chain. if (recursionCount > 256) { return; } // We always want to compute the real edit distance (ignoring any thresholds). This is // because we need that edit distance to appropriately determine which edges to walk // in the tree. var characterSpan = currentNode.WordSpan; var editDistance = EditDistance.GetEditDistance( _concatenatedLowerCaseWords.AsSpan(characterSpan.Start, characterSpan.Length), queryCharacters.AsSpan(0, queryLength)); if (editDistance <= threshold) { // Found a match. result.Add(new string(_concatenatedLowerCaseWords, characterSpan.Start, characterSpan.Length)); } var min = editDistance - threshold; var max = editDistance + threshold; var startInclusive = currentNode.FirstEdgeIndex; var endExclusive = startInclusive + currentNode.EdgeCount; for (var i = startInclusive; i < endExclusive; i++) { var childEditDistance = _edges[i].EditDistance; if (min <= childEditDistance && childEditDistance <= max) { Lookup(_nodes[_edges[i].ChildNodeIndex], queryCharacters, queryLength, threshold, result, recursionCount + 1); } } }
public WordSimilarityChecker(string text, bool substringsAreSimilar) { if (text != null) { _source = text; } else { throw new ArgumentNullException("text"); } _threshold = GetThreshold(_source); _editDistance = new EditDistance(text); _substringsAreSimilar = substringsAreSimilar; }
/// <summary> /// Returns true if 'value1' and 'value2' are likely a mispelling of each other. /// Returns false otherwlse. If it is a likely mispelling a matchCost is provided /// to help rank the match. Lower costs mean it was a better match. /// </summary> public static bool IsCloseMatch(string originalText, string candidateText, int costThreshold, out double matchCost) { matchCost = EditDistance.GetEditDistance(originalText, candidateText); if (matchCost > costThreshold) { // it had a high cost. However, the string the user typed was contained // in the string we're currently looking at. That's enough to consider it // although we place it just at the threshold (i.e. it's worse than all // other matches). if (candidateText.IndexOf(originalText, StringComparison.OrdinalIgnoreCase) >= 0) { matchCost = costThreshold; } } if (matchCost > costThreshold) { return(false); } matchCost += Penalty(candidateText, originalText); return(true); }
public void Dispose() { _editDistance.Dispose(); _editDistance = null; }
public void Dispose() { _editDistance?.Dispose(); _editDistance = null; }