private static int GetEditDistanceWorker(ArraySlice <char> source, ArraySlice <char> target, int threshold) { // Note: sourceLength will always be smaller or equal to targetLength. // // Also Note: sourceLength and targetLength values will mutate and represent the lengths // of the portions of the arrays we want to compare. However, even after mutation, hte // invariant htat sourceLength is <= targetLength will remain. Debug.Assert(source.Length <= target.Length); // First: // Determine the common prefix/suffix portions of the strings. We don't even need to // consider them as they won't add anything to the edit cost. while (source.Length > 0 && source[source.Length - 1] == target[target.Length - 1]) { source.SetLength(source.Length - 1); target.SetLength(target.Length - 1); } while (source.Length > 0 && source[0] == target[0]) { source.MoveStartForward(amount: 1); target.MoveStartForward(amount: 1); } // 'sourceLength' and 'targetLength' are now the lengths of the substrings of our strings that we // want to compare. 'startIndex' is the starting point of the substrings in both array. // // If we've matched all of the 'source' string in the prefix and suffix of 'target'. then the edit // distance is just whatever operations we have to create the remaining target substring. // // Note: we don't have to check if targetLength is 0. That's because targetLength being zero would // necessarily mean that sourceLength is 0. var sourceLength = source.Length; var targetLength = target.Length; if (sourceLength == 0) { return(targetLength <= threshold ? targetLength : BeyondThreshold); } // The is the minimum number of edits we'd have to make. i.e. if 'source' and // 'target' are the same length, then we might not need to make any edits. However, // if target has length 10 and source has length 7, then we're going to have to // make at least 3 edits no matter what. var minimumEditCount = targetLength - sourceLength; Debug.Assert(minimumEditCount >= 0); // If the number of edits we'd have to perform is greater than our threshold, then // there's no point in even continuing. if (minimumEditCount > threshold) { return(BeyondThreshold); } // Say we want to find the edit distance between "sunday" and "saturday". Our initial // matrix will be: // // (Note: for purposes of this explanation we will not be trimming off the common // prefix/suffix of the strings. That optimization does not affect any of the // remainder of the explanation). // // s u n d a y // ---------------- // |∞ ∞ ∞ ∞ ∞ ∞ ∞ ∞ // |∞ 0 1 2 3 4 5 6 // s |∞ 1 // a |∞ 2 // t |∞ 3 // u |∞ 4 // r |∞ 5 // d |∞ 6 // a |∞ 7 // y |∞ 8 // // Note that the matrix will always be square, or a rectangle that is taller htan it is // longer. Our 'source' is at the top, and our 'target' is on the left. The edit distance // between any prefix of 'source' and any prefix of 'target' can then be found in // the unfilled area of the matrix. Specifically, if we have source.substring(0, m) and // target.substring(0, n), then the edit distance for them can be found at matrix position // (m+1, n+1). This is why the 1'th row and 1'th column can be prefilled. They represent // the cost to go from the empty target to the full source or the empty source to the full // target (respectively). So, if we wanted to know the edit distance between "sun" and // "sat", we'd look at (3+1, 3+1). It then follows that our final edit distance between // the full source and target is in the lower right corner of this matrix. // // If we fill out the matrix fully we'll get: // // s u n d a y <-- source // ---------------- // |∞ ∞ ∞ ∞ ∞ ∞ ∞ ∞ // |∞ 0 1 2 3 4 5 6 // s |∞ 1 0 1 2 3 4 5 // a |∞ 2 1 1 2 3 3 4 // t |∞ 3 2 2 2 3 4 4 // u |∞ 4 3 2 3 3 4 5 // r |∞ 5 4 3 3 4 4 5 // d |∞ 6 5 4 4 3 4 5 // a |∞ 7 6 5 5 4 3 4 // y |∞ 8 7 6 6 5 4 3 <-- // ^ // | // // So in this case, the edit distance is 3. Or, specifically, the edits: // // Sunday -> Replace("n", "r") -> // Surday -> Insert("a") -> // Saurday -> Insert("t") -> // Saturday // // // Now: in the case where we want to know what the edit distance actually is (for example // when making a BKTree), we must fill out this entire array to get the true edit distance. // // However, in some cases we can do a bit better. For example, if a client only wants to // the edit distance *when the edit distance will be less than some threshold* then we do // not need to examine the entire matrix. We only want to examine until the point where // we realize that, no matter what, our final edit distance will be more than that threshold // (at which point we can return early). // // Some things are trivially easy to check. First, the edit distance between two strings is at // *best* the difference of their lengths. i.e. if i have "aa" and "aaaaa" then the edit // distance is 3 (the difference of 5 and 2). If our threshold is less then 3 then there // is no way these two strings could match. So we can leave early if we can tell it would // simply be impossible to get an edit distance within the specified threshold. // // Second, let's look at our matrix again: // // s u n d a y // ---------------- // |∞ ∞ ∞ ∞ ∞ ∞ ∞ ∞ // |∞ 0 1 2 3 4 5 6 // s |∞ 1 // a |∞ 2 // t |∞ 3 // u |∞ 4 // r |∞ 5 // d |∞ 6 // a |∞ 7 // y |∞ 8 * // // We want to know what the value is at *, and we want to stop as early as possible if it // is greater than our threshold. // // Given the edit distance rules we observe edit distance at any point (i,j) in the matrix will // always be greater than or equal to the value in (i-1, j-1). i.e. the edit distance of // any two strings is going to be *at best* equal to the edit distance of those two strings // without their final characters. If their final characters are the same, they'll ahve the // same edit distance. If they are different, the edit distance will be greater. Given // that we know the final edit distance is in the lower right, we can discover something // useful in the matrix. // // s u n d a y // ---------------- // |∞ ∞ ∞ ∞ ∞ ∞ ∞ ∞ // |∞ 0 1 2 3 4 5 6 // s |∞ 1 // a |∞ 2 // t |∞ 3 ` // u |∞ 4 ` // r |∞ 5 ` // d |∞ 6 ` // a |∞ 7 ` // y |∞ 8 * // // The slashes are the "bottom" diagonal leading to the lower right. The value in the // lower right will be strictly equal to or greater than any value on this diagonal. // Thus, if that value exceeds the threshold, we know we can stop immediately as the // total edit distance must be greater than the threshold. // // We can use similar logic to avoid even having to examine more of the matrix when we // have a threshold. First, consider the same diagonal. // // s u n d a y // ---------------- // |∞ ∞ ∞ ∞ ∞ ∞ ∞ ∞ // |∞ 0 1 2 3 4 5 6 // s |∞ 1 // a |∞ 2 // t |∞ 3 ` // u |∞ 4 ` x // r |∞ 5 ` | // d |∞ 6 ` | // a |∞ 7 ` | // y |∞ 8 * // // And then consider a point above that diagonal (indicated by x). In the example // above, the edit distance to * from 'x' will be (x+4). If, for example, threshold // was '2', then it would be impossible for the path from 'x' to provide a good // enough edit distance *ever*. Similarly: // // s u n d a y // ---------------- // |∞ ∞ ∞ ∞ ∞ ∞ ∞ ∞ // |∞ 0 1 2 3 4 5 6 // s |∞ 1 // a |∞ 2 // t |∞ 3 ` // u |∞ 4 ` // r |∞ 5 ` // d |∞ 6 ` // a |∞ 7 ` // y |∞ 8 y - - * // // Here we see that the final edit distance will be "y+3". Again, if the edit // distance threshold is less than 3, then no path from y will provide a good // enough edit distance. // // So, if we had an edit distance threshold of 3, then the range around that // bottom diagonal that we should consider checking is: // // s u n d a y // ---------------- // |∞ ∞ ∞ ∞ ∞ ∞ ∞ ∞ // |∞ 0 1 2 3 4 5 6 // s |∞ 1 | | // a |∞ 2 | | | // t |∞ 3 ` | | | // u |∞ 4 - ` | | | // r |∞ 5 - - ` | | | // d |∞ 6 - - - ` | | // a |∞ 7 - - - ` | // y |∞ 8 - - - * // // Now, also consider that it will take a minimum of targetLength-sourceLength edits // just to move to the lower diagonal from the upper diagonal. That leaves // 'threshold - (targetLength - sourceLength)' edits remaining. In this example, that // means '3 - (8 - 6)' = 1. Because of this our lower diagonal offset is capped at: // // s u n d a y // ---------------- // |∞ ∞ ∞ ∞ ∞ ∞ ∞ ∞ // |∞ 0 1 2 3 4 5 6 // s |∞ 1 | | // a |∞ 2 | | | // t |∞ 3 ` | | | // u |∞ 4 - ` | | | // r |∞ 5 - ` | | | // d |∞ 6 - ` | | // a |∞ 7 - ` | // y |∞ 8 - * // // If we mark the upper diagonal appropriately we see the matrix as: // // s u n d a y // ---------------- // |∞ ∞ ∞ ∞ ∞ ∞ ∞ ∞ // |∞ 0 1 2 3 4 5 6 // s |∞ 1 ` | // a |∞ 2 ` | // t |∞ 3 ` ` | // u |∞ 4 - ` ` | // r |∞ 5 - ` ` | // d |∞ 6 - ` ` // a |∞ 7 - ` // y |∞ 8 - * // // Or, effectively, we only need to examine 'threshold - (targetLength - sourceLength)' // above and below the diagonals. // // In practice, when a threshold is provided it is normally capped at '2'. Given that, // the most around the diagonal we'll ever have to check is +/- 2 elements. i.e. with // strings of length 10 we'd only check: // // a b c d e f g h i j // ------------------------ // |∞ ∞ ∞ ∞ ∞ ∞ ∞ ∞ ∞ ∞ ∞ ∞ // |∞ 0 1 2 3 4 5 6 7 8 9 10 // m |∞ 1 * * * // n |∞ 2 * * * * // o |∞ 3 * * * * * // p |∞ 4 * * * * * // q |∞ 5 * * * * * // r |∞ 6 * * * * * // s |∞ 7 * * * * * // t |∞ 8 * * * * * // u |∞ 9 * * * * // v |∞10 * * * // // or 10+18+16=44. Or only 44%. if our threshold is two and our strings differ by length // 2 then we have: // // a b c d e f g h // -------------------- // |∞ ∞ ∞ ∞ ∞ ∞ ∞ ∞ ∞ ∞ // |∞ 0 1 2 3 4 5 6 7 8 // m |∞ 1 * // n |∞ 2 * * // o |∞ 3 * * * // p |∞ 4 * * * // q |∞ 5 * * * // r |∞ 6 * * * // s |∞ 7 * * * // t |∞ 8 * * * // u |∞ 9 * * // v |∞10 * // // Then we examine 8+8+8=24 out of 80, or only 30% of the matrix. As the strings // get larger, the savings increase as well. // -------------------------------------------------------------------------------- // The highest cost it can be to convert a source to target is targetLength. i.e. // changing all the characters in source to target (which would be be 'sourceLength' // changes), and then adding all the missing characters in 'target' (which is // 'targetLength' - 'sourceLength' changes). Combined that's 'targetLength'. // // So we can just cap our threshold here. This makes some of the walking code // below simpler. threshold = Math.Min(threshold, targetLength); var offset = threshold - minimumEditCount; Debug.Assert(offset >= 0); var matrix = GetMatrix(sourceLength + 2, targetLength + 2); var characterToLastSeenIndex_inSource = t_dictionaryPool.Value; characterToLastSeenIndex_inSource.Clear(); for (int i = 1; i <= sourceLength; i++) { var lastMatchIndex_inTarget = 0; var sourceChar = source[i - 1]; // Determinethe portion of the column we actually want to examine. var jStart = Math.Max(1, i - offset); var jEnd = Math.Min(targetLength, i + minimumEditCount + offset); // If we're examining only a subportion of the column, then we need to make sure // that the values outside that range are set to Infinity. That way we don't // consider them when we look through edit paths from above (for this column) or // from the left (for the next column). if (jStart > 1) { matrix[i + 1, jStart] = Infinity; } if (jEnd < targetLength) { matrix[i + 1, jEnd + 2] = Infinity; } for (int j = jStart; j <= jEnd; j++) { var targetChar = target[j - 1]; var i1 = GetValue(characterToLastSeenIndex_inSource, targetChar); var j1 = lastMatchIndex_inTarget; var matched = sourceChar == targetChar; if (matched) { lastMatchIndex_inTarget = j; } matrix[i + 1, j + 1] = Min( matrix[i, j] + (matched ? 0 : 1), matrix[i + 1, j] + 1, matrix[i, j + 1] + 1, matrix[i1, j1] + (i - i1 - 1) + 1 + (j - j1 - 1)); } characterToLastSeenIndex_inSource[sourceChar] = i; // Recall that minimumEditCount is simply the difference in length of our two // strings. So matrix[i+1,i+1] is the cost for the upper-left diagonal of the // matrix. matrix[i+1,i+1+minimumEditCount] is the cost for the lower right diagonal. // Here we are simply getting the lowest cost edit of hese two substrings so far. // If this lowest cost edit is greater than our threshold, then there is no need // to proceed. if (matrix[i + 1, i + minimumEditCount + 1] > threshold) { return(BeyondThreshold); } } return(matrix[sourceLength + 1, targetLength + 1]); }
public static int GetEditDistance(ArraySlice <char> source, ArraySlice <char> target, int threshold = int.MaxValue) { return(source.Length <= target.Length ? GetEditDistanceWorker(source, target, threshold) : GetEditDistanceWorker(target, source, threshold)); }