Example #1
0
        private static int GetEditDistanceWorker(ArraySlice <char> source, ArraySlice <char> target, int threshold)
        {
            // Note: sourceLength will always be smaller or equal to targetLength.
            //
            // Also Note: sourceLength and targetLength values will mutate and represent the lengths
            // of the portions of the arrays we want to compare.  However, even after mutation, hte
            // invariant htat sourceLength is <= targetLength will remain.
            Debug.Assert(source.Length <= target.Length);

            // First:
            // Determine the common prefix/suffix portions of the strings.  We don't even need to
            // consider them as they won't add anything to the edit cost.
            while (source.Length > 0 && source[source.Length - 1] == target[target.Length - 1])
            {
                source.SetLength(source.Length - 1);
                target.SetLength(target.Length - 1);
            }

            while (source.Length > 0 && source[0] == target[0])
            {
                source.MoveStartForward(amount: 1);
                target.MoveStartForward(amount: 1);
            }

            // 'sourceLength' and 'targetLength' are now the lengths of the substrings of our strings that we
            // want to compare. 'startIndex' is the starting point of the substrings in both array.
            //
            // If we've matched all of the 'source' string in the prefix and suffix of 'target'. then the edit
            // distance is just whatever operations we have to create the remaining target substring.
            //
            // Note: we don't have to check if targetLength is 0.  That's because targetLength being zero would
            // necessarily mean that sourceLength is 0.
            var sourceLength = source.Length;
            var targetLength = target.Length;

            if (sourceLength == 0)
            {
                return(targetLength <= threshold ? targetLength : BeyondThreshold);
            }

            // The is the minimum number of edits we'd have to make.  i.e. if  'source' and
            // 'target' are the same length, then we might not need to make any edits.  However,
            // if target has length 10 and source has length 7, then we're going to have to
            // make at least 3 edits no matter what.
            var minimumEditCount = targetLength - sourceLength;

            Debug.Assert(minimumEditCount >= 0);

            // If the number of edits we'd have to perform is greater than our threshold, then
            // there's no point in even continuing.
            if (minimumEditCount > threshold)
            {
                return(BeyondThreshold);
            }

            // Say we want to find the edit distance between "sunday" and "saturday".  Our initial
            // matrix will be:
            //
            // (Note: for purposes of this explanation we will not be trimming off the common
            // prefix/suffix of the strings.  That optimization does not affect any of the
            // remainder of the explanation).
            //
            //           s u n d a y
            //      ----------------
            //      |∞ ∞ ∞ ∞ ∞ ∞ ∞ ∞
            //      |∞ 0 1 2 3 4 5 6
            //    s |∞ 1
            //    a |∞ 2
            //    t |∞ 3
            //    u |∞ 4
            //    r |∞ 5
            //    d |∞ 6
            //    a |∞ 7
            //    y |∞ 8
            //
            // Note that the matrix will always be square, or a rectangle that is taller htan it is
            // longer.  Our 'source' is at the top, and our 'target' is on the left.  The edit distance
            // between any prefix of 'source' and any prefix of 'target' can then be found in
            // the unfilled area of the matrix.  Specifically, if we have source.substring(0, m) and
            // target.substring(0, n), then the edit distance for them can be found at matrix position
            // (m+1, n+1).  This is why the 1'th row and 1'th column can be prefilled.  They represent
            // the cost to go from the empty target to the full source or the empty source to the full
            // target (respectively).  So, if we wanted to know the edit distance between "sun" and
            // "sat", we'd look at (3+1, 3+1).  It then follows that our final edit distance between
            // the full source and target is in the lower right corner of this matrix.
            //
            // If we fill out the matrix fully we'll get:
            //
            //           s u n d a y <-- source
            //      ----------------
            //      |∞ ∞ ∞ ∞ ∞ ∞ ∞ ∞
            //      |∞ 0 1 2 3 4 5 6
            //    s |∞ 1 0 1 2 3 4 5
            //    a |∞ 2 1 1 2 3 3 4
            //    t |∞ 3 2 2 2 3 4 4
            //    u |∞ 4 3 2 3 3 4 5
            //    r |∞ 5 4 3 3 4 4 5
            //    d |∞ 6 5 4 4 3 4 5
            //    a |∞ 7 6 5 5 4 3 4
            //    y |∞ 8 7 6 6 5 4 3 <--
            //                     ^
            //                     |
            //
            // So in this case, the edit distance is 3.  Or, specifically, the edits:
            //
            //      Sunday -> Replace("n", "r") ->
            //      Surday -> Insert("a") ->
            //      Saurday -> Insert("t") ->
            //      Saturday
            //
            //
            // Now: in the case where we want to know what the edit distance actually is (for example
            // when making a BKTree), we must fill out this entire array to get the true edit distance.
            //
            // However, in some cases we can do a bit better.  For example, if a client only wants to
            // the edit distance *when the edit distance will be less than some threshold* then we do
            // not need to examine the entire matrix.  We only want to examine until the point where
            // we realize that, no matter what, our final edit distance will be more than that threshold
            // (at which point we can return early).
            //
            // Some things are trivially easy to check.  First, the edit distance between two strings is at
            // *best* the difference of their lengths.  i.e. if i have "aa" and "aaaaa" then the edit
            // distance is 3 (the difference of 5 and 2).  If our threshold is less then 3 then there
            // is no way these two strings could match.  So we can leave early if we can tell it would
            // simply be impossible to get an edit distance within the specified threshold.
            //
            // Second, let's look at our matrix again:
            //
            //           s u n d a y
            //      ----------------
            //      |∞ ∞ ∞ ∞ ∞ ∞ ∞ ∞
            //      |∞ 0 1 2 3 4 5 6
            //    s |∞ 1
            //    a |∞ 2
            //    t |∞ 3
            //    u |∞ 4
            //    r |∞ 5
            //    d |∞ 6
            //    a |∞ 7
            //    y |∞ 8           *
            //
            // We want to know what the value is at *, and we want to stop as early as possible if it
            // is greater than our threshold.
            //
            // Given the edit distance rules we observe edit distance at any point (i,j) in the matrix will
            // always be greater than or equal to the value in (i-1, j-1).  i.e. the edit distance of
            // any two strings is going to be *at best* equal to the edit distance of those two strings
            // without their final characters.  If their final characters are the same, they'll ahve the
            // same edit distance.  If they are different, the edit distance will be greater.  Given
            // that we know the final edit distance is in the lower right, we can discover something
            // useful in the matrix.
            //
            //           s u n d a y
            //      ----------------
            //      |∞ ∞ ∞ ∞ ∞ ∞ ∞ ∞
            //      |∞ 0 1 2 3 4 5 6
            //    s |∞ 1
            //    a |∞ 2
            //    t |∞ 3 `
            //    u |∞ 4   `
            //    r |∞ 5     `
            //    d |∞ 6       `
            //    a |∞ 7         `
            //    y |∞ 8           *
            //
            // The slashes are the "bottom" diagonal leading to the lower right.  The value in the
            // lower right will be strictly equal to or greater than any value on this diagonal.
            // Thus, if that value exceeds the threshold, we know we can stop immediately as the
            // total edit distance must be greater than the threshold.
            //
            // We can use similar logic to avoid even having to examine more of the matrix when we
            // have a threshold. First, consider the same diagonal.
            //
            //           s u n d a y
            //      ----------------
            //      |∞ ∞ ∞ ∞ ∞ ∞ ∞ ∞
            //      |∞ 0 1 2 3 4 5 6
            //    s |∞ 1
            //    a |∞ 2
            //    t |∞ 3 `
            //    u |∞ 4   `       x
            //    r |∞ 5     `     |
            //    d |∞ 6       `   |
            //    a |∞ 7         ` |
            //    y |∞ 8           *
            //
            // And then consider a point above that diagonal (indicated by x).  In the example
            // above, the edit distance to * from 'x' will be (x+4).  If, for example, threshold
            // was '2', then it would be impossible for the path from 'x' to provide a good
            // enough edit distance *ever*.   Similarly:
            //
            //           s u n d a y
            //      ----------------
            //      |∞ ∞ ∞ ∞ ∞ ∞ ∞ ∞
            //      |∞ 0 1 2 3 4 5 6
            //    s |∞ 1
            //    a |∞ 2
            //    t |∞ 3 `
            //    u |∞ 4   `
            //    r |∞ 5     `
            //    d |∞ 6       `
            //    a |∞ 7         `
            //    y |∞ 8     y - - *
            //
            // Here we see that the final edit distance will be "y+3".  Again, if the edit
            // distance threshold is less than 3, then no path from y will provide a good
            // enough edit distance.
            //
            // So, if we had an edit distance threshold of 3, then the range around that
            // bottom diagonal that we should consider checking is:
            //
            //           s u n d a y
            //      ----------------
            //      |∞ ∞ ∞ ∞ ∞ ∞ ∞ ∞
            //      |∞ 0 1 2 3 4 5 6
            //    s |∞ 1 | |
            //    a |∞ 2 | | |
            //    t |∞ 3 ` | | |
            //    u |∞ 4 - ` | | |
            //    r |∞ 5 - - ` | | |
            //    d |∞ 6 - - - ` | |
            //    a |∞ 7   - - - ` |
            //    y |∞ 8     - - - *
            //
            // Now, also consider that it will take a minimum of targetLength-sourceLength edits
            // just to move to the lower diagonal from the upper diagonal.  That leaves
            // 'threshold - (targetLength - sourceLength)' edits remaining.  In this example, that
            // means '3 - (8 - 6)' = 1.  Because of this our lower diagonal offset is capped at:
            //
            //           s u n d a y
            //      ----------------
            //      |∞ ∞ ∞ ∞ ∞ ∞ ∞ ∞
            //      |∞ 0 1 2 3 4 5 6
            //    s |∞ 1 | |
            //    a |∞ 2 | | |
            //    t |∞ 3 ` | | |
            //    u |∞ 4 - ` | | |
            //    r |∞ 5   - ` | | |
            //    d |∞ 6     - ` | |
            //    a |∞ 7       - ` |
            //    y |∞ 8         - *
            //
            // If we mark the upper diagonal appropriately we see the matrix as:
            //
            //           s u n d a y
            //      ----------------
            //      |∞ ∞ ∞ ∞ ∞ ∞ ∞ ∞
            //      |∞ 0 1 2 3 4 5 6
            //    s |∞ 1 ` |
            //    a |∞ 2   ` |
            //    t |∞ 3 `   ` |
            //    u |∞ 4 - `   ` |
            //    r |∞ 5   - `   ` |
            //    d |∞ 6     - `   `
            //    a |∞ 7       - `
            //    y |∞ 8         - *
            //
            // Or, effectively, we only need to examine 'threshold - (targetLength - sourceLength)'
            // above and below the diagonals.
            //
            // In practice, when a threshold is provided it is normally capped at '2'.  Given that,
            // the most around the diagonal we'll ever have to check is +/- 2 elements.  i.e. with
            // strings of length 10 we'd only check:
            //
            //           a b c d e f g h i j
            //      ------------------------
            //      |∞ ∞ ∞ ∞ ∞ ∞ ∞ ∞ ∞ ∞ ∞ ∞
            //      |∞ 0 1 2 3 4 5 6 7 8 9 10
            //    m |∞ 1 * * *
            //    n |∞ 2 * * * *
            //    o |∞ 3 * * * * *
            //    p |∞ 4   * * * * *
            //    q |∞ 5     * * * * *
            //    r |∞ 6       * * * * *
            //    s |∞ 7         * * * * *
            //    t |∞ 8           * * * * *
            //    u |∞ 9             * * * *
            //    v |∞10               * * *
            //
            // or 10+18+16=44.  Or only 44%. if our threshold is two and our strings differ by length
            // 2 then we have:
            //
            //           a b c d e f g h
            //      --------------------
            //      |∞ ∞ ∞ ∞ ∞ ∞ ∞ ∞ ∞ ∞
            //      |∞ 0 1 2 3 4 5 6 7 8
            //    m |∞ 1 *
            //    n |∞ 2 * *
            //    o |∞ 3 * * *
            //    p |∞ 4   * * *
            //    q |∞ 5     * * *
            //    r |∞ 6       * * *
            //    s |∞ 7         * * *
            //    t |∞ 8           * * *
            //    u |∞ 9             * *
            //    v |∞10               *
            //
            // Then we examine 8+8+8=24 out of 80, or only 30% of the matrix.  As the strings
            // get larger, the savings increase as well.

            // --------------------------------------------------------------------------------

            // The highest cost it can be to convert a source to target is targetLength.  i.e.
            // changing all the characters in source to target (which would be be 'sourceLength'
            // changes), and then adding all the missing characters in 'target' (which is
            // 'targetLength' - 'sourceLength' changes).  Combined that's 'targetLength'.
            //
            // So we can just cap our threshold here.  This makes some of the walking code
            // below simpler.
            threshold = Math.Min(threshold, targetLength);

            var offset = threshold - minimumEditCount;

            Debug.Assert(offset >= 0);

            var matrix = GetMatrix(sourceLength + 2, targetLength + 2);

            var characterToLastSeenIndex_inSource = t_dictionaryPool.Value;

            characterToLastSeenIndex_inSource.Clear();

            for (int i = 1; i <= sourceLength; i++)
            {
                var lastMatchIndex_inTarget = 0;
                var sourceChar = source[i - 1];

                // Determinethe portion of the column we actually want to examine.
                var jStart = Math.Max(1, i - offset);
                var jEnd   = Math.Min(targetLength, i + minimumEditCount + offset);

                // If we're examining only a subportion of the column, then we need to make sure
                // that the values outside that range are set to Infinity.  That way we don't
                // consider them when we look through edit paths from above (for this column) or
                // from the left (for the next column).
                if (jStart > 1)
                {
                    matrix[i + 1, jStart] = Infinity;
                }

                if (jEnd < targetLength)
                {
                    matrix[i + 1, jEnd + 2] = Infinity;
                }

                for (int j = jStart; j <= jEnd; j++)
                {
                    var targetChar = target[j - 1];

                    var i1 = GetValue(characterToLastSeenIndex_inSource, targetChar);
                    var j1 = lastMatchIndex_inTarget;

                    var matched = sourceChar == targetChar;
                    if (matched)
                    {
                        lastMatchIndex_inTarget = j;
                    }

                    matrix[i + 1, j + 1] = Min(
                        matrix[i, j] + (matched ? 0 : 1),
                        matrix[i + 1, j] + 1,
                        matrix[i, j + 1] + 1,
                        matrix[i1, j1] + (i - i1 - 1) + 1 + (j - j1 - 1));
                }

                characterToLastSeenIndex_inSource[sourceChar] = i;

                // Recall that minimumEditCount is simply the difference in length of our two
                // strings.  So matrix[i+1,i+1] is the cost for the upper-left diagonal of the
                // matrix.  matrix[i+1,i+1+minimumEditCount] is the cost for the lower right diagonal.
                // Here we are simply getting the lowest cost edit of hese two substrings so far.
                // If this lowest cost edit is greater than our threshold, then there is no need
                // to proceed.
                if (matrix[i + 1, i + minimumEditCount + 1] > threshold)
                {
                    return(BeyondThreshold);
                }
            }

            return(matrix[sourceLength + 1, targetLength + 1]);
        }
Example #2
0
 public static int GetEditDistance(ArraySlice <char> source, ArraySlice <char> target, int threshold = int.MaxValue)
 {
     return(source.Length <= target.Length
         ? GetEditDistanceWorker(source, target, threshold)
         : GetEditDistanceWorker(target, source, threshold));
 }