// Determines the longest common subsequence between two sequences and populates the // list of diffs derived from the result as we go. Each recursive step identifies // a middle "snake" (a common sequence) then splits the problem until nothing remains. private void ComputeLCS(Substring left, Substring right) { int n = left.Length; int m = right.Length; if (n != 0 && m != 0) { int middleSnakeLeftStartIndex, middleSnakeRightStartIndex, middleSnakeLength; int ses = FindMiddleSnake(left, right, out middleSnakeLeftStartIndex, out middleSnakeRightStartIndex, out middleSnakeLength); if (ses > 1) { // If SES >= 2 then the edit script includes at least 2 differences, so we divide the problem. ComputeLCS( left.Extract(0, middleSnakeLeftStartIndex), right.Extract(0, middleSnakeRightStartIndex)); EmitDiffsFromCommonSequence( left.Range.StartIndex + middleSnakeLeftStartIndex, right.Range.StartIndex + middleSnakeRightStartIndex, middleSnakeLength); ComputeLCS( left.Extract(middleSnakeLeftStartIndex + middleSnakeLength), right.Extract(middleSnakeRightStartIndex + middleSnakeLength)); } else { // If SES = 1, then exactly one symbol needs to be added or deleted from either sequence. // If SES = 0, then both sequences are equal. if (ses != 0) { // The middle snake is the common part after the change so we just need to grab the // common part before the change. EmitDiffsFromCommonSequence( left.Range.StartIndex, right.Range.StartIndex, Math.Min(middleSnakeLeftStartIndex, middleSnakeRightStartIndex)); } EmitDiffsFromCommonSequence( left.Range.StartIndex + middleSnakeLeftStartIndex, right.Range.StartIndex + middleSnakeRightStartIndex, middleSnakeLength); } } }
public static void PopulateDiffs(IList <Diff> diffs, Substring left, Substring right, bool boundRuntime) { int n = left.Length; int m = right.Length; int max = CeilNPlusMOverTwo(n, m); if (boundRuntime && ((long)n) * ((long)m) > TooLong) { max = (int)Math.Pow(max, PowLimit - 1.0); } DiffAlgorithm algorithm = new DiffAlgorithm(diffs, left.Range.StartIndex, right.Range.StartIndex, max); algorithm.ComputeLCS(left, right); algorithm.FlushDiffs(left.Range.EndIndex, right.Range.EndIndex); }
private static void WriteContext(MarkupStreamWriter writer, Substring context, int maxContextLength) { if (context.Length < maxContextLength) { writer.Write(context.ToString()); } else { int split = maxContextLength / 2; if (split > 0) { writer.Write(context.Extract(0, split).ToString()); writer.WriteEllipsis(); writer.Write(context.Extract(context.Length - split)); } } }
private static void FastDiff(IList <Diff> diffs, Substring left, Substring right, bool boundRuntime) { // If either document is empty, then the change covers the whole document. if (left.Length == 0 || right.Length == 0) { diffs.Add(new Diff(DiffKind.Change, left.Range, right.Range)); return; } // Reduce the problem size by identifying a common prefix and suffix, if any. int commonPrefixLength = left.FindCommonPrefixLength(right); if (commonPrefixLength != 0) { if (commonPrefixLength == left.Length && commonPrefixLength == right.Length) { diffs.Add(new Diff(DiffKind.NoChange, left.Range, right.Range)); return; } diffs.Add(new Diff(DiffKind.NoChange, new Range(left.Range.StartIndex, commonPrefixLength), new Range(right.Range.StartIndex, commonPrefixLength))); } int commonSuffixLength = left.Extract(commonPrefixLength).FindCommonSuffixLength(right.Extract(commonPrefixLength)); // Now work on the middle part. Substring leftMiddle = left.Extract(commonPrefixLength, left.Length - commonPrefixLength - commonSuffixLength); Substring rightMiddle = right.Extract(commonPrefixLength, right.Length - commonPrefixLength - commonSuffixLength); SlowDiff(diffs, leftMiddle, rightMiddle, boundRuntime); // Tack on the final diff for the common suffix, if any. if (commonSuffixLength != 0) { diffs.Add(new Diff(DiffKind.NoChange, new Range(leftMiddle.Range.EndIndex, commonSuffixLength), new Range(rightMiddle.Range.EndIndex, commonSuffixLength))); } }
// Finds a middle "snake", which is a (possibly empty) sequence of diagonal edges in the edit // graph. Thus it directly represents a common sequence. // // In essence, this function searches D-paths forward and backward in the sequence until it // finds the middle snake. The middle snake informs us about a common sequence sandwiched // between two other sequences that may contain changes. By definition, the left and right // middle snakes must be of equal length. private int FindMiddleSnake(Substring left, Substring right, out int middleSnakeLeftStartIndex, out int middleSnakeRightStartIndex, out int middleSnakeLength) { int n = left.Length; int m = right.Length; int delta = n - m; bool isDeltaOdd = (delta & 1) != 0; leftVector[max + 1] = 0; rightVector[max - 1] = n; int end = Math.Min(CeilNPlusMOverTwo(n, m), max); for (int d = 0; d <= end; d++) { // Search forward D-paths. for (int k = -d; k <= d; k += 2) { // Find the end of the furthest reaching forward D-path in diagonal k. int x = k == -d || k != d && leftVector[max + k - 1] < leftVector[max + k + 1] ? leftVector[max + k + 1] : leftVector[max + k - 1] + 1; int origX = x; for (int y = x - k; x < n && y < m && left[x] == right[y];) { x += 1; y += 1; } leftVector[max + k] = x; // If the D-path is feasible and overlaps the furthest reaching reverse (D-1)-Path in diagonal k // then we have found the middle snake. if (isDeltaOdd && k >= delta - d + 1 && k <= delta + d - 1) { int u = rightVector[max + k - delta]; if (x >= u) { middleSnakeLeftStartIndex = origX; middleSnakeRightStartIndex = origX - k; middleSnakeLength = x - origX; return(d * 2 - 1); } } } // Search reverse D-paths. for (int k = -d; k <= d; k += 2) { // Find the end of the furthest reaching reverse D-path in diagonal k + delta. int u = k == d || k != -d && rightVector[max + k - 1] < rightVector[max + k + 1] ? rightVector[max + k - 1] : rightVector[max + k + 1] - 1; int kPlusDelta = k + delta; int origU = u; int v; for (v = u - kPlusDelta; u > 0 && v > 0 && left[u - 1] == right[v - 1];) { u -= 1; v -= 1; } rightVector[max + k] = u; // If the D-path is feasible and overlaps the furthest reaching forward D-Path in diagonal k // then we have found the middle snake. if (!isDeltaOdd && kPlusDelta >= -d && kPlusDelta <= d) { int x = leftVector[max + kPlusDelta]; if (u <= x) { middleSnakeLeftStartIndex = u; middleSnakeRightStartIndex = v; middleSnakeLength = origU - u; return(d * 2); } } } } // We have exceeded the maximum effort we are willing to expend finding a diff. // // So we artificially divide the problem by finding the snakes in the forward / reverse // direction that have the most progress toward (N, M) / (0, 0). These are the // ones that maximize x + y / minimize u + v. The snake we return will not actually // be the middle snake (since we haven't found it yet) but it will be good enough // to reduce the problem. // // These snakes all begin on the same diagonal as the others of equal // progress in the same direction. As there may be several of them, we need a way // to decide which one to pursue. // // The Eclipse LCS implementation chooses the median of these snakes with respect to k. // Intuitively this is the one that is nearer the direct line between (0, 0) and (N, M) // so it has a good chance of forming a path with more balanced changes between A and B // than the snakes that consist of significantly more changes to A than B or vice-versa. // Consequently the median of theses snakes should yield a pretty good approximation. -- Jeff. int bestProgress = 0; Dictionary <int, bool> bestKs = new Dictionary <int, bool>(); // with the forward direction indicated by value true for (int k = -end; k <= end; k += 2) { // Forward direction. int x = leftVector[max + k]; int y = x - k; if (x < n && y < m) { int progress = x + y; if (progress >= bestProgress) { if (progress > bestProgress) { bestProgress = progress; bestKs.Clear(); } bestKs[k] = true; } } // Reverse direction. int u = rightVector[max + k]; int v = u - k - delta; if (u >= 0 && v >= 0) { int progress = n + m - u - v; if (progress >= bestProgress) { if (progress > bestProgress) { bestProgress = progress; bestKs.Clear(); } bestKs[k] = false; } } } int[] sortedKs = new int[bestKs.Count]; bestKs.Keys.CopyTo(sortedKs, 0); Array.Sort(sortedKs); int medianK = sortedKs[sortedKs.Length / 2]; if (bestKs[medianK]) { int x = leftVector[max + medianK]; int y = x - medianK; middleSnakeLeftStartIndex = x; middleSnakeRightStartIndex = y; } else { int u = rightVector[max + medianK]; int v = u - medianK - delta; middleSnakeLeftStartIndex = u; middleSnakeRightStartIndex = v; } middleSnakeLength = 0; // We need to return the length of the shortest edit script but we don't actually know // what it is. Fortunately the caller does not care as long as it's greater than 2, which // it must be since d > end >= max > 2. return(int.MaxValue); }
private static void SlowDiff(IList <Diff> diffs, Substring left, Substring right, bool boundRuntime) { DiffAlgorithm.PopulateDiffs(diffs, left, right, boundRuntime); }