public SubArray(SubArray arData, int iOffset, int iLength) { m_arData = arData.m_arData; //Subtract 1 here because iOffset will be 1-based m_iOffset = arData.m_iOffset + iOffset - 1; m_iLength = iLength; }
private bool SetupFictitiousPoints(SubArray A, SubArray B) { if (A.Length > 0 && B.Length > 0) { //Setup some "fictious" endpoints for initial forward //and reverse path navigation. m_vecForward[1] = 0; int N = A.Length; int M = B.Length; int iDelta = N - M; m_vecReverse[iDelta + 1] = N + 1; return true; } else return false; }
/// <summary> /// Returns an EditScript instance that gives all the Edits /// necessary to transform A into B. /// </summary> public EditScript Execute() { ArrayList MatchPoints = new ArrayList(); SubArray A = new SubArray(m_arA); SubArray B = new SubArray(m_arB); GetMatchPoints(A, B, MatchPoints); Debug.Assert(MatchPoints.Count == GetLCSLength(), "The number of match points must equal the LCS length."); EditScript Script = ConvertMatchPointsToEditScript(A.Length, B.Length, MatchPoints); Debug.Assert(Script.TotalEditLength == GetSESLength()); return Script; }
private void GetMatchPoints(SubArray A, SubArray B, ArrayList MatchPoints) { if (A.Length > 0 && B.Length > 0) { //Find the middle snake from (x,y) to (u,v) int x, u, k; int D = FindMiddleSnake(A, B, out x, out u, out k); int y = x - k; int v = u - k; if (D > 1) { GetMatchPoints(new SubArray(A, 1, x), new SubArray(B, 1, y), MatchPoints); for (int i = x + 1; i <= u; i++) { //Output absolute X and Y (not relative to the current subarray) MatchPoints.Add(new Point(i + A.Offset, i - k + B.Offset)); } GetMatchPoints(new SubArray(A, u + 1, A.Length - u), new SubArray(B, v + 1, B.Length - v), MatchPoints); } else { //If there are no differences, we have to output all of the points. //If there's only one difference, we have to output all of the //match points, skipping the single point that is different. Debug.Assert(D == 0 || Math.Abs(A.Length - B.Length) == 1, "A and B's lengths must differ by 1 if D == 1"); //Only go to the minimum of the two lengths since that's the //most that can possibly match between the two subsequences. int N = A.Length; int M = B.Length; if (M > N) { //Output A[1..N] as match points int iCurrY = 1; for (int i = 1; i <= N; i++) { //We must skip the one difference when we hit it if (A[i] != B[iCurrY]) { iCurrY++; } MatchPoints.Add(new Point(i + A.Offset, iCurrY + B.Offset)); iCurrY++; } } else { //Output B[1..M] as match points int iCurrX = 1; for (int i = 1; i <= M; i++) { //We must skip the one difference when we hit it if (A[iCurrX] != B[i]) { iCurrX++; } MatchPoints.Add(new Point(iCurrX + A.Offset, i + B.Offset)); iCurrX++; } } } } }
private int GetReverseDPaths(SubArray A, SubArray B, int D, int k, int iDelta) { DiagonalVector V = m_vecReverse; int p = k + iDelta; int x; if ((k == -D) || (k != D && V[p + 1] <= V[p - 1])) { x = V[p + 1] - 1; } else { x = V[p - 1]; } int y = x - p; while (x > 0 && y > 0 && A[x] == B[y]) { x--; y--; } V[p] = x; return x; }
private void GetLCS(SubArray A, SubArray B, ArrayList Output) { if (A.Length > 0 && B.Length > 0) { //Find the length D and the middle snake from (x,y) to (u,v) int x, u, k; int D = FindMiddleSnake(A, B, out x, out u, out k); int y = x - k; int v = u - k; if (D > 1) { GetLCS(new SubArray(A, 1, x), new SubArray(B, 1, y), Output); for (int i = x + 1; i <= u; i++) { Output.Add(A[i]); } GetLCS(new SubArray(A, u + 1, A.Length - u), new SubArray(B, v + 1, B.Length - v), Output); } else if (B.Length > A.Length) { for (int i = 1; i <= A.Length; i++) { Output.Add(A[i]); } } else { for (int i = 1; i <= B.Length; i++) { Output.Add(B[i]); } } } }
private int GetForwardDPaths(SubArray A, SubArray B, int D, int k) { DiagonalVector V = m_vecForward; int x; if ((k == -D) || (k != D && V[k - 1] < V[k + 1])) { x = V[k + 1]; } else { x = V[k - 1] + 1; } int y = x - k; while (x < A.Length && y < B.Length && A[x + 1] == B[y + 1]) { x++; y++; } V[k] = x; return x; }
private int FindMiddleSnake(SubArray A, SubArray B, out int iPathStartX, out int iPathEndX, out int iPathK) { //We don't have to check the result of this because the calling procedure //has already check the length preconditions. SetupFictitiousPoints(A, B); iPathStartX = -1; iPathEndX = -1; iPathK = 0; int iDelta = A.Length - B.Length; int iCeiling = (int)Math.Ceiling((A.Length + B.Length) / 2.0); for (int D = 0; D <= iCeiling; D++) { for (int k = -D; k <= D; k += 2) { //Find the end of the furthest reaching forward D-path in diagonal k. GetForwardDPaths(A, B, D, k); //If iDelta is odd (i.e. remainder == 1 or -1) and ... if ((iDelta % 2 != 0) && (k >= (iDelta - (D - 1)) && k <= (iDelta + (D - 1)))) { //If the path overlaps the furthest reaching reverse (D-1)-path in diagonal k. if (m_vecForward[k] >= m_vecReverse[k]) { //The last snake of the forward path is the middle snake. iPathK = k; iPathEndX = m_vecForward[k]; iPathStartX = iPathEndX; int iPathStartY = iPathStartX - iPathK; while (iPathStartX > 0 && iPathStartY > 0 && A[iPathStartX] == B[iPathStartY]) { iPathStartX--; iPathStartY--; } //Length of an SES is 2D-1. return 2 * D - 1; } } } for (int k = -D; k <= D; k += 2) { //Find the end of the furthest reaching reverse D=path in diagonal k+iDelta GetReverseDPaths(A, B, D, k, iDelta); //If iDelta is even and ... if ((iDelta % 2 == 0) && ((k + iDelta) >= -D && (k + iDelta) <= D)) { //If the path overlaps the furthest reaching forward D-path in diagonal k+iDelta. if (m_vecReverse[k + iDelta] <= m_vecForward[k + iDelta]) { //The last snake of the reverse path is the middle snake. iPathK = k + iDelta; iPathStartX = m_vecReverse[iPathK]; iPathEndX = iPathStartX; int iPathEndY = iPathEndX - iPathK; while (iPathEndX < A.Length && iPathEndY < B.Length && A[iPathEndX + 1] == B[iPathEndY + 1]) { iPathEndX++; iPathEndY++; } //Length of an SES is 2D. return 2 * D; } } } } //We should never get here if the algorithm is coded correctly. Debug.Assert(false); return -1; }
/// <summary> /// Calculates the length of the "shortest edit script" /// as defined in Myers's paper. /// /// Note: This may not be the same as the Count property /// of an EditScript instance returned by Execute(). If /// an EditScript instance has any Edits with Length > 1, /// then those groupings will make EditScript.Count less /// than GetSESLength(). Similarly, an Edit with EditType /// Change should be thought of as a combined Delete and /// Insert for the specified Length. /// </summary> public int GetSESLength() { SubArray A = new SubArray(m_arA); SubArray B = new SubArray(m_arB); if (SetupFictitiousPoints(A, B)) { int N = m_arA.Length; int M = m_arB.Length; for (int D = 0; D <= N + M; D++) { for (int k = -D; k <= D; k += 2) { int x = GetForwardDPaths(A, B, D, k); int y = x - k; if (x >= N && y >= M) { return D; } } } //We should never get here if the algorithm is coded correctly. Debug.Assert(false); return -1; } else if (m_arA.Length == 0) { return m_arB.Length; } else { return m_arA.Length; } }
/// <summary> /// Gets the length of the "shortest edit script" /// by running the algorithm in reverse. We should /// always have GetSESLength() == GetReverseSESLength(). /// </summary> public int GetReverseSESLength() { SubArray A = new SubArray(m_arA); SubArray B = new SubArray(m_arB); if (SetupFictitiousPoints(A, B)) { int N = m_arA.Length; int M = m_arB.Length; int iDelta = N - M; for (int D = 0; D <= N + M; D++) { for (int k = -D; k <= D; k += 2) { int x = GetReverseDPaths(A, B, D, k, iDelta); int y = x - (k + iDelta); if (x <= 0 && y <= 0) { return D; } } } //We should never get here if the algorithm is coded correctly. Debug.Assert(false); return -1; } else if (m_arA.Length == 0) { return m_arB.Length; } else { return m_arA.Length; } }