/// <summary> /// Computes and returns the edit distance between two sequences of type <typeparamref name="T"/>, using /// the similarity computer and cost values specified in the constructor. If <see cref="ComputeMoveOperations"/> /// is <c>true</c>, simple moves will be detected. Otherwise, moves will (typically) result in two independent /// insert/delete operations. /// </summary> /// <param name="sourceObjects">The first input sequence ("source")</param> /// <param name="targetObjects">The second input sequence ("target")</param> /// <param name="precomputedAssociations">A list of precomputed item index associations. If valid, item pairs /// in this list will be associated with each other, which will result in either an identity /// or a change operation.</param> /// <returns>The edit distance between the two sequences</returns> public Core.EditDistance.EditDistance ComputeEditDistance(IList <T> sourceObjects, IList <T> targetObjects, List <Core.Pair <int> > precomputedAssociations) { const double invalidAssignmentCosts = 100000.0d; #if DEBUG if (typeof(T) != typeof(char)) { } #endif if (sourceObjects == null) { throw new ArgumentNullException("sourceObjects"); } if (targetObjects == null) { throw new ArgumentNullException("targetObjects"); } if (precomputedAssociations != null) { if (!SortAndValidate(precomputedAssociations, sourceObjects.Count, targetObjects.Count)) { System.Diagnostics.Debug.Assert(false, "Invalid preassignments"); precomputedAssociations = null; } } // TODO handle special cases (one/both of the arrays being empty/having no elements) // TODO use diagonal algorithm Core.EditDistance.EditDistance result = new Core.EditDistance.EditDistance(sourceObjects.Count, targetObjects.Count, 0.0d); MatrixItem[,] matrix = new MatrixItem[sourceObjects.Count + 1, targetObjects.Count + 1]; int i, j; bool usePreassignments = precomputedAssociations != null; // initialize matrix matrix[0, 0] = new MatrixItem(0.0d, Core.EditDistance.EditOperation.Identity, 0.0d); for (i = 1; i <= sourceObjects.Count; ++i) { matrix[i, 0] = new MatrixItem((double)i * _InsertDeleteCosts, Core.EditDistance.EditOperation.Delete, 0.0d); } for (j = 1; j <= targetObjects.Count; ++j) { matrix[0, j] = new MatrixItem((double)j * _InsertDeleteCosts, Core.EditDistance.EditOperation.Insert, 0.0d); } for (i = 1; i <= sourceObjects.Count; ++i) { for (j = 1; j <= targetObjects.Count; ++j) { matrix[i, j] = new MatrixItem(0.0d, Core.EditDistance.EditOperation.Identity, 0.0d); } } // populate matrix for (i = 1; i <= sourceObjects.Count; ++i) { T s = sourceObjects[i - 1]; int associatedTarget = usePreassignments ? GetSourcePreassignment(i - 1, precomputedAssociations) : -1; for (j = 1; j <= targetObjects.Count; ++j) { T t = targetObjects[j - 1]; double similarity = 0.0d; if (associatedTarget < 0 || associatedTarget == j - 1) { // no preassignment or items are correlated - use std sim similarity = _SimilarityComputer(s, t); } else { // there is a correlation with another item - don't allow change/identity similarity = -1.0d; } System.Diagnostics.Debug.Assert((similarity >= 0.0d && similarity <= 1.0d) || similarity == -1.0d); // low similarity means high "change costs" and vice versa: double changeCosts = (similarity < 0) ? invalidAssignmentCosts : matrix[i - 1, j - 1].Score + (1.0d - similarity); double insertCosts = matrix[i, j - 1].Score + _InsertDeleteCosts; double deleteCosts = matrix[i - 1, j].Score + _InsertDeleteCosts; double min = Math.Min(Math.Min(changeCosts, deleteCosts), insertCosts); matrix[i, j].Score = min; matrix[i, j].Similarity = similarity; if (min == deleteCosts) { matrix[i, j].Operation = Core.EditDistance.EditOperation.Delete; } else if (min == insertCosts) { matrix[i, j].Operation = Core.EditDistance.EditOperation.Insert; } else if (min == changeCosts) { if (similarity == 1.0d) { matrix[i, j].Operation = Core.EditDistance.EditOperation.Identity; } else { matrix[i, j].Operation = Core.EditDistance.EditOperation.Change; } } } } // readout the cheapest path i = sourceObjects.Count; j = targetObjects.Count; result.Distance = matrix[i, j].Score; while (i > 0 || j > 0) { EditDistanceItem item = new EditDistanceItem(); item.Resolution = EditDistanceResolution.None; item.Operation = matrix[i, j].Operation; switch (item.Operation) { case EditOperation.Identity: --i; --j; item.Costs = 0.0d; break; case EditOperation.Change: item.Costs = 1.0d - matrix[i, j].Similarity; --i; --j; break; case EditOperation.Insert: --j; item.Costs = _InsertDeleteCosts; break; case EditOperation.Delete: item.Costs = _InsertDeleteCosts; --i; break; } item.Source = i; item.Target = j; result.AddAtStart(item); } // identify move operations which are pairs of insert/delete operations in the shortest path. // Note that the comparision result is already in the matrix and we only care about identity. // TODO we may rather use a configurable threshold than identity (1.0) to catch move operations // of sufficiently similar items (e.g. case-insensitive) int moves = 0; // try to detect moves in case the move penalty is smaller than the sum of insert/delete penalties if (_ComputeMoveOperations) { // matrix[i, j].Similarity is the cached token similarity between source[i-1] and target[j-1] // TODO may need to restrict moves to undisputed corresponding items, i.e. those which // have only one row/column similarity maximum for (int index = 0; index < result.Items.Count; ++index) { EditOperation op = result[index].Operation; if (op == EditOperation.Delete || op == EditOperation.Insert) { int moveSource = 0; int moveTarget = 0; int moveSourceTarget = 0; int moveTargetSource = 0; // search in the remainder of the result list for a "compensating" operation int comp = 0; for (comp = index + 1; comp < result.Items.Count; ++comp) { if (result[comp].Operation == EditOperation.Insert && op == EditOperation.Delete && matrix[result[index].Source + 1, result[comp].Target + 1].Similarity >= _SimThreshold) { // source[result[index].Source] was deleted // target[result[comp].Target] was inserted moveSource = result[index].Source; moveSourceTarget = result[index].Target; moveTarget = result[comp].Target; moveTargetSource = result[comp].Source; break; } else if (result[comp].Operation == EditOperation.Delete && op == EditOperation.Insert && matrix[result[comp].Source + 1, result[index].Target + 1].Similarity >= _SimThreshold) { // source[result[comp].Source] was deleted // target[result[index].Target] was inserted moveSource = result[comp].Source; moveSourceTarget = result[comp].Target; moveTarget = result[index].Target; moveTargetSource = result[index].Source; break; } } // TODO take moveDistance into account, i.e. penalty depends on distance? Avoids // long-distance moves. if (comp < result.Items.Count) { // compensating operation found // TODO backtrack to find other compensating items? EditDistanceItem item = result[index]; item.Operation = EditOperation.Move; item.Source = moveSource; item.Target = moveTarget; item.MoveSourceTarget = moveSourceTarget; item.MoveTargetSource = moveTargetSource; // TODO update item similarity result.Items[index] = item; result.Items.RemoveAt(comp); ++moves; } } } } if (moves > 0) { // adjust score: substract moves * (deletionCosts + insertionCosts), add moves * moveCosts // TODO take moveDistance into account, i.e. penalty depends on distance? result.Distance -= (double)moves * (2.0d * _InsertDeleteCosts); result.Distance += (double)moves * _MoveCosts; } #if DEBUG && !SILVERLIGHT _DumpMatrix = false; // typeof(T) != typeof(char); if (_DumpMatrix) { // in debug mode, write matrix to a temp file in HTML format System.Environment.GetEnvironmentVariable("TEMP"); System.IO.StreamWriter wtr = new System.IO.StreamWriter(System.Environment.GetEnvironmentVariable("TEMP") + "/SimMatrix.html", false, System.Text.Encoding.UTF8); System.Web.UI.Html32TextWriter htmlWriter = new System.Web.UI.Html32TextWriter(wtr); htmlWriter.WriteFullBeginTag("html"); htmlWriter.WriteFullBeginTag("body"); htmlWriter.WriteBeginTag("table"); htmlWriter.WriteAttribute("border", "1"); for (j = -1; j <= targetObjects.Count; ++j) { htmlWriter.WriteFullBeginTag("tr"); for (i = -1; i <= sourceObjects.Count; ++i) { htmlWriter.WriteFullBeginTag("td"); if (i < 0) { // caption row if (j >= 0) { htmlWriter.Write("j={0}", j); if (j > 0) { htmlWriter.WriteFullBeginTag("br"); htmlWriter.WriteFullBeginTag("b"); htmlWriter.Write(targetObjects[j - 1].ToString()); htmlWriter.WriteEndTag("b"); } } } else if (j < 0) { // j < 0 but i >= 0 --> htmlWriter.Write("i={0}", i); if (i > 0) { htmlWriter.WriteFullBeginTag("br"); htmlWriter.WriteFullBeginTag("b"); htmlWriter.Write(sourceObjects[i - 1].ToString()); htmlWriter.WriteEndTag("b"); } } else { // content cell htmlWriter.Write("d={0}", matrix[i, j].Score); htmlWriter.WriteFullBeginTag("br"); htmlWriter.Write("s={0}", matrix[i, j].Similarity); htmlWriter.WriteFullBeginTag("br"); htmlWriter.Write("o={0}", matrix[i, j].Operation.ToString()); } htmlWriter.WriteEndTag("td"); } htmlWriter.WriteEndTag("tr"); } htmlWriter.WriteEndTag("table"); htmlWriter.WriteFullBeginTag("h2"); htmlWriter.Write("Result"); htmlWriter.WriteEndTag("h2"); htmlWriter.Write("Score = {0}", result.Distance); htmlWriter.WriteFullBeginTag("ol"); for (i = 0; i < result.Items.Count; ++i) { htmlWriter.WriteFullBeginTag("li"); htmlWriter.Write("{0}: s={1} t={2}", result[i].Operation.ToString(), result[i].Source, result[i].Target); } htmlWriter.WriteEndTag("ol"); htmlWriter.WriteEndTag("body"); htmlWriter.WriteEndTag("html"); htmlWriter.Close(); } #endif return(result); }
private Core.EditDistance.EditDistance ComputeEditDistanceImpl_Original( IList <Core.Tokenization.Token> sourceTokens, IList <Core.Tokenization.Token> targetTokens, bool computeDiagonalOnly, Core.Tokenization.BuiltinRecognizers disabledAutoSubstitutions, out TagAssociations alignedTags) { /* * The "classic" ED approach has the problem that it doesn't detect moves * reliably, particularly block moves. Patching up insert/delete pairs as * moves also won't catch moves which appear as changes in the ED. */ if (sourceTokens == null) { throw new ArgumentNullException("sourceTokens"); } if (targetTokens == null) { throw new ArgumentNullException("targetTokens"); } alignedTags = null; int i, j; // TODO handle special cases (one/both of the arrays being empty/having no elements) // TODO use diagonal algorithm bool enforceFullMatrixComputation = false; Core.EditDistance.EditDistance result = new Core.EditDistance.EditDistance(sourceTokens.Count, targetTokens.Count, 0.0d); // matrix which captures the similarity between two tokens as well as preassignments SimilarityMatrix sim = new SimilarityMatrix(sourceTokens, targetTokens, _UseStringEditDistance, disabledAutoSubstitutions); if (enforceFullMatrixComputation) { // this will be fully computed by the tag aligner in most cases, but we may save a bit // on plain text segments sim.Compute(computeDiagonalOnly); } MatrixItem[,] matrix = CreateEditDistanceMatrix(sourceTokens, targetTokens); alignedTags = TagAligner.AlignPairedTags(sourceTokens, targetTokens, sim); if (alignedTags != null && alignedTags.Count > 0) { // Patch the sim matrix so that non-aligned tags can't be assigned to each other PatchSimilarityMatrix(sim, sourceTokens, targetTokens, alignedTags); ComputeEditDistanceMatrix_Full(matrix, sim, alignedTags); } else if (enforceFullMatrixComputation) { ComputeEditDistanceMatrix_Full(matrix, sim, alignedTags); } else { ComputeEditDistanceMatrix_Lazy(matrix, sim); } // readout the cheapest path i = sourceTokens.Count; j = targetTokens.Count; result.Distance = matrix[i, j].Score; while (i > 0 || j > 0) { EditDistanceItem item = new EditDistanceItem(); item.Resolution = EditDistanceResolution.None; MatrixItem m = matrix[i, j]; item.Operation = m.Operation; switch (item.Operation) { case EditOperation.Identity: item.Costs = 0.0d; --i; --j; break; case EditOperation.Change: item.Costs = _UseStringEditDistance ? (1.0d - m.Similarity) : (1.0d - SimilarityComputers.GetTokenSimilarity(sourceTokens[i - 1], targetTokens[j - 1], true, disabledAutoSubstitutions)); // item.Costs = (1.0d - m.Similarity); --i; --j; break; case EditOperation.Insert: item.Costs = _InsertDeleteCosts; --j; break; case EditOperation.Delete: item.Costs = _InsertDeleteCosts; --i; break; case EditOperation.Undefined: throw new Exception("Internal ED computation error"); } item.Source = i; item.Target = j; result.AddAtStart(item); } if (alignedTags != null && alignedTags.Count > 0) { // should happen before move detection FixTagActions(sourceTokens, targetTokens, result, alignedTags); } // identify move operations which are pairs of insert/delete operations in the shortest path. // Note that the comparision result is already in the matrix and we only care about identity. // TODO we may rather use a configurable threshold than identity (1.0) to catch move operations // of sufficiently similar items (e.g. case-insensitive) if (_ComputeMoves) { int moves = DetectMoves(result, matrix); if (moves > 0) { // adjust score: substract moves * (deletionCosts + insertionCosts), add moves * moveCosts // TODO take moveDistance into account, i.e. penalty depends on distance? result.Distance -= (double)moves * (2.0d * _InsertDeleteCosts); result.Distance += (double)moves * _MoveCosts; } } #if DEBUG // a stream for logging. Will always be null in non-Debug builds System.IO.TextWriter logStream = null; bool log = false; if (log) { logStream = new System.IO.StreamWriter(System.IO.Path.GetTempPath() + "/ed.log", false, System.Text.Encoding.UTF8); logStream.WriteLine("Source objects:"); for (int p = 0; p < sourceTokens.Count; ++p) { logStream.WriteLine("\t{0}:\t{1}", p, sourceTokens[p].ToString()); } logStream.WriteLine(); logStream.WriteLine("Target objects:"); for (int p = 0; p < targetTokens.Count; ++p) { logStream.WriteLine("\t{0}:\t{1}", p, targetTokens[p].ToString()); } logStream.WriteLine(); logStream.WriteLine(); if (alignedTags != null) { logStream.WriteLine("Tag Alignment:"); foreach (TagAssociation ta in alignedTags) { logStream.WriteLine("\t{0}", ta.ToString()); } logStream.WriteLine(); logStream.WriteLine(); } result.Dump(logStream, "Final ED"); logStream.Close(); logStream.Dispose(); logStream = null; } #endif #if DEBUG // write matrix to a temp file in HTML format _DumpMatrix = false; // typeof(T) != typeof(char); if (_DumpMatrix) { System.IO.StreamWriter wtr = new System.IO.StreamWriter(System.IO.Path.GetTempPath() + "/SimMatrix.html", false, System.Text.Encoding.UTF8); System.Web.UI.Html32TextWriter htmlWriter = new System.Web.UI.Html32TextWriter(wtr); htmlWriter.WriteFullBeginTag("html"); htmlWriter.WriteFullBeginTag("body"); htmlWriter.WriteBeginTag("table"); htmlWriter.WriteAttribute("border", "1"); for (j = -1; j <= targetTokens.Count; ++j) { htmlWriter.WriteFullBeginTag("tr"); for (i = -1; i <= sourceTokens.Count; ++i) { htmlWriter.WriteFullBeginTag("td"); if (i < 0) { // caption row if (j >= 0) { htmlWriter.Write("j={0}", j); if (j > 0) { htmlWriter.WriteFullBeginTag("br"); htmlWriter.WriteFullBeginTag("b"); htmlWriter.Write(targetTokens[j - 1].ToString()); htmlWriter.WriteEndTag("b"); } } } else if (j < 0) { // j < 0 but i >= 0 --> htmlWriter.Write("i={0}", i); if (i > 0) { htmlWriter.WriteFullBeginTag("br"); htmlWriter.WriteFullBeginTag("b"); htmlWriter.Write(sourceTokens[i - 1].ToString()); htmlWriter.WriteEndTag("b"); } } else { // content cell htmlWriter.Write("d={0}", matrix[i, j].Score); htmlWriter.WriteFullBeginTag("br"); htmlWriter.Write("s={0}", matrix[i, j].Similarity); htmlWriter.WriteFullBeginTag("br"); htmlWriter.Write("o={0}", matrix[i, j].Operation.ToString()); } htmlWriter.WriteEndTag("td"); } htmlWriter.WriteEndTag("tr"); } htmlWriter.WriteEndTag("table"); htmlWriter.WriteFullBeginTag("h2"); htmlWriter.Write("Result"); htmlWriter.WriteEndTag("h2"); htmlWriter.Write("Score = {0}", result.Distance); htmlWriter.WriteFullBeginTag("ol"); for (i = 0; i < result.Items.Count; ++i) { htmlWriter.WriteFullBeginTag("li"); htmlWriter.Write("{0}: s={1} t={2}", result[i].Operation.ToString(), result[i].Source, result[i].Target); } htmlWriter.WriteEndTag("ol"); htmlWriter.WriteEndTag("body"); htmlWriter.WriteEndTag("html"); htmlWriter.Close(); } #endif return(result); }