/// <summary> /// Patch the similarity matrix so that tags which are not aligned can't be associated /// by the ED /// </summary> private void PatchSimilarityMatrix(SimilarityMatrix sim, IList <Core.Tokenization.Token> srcTokens, IList <Core.Tokenization.Token> trgTokens, TagAssociations tagAlignment) { if (tagAlignment == null || tagAlignment.Count == 0) { return; } for (int s = 0; s < srcTokens.Count; ++s) { if (!(srcTokens[s] is Core.Tokenization.TagToken)) { // not a tag continue; } Core.Tag st = ((Core.Tokenization.TagToken)srcTokens[s]).Tag; if (!(st.Type == TagType.Start || st.Type == TagType.End)) { // not a paired tag continue; } for (int t = 0; t < trgTokens.Count; ++t) { if (sim.IsAssigned(s, t) && sim[s, t] < 0.0d) { // invalid assignment anyway, no need to check further continue; } if (!(trgTokens[t] is Core.Tokenization.TagToken)) { // should't really be the case as then sim[s, t] < 0 System.Diagnostics.Debug.Assert(false, "Shouldn't be"); continue; } Core.Tag tt = ((Core.Tokenization.TagToken)trgTokens[t]).Tag; if (!(tt.Type == TagType.Start || tt.Type == TagType.End)) { // should't really be the case as then sim[s, t] < 0 System.Diagnostics.Debug.Assert(false, "Shouldn't be"); continue; } if (!tagAlignment.AreAssociated(s, t)) { sim[s, t] = -1.0d; } } } }
/// <summary> /// Computes the ED /// </summary> /// <param name="sourceTokens"></param> /// <param name="targetTokens"></param> /// <param name="computeDiagonalOnly">If number of tokens is equivalent, only the diagonal's similarities are computed.</param> /// <param name="alignedTags"></param> /// <returns></returns> public Core.EditDistance.EditDistance ComputeEditDistance( IList <Core.Tokenization.Token> sourceTokens, IList <Core.Tokenization.Token> targetTokens, bool computeDiagonalOnly, Core.Tokenization.BuiltinRecognizers disabledAutoSubstitutions, out TagAssociations alignedTags) { Core.EditDistance.EditDistance result = null; result = ComputeEditDistanceImpl_Original(sourceTokens, targetTokens, computeDiagonalOnly, disabledAutoSubstitutions, out alignedTags); #if DEBUG { bool ok = VerifyEditDistance(result.Items, sourceTokens.Count, targetTokens.Count); if (!ok) { System.Diagnostics.Debug.Assert(ok, "ED error - let Oli know and provide test data"); } } #endif return(result); }
public static TagAssociations AlignPairedTags(IList <Core.Tokenization.Token> sourceTokens, IList <Core.Tokenization.Token> targetTokens, SimilarityMatrix similarityMatrix) { TagPairs srcPairedTags = FindPairedTags(sourceTokens); if (srcPairedTags == null || srcPairedTags.Count == 0) { return(null); } TagPairs trgPairedTags = FindPairedTags(targetTokens); if (trgPairedTags == null || trgPairedTags.Count == 0) { return(null); } TagAssociations associations = new TagAssociations(); System.Collections.BitArray processedSrcTags = new System.Collections.BitArray(srcPairedTags.Count); System.Collections.BitArray processedTrgTags = new System.Collections.BitArray(trgPairedTags.Count); const bool useEndPositions = true; if (srcPairedTags.Count > 0 && trgPairedTags.Count > 0) { int[,] lcsScores = ComputeTagAssociationScores(similarityMatrix, srcPairedTags, trgPairedTags, useEndPositions); if (lcsScores == null) { return(null); } while (true) { // find global row/column maximum int maxScore = Int32.MinValue; int maxS = -1; int maxT = -1; bool unique = false; for (int s = 0; s < srcPairedTags.Count; ++s) { if (processedSrcTags[s]) { continue; } for (int t = 0; t < trgPairedTags.Count; ++t) { if (processedTrgTags[t]) { continue; } if (lcsScores[s, t] > maxScore) { maxScore = lcsScores[s, t]; maxS = s; maxT = t; unique = true; } else if (lcsScores[s, t] == maxScore) { unique = false; } } } if (maxS >= 0) { if (!unique) { // disambiguation required? Only if in same row or column - DNC right now. // System.Diagnostics.Debug.Assert(false, "Investigate - let Oli know and provide test data"); } // global unique maximum - associate tags associations.Add(srcPairedTags[maxS], trgPairedTags[maxT], Core.EditDistance.EditOperation.Change); processedSrcTags[maxS] = true; processedTrgTags[maxT] = true; } else { // no global max found anymore break; } } } for (int p = 0; p < srcPairedTags.Count; ++p) { if (!processedSrcTags[p]) { // src tag at that position is not associated associations.Add(srcPairedTags[p], null); } } for (int p = 0; p < trgPairedTags.Count; ++p) { if (!processedTrgTags[p]) { associations.Add(null, trgPairedTags[p]); } } return(associations); }
/// <summary> /// If the tag alignment suggests action "Change", but the ED can't find this, /// we need to patch the corresponding ED item for the corresponding start or end tag as well. /// </summary> /// <param name="result"></param> /// <param name="tagAlignment"></param> private void FixTagActions(IList <Core.Tokenization.Token> sourceTokens, IList <Core.Tokenization.Token> targetTokens, Core.EditDistance.EditDistance result, TagAssociations tagAlignment) { // not yet working return; // NOTE a single ED item may suggest "D" or "I" for a tag. This does not necessarily conflict // with a "C" suggested by the alignment, as a following ED item may suggest // a compensating "I" or "D" which would result in a "M", which is still // compatible with the alignment's "C". TagAssociation srcAssoc = null; TagAssociation trgAssoc = null; foreach (EditDistanceItem edi in result.Items) { switch (edi.Operation) { case EditOperation.Identity: case EditOperation.Change: srcAssoc = tagAlignment.GetBySourcePosition(edi.Source); trgAssoc = tagAlignment.GetByTargetPosition(edi.Target); if (srcAssoc != null || trgAssoc != null) { // assignment is only valid between tags System.Diagnostics.Debug.Assert(srcAssoc != null && trgAssoc != null); // should also be the same association, otherwise incompatible tags are // associated System.Diagnostics.Debug.Assert(object.ReferenceEquals(srcAssoc, trgAssoc)); System.Diagnostics.Debug.Assert(srcAssoc.SourceTag != null && srcAssoc.TargetTag != null); if (edi.Source == srcAssoc.SourceTag.Start) { System.Diagnostics.Debug.Assert(srcAssoc.SourceTag.StartTagOperation == EditOperation.Undefined); System.Diagnostics.Debug.Assert(srcAssoc.TargetTag.StartTagOperation == EditOperation.Undefined); // source tag start position srcAssoc.SourceTag.StartTagOperation = edi.Operation; srcAssoc.TargetTag.StartTagOperation = edi.Operation; } else { System.Diagnostics.Debug.Assert(edi.Source == srcAssoc.SourceTag.End); System.Diagnostics.Debug.Assert(srcAssoc.SourceTag.EndTagOperation == EditOperation.Undefined); System.Diagnostics.Debug.Assert(srcAssoc.TargetTag.EndTagOperation == EditOperation.Undefined); srcAssoc.SourceTag.EndTagOperation = edi.Operation; srcAssoc.TargetTag.EndTagOperation = edi.Operation; } } break; case EditOperation.Insert: trgAssoc = tagAlignment.GetByTargetPosition(edi.Target); if (trgAssoc != null) { if (edi.Target == trgAssoc.TargetTag.Start) { System.Diagnostics.Debug.Assert(trgAssoc.TargetTag.StartTagOperation == EditOperation.Undefined); trgAssoc.TargetTag.StartTagOperation = edi.Operation; } else { System.Diagnostics.Debug.Assert(edi.Target == trgAssoc.TargetTag.End); System.Diagnostics.Debug.Assert(trgAssoc.TargetTag.EndTagOperation == EditOperation.Undefined); trgAssoc.TargetTag.EndTagOperation = edi.Operation; } } break; case EditOperation.Delete: srcAssoc = tagAlignment.GetBySourcePosition(edi.Source); if (srcAssoc != null) { if (edi.Source == srcAssoc.SourceTag.Start) { System.Diagnostics.Debug.Assert(srcAssoc.SourceTag.StartTagOperation == EditOperation.Undefined); srcAssoc.SourceTag.StartTagOperation = edi.Operation; } else { System.Diagnostics.Debug.Assert(edi.Source == srcAssoc.SourceTag.End); System.Diagnostics.Debug.Assert(srcAssoc.SourceTag.EndTagOperation == EditOperation.Undefined); srcAssoc.SourceTag.EndTagOperation = edi.Operation; } } break; case EditOperation.Move: case EditOperation.Undefined: default: throw new Exception("Unexpected case"); } } // phase 2: detect conflicts foreach (TagAssociation ta in tagAlignment) { EditOperation startOp = EditOperation.Undefined; EditOperation endOp = EditOperation.Undefined; if ((ta.SourceTag.StartTagOperation == EditOperation.Insert) && (ta.TargetTag.StartTagOperation == EditOperation.Delete)) { startOp = EditOperation.Move; } else if ((ta.SourceTag.StartTagOperation == EditOperation.Delete) && (ta.TargetTag.StartTagOperation == EditOperation.Insert)) { startOp = EditOperation.Move; } else if (ta.SourceTag.StartTagOperation == ta.TargetTag.StartTagOperation) { startOp = ta.SourceTag.StartTagOperation; } else { System.Diagnostics.Debug.Assert(false, "Conflicting start tag operations"); startOp = EditOperation.Undefined; } if ((ta.SourceTag.EndTagOperation == EditOperation.Insert) && (ta.TargetTag.EndTagOperation == EditOperation.Delete)) { endOp = EditOperation.Move; } else if ((ta.SourceTag.EndTagOperation == EditOperation.Delete) && (ta.TargetTag.EndTagOperation == EditOperation.Insert)) { endOp = EditOperation.Move; } else if (ta.SourceTag.EndTagOperation == ta.TargetTag.EndTagOperation) { endOp = ta.SourceTag.EndTagOperation; } else { System.Diagnostics.Debug.Assert(false, "Conflicting end tag operations"); endOp = EditOperation.Undefined; } if (startOp != endOp) { System.Diagnostics.Debug.Assert(false, "Conflicting tag actions"); } } }
private Core.EditDistance.EditDistance ComputeEditDistanceImpl_Original( IList <Core.Tokenization.Token> sourceTokens, IList <Core.Tokenization.Token> targetTokens, bool computeDiagonalOnly, Core.Tokenization.BuiltinRecognizers disabledAutoSubstitutions, out TagAssociations alignedTags) { /* * The "classic" ED approach has the problem that it doesn't detect moves * reliably, particularly block moves. Patching up insert/delete pairs as * moves also won't catch moves which appear as changes in the ED. */ if (sourceTokens == null) { throw new ArgumentNullException("sourceTokens"); } if (targetTokens == null) { throw new ArgumentNullException("targetTokens"); } alignedTags = null; int i, j; // TODO handle special cases (one/both of the arrays being empty/having no elements) // TODO use diagonal algorithm bool enforceFullMatrixComputation = false; Core.EditDistance.EditDistance result = new Core.EditDistance.EditDistance(sourceTokens.Count, targetTokens.Count, 0.0d); // matrix which captures the similarity between two tokens as well as preassignments SimilarityMatrix sim = new SimilarityMatrix(sourceTokens, targetTokens, _UseStringEditDistance, disabledAutoSubstitutions); if (enforceFullMatrixComputation) { // this will be fully computed by the tag aligner in most cases, but we may save a bit // on plain text segments sim.Compute(computeDiagonalOnly); } MatrixItem[,] matrix = CreateEditDistanceMatrix(sourceTokens, targetTokens); alignedTags = TagAligner.AlignPairedTags(sourceTokens, targetTokens, sim); if (alignedTags != null && alignedTags.Count > 0) { // Patch the sim matrix so that non-aligned tags can't be assigned to each other PatchSimilarityMatrix(sim, sourceTokens, targetTokens, alignedTags); ComputeEditDistanceMatrix_Full(matrix, sim, alignedTags); } else if (enforceFullMatrixComputation) { ComputeEditDistanceMatrix_Full(matrix, sim, alignedTags); } else { ComputeEditDistanceMatrix_Lazy(matrix, sim); } // readout the cheapest path i = sourceTokens.Count; j = targetTokens.Count; result.Distance = matrix[i, j].Score; while (i > 0 || j > 0) { EditDistanceItem item = new EditDistanceItem(); item.Resolution = EditDistanceResolution.None; MatrixItem m = matrix[i, j]; item.Operation = m.Operation; switch (item.Operation) { case EditOperation.Identity: item.Costs = 0.0d; --i; --j; break; case EditOperation.Change: item.Costs = _UseStringEditDistance ? (1.0d - m.Similarity) : (1.0d - SimilarityComputers.GetTokenSimilarity(sourceTokens[i - 1], targetTokens[j - 1], true, disabledAutoSubstitutions)); // item.Costs = (1.0d - m.Similarity); --i; --j; break; case EditOperation.Insert: item.Costs = _InsertDeleteCosts; --j; break; case EditOperation.Delete: item.Costs = _InsertDeleteCosts; --i; break; case EditOperation.Undefined: throw new Exception("Internal ED computation error"); } item.Source = i; item.Target = j; result.AddAtStart(item); } if (alignedTags != null && alignedTags.Count > 0) { // should happen before move detection FixTagActions(sourceTokens, targetTokens, result, alignedTags); } // identify move operations which are pairs of insert/delete operations in the shortest path. // Note that the comparision result is already in the matrix and we only care about identity. // TODO we may rather use a configurable threshold than identity (1.0) to catch move operations // of sufficiently similar items (e.g. case-insensitive) if (_ComputeMoves) { int moves = DetectMoves(result, matrix); if (moves > 0) { // adjust score: substract moves * (deletionCosts + insertionCosts), add moves * moveCosts // TODO take moveDistance into account, i.e. penalty depends on distance? result.Distance -= (double)moves * (2.0d * _InsertDeleteCosts); result.Distance += (double)moves * _MoveCosts; } } #if DEBUG // a stream for logging. Will always be null in non-Debug builds System.IO.TextWriter logStream = null; bool log = false; if (log) { logStream = new System.IO.StreamWriter(System.IO.Path.GetTempPath() + "/ed.log", false, System.Text.Encoding.UTF8); logStream.WriteLine("Source objects:"); for (int p = 0; p < sourceTokens.Count; ++p) { logStream.WriteLine("\t{0}:\t{1}", p, sourceTokens[p].ToString()); } logStream.WriteLine(); logStream.WriteLine("Target objects:"); for (int p = 0; p < targetTokens.Count; ++p) { logStream.WriteLine("\t{0}:\t{1}", p, targetTokens[p].ToString()); } logStream.WriteLine(); logStream.WriteLine(); if (alignedTags != null) { logStream.WriteLine("Tag Alignment:"); foreach (TagAssociation ta in alignedTags) { logStream.WriteLine("\t{0}", ta.ToString()); } logStream.WriteLine(); logStream.WriteLine(); } result.Dump(logStream, "Final ED"); logStream.Close(); logStream.Dispose(); logStream = null; } #endif #if DEBUG // write matrix to a temp file in HTML format _DumpMatrix = false; // typeof(T) != typeof(char); if (_DumpMatrix) { System.IO.StreamWriter wtr = new System.IO.StreamWriter(System.IO.Path.GetTempPath() + "/SimMatrix.html", false, System.Text.Encoding.UTF8); System.Web.UI.Html32TextWriter htmlWriter = new System.Web.UI.Html32TextWriter(wtr); htmlWriter.WriteFullBeginTag("html"); htmlWriter.WriteFullBeginTag("body"); htmlWriter.WriteBeginTag("table"); htmlWriter.WriteAttribute("border", "1"); for (j = -1; j <= targetTokens.Count; ++j) { htmlWriter.WriteFullBeginTag("tr"); for (i = -1; i <= sourceTokens.Count; ++i) { htmlWriter.WriteFullBeginTag("td"); if (i < 0) { // caption row if (j >= 0) { htmlWriter.Write("j={0}", j); if (j > 0) { htmlWriter.WriteFullBeginTag("br"); htmlWriter.WriteFullBeginTag("b"); htmlWriter.Write(targetTokens[j - 1].ToString()); htmlWriter.WriteEndTag("b"); } } } else if (j < 0) { // j < 0 but i >= 0 --> htmlWriter.Write("i={0}", i); if (i > 0) { htmlWriter.WriteFullBeginTag("br"); htmlWriter.WriteFullBeginTag("b"); htmlWriter.Write(sourceTokens[i - 1].ToString()); htmlWriter.WriteEndTag("b"); } } else { // content cell htmlWriter.Write("d={0}", matrix[i, j].Score); htmlWriter.WriteFullBeginTag("br"); htmlWriter.Write("s={0}", matrix[i, j].Similarity); htmlWriter.WriteFullBeginTag("br"); htmlWriter.Write("o={0}", matrix[i, j].Operation.ToString()); } htmlWriter.WriteEndTag("td"); } htmlWriter.WriteEndTag("tr"); } htmlWriter.WriteEndTag("table"); htmlWriter.WriteFullBeginTag("h2"); htmlWriter.Write("Result"); htmlWriter.WriteEndTag("h2"); htmlWriter.Write("Score = {0}", result.Distance); htmlWriter.WriteFullBeginTag("ol"); for (i = 0; i < result.Items.Count; ++i) { htmlWriter.WriteFullBeginTag("li"); htmlWriter.Write("{0}: s={1} t={2}", result[i].Operation.ToString(), result[i].Source, result[i].Target); } htmlWriter.WriteEndTag("ol"); htmlWriter.WriteEndTag("body"); htmlWriter.WriteEndTag("html"); htmlWriter.Close(); } #endif return(result); }
private void ComputeEditDistanceMatrix_Full(MatrixItem[,] matrix, SimilarityMatrix sim, TagAssociations alignedTags) { for (int i = 1; i <= sim.SourceTokens.Count; ++i) { for (int j = 1; j <= sim.TargetTokens.Count; ++j) { // current cell must not yet be computed: System.Diagnostics.Debug.Assert(matrix[i, j].Operation == EditOperation.Undefined); // predecessors must be valid: System.Diagnostics.Debug.Assert(matrix[i - 1, j - 1].Operation != EditOperation.Undefined); System.Diagnostics.Debug.Assert(matrix[i, j - 1].Operation != EditOperation.Undefined); System.Diagnostics.Debug.Assert(matrix[i - 1, j].Operation != EditOperation.Undefined); double similarity = sim[i - 1, j - 1]; System.Diagnostics.Debug.Assert((similarity >= 0.0d && similarity <= 1.0d) || similarity == -1.0d); // low similarity means high "change costs" and vice versa: double changeCosts = (similarity < 0) ? _InvalidAssignmentCosts : matrix[i - 1, j - 1].Score + (1.0d - similarity); double insertCosts = matrix[i, j - 1].Score + _InsertDeleteCosts; double deleteCosts = matrix[i - 1, j].Score + _InsertDeleteCosts; double min = Math.Min(Math.Min(changeCosts, deleteCosts), insertCosts); // verify the shortcut condition: System.Diagnostics.Debug.Assert(similarity < 1.0d || min == changeCosts); EditOperation op = EditOperation.Undefined; if (min == deleteCosts) { op = EditOperation.Delete; } else if (min == insertCosts) { op = EditOperation.Insert; } else if (min == changeCosts) { if (similarity == 1.0d) { op = EditOperation.Identity; } else { op = EditOperation.Change; } } if (alignedTags != null && alignedTags.Count > 0) { // check whether tag alignment overrides ED result: // TODO do this during population or during readout? EditOperation srcTagOp = alignedTags.GetOperationBySourcePosition(i - 1); EditOperation trgTagOp = alignedTags.GetOperationByTargetPosition(j - 1); // changes/identity of tags are through ED, while the tag alignment // defines deletions, insertions if ((srcTagOp == EditOperation.Insert || srcTagOp == EditOperation.Delete) && op != srcTagOp) { // this is where the pre-alignment of tags supersedes the ED result op = srcTagOp; } else if ((trgTagOp == EditOperation.Insert || trgTagOp == EditOperation.Delete) && op != trgTagOp) { op = trgTagOp; } } matrix[i, j].Similarity = similarity; matrix[i, j].Operation = op; if (op == EditOperation.Delete) { matrix[i, j].Score = deleteCosts; } else if (op == EditOperation.Insert) { matrix[i, j].Score = insertCosts; } else { matrix[i, j].Score = changeCosts; } } } }