/// <summary> /// blablabla /// </summary> /// <param name="simMatrix"></param> /// <param name="threshold"></param> /// <param name="maySkip">If true, computes the longest common subsequence. Otherwise, /// computes the longest common substring. /// </param> public TokenIndexLCSScoreProvider(SimilarityMatrix simMatrix, double threshold, bool maySkip) { _SimMatrix = simMatrix; _Threshold = threshold; _MaySkip = maySkip; }
/// <summary> /// Patch the similarity matrix so that tags which are not aligned can't be associated /// by the ED /// </summary> private void PatchSimilarityMatrix(SimilarityMatrix sim, IList <Core.Tokenization.Token> srcTokens, IList <Core.Tokenization.Token> trgTokens, TagAssociations tagAlignment) { if (tagAlignment == null || tagAlignment.Count == 0) { return; } for (int s = 0; s < srcTokens.Count; ++s) { if (!(srcTokens[s] is Core.Tokenization.TagToken)) { // not a tag continue; } Core.Tag st = ((Core.Tokenization.TagToken)srcTokens[s]).Tag; if (!(st.Type == TagType.Start || st.Type == TagType.End)) { // not a paired tag continue; } for (int t = 0; t < trgTokens.Count; ++t) { if (sim.IsAssigned(s, t) && sim[s, t] < 0.0d) { // invalid assignment anyway, no need to check further continue; } if (!(trgTokens[t] is Core.Tokenization.TagToken)) { // should't really be the case as then sim[s, t] < 0 System.Diagnostics.Debug.Assert(false, "Shouldn't be"); continue; } Core.Tag tt = ((Core.Tokenization.TagToken)trgTokens[t]).Tag; if (!(tt.Type == TagType.Start || tt.Type == TagType.End)) { // should't really be the case as then sim[s, t] < 0 System.Diagnostics.Debug.Assert(false, "Shouldn't be"); continue; } if (!tagAlignment.AreAssociated(s, t)) { sim[s, t] = -1.0d; } } } }
private static int[,] ComputeTagAssociationScores(SimilarityMatrix similarityMatrix, TagPairs srcPairedTags, TagPairs trgPairedTags, bool useEndPositions) { // this should pretty much result in first-come first-serve alignment, but we hopefully // get better associations for nested tags // foreach src tag, compute LCS to each target tag int[,] lcsScores = new int[srcPairedTags.Count, trgPairedTags.Count]; List <int> sourceTokenPositions = new List <int>(); List <int> targetTokenPositions = new List <int>(); TokenIndexLCSScoreProvider scorer = new TokenIndexLCSScoreProvider(similarityMatrix, 0.75, true); for (int p = 0; p < similarityMatrix.SourceTokens.Count; ++p) { sourceTokenPositions.Add(p); } for (int p = 0; p < similarityMatrix.TargetTokens.Count; ++p) { targetTokenPositions.Add(p); } SequenceAlignmentComputer <int> aligner = new SequenceAlignmentComputer <int>(sourceTokenPositions, targetTokenPositions, scorer, null, 1, 1); int uptoSource; int uptoTarget; for (int srcTag = srcPairedTags.Count - 1; srcTag >= 0; --srcTag) { PairedTag sPt = srcPairedTags[srcTag]; uptoSource = (useEndPositions ? sPt.End : sPt.Start); for (int trgTag = trgPairedTags.Count - 1; trgTag >= 0; --trgTag) { PairedTag tPt = trgPairedTags[trgTag]; uptoTarget = (useEndPositions ? tPt.End : tPt.Start); List <AlignedSubstring> result = aligner.Compute(uptoSource, uptoTarget); if (result != null && result.Count > 0) { System.Diagnostics.Debug.Assert(result.Count == 1); // the result is the common subsequence length minus items which were deleted or inserted int score = result[0].Score - (uptoSource - result[0].Score) - (uptoTarget - result[0].Score); // penalize large differences in the spanned width, but not if // we include the end positions in the LCS int malus; if (useEndPositions) { malus = 0; } else { int srcSpan = GetTagSpan(sPt); int trgSpan = GetTagSpan(tPt); malus = Math.Abs(srcSpan - trgSpan) / 2; } lcsScores[srcTag, trgTag] = score - malus; } } } return(lcsScores); }
public static TagAssociations AlignPairedTags(IList <Core.Tokenization.Token> sourceTokens, IList <Core.Tokenization.Token> targetTokens, SimilarityMatrix similarityMatrix) { TagPairs srcPairedTags = FindPairedTags(sourceTokens); if (srcPairedTags == null || srcPairedTags.Count == 0) { return(null); } TagPairs trgPairedTags = FindPairedTags(targetTokens); if (trgPairedTags == null || trgPairedTags.Count == 0) { return(null); } TagAssociations associations = new TagAssociations(); System.Collections.BitArray processedSrcTags = new System.Collections.BitArray(srcPairedTags.Count); System.Collections.BitArray processedTrgTags = new System.Collections.BitArray(trgPairedTags.Count); const bool useEndPositions = true; if (srcPairedTags.Count > 0 && trgPairedTags.Count > 0) { int[,] lcsScores = ComputeTagAssociationScores(similarityMatrix, srcPairedTags, trgPairedTags, useEndPositions); if (lcsScores == null) { return(null); } while (true) { // find global row/column maximum int maxScore = Int32.MinValue; int maxS = -1; int maxT = -1; bool unique = false; for (int s = 0; s < srcPairedTags.Count; ++s) { if (processedSrcTags[s]) { continue; } for (int t = 0; t < trgPairedTags.Count; ++t) { if (processedTrgTags[t]) { continue; } if (lcsScores[s, t] > maxScore) { maxScore = lcsScores[s, t]; maxS = s; maxT = t; unique = true; } else if (lcsScores[s, t] == maxScore) { unique = false; } } } if (maxS >= 0) { if (!unique) { // disambiguation required? Only if in same row or column - DNC right now. // System.Diagnostics.Debug.Assert(false, "Investigate - let Oli know and provide test data"); } // global unique maximum - associate tags associations.Add(srcPairedTags[maxS], trgPairedTags[maxT], Core.EditDistance.EditOperation.Change); processedSrcTags[maxS] = true; processedTrgTags[maxT] = true; } else { // no global max found anymore break; } } } for (int p = 0; p < srcPairedTags.Count; ++p) { if (!processedSrcTags[p]) { // src tag at that position is not associated associations.Add(srcPairedTags[p], null); } } for (int p = 0; p < trgPairedTags.Count; ++p) { if (!processedTrgTags[p]) { associations.Add(null, trgPairedTags[p]); } } return(associations); }
private Core.EditDistance.EditDistance ComputeEditDistanceImpl_Original( IList <Core.Tokenization.Token> sourceTokens, IList <Core.Tokenization.Token> targetTokens, bool computeDiagonalOnly, Core.Tokenization.BuiltinRecognizers disabledAutoSubstitutions, out TagAssociations alignedTags) { /* * The "classic" ED approach has the problem that it doesn't detect moves * reliably, particularly block moves. Patching up insert/delete pairs as * moves also won't catch moves which appear as changes in the ED. */ if (sourceTokens == null) { throw new ArgumentNullException("sourceTokens"); } if (targetTokens == null) { throw new ArgumentNullException("targetTokens"); } alignedTags = null; int i, j; // TODO handle special cases (one/both of the arrays being empty/having no elements) // TODO use diagonal algorithm bool enforceFullMatrixComputation = false; Core.EditDistance.EditDistance result = new Core.EditDistance.EditDistance(sourceTokens.Count, targetTokens.Count, 0.0d); // matrix which captures the similarity between two tokens as well as preassignments SimilarityMatrix sim = new SimilarityMatrix(sourceTokens, targetTokens, _UseStringEditDistance, disabledAutoSubstitutions); if (enforceFullMatrixComputation) { // this will be fully computed by the tag aligner in most cases, but we may save a bit // on plain text segments sim.Compute(computeDiagonalOnly); } MatrixItem[,] matrix = CreateEditDistanceMatrix(sourceTokens, targetTokens); alignedTags = TagAligner.AlignPairedTags(sourceTokens, targetTokens, sim); if (alignedTags != null && alignedTags.Count > 0) { // Patch the sim matrix so that non-aligned tags can't be assigned to each other PatchSimilarityMatrix(sim, sourceTokens, targetTokens, alignedTags); ComputeEditDistanceMatrix_Full(matrix, sim, alignedTags); } else if (enforceFullMatrixComputation) { ComputeEditDistanceMatrix_Full(matrix, sim, alignedTags); } else { ComputeEditDistanceMatrix_Lazy(matrix, sim); } // readout the cheapest path i = sourceTokens.Count; j = targetTokens.Count; result.Distance = matrix[i, j].Score; while (i > 0 || j > 0) { EditDistanceItem item = new EditDistanceItem(); item.Resolution = EditDistanceResolution.None; MatrixItem m = matrix[i, j]; item.Operation = m.Operation; switch (item.Operation) { case EditOperation.Identity: item.Costs = 0.0d; --i; --j; break; case EditOperation.Change: item.Costs = _UseStringEditDistance ? (1.0d - m.Similarity) : (1.0d - SimilarityComputers.GetTokenSimilarity(sourceTokens[i - 1], targetTokens[j - 1], true, disabledAutoSubstitutions)); // item.Costs = (1.0d - m.Similarity); --i; --j; break; case EditOperation.Insert: item.Costs = _InsertDeleteCosts; --j; break; case EditOperation.Delete: item.Costs = _InsertDeleteCosts; --i; break; case EditOperation.Undefined: throw new Exception("Internal ED computation error"); } item.Source = i; item.Target = j; result.AddAtStart(item); } if (alignedTags != null && alignedTags.Count > 0) { // should happen before move detection FixTagActions(sourceTokens, targetTokens, result, alignedTags); } // identify move operations which are pairs of insert/delete operations in the shortest path. // Note that the comparision result is already in the matrix and we only care about identity. // TODO we may rather use a configurable threshold than identity (1.0) to catch move operations // of sufficiently similar items (e.g. case-insensitive) if (_ComputeMoves) { int moves = DetectMoves(result, matrix); if (moves > 0) { // adjust score: substract moves * (deletionCosts + insertionCosts), add moves * moveCosts // TODO take moveDistance into account, i.e. penalty depends on distance? result.Distance -= (double)moves * (2.0d * _InsertDeleteCosts); result.Distance += (double)moves * _MoveCosts; } } #if DEBUG // a stream for logging. Will always be null in non-Debug builds System.IO.TextWriter logStream = null; bool log = false; if (log) { logStream = new System.IO.StreamWriter(System.IO.Path.GetTempPath() + "/ed.log", false, System.Text.Encoding.UTF8); logStream.WriteLine("Source objects:"); for (int p = 0; p < sourceTokens.Count; ++p) { logStream.WriteLine("\t{0}:\t{1}", p, sourceTokens[p].ToString()); } logStream.WriteLine(); logStream.WriteLine("Target objects:"); for (int p = 0; p < targetTokens.Count; ++p) { logStream.WriteLine("\t{0}:\t{1}", p, targetTokens[p].ToString()); } logStream.WriteLine(); logStream.WriteLine(); if (alignedTags != null) { logStream.WriteLine("Tag Alignment:"); foreach (TagAssociation ta in alignedTags) { logStream.WriteLine("\t{0}", ta.ToString()); } logStream.WriteLine(); logStream.WriteLine(); } result.Dump(logStream, "Final ED"); logStream.Close(); logStream.Dispose(); logStream = null; } #endif #if DEBUG // write matrix to a temp file in HTML format _DumpMatrix = false; // typeof(T) != typeof(char); if (_DumpMatrix) { System.IO.StreamWriter wtr = new System.IO.StreamWriter(System.IO.Path.GetTempPath() + "/SimMatrix.html", false, System.Text.Encoding.UTF8); System.Web.UI.Html32TextWriter htmlWriter = new System.Web.UI.Html32TextWriter(wtr); htmlWriter.WriteFullBeginTag("html"); htmlWriter.WriteFullBeginTag("body"); htmlWriter.WriteBeginTag("table"); htmlWriter.WriteAttribute("border", "1"); for (j = -1; j <= targetTokens.Count; ++j) { htmlWriter.WriteFullBeginTag("tr"); for (i = -1; i <= sourceTokens.Count; ++i) { htmlWriter.WriteFullBeginTag("td"); if (i < 0) { // caption row if (j >= 0) { htmlWriter.Write("j={0}", j); if (j > 0) { htmlWriter.WriteFullBeginTag("br"); htmlWriter.WriteFullBeginTag("b"); htmlWriter.Write(targetTokens[j - 1].ToString()); htmlWriter.WriteEndTag("b"); } } } else if (j < 0) { // j < 0 but i >= 0 --> htmlWriter.Write("i={0}", i); if (i > 0) { htmlWriter.WriteFullBeginTag("br"); htmlWriter.WriteFullBeginTag("b"); htmlWriter.Write(sourceTokens[i - 1].ToString()); htmlWriter.WriteEndTag("b"); } } else { // content cell htmlWriter.Write("d={0}", matrix[i, j].Score); htmlWriter.WriteFullBeginTag("br"); htmlWriter.Write("s={0}", matrix[i, j].Similarity); htmlWriter.WriteFullBeginTag("br"); htmlWriter.Write("o={0}", matrix[i, j].Operation.ToString()); } htmlWriter.WriteEndTag("td"); } htmlWriter.WriteEndTag("tr"); } htmlWriter.WriteEndTag("table"); htmlWriter.WriteFullBeginTag("h2"); htmlWriter.Write("Result"); htmlWriter.WriteEndTag("h2"); htmlWriter.Write("Score = {0}", result.Distance); htmlWriter.WriteFullBeginTag("ol"); for (i = 0; i < result.Items.Count; ++i) { htmlWriter.WriteFullBeginTag("li"); htmlWriter.Write("{0}: s={1} t={2}", result[i].Operation.ToString(), result[i].Source, result[i].Target); } htmlWriter.WriteEndTag("ol"); htmlWriter.WriteEndTag("body"); htmlWriter.WriteEndTag("html"); htmlWriter.Close(); } #endif return(result); }
private void ComputeEditDistanceMatrix_Lazy(MatrixItem[,] matrix, SimilarityMatrix sim) { ComputeCell(matrix, sim, sim.SourceTokens.Count, sim.TargetTokens.Count); }
private void ComputeCell(MatrixItem[,] matrix, SimilarityMatrix sim, int i, int j) { if (matrix[i, j].Operation != EditOperation.Undefined) { // cell already computed - no further processing required return; } // ensure that the diagonal cell is computed (always needed) ComputeCell(matrix, sim, i - 1, j - 1); System.Diagnostics.Debug.Assert(matrix[i - 1, j - 1].Operation != EditOperation.Undefined); double similarity = sim[i - 1, j - 1]; // low similarity means high "change costs" and vice versa: double changeCosts = (similarity < 0.0d) ? _InvalidAssignmentCosts : matrix[i - 1, j - 1].Score + (1.0d - similarity); EditOperation op = EditOperation.Undefined; double insertCosts = 0.0d; double deleteCosts = 0.0d; // i == j: main diagonal. i < j: below, i > j: above. /* * if (similarity == 1.0d) * { * // this seems to assume that the costs are minimal in the diagonal - not * // sure that's true in the general case (only if insert/delete/change * // costs are equal = 1) * op = EditOperation.Identity; * } * else */ if (i < j) { // below main diagonal. ComputeCell(matrix, sim, i, j - 1); insertCosts = matrix[i, j - 1].Score + _InsertDeleteCosts; if (insertCosts >= changeCosts || changeCosts == _InvalidAssignmentCosts) { // need to get the deletion costs as well ComputeCell(matrix, sim, i - 1, j); deleteCosts = matrix[i - 1, j].Score + _InsertDeleteCosts; op = GetOperation(changeCosts, insertCosts, deleteCosts, similarity); } else { if (insertCosts < changeCosts) { op = EditOperation.Insert; } else { op = EditOperation.Change; } } } else { // on or above main diagonal ComputeCell(matrix, sim, i - 1, j); deleteCosts = matrix[i - 1, j].Score + _InsertDeleteCosts; if (deleteCosts >= changeCosts || changeCosts == _InvalidAssignmentCosts) { // need to get the insert costs as well ComputeCell(matrix, sim, i, j - 1); insertCosts = matrix[i, j - 1].Score + _InsertDeleteCosts; op = GetOperation(changeCosts, insertCosts, deleteCosts, similarity); } else { if (deleteCosts < changeCosts) { op = EditOperation.Delete; } else { op = EditOperation.Change; } } } matrix[i, j].Similarity = similarity; matrix[i, j].Operation = op; System.Diagnostics.Debug.Assert(op != EditOperation.Undefined); if (op == EditOperation.Delete) { matrix[i, j].Score = deleteCosts; } else if (op == EditOperation.Insert) { matrix[i, j].Score = insertCosts; } else { matrix[i, j].Score = changeCosts; } }
private void ComputeEditDistanceMatrix_Full(MatrixItem[,] matrix, SimilarityMatrix sim, TagAssociations alignedTags) { for (int i = 1; i <= sim.SourceTokens.Count; ++i) { for (int j = 1; j <= sim.TargetTokens.Count; ++j) { // current cell must not yet be computed: System.Diagnostics.Debug.Assert(matrix[i, j].Operation == EditOperation.Undefined); // predecessors must be valid: System.Diagnostics.Debug.Assert(matrix[i - 1, j - 1].Operation != EditOperation.Undefined); System.Diagnostics.Debug.Assert(matrix[i, j - 1].Operation != EditOperation.Undefined); System.Diagnostics.Debug.Assert(matrix[i - 1, j].Operation != EditOperation.Undefined); double similarity = sim[i - 1, j - 1]; System.Diagnostics.Debug.Assert((similarity >= 0.0d && similarity <= 1.0d) || similarity == -1.0d); // low similarity means high "change costs" and vice versa: double changeCosts = (similarity < 0) ? _InvalidAssignmentCosts : matrix[i - 1, j - 1].Score + (1.0d - similarity); double insertCosts = matrix[i, j - 1].Score + _InsertDeleteCosts; double deleteCosts = matrix[i - 1, j].Score + _InsertDeleteCosts; double min = Math.Min(Math.Min(changeCosts, deleteCosts), insertCosts); // verify the shortcut condition: System.Diagnostics.Debug.Assert(similarity < 1.0d || min == changeCosts); EditOperation op = EditOperation.Undefined; if (min == deleteCosts) { op = EditOperation.Delete; } else if (min == insertCosts) { op = EditOperation.Insert; } else if (min == changeCosts) { if (similarity == 1.0d) { op = EditOperation.Identity; } else { op = EditOperation.Change; } } if (alignedTags != null && alignedTags.Count > 0) { // check whether tag alignment overrides ED result: // TODO do this during population or during readout? EditOperation srcTagOp = alignedTags.GetOperationBySourcePosition(i - 1); EditOperation trgTagOp = alignedTags.GetOperationByTargetPosition(j - 1); // changes/identity of tags are through ED, while the tag alignment // defines deletions, insertions if ((srcTagOp == EditOperation.Insert || srcTagOp == EditOperation.Delete) && op != srcTagOp) { // this is where the pre-alignment of tags supersedes the ED result op = srcTagOp; } else if ((trgTagOp == EditOperation.Insert || trgTagOp == EditOperation.Delete) && op != trgTagOp) { op = trgTagOp; } } matrix[i, j].Similarity = similarity; matrix[i, j].Operation = op; if (op == EditOperation.Delete) { matrix[i, j].Score = deleteCosts; } else if (op == EditOperation.Insert) { matrix[i, j].Score = insertCosts; } else { matrix[i, j].Score = changeCosts; } } } }