public SimilarityMatrix(IList <Core.Tokenization.Token> sourceTokens, IList <Core.Tokenization.Token> targetTokens, bool useStringEditDistance, Core.Tokenization.BuiltinRecognizers disabledAutoSubstitutions) { _SourceTokens = sourceTokens; _TargetTokens = targetTokens; _UseStringEditDistance = useStringEditDistance; _DisabledAutoSubstitutions = disabledAutoSubstitutions; _Sim = new double[_SourceTokens.Count, _TargetTokens.Count]; for (int s = 0; s < _SourceTokens.Count; ++s) { for (int t = 0; t < _TargetTokens.Count; ++t) { _Sim[s, t] = _Uncomputed; } } }
/// <summary> /// Computes the ED /// </summary> /// <param name="sourceTokens"></param> /// <param name="targetTokens"></param> /// <param name="computeDiagonalOnly">If number of tokens is equivalent, only the diagonal's similarities are computed.</param> /// <param name="alignedTags"></param> /// <returns></returns> public Core.EditDistance.EditDistance ComputeEditDistance( IList <Core.Tokenization.Token> sourceTokens, IList <Core.Tokenization.Token> targetTokens, bool computeDiagonalOnly, Core.Tokenization.BuiltinRecognizers disabledAutoSubstitutions, out TagAssociations alignedTags) { Core.EditDistance.EditDistance result = null; result = ComputeEditDistanceImpl_Original(sourceTokens, targetTokens, computeDiagonalOnly, disabledAutoSubstitutions, out alignedTags); #if DEBUG { bool ok = VerifyEditDistance(result.Items, sourceTokens.Count, targetTokens.Count); if (!ok) { System.Diagnostics.Debug.Assert(ok, "ED error - let Oli know and provide test data"); } } #endif return(result); }
public static double GetPlaceableSimilarity(Core.Tokenization.Token a, Core.Tokenization.Token b, Core.Tokenization.BuiltinRecognizers disabledAutoSubstitutions) { /* * identical type and value - 1.0 * similar type and value - 0.85 * identical type - 0.7 * otherwise - 0.0 * */ if (!(a.IsPlaceable && b.IsPlaceable)) { System.Diagnostics.Debug.Assert(false, "Expect placeable types"); return(0.0d); } // Can't compare different types if (a.Type != b.Type || a.GetType() != b.GetType()) { return(0.0d); } Core.Tokenization.TagToken ta = a as Core.Tokenization.TagToken; Core.Tokenization.TagToken tb = b as Core.Tokenization.TagToken; if (ta != null || tb != null) { if (ta == null || tb == null) { return(-1.0d); // can't align two placeables of which one isn't a tag } if (ta.Tag.Type != tb.Tag.Type) { return(-1.0d); // can't align tags of different types } } if (disabledAutoSubstitutions != Sdl.LanguagePlatform.Core.Tokenization.BuiltinRecognizers.RecognizeNone) { bool requireEquality = false; switch (a.Type) { case Sdl.LanguagePlatform.Core.Tokenization.TokenType.Abbreviation: requireEquality = (disabledAutoSubstitutions & Sdl.LanguagePlatform.Core.Tokenization.BuiltinRecognizers.RecognizeAcronyms) != Sdl.LanguagePlatform.Core.Tokenization.BuiltinRecognizers.RecognizeNone; break; case Sdl.LanguagePlatform.Core.Tokenization.TokenType.Date: requireEquality = (disabledAutoSubstitutions & Sdl.LanguagePlatform.Core.Tokenization.BuiltinRecognizers.RecognizeDates) != Sdl.LanguagePlatform.Core.Tokenization.BuiltinRecognizers.RecognizeNone; break; case Sdl.LanguagePlatform.Core.Tokenization.TokenType.Time: requireEquality = (disabledAutoSubstitutions & Sdl.LanguagePlatform.Core.Tokenization.BuiltinRecognizers.RecognizeTimes) != Sdl.LanguagePlatform.Core.Tokenization.BuiltinRecognizers.RecognizeNone; break; case Sdl.LanguagePlatform.Core.Tokenization.TokenType.Variable: requireEquality = (disabledAutoSubstitutions & Sdl.LanguagePlatform.Core.Tokenization.BuiltinRecognizers.RecognizeVariables) != Sdl.LanguagePlatform.Core.Tokenization.BuiltinRecognizers.RecognizeNone; break; case Sdl.LanguagePlatform.Core.Tokenization.TokenType.Number: requireEquality = (disabledAutoSubstitutions & Sdl.LanguagePlatform.Core.Tokenization.BuiltinRecognizers.RecognizeNumbers) != Sdl.LanguagePlatform.Core.Tokenization.BuiltinRecognizers.RecognizeNone; break; case Sdl.LanguagePlatform.Core.Tokenization.TokenType.Measurement: requireEquality = (disabledAutoSubstitutions & Sdl.LanguagePlatform.Core.Tokenization.BuiltinRecognizers.RecognizeMeasurements) != Sdl.LanguagePlatform.Core.Tokenization.BuiltinRecognizers.RecognizeNone; break; default: requireEquality = false; break; } if (requireEquality) { if (a.Equals(b)) { return(1.0d); } else { return(0.7d); } } } switch (a.GetSimilarity(b)) { case Core.SegmentElement.Similarity.None: // in this case the tokens are not similar, but the token types are identical (checked above) return(0.7d); case Core.SegmentElement.Similarity.IdenticalType: return(0.85d); case Core.SegmentElement.Similarity.IdenticalValueAndType: return(1.0d); default: return(0.0d); } }
public static double GetTokenSimilarity(Core.Tokenization.Token a, Core.Tokenization.Token b, bool useStringEditDistance, Core.Tokenization.BuiltinRecognizers disabledAutoSubstitutions) { /* * identical form - 1.0 * identical case-insensitive - 0.9 * stem-identical or same placeable type - 0.85 * identical type - 0.4 * */ // TODO consider normalization of token texts, or let the tokenizer store the text in normalized form. Core.Tokenization.TagToken ta = a as Core.Tokenization.TagToken; Core.Tokenization.TagToken tb = b as Core.Tokenization.TagToken; bool aIsTag = (ta != null); bool bIsTag = (tb != null); if (aIsTag != bIsTag || a.IsWhitespace != b.IsWhitespace || a.IsPunctuation != b.IsPunctuation) { // comparing a tag with a non-tag results in no-change-allowed similarity (<0) // same for whitespace, punctuation return(-1.0d); } if (aIsTag && bIsTag) { System.Diagnostics.Debug.Assert(ta.Tag != null && tb.Tag != null); if (ta.Tag.Type == tb.Tag.Type) { // assignable return(0.95d); } else if ((ta.Tag.Type == Sdl.LanguagePlatform.Core.TagType.Standalone && tb.Tag.Type == Sdl.LanguagePlatform.Core.TagType.TextPlaceholder) || (ta.Tag.Type == Sdl.LanguagePlatform.Core.TagType.TextPlaceholder && tb.Tag.Type == Sdl.LanguagePlatform.Core.TagType.Standalone)) { // one placeholder, one text placeholder return(0.85d); } else { // not assignable return(-1.0d); } } double malus = 0.0d; double sim = 0.0d; if (a.IsPlaceable && b.IsPlaceable) { return(GetPlaceableSimilarity(a, b, disabledAutoSubstitutions)); } if (a.Text == null || b.Text == null) { System.Diagnostics.Debug.Assert(false, "Expected non-null token text. Let Oli know if this assertion fires and provide test data."); return(0.0d); } if (a.IsWord != b.IsWord) { // tokens of different types - reduce similarity accordingly // NOTE only checks whether both are words or non-words malus = 0.1d; } if (a.Text.Equals(b.Text, StringComparison.Ordinal)) { sim = 1.0d; } else if (a.IsWhitespace || a.IsPunctuation) { // slightly less than the SegmentEditDistanceComputer's move threshold, as // we don't want to move them around sim = 0.94d; } else if (a.Text.Equals(b.Text, StringComparison.OrdinalIgnoreCase)) { // we want to detect moves for such tokens, so: sim = 0.95d; // the SegmentEditDistanceComputer's move threshold } else if (a is Core.Tokenization.SimpleToken && b is Core.Tokenization.SimpleToken) { Core.Tokenization.SimpleToken ast = a as Core.Tokenization.SimpleToken; Core.Tokenization.SimpleToken bst = b as Core.Tokenization.SimpleToken; if (ast != null && bst != null && ast.Stem != null && bst.Stem != null && ast.Stem.Equals(bst.Stem, StringComparison.OrdinalIgnoreCase)) { sim = 0.85d; } else { sim = useStringEditDistance ? 0.95d * GetThreshold(GetStringSimilarity(a.Text, b.Text)) : 0.0d; } } else { // strings are not identical or identical w/ ignore case sim = useStringEditDistance ? 0.95d * GetThreshold(GetStringSimilarity(a.Text, b.Text)) : 0.0d; } return(Math.Max(0.0d, sim - malus)); }
private Core.EditDistance.EditDistance ComputeEditDistanceImpl_Original( IList <Core.Tokenization.Token> sourceTokens, IList <Core.Tokenization.Token> targetTokens, bool computeDiagonalOnly, Core.Tokenization.BuiltinRecognizers disabledAutoSubstitutions, out TagAssociations alignedTags) { /* * The "classic" ED approach has the problem that it doesn't detect moves * reliably, particularly block moves. Patching up insert/delete pairs as * moves also won't catch moves which appear as changes in the ED. */ if (sourceTokens == null) { throw new ArgumentNullException("sourceTokens"); } if (targetTokens == null) { throw new ArgumentNullException("targetTokens"); } alignedTags = null; int i, j; // TODO handle special cases (one/both of the arrays being empty/having no elements) // TODO use diagonal algorithm bool enforceFullMatrixComputation = false; Core.EditDistance.EditDistance result = new Core.EditDistance.EditDistance(sourceTokens.Count, targetTokens.Count, 0.0d); // matrix which captures the similarity between two tokens as well as preassignments SimilarityMatrix sim = new SimilarityMatrix(sourceTokens, targetTokens, _UseStringEditDistance, disabledAutoSubstitutions); if (enforceFullMatrixComputation) { // this will be fully computed by the tag aligner in most cases, but we may save a bit // on plain text segments sim.Compute(computeDiagonalOnly); } MatrixItem[,] matrix = CreateEditDistanceMatrix(sourceTokens, targetTokens); alignedTags = TagAligner.AlignPairedTags(sourceTokens, targetTokens, sim); if (alignedTags != null && alignedTags.Count > 0) { // Patch the sim matrix so that non-aligned tags can't be assigned to each other PatchSimilarityMatrix(sim, sourceTokens, targetTokens, alignedTags); ComputeEditDistanceMatrix_Full(matrix, sim, alignedTags); } else if (enforceFullMatrixComputation) { ComputeEditDistanceMatrix_Full(matrix, sim, alignedTags); } else { ComputeEditDistanceMatrix_Lazy(matrix, sim); } // readout the cheapest path i = sourceTokens.Count; j = targetTokens.Count; result.Distance = matrix[i, j].Score; while (i > 0 || j > 0) { EditDistanceItem item = new EditDistanceItem(); item.Resolution = EditDistanceResolution.None; MatrixItem m = matrix[i, j]; item.Operation = m.Operation; switch (item.Operation) { case EditOperation.Identity: item.Costs = 0.0d; --i; --j; break; case EditOperation.Change: item.Costs = _UseStringEditDistance ? (1.0d - m.Similarity) : (1.0d - SimilarityComputers.GetTokenSimilarity(sourceTokens[i - 1], targetTokens[j - 1], true, disabledAutoSubstitutions)); // item.Costs = (1.0d - m.Similarity); --i; --j; break; case EditOperation.Insert: item.Costs = _InsertDeleteCosts; --j; break; case EditOperation.Delete: item.Costs = _InsertDeleteCosts; --i; break; case EditOperation.Undefined: throw new Exception("Internal ED computation error"); } item.Source = i; item.Target = j; result.AddAtStart(item); } if (alignedTags != null && alignedTags.Count > 0) { // should happen before move detection FixTagActions(sourceTokens, targetTokens, result, alignedTags); } // identify move operations which are pairs of insert/delete operations in the shortest path. // Note that the comparision result is already in the matrix and we only care about identity. // TODO we may rather use a configurable threshold than identity (1.0) to catch move operations // of sufficiently similar items (e.g. case-insensitive) if (_ComputeMoves) { int moves = DetectMoves(result, matrix); if (moves > 0) { // adjust score: substract moves * (deletionCosts + insertionCosts), add moves * moveCosts // TODO take moveDistance into account, i.e. penalty depends on distance? result.Distance -= (double)moves * (2.0d * _InsertDeleteCosts); result.Distance += (double)moves * _MoveCosts; } } #if DEBUG // a stream for logging. Will always be null in non-Debug builds System.IO.TextWriter logStream = null; bool log = false; if (log) { logStream = new System.IO.StreamWriter(System.IO.Path.GetTempPath() + "/ed.log", false, System.Text.Encoding.UTF8); logStream.WriteLine("Source objects:"); for (int p = 0; p < sourceTokens.Count; ++p) { logStream.WriteLine("\t{0}:\t{1}", p, sourceTokens[p].ToString()); } logStream.WriteLine(); logStream.WriteLine("Target objects:"); for (int p = 0; p < targetTokens.Count; ++p) { logStream.WriteLine("\t{0}:\t{1}", p, targetTokens[p].ToString()); } logStream.WriteLine(); logStream.WriteLine(); if (alignedTags != null) { logStream.WriteLine("Tag Alignment:"); foreach (TagAssociation ta in alignedTags) { logStream.WriteLine("\t{0}", ta.ToString()); } logStream.WriteLine(); logStream.WriteLine(); } result.Dump(logStream, "Final ED"); logStream.Close(); logStream.Dispose(); logStream = null; } #endif #if DEBUG // write matrix to a temp file in HTML format _DumpMatrix = false; // typeof(T) != typeof(char); if (_DumpMatrix) { System.IO.StreamWriter wtr = new System.IO.StreamWriter(System.IO.Path.GetTempPath() + "/SimMatrix.html", false, System.Text.Encoding.UTF8); System.Web.UI.Html32TextWriter htmlWriter = new System.Web.UI.Html32TextWriter(wtr); htmlWriter.WriteFullBeginTag("html"); htmlWriter.WriteFullBeginTag("body"); htmlWriter.WriteBeginTag("table"); htmlWriter.WriteAttribute("border", "1"); for (j = -1; j <= targetTokens.Count; ++j) { htmlWriter.WriteFullBeginTag("tr"); for (i = -1; i <= sourceTokens.Count; ++i) { htmlWriter.WriteFullBeginTag("td"); if (i < 0) { // caption row if (j >= 0) { htmlWriter.Write("j={0}", j); if (j > 0) { htmlWriter.WriteFullBeginTag("br"); htmlWriter.WriteFullBeginTag("b"); htmlWriter.Write(targetTokens[j - 1].ToString()); htmlWriter.WriteEndTag("b"); } } } else if (j < 0) { // j < 0 but i >= 0 --> htmlWriter.Write("i={0}", i); if (i > 0) { htmlWriter.WriteFullBeginTag("br"); htmlWriter.WriteFullBeginTag("b"); htmlWriter.Write(sourceTokens[i - 1].ToString()); htmlWriter.WriteEndTag("b"); } } else { // content cell htmlWriter.Write("d={0}", matrix[i, j].Score); htmlWriter.WriteFullBeginTag("br"); htmlWriter.Write("s={0}", matrix[i, j].Similarity); htmlWriter.WriteFullBeginTag("br"); htmlWriter.Write("o={0}", matrix[i, j].Operation.ToString()); } htmlWriter.WriteEndTag("td"); } htmlWriter.WriteEndTag("tr"); } htmlWriter.WriteEndTag("table"); htmlWriter.WriteFullBeginTag("h2"); htmlWriter.Write("Result"); htmlWriter.WriteEndTag("h2"); htmlWriter.Write("Score = {0}", result.Distance); htmlWriter.WriteFullBeginTag("ol"); for (i = 0; i < result.Items.Count; ++i) { htmlWriter.WriteFullBeginTag("li"); htmlWriter.Write("{0}: s={1} t={2}", result[i].Operation.ToString(), result[i].Source, result[i].Target); } htmlWriter.WriteEndTag("ol"); htmlWriter.WriteEndTag("body"); htmlWriter.WriteEndTag("html"); htmlWriter.Close(); } #endif return(result); }