public void Compute(bool computeDiagonalOnly) { for (int i = 0; i < _SourceTokens.Count; ++i) { for (int j = 0; j < _TargetTokens.Count; ++j) { if (_Sim[i, j] != _Uncomputed) { continue; } if (computeDiagonalOnly) { if (i == j) { _Sim[i, j] = SimilarityComputers.GetTokenSimilarity(_SourceTokens[i], _TargetTokens[j], _UseStringEditDistance, _DisabledAutoSubstitutions); } else { _Sim[i, j] = -1.0d; } } else { _Sim[i, j] = SimilarityComputers.GetTokenSimilarity(_SourceTokens[i], _TargetTokens[j], _UseStringEditDistance, _DisabledAutoSubstitutions); } } } }
public double this[int s, int t] { get { double result = _Sim[s, t]; if (result == _Uncomputed) { result = SimilarityComputers.GetTokenSimilarity(_SourceTokens[s], _TargetTokens[t], _UseStringEditDistance, _DisabledAutoSubstitutions); _Sim[s, t] = result; } return(result); } set { _Sim[s, t] = value; } }
private Core.EditDistance.EditDistance ComputeEditDistanceImpl_Original( IList <Core.Tokenization.Token> sourceTokens, IList <Core.Tokenization.Token> targetTokens, bool computeDiagonalOnly, Core.Tokenization.BuiltinRecognizers disabledAutoSubstitutions, out TagAssociations alignedTags) { /* * The "classic" ED approach has the problem that it doesn't detect moves * reliably, particularly block moves. Patching up insert/delete pairs as * moves also won't catch moves which appear as changes in the ED. */ if (sourceTokens == null) { throw new ArgumentNullException("sourceTokens"); } if (targetTokens == null) { throw new ArgumentNullException("targetTokens"); } alignedTags = null; int i, j; // TODO handle special cases (one/both of the arrays being empty/having no elements) // TODO use diagonal algorithm bool enforceFullMatrixComputation = false; Core.EditDistance.EditDistance result = new Core.EditDistance.EditDistance(sourceTokens.Count, targetTokens.Count, 0.0d); // matrix which captures the similarity between two tokens as well as preassignments SimilarityMatrix sim = new SimilarityMatrix(sourceTokens, targetTokens, _UseStringEditDistance, disabledAutoSubstitutions); if (enforceFullMatrixComputation) { // this will be fully computed by the tag aligner in most cases, but we may save a bit // on plain text segments sim.Compute(computeDiagonalOnly); } MatrixItem[,] matrix = CreateEditDistanceMatrix(sourceTokens, targetTokens); alignedTags = TagAligner.AlignPairedTags(sourceTokens, targetTokens, sim); if (alignedTags != null && alignedTags.Count > 0) { // Patch the sim matrix so that non-aligned tags can't be assigned to each other PatchSimilarityMatrix(sim, sourceTokens, targetTokens, alignedTags); ComputeEditDistanceMatrix_Full(matrix, sim, alignedTags); } else if (enforceFullMatrixComputation) { ComputeEditDistanceMatrix_Full(matrix, sim, alignedTags); } else { ComputeEditDistanceMatrix_Lazy(matrix, sim); } // readout the cheapest path i = sourceTokens.Count; j = targetTokens.Count; result.Distance = matrix[i, j].Score; while (i > 0 || j > 0) { EditDistanceItem item = new EditDistanceItem(); item.Resolution = EditDistanceResolution.None; MatrixItem m = matrix[i, j]; item.Operation = m.Operation; switch (item.Operation) { case EditOperation.Identity: item.Costs = 0.0d; --i; --j; break; case EditOperation.Change: item.Costs = _UseStringEditDistance ? (1.0d - m.Similarity) : (1.0d - SimilarityComputers.GetTokenSimilarity(sourceTokens[i - 1], targetTokens[j - 1], true, disabledAutoSubstitutions)); // item.Costs = (1.0d - m.Similarity); --i; --j; break; case EditOperation.Insert: item.Costs = _InsertDeleteCosts; --j; break; case EditOperation.Delete: item.Costs = _InsertDeleteCosts; --i; break; case EditOperation.Undefined: throw new Exception("Internal ED computation error"); } item.Source = i; item.Target = j; result.AddAtStart(item); } if (alignedTags != null && alignedTags.Count > 0) { // should happen before move detection FixTagActions(sourceTokens, targetTokens, result, alignedTags); } // identify move operations which are pairs of insert/delete operations in the shortest path. // Note that the comparision result is already in the matrix and we only care about identity. // TODO we may rather use a configurable threshold than identity (1.0) to catch move operations // of sufficiently similar items (e.g. case-insensitive) if (_ComputeMoves) { int moves = DetectMoves(result, matrix); if (moves > 0) { // adjust score: substract moves * (deletionCosts + insertionCosts), add moves * moveCosts // TODO take moveDistance into account, i.e. penalty depends on distance? result.Distance -= (double)moves * (2.0d * _InsertDeleteCosts); result.Distance += (double)moves * _MoveCosts; } } #if DEBUG // a stream for logging. Will always be null in non-Debug builds System.IO.TextWriter logStream = null; bool log = false; if (log) { logStream = new System.IO.StreamWriter(System.IO.Path.GetTempPath() + "/ed.log", false, System.Text.Encoding.UTF8); logStream.WriteLine("Source objects:"); for (int p = 0; p < sourceTokens.Count; ++p) { logStream.WriteLine("\t{0}:\t{1}", p, sourceTokens[p].ToString()); } logStream.WriteLine(); logStream.WriteLine("Target objects:"); for (int p = 0; p < targetTokens.Count; ++p) { logStream.WriteLine("\t{0}:\t{1}", p, targetTokens[p].ToString()); } logStream.WriteLine(); logStream.WriteLine(); if (alignedTags != null) { logStream.WriteLine("Tag Alignment:"); foreach (TagAssociation ta in alignedTags) { logStream.WriteLine("\t{0}", ta.ToString()); } logStream.WriteLine(); logStream.WriteLine(); } result.Dump(logStream, "Final ED"); logStream.Close(); logStream.Dispose(); logStream = null; } #endif #if DEBUG // write matrix to a temp file in HTML format _DumpMatrix = false; // typeof(T) != typeof(char); if (_DumpMatrix) { System.IO.StreamWriter wtr = new System.IO.StreamWriter(System.IO.Path.GetTempPath() + "/SimMatrix.html", false, System.Text.Encoding.UTF8); System.Web.UI.Html32TextWriter htmlWriter = new System.Web.UI.Html32TextWriter(wtr); htmlWriter.WriteFullBeginTag("html"); htmlWriter.WriteFullBeginTag("body"); htmlWriter.WriteBeginTag("table"); htmlWriter.WriteAttribute("border", "1"); for (j = -1; j <= targetTokens.Count; ++j) { htmlWriter.WriteFullBeginTag("tr"); for (i = -1; i <= sourceTokens.Count; ++i) { htmlWriter.WriteFullBeginTag("td"); if (i < 0) { // caption row if (j >= 0) { htmlWriter.Write("j={0}", j); if (j > 0) { htmlWriter.WriteFullBeginTag("br"); htmlWriter.WriteFullBeginTag("b"); htmlWriter.Write(targetTokens[j - 1].ToString()); htmlWriter.WriteEndTag("b"); } } } else if (j < 0) { // j < 0 but i >= 0 --> htmlWriter.Write("i={0}", i); if (i > 0) { htmlWriter.WriteFullBeginTag("br"); htmlWriter.WriteFullBeginTag("b"); htmlWriter.Write(sourceTokens[i - 1].ToString()); htmlWriter.WriteEndTag("b"); } } else { // content cell htmlWriter.Write("d={0}", matrix[i, j].Score); htmlWriter.WriteFullBeginTag("br"); htmlWriter.Write("s={0}", matrix[i, j].Similarity); htmlWriter.WriteFullBeginTag("br"); htmlWriter.Write("o={0}", matrix[i, j].Operation.ToString()); } htmlWriter.WriteEndTag("td"); } htmlWriter.WriteEndTag("tr"); } htmlWriter.WriteEndTag("table"); htmlWriter.WriteFullBeginTag("h2"); htmlWriter.Write("Result"); htmlWriter.WriteEndTag("h2"); htmlWriter.Write("Score = {0}", result.Distance); htmlWriter.WriteFullBeginTag("ol"); for (i = 0; i < result.Items.Count; ++i) { htmlWriter.WriteFullBeginTag("li"); htmlWriter.Write("{0}: s={1} t={2}", result[i].Operation.ToString(), result[i].Source, result[i].Target); } htmlWriter.WriteEndTag("ol"); htmlWriter.WriteEndTag("body"); htmlWriter.WriteEndTag("html"); htmlWriter.Close(); } #endif return(result); }