Example #1
0
        /// <summary>
        /// Computes and returns the edit distance between two sequences of type <typeparamref name="T"/>, using
        /// the similarity computer and cost values specified in the constructor. If <see cref="ComputeMoveOperations"/>
        /// is <c>true</c>, simple moves will be detected. Otherwise, moves will (typically) result in two independent
        /// insert/delete operations.
        /// </summary>
        /// <param name="sourceObjects">The first input sequence ("source")</param>
        /// <param name="targetObjects">The second input sequence ("target")</param>
        /// <param name="precomputedAssociations">A list of precomputed item index associations. If valid, item pairs
        /// in this list will be associated with each other, which will result in either an identity
        /// or a change operation.</param>
        /// <returns>The edit distance between the two sequences</returns>
        public Core.EditDistance.EditDistance ComputeEditDistance(IList <T> sourceObjects,
                                                                  IList <T> targetObjects, List <Core.Pair <int> > precomputedAssociations)
        {
            const double invalidAssignmentCosts = 100000.0d;

#if DEBUG
            if (typeof(T) != typeof(char))
            {
            }
#endif
            if (sourceObjects == null)
            {
                throw new ArgumentNullException("sourceObjects");
            }
            if (targetObjects == null)
            {
                throw new ArgumentNullException("targetObjects");
            }

            if (precomputedAssociations != null)
            {
                if (!SortAndValidate(precomputedAssociations, sourceObjects.Count, targetObjects.Count))
                {
                    System.Diagnostics.Debug.Assert(false, "Invalid preassignments");
                    precomputedAssociations = null;
                }
            }

            // TODO handle special cases (one/both of the arrays being empty/having no elements)
            // TODO use diagonal algorithm

            Core.EditDistance.EditDistance result = new Core.EditDistance.EditDistance(sourceObjects.Count, targetObjects.Count, 0.0d);

            MatrixItem[,] matrix = new MatrixItem[sourceObjects.Count + 1, targetObjects.Count + 1];
            int i, j;

            bool usePreassignments = precomputedAssociations != null;

            // initialize matrix
            matrix[0, 0] = new MatrixItem(0.0d, Core.EditDistance.EditOperation.Identity, 0.0d);

            for (i = 1; i <= sourceObjects.Count; ++i)
            {
                matrix[i, 0] = new MatrixItem((double)i * _InsertDeleteCosts, Core.EditDistance.EditOperation.Delete, 0.0d);
            }

            for (j = 1; j <= targetObjects.Count; ++j)
            {
                matrix[0, j] = new MatrixItem((double)j * _InsertDeleteCosts, Core.EditDistance.EditOperation.Insert, 0.0d);
            }

            for (i = 1; i <= sourceObjects.Count; ++i)
            {
                for (j = 1; j <= targetObjects.Count; ++j)
                {
                    matrix[i, j] = new MatrixItem(0.0d, Core.EditDistance.EditOperation.Identity, 0.0d);
                }
            }

            // populate matrix

            for (i = 1; i <= sourceObjects.Count; ++i)
            {
                T s = sourceObjects[i - 1];

                int associatedTarget = usePreassignments
                                        ? GetSourcePreassignment(i - 1, precomputedAssociations)
                                        : -1;

                for (j = 1; j <= targetObjects.Count; ++j)
                {
                    T t = targetObjects[j - 1];

                    double similarity = 0.0d;

                    if (associatedTarget < 0 || associatedTarget == j - 1)
                    {
                        // no preassignment or items are correlated - use std sim
                        similarity = _SimilarityComputer(s, t);
                    }
                    else
                    {
                        // there is a correlation with another item - don't allow change/identity
                        similarity = -1.0d;
                    }

                    System.Diagnostics.Debug.Assert((similarity >= 0.0d && similarity <= 1.0d) ||
                                                    similarity == -1.0d);

                    // low similarity means high "change costs" and vice versa:
                    double changeCosts = (similarity < 0)
                                                ? invalidAssignmentCosts
                                                : matrix[i - 1, j - 1].Score + (1.0d - similarity);

                    double insertCosts = matrix[i, j - 1].Score + _InsertDeleteCosts;
                    double deleteCosts = matrix[i - 1, j].Score + _InsertDeleteCosts;

                    double min = Math.Min(Math.Min(changeCosts, deleteCosts), insertCosts);

                    matrix[i, j].Score      = min;
                    matrix[i, j].Similarity = similarity;

                    if (min == deleteCosts)
                    {
                        matrix[i, j].Operation = Core.EditDistance.EditOperation.Delete;
                    }
                    else if (min == insertCosts)
                    {
                        matrix[i, j].Operation = Core.EditDistance.EditOperation.Insert;
                    }
                    else if (min == changeCosts)
                    {
                        if (similarity == 1.0d)
                        {
                            matrix[i, j].Operation = Core.EditDistance.EditOperation.Identity;
                        }
                        else
                        {
                            matrix[i, j].Operation = Core.EditDistance.EditOperation.Change;
                        }
                    }
                }
            }

            // readout the cheapest path

            i = sourceObjects.Count;
            j = targetObjects.Count;
            result.Distance = matrix[i, j].Score;

            while (i > 0 || j > 0)
            {
                EditDistanceItem item = new EditDistanceItem();
                item.Resolution = EditDistanceResolution.None;

                item.Operation = matrix[i, j].Operation;
                switch (item.Operation)
                {
                case EditOperation.Identity:
                    --i;
                    --j;
                    item.Costs = 0.0d;
                    break;

                case EditOperation.Change:
                    item.Costs = 1.0d - matrix[i, j].Similarity;
                    --i;
                    --j;
                    break;

                case EditOperation.Insert:
                    --j;
                    item.Costs = _InsertDeleteCosts;
                    break;

                case EditOperation.Delete:
                    item.Costs = _InsertDeleteCosts;
                    --i;
                    break;
                }

                item.Source = i;
                item.Target = j;
                result.AddAtStart(item);
            }

            // identify move operations which are pairs of insert/delete operations in the shortest path.
            // Note that the comparision result is already in the matrix and we only care about identity.
            // TODO we may rather use a configurable threshold than identity (1.0) to catch move operations
            //  of sufficiently similar items (e.g. case-insensitive)

            int moves = 0;

            // try to detect moves in case the move penalty is smaller than the sum of insert/delete penalties
            if (_ComputeMoveOperations)
            {
                // matrix[i, j].Similarity is the cached token similarity between source[i-1] and target[j-1]

                // TODO may need to restrict moves to undisputed corresponding items, i.e. those which
                //  have only one row/column similarity maximum

                for (int index = 0; index < result.Items.Count; ++index)
                {
                    EditOperation op = result[index].Operation;
                    if (op == EditOperation.Delete || op == EditOperation.Insert)
                    {
                        int moveSource       = 0;
                        int moveTarget       = 0;
                        int moveSourceTarget = 0;
                        int moveTargetSource = 0;

                        // search in the remainder of the result list for a "compensating" operation
                        int comp = 0;
                        for (comp = index + 1; comp < result.Items.Count; ++comp)
                        {
                            if (result[comp].Operation == EditOperation.Insert &&
                                op == EditOperation.Delete &&
                                matrix[result[index].Source + 1, result[comp].Target + 1].Similarity >= _SimThreshold)
                            {
                                // source[result[index].Source] was deleted
                                // target[result[comp].Target] was inserted
                                moveSource       = result[index].Source;
                                moveSourceTarget = result[index].Target;

                                moveTarget       = result[comp].Target;
                                moveTargetSource = result[comp].Source;
                                break;
                            }
                            else if (result[comp].Operation == EditOperation.Delete &&
                                     op == EditOperation.Insert &&
                                     matrix[result[comp].Source + 1, result[index].Target + 1].Similarity >= _SimThreshold)
                            {
                                // source[result[comp].Source] was deleted
                                // target[result[index].Target] was inserted
                                moveSource       = result[comp].Source;
                                moveSourceTarget = result[comp].Target;

                                moveTarget       = result[index].Target;
                                moveTargetSource = result[index].Source;
                                break;
                            }
                        }

                        // TODO take moveDistance into account, i.e. penalty depends on distance? Avoids
                        //  long-distance moves.

                        if (comp < result.Items.Count)
                        {
                            // compensating operation found
                            // TODO backtrack to find other compensating items?
                            EditDistanceItem item = result[index];
                            item.Operation        = EditOperation.Move;
                            item.Source           = moveSource;
                            item.Target           = moveTarget;
                            item.MoveSourceTarget = moveSourceTarget;
                            item.MoveTargetSource = moveTargetSource;
                            // TODO update item similarity
                            result.Items[index] = item;
                            result.Items.RemoveAt(comp);
                            ++moves;
                        }
                    }
                }
            }

            if (moves > 0)
            {
                // adjust score: substract moves * (deletionCosts + insertionCosts), add moves * moveCosts
                // TODO take moveDistance into account, i.e. penalty depends on distance?
                result.Distance -= (double)moves * (2.0d * _InsertDeleteCosts);
                result.Distance += (double)moves * _MoveCosts;
            }

#if DEBUG && !SILVERLIGHT
            _DumpMatrix = false;             //  typeof(T) != typeof(char);
            if (_DumpMatrix)
            {
                // in debug mode, write matrix to a temp file in HTML format
                System.Environment.GetEnvironmentVariable("TEMP");
                System.IO.StreamWriter wtr = new System.IO.StreamWriter(System.Environment.GetEnvironmentVariable("TEMP") + "/SimMatrix.html",
                                                                        false, System.Text.Encoding.UTF8);
                System.Web.UI.Html32TextWriter htmlWriter = new System.Web.UI.Html32TextWriter(wtr);

                htmlWriter.WriteFullBeginTag("html");
                htmlWriter.WriteFullBeginTag("body");
                htmlWriter.WriteBeginTag("table");
                htmlWriter.WriteAttribute("border", "1");

                for (j = -1; j <= targetObjects.Count; ++j)
                {
                    htmlWriter.WriteFullBeginTag("tr");

                    for (i = -1; i <= sourceObjects.Count; ++i)
                    {
                        htmlWriter.WriteFullBeginTag("td");

                        if (i < 0)
                        {
                            // caption row
                            if (j >= 0)
                            {
                                htmlWriter.Write("j={0}", j);
                                if (j > 0)
                                {
                                    htmlWriter.WriteFullBeginTag("br");
                                    htmlWriter.WriteFullBeginTag("b");
                                    htmlWriter.Write(targetObjects[j - 1].ToString());
                                    htmlWriter.WriteEndTag("b");
                                }
                            }
                        }
                        else if (j < 0)
                        {
                            // j < 0 but i >= 0 -->
                            htmlWriter.Write("i={0}", i);
                            if (i > 0)
                            {
                                htmlWriter.WriteFullBeginTag("br");
                                htmlWriter.WriteFullBeginTag("b");
                                htmlWriter.Write(sourceObjects[i - 1].ToString());
                                htmlWriter.WriteEndTag("b");
                            }
                        }
                        else
                        {
                            // content cell
                            htmlWriter.Write("d={0}", matrix[i, j].Score);
                            htmlWriter.WriteFullBeginTag("br");
                            htmlWriter.Write("s={0}", matrix[i, j].Similarity);
                            htmlWriter.WriteFullBeginTag("br");
                            htmlWriter.Write("o={0}", matrix[i, j].Operation.ToString());
                        }

                        htmlWriter.WriteEndTag("td");
                    }

                    htmlWriter.WriteEndTag("tr");
                }

                htmlWriter.WriteEndTag("table");

                htmlWriter.WriteFullBeginTag("h2");
                htmlWriter.Write("Result");
                htmlWriter.WriteEndTag("h2");

                htmlWriter.Write("Score = {0}", result.Distance);

                htmlWriter.WriteFullBeginTag("ol");

                for (i = 0; i < result.Items.Count; ++i)
                {
                    htmlWriter.WriteFullBeginTag("li");
                    htmlWriter.Write("{0}: s={1} t={2}",
                                     result[i].Operation.ToString(), result[i].Source, result[i].Target);
                }

                htmlWriter.WriteEndTag("ol");

                htmlWriter.WriteEndTag("body");
                htmlWriter.WriteEndTag("html");

                htmlWriter.Close();
            }
#endif

            return(result);
        }
        private Core.EditDistance.EditDistance ComputeEditDistanceImpl_Original(
            IList <Core.Tokenization.Token> sourceTokens,
            IList <Core.Tokenization.Token> targetTokens,
            bool computeDiagonalOnly,
            Core.Tokenization.BuiltinRecognizers disabledAutoSubstitutions,
            out TagAssociations alignedTags)
        {
            /*
             * The "classic" ED approach has the problem that it doesn't detect moves
             * reliably, particularly block moves. Patching up insert/delete pairs as
             * moves also won't catch moves which appear as changes in the ED.
             */

            if (sourceTokens == null)
            {
                throw new ArgumentNullException("sourceTokens");
            }
            if (targetTokens == null)
            {
                throw new ArgumentNullException("targetTokens");
            }

            alignedTags = null;

            int i, j;

            // TODO handle special cases (one/both of the arrays being empty/having no elements)
            // TODO use diagonal algorithm

            bool enforceFullMatrixComputation = false;

            Core.EditDistance.EditDistance result =
                new Core.EditDistance.EditDistance(sourceTokens.Count, targetTokens.Count, 0.0d);

            // matrix which captures the similarity between two tokens as well as preassignments
            SimilarityMatrix sim = new SimilarityMatrix(sourceTokens, targetTokens,
                                                        _UseStringEditDistance, disabledAutoSubstitutions);

            if (enforceFullMatrixComputation)
            {
                // this will be fully computed by the tag aligner in most cases, but we may save a bit
                // on plain text segments
                sim.Compute(computeDiagonalOnly);
            }

            MatrixItem[,] matrix = CreateEditDistanceMatrix(sourceTokens, targetTokens);

            alignedTags = TagAligner.AlignPairedTags(sourceTokens, targetTokens, sim);
            if (alignedTags != null && alignedTags.Count > 0)
            {
                // Patch the sim matrix so that non-aligned tags can't be assigned to each other
                PatchSimilarityMatrix(sim, sourceTokens, targetTokens, alignedTags);
                ComputeEditDistanceMatrix_Full(matrix, sim, alignedTags);
            }
            else if (enforceFullMatrixComputation)
            {
                ComputeEditDistanceMatrix_Full(matrix, sim, alignedTags);
            }
            else
            {
                ComputeEditDistanceMatrix_Lazy(matrix, sim);
            }

            // readout the cheapest path

            i = sourceTokens.Count;
            j = targetTokens.Count;
            result.Distance = matrix[i, j].Score;

            while (i > 0 || j > 0)
            {
                EditDistanceItem item = new EditDistanceItem();
                item.Resolution = EditDistanceResolution.None;

                MatrixItem m = matrix[i, j];

                item.Operation = m.Operation;

                switch (item.Operation)
                {
                case EditOperation.Identity:
                    item.Costs = 0.0d;
                    --i;
                    --j;
                    break;

                case EditOperation.Change:
                    item.Costs = _UseStringEditDistance
                                                ? (1.0d - m.Similarity)
                                                : (1.0d - SimilarityComputers.GetTokenSimilarity(sourceTokens[i - 1], targetTokens[j - 1],
                                                                                                 true, disabledAutoSubstitutions));
                    // item.Costs = (1.0d - m.Similarity);
                    --i;
                    --j;
                    break;

                case EditOperation.Insert:
                    item.Costs = _InsertDeleteCosts;
                    --j;
                    break;

                case EditOperation.Delete:
                    item.Costs = _InsertDeleteCosts;
                    --i;
                    break;

                case EditOperation.Undefined:
                    throw new Exception("Internal ED computation error");
                }

                item.Source = i;
                item.Target = j;
                result.AddAtStart(item);
            }

            if (alignedTags != null && alignedTags.Count > 0)
            {
                // should happen before move detection
                FixTagActions(sourceTokens, targetTokens, result, alignedTags);
            }

            // identify move operations which are pairs of insert/delete operations in the shortest path.
            // Note that the comparision result is already in the matrix and we only care about identity.
            // TODO we may rather use a configurable threshold than identity (1.0) to catch move operations
            //  of sufficiently similar items (e.g. case-insensitive)

            if (_ComputeMoves)
            {
                int moves = DetectMoves(result, matrix);
                if (moves > 0)
                {
                    // adjust score: substract moves * (deletionCosts + insertionCosts), add moves * moveCosts
                    // TODO take moveDistance into account, i.e. penalty depends on distance?
                    result.Distance -= (double)moves * (2.0d * _InsertDeleteCosts);
                    result.Distance += (double)moves * _MoveCosts;
                }
            }

#if DEBUG
            // a stream for logging. Will always be null in non-Debug builds
            System.IO.TextWriter logStream = null;
            bool log = false;
            if (log)
            {
                logStream = new System.IO.StreamWriter(System.IO.Path.GetTempPath() + "/ed.log",
                                                       false, System.Text.Encoding.UTF8);

                logStream.WriteLine("Source objects:");
                for (int p = 0; p < sourceTokens.Count; ++p)
                {
                    logStream.WriteLine("\t{0}:\t{1}", p, sourceTokens[p].ToString());
                }
                logStream.WriteLine();
                logStream.WriteLine("Target objects:");
                for (int p = 0; p < targetTokens.Count; ++p)
                {
                    logStream.WriteLine("\t{0}:\t{1}", p, targetTokens[p].ToString());
                }
                logStream.WriteLine();
                logStream.WriteLine();

                if (alignedTags != null)
                {
                    logStream.WriteLine("Tag Alignment:");
                    foreach (TagAssociation ta in alignedTags)
                    {
                        logStream.WriteLine("\t{0}", ta.ToString());
                    }
                    logStream.WriteLine();
                    logStream.WriteLine();
                }

                result.Dump(logStream, "Final ED");

                logStream.Close();
                logStream.Dispose();
                logStream = null;
            }
#endif

#if DEBUG
            // write matrix to a temp file in HTML format
            _DumpMatrix = false;             //  typeof(T) != typeof(char);
            if (_DumpMatrix)
            {
                System.IO.StreamWriter wtr = new System.IO.StreamWriter(System.IO.Path.GetTempPath() + "/SimMatrix.html",
                                                                        false, System.Text.Encoding.UTF8);
                System.Web.UI.Html32TextWriter htmlWriter = new System.Web.UI.Html32TextWriter(wtr);

                htmlWriter.WriteFullBeginTag("html");
                htmlWriter.WriteFullBeginTag("body");
                htmlWriter.WriteBeginTag("table");
                htmlWriter.WriteAttribute("border", "1");

                for (j = -1; j <= targetTokens.Count; ++j)
                {
                    htmlWriter.WriteFullBeginTag("tr");

                    for (i = -1; i <= sourceTokens.Count; ++i)
                    {
                        htmlWriter.WriteFullBeginTag("td");

                        if (i < 0)
                        {
                            // caption row
                            if (j >= 0)
                            {
                                htmlWriter.Write("j={0}", j);
                                if (j > 0)
                                {
                                    htmlWriter.WriteFullBeginTag("br");
                                    htmlWriter.WriteFullBeginTag("b");
                                    htmlWriter.Write(targetTokens[j - 1].ToString());
                                    htmlWriter.WriteEndTag("b");
                                }
                            }
                        }
                        else if (j < 0)
                        {
                            // j < 0 but i >= 0 -->
                            htmlWriter.Write("i={0}", i);
                            if (i > 0)
                            {
                                htmlWriter.WriteFullBeginTag("br");
                                htmlWriter.WriteFullBeginTag("b");
                                htmlWriter.Write(sourceTokens[i - 1].ToString());
                                htmlWriter.WriteEndTag("b");
                            }
                        }
                        else
                        {
                            // content cell
                            htmlWriter.Write("d={0}", matrix[i, j].Score);
                            htmlWriter.WriteFullBeginTag("br");
                            htmlWriter.Write("s={0}", matrix[i, j].Similarity);
                            htmlWriter.WriteFullBeginTag("br");
                            htmlWriter.Write("o={0}", matrix[i, j].Operation.ToString());
                        }

                        htmlWriter.WriteEndTag("td");
                    }

                    htmlWriter.WriteEndTag("tr");
                }

                htmlWriter.WriteEndTag("table");

                htmlWriter.WriteFullBeginTag("h2");
                htmlWriter.Write("Result");
                htmlWriter.WriteEndTag("h2");

                htmlWriter.Write("Score = {0}", result.Distance);

                htmlWriter.WriteFullBeginTag("ol");

                for (i = 0; i < result.Items.Count; ++i)
                {
                    htmlWriter.WriteFullBeginTag("li");
                    htmlWriter.Write("{0}: s={1} t={2}",
                                     result[i].Operation.ToString(), result[i].Source, result[i].Target);
                }

                htmlWriter.WriteEndTag("ol");

                htmlWriter.WriteEndTag("body");
                htmlWriter.WriteEndTag("html");

                htmlWriter.Close();
            }
#endif

            return(result);
        }