コード例 #1
0
        public static double GetStringSimilarity(string a, string b)
        {
            if (a.Equals(b, StringComparison.Ordinal))
            {
                return(1.0d);
            }

            if (a.Equals(b, StringComparison.OrdinalIgnoreCase))
            {
                return(0.95d);
            }

            Lingua.EditDistance.EditDistanceComputer <char> edc
                = new Lingua.EditDistance.EditDistanceComputer <char>(GetCharSimilarity);

            Core.EditDistance.EditDistance result
                = edc.ComputeEditDistance(a.ToCharArray(), b.ToCharArray());

            return(result.Score);
        }
コード例 #2
0
        /// <summary>
        /// Computes the ED
        /// </summary>
        /// <param name="sourceTokens"></param>
        /// <param name="targetTokens"></param>
        /// <param name="computeDiagonalOnly">If number of tokens is equivalent, only the diagonal's similarities are computed.</param>
        /// <param name="alignedTags"></param>
        /// <returns></returns>
        public Core.EditDistance.EditDistance ComputeEditDistance(
            IList <Core.Tokenization.Token> sourceTokens,
            IList <Core.Tokenization.Token> targetTokens,
            bool computeDiagonalOnly,
            Core.Tokenization.BuiltinRecognizers disabledAutoSubstitutions,
            out TagAssociations alignedTags)
        {
            Core.EditDistance.EditDistance result = null;

            result = ComputeEditDistanceImpl_Original(sourceTokens, targetTokens, computeDiagonalOnly, disabledAutoSubstitutions, out alignedTags);

#if DEBUG
            {
                bool ok = VerifyEditDistance(result.Items, sourceTokens.Count, targetTokens.Count);
                if (!ok)
                {
                    System.Diagnostics.Debug.Assert(ok, "ED error - let Oli know and provide test data");
                }
            }
#endif
            return(result);
        }
コード例 #3
0
        /// <summary>
        /// Computes and returns the edit distance between two sequences of type <typeparamref name="T"/>, using
        /// the similarity computer and cost values specified in the constructor. If <see cref="ComputeMoveOperations"/>
        /// is <c>true</c>, simple moves will be detected. Otherwise, moves will (typically) result in two independent
        /// insert/delete operations.
        /// </summary>
        /// <param name="sourceObjects">The first input sequence ("source")</param>
        /// <param name="targetObjects">The second input sequence ("target")</param>
        /// <param name="precomputedAssociations">A list of precomputed item index associations. If valid, item pairs
        /// in this list will be associated with each other, which will result in either an identity
        /// or a change operation.</param>
        /// <returns>The edit distance between the two sequences</returns>
        public Core.EditDistance.EditDistance ComputeEditDistance(IList <T> sourceObjects,
                                                                  IList <T> targetObjects, List <Core.Pair <int> > precomputedAssociations)
        {
            const double invalidAssignmentCosts = 100000.0d;

#if DEBUG
            if (typeof(T) != typeof(char))
            {
            }
#endif
            if (sourceObjects == null)
            {
                throw new ArgumentNullException("sourceObjects");
            }
            if (targetObjects == null)
            {
                throw new ArgumentNullException("targetObjects");
            }

            if (precomputedAssociations != null)
            {
                if (!SortAndValidate(precomputedAssociations, sourceObjects.Count, targetObjects.Count))
                {
                    System.Diagnostics.Debug.Assert(false, "Invalid preassignments");
                    precomputedAssociations = null;
                }
            }

            // TODO handle special cases (one/both of the arrays being empty/having no elements)
            // TODO use diagonal algorithm

            Core.EditDistance.EditDistance result = new Core.EditDistance.EditDistance(sourceObjects.Count, targetObjects.Count, 0.0d);

            MatrixItem[,] matrix = new MatrixItem[sourceObjects.Count + 1, targetObjects.Count + 1];
            int i, j;

            bool usePreassignments = precomputedAssociations != null;

            // initialize matrix
            matrix[0, 0] = new MatrixItem(0.0d, Core.EditDistance.EditOperation.Identity, 0.0d);

            for (i = 1; i <= sourceObjects.Count; ++i)
            {
                matrix[i, 0] = new MatrixItem((double)i * _InsertDeleteCosts, Core.EditDistance.EditOperation.Delete, 0.0d);
            }

            for (j = 1; j <= targetObjects.Count; ++j)
            {
                matrix[0, j] = new MatrixItem((double)j * _InsertDeleteCosts, Core.EditDistance.EditOperation.Insert, 0.0d);
            }

            for (i = 1; i <= sourceObjects.Count; ++i)
            {
                for (j = 1; j <= targetObjects.Count; ++j)
                {
                    matrix[i, j] = new MatrixItem(0.0d, Core.EditDistance.EditOperation.Identity, 0.0d);
                }
            }

            // populate matrix

            for (i = 1; i <= sourceObjects.Count; ++i)
            {
                T s = sourceObjects[i - 1];

                int associatedTarget = usePreassignments
                                        ? GetSourcePreassignment(i - 1, precomputedAssociations)
                                        : -1;

                for (j = 1; j <= targetObjects.Count; ++j)
                {
                    T t = targetObjects[j - 1];

                    double similarity = 0.0d;

                    if (associatedTarget < 0 || associatedTarget == j - 1)
                    {
                        // no preassignment or items are correlated - use std sim
                        similarity = _SimilarityComputer(s, t);
                    }
                    else
                    {
                        // there is a correlation with another item - don't allow change/identity
                        similarity = -1.0d;
                    }

                    System.Diagnostics.Debug.Assert((similarity >= 0.0d && similarity <= 1.0d) ||
                                                    similarity == -1.0d);

                    // low similarity means high "change costs" and vice versa:
                    double changeCosts = (similarity < 0)
                                                ? invalidAssignmentCosts
                                                : matrix[i - 1, j - 1].Score + (1.0d - similarity);

                    double insertCosts = matrix[i, j - 1].Score + _InsertDeleteCosts;
                    double deleteCosts = matrix[i - 1, j].Score + _InsertDeleteCosts;

                    double min = Math.Min(Math.Min(changeCosts, deleteCosts), insertCosts);

                    matrix[i, j].Score      = min;
                    matrix[i, j].Similarity = similarity;

                    if (min == deleteCosts)
                    {
                        matrix[i, j].Operation = Core.EditDistance.EditOperation.Delete;
                    }
                    else if (min == insertCosts)
                    {
                        matrix[i, j].Operation = Core.EditDistance.EditOperation.Insert;
                    }
                    else if (min == changeCosts)
                    {
                        if (similarity == 1.0d)
                        {
                            matrix[i, j].Operation = Core.EditDistance.EditOperation.Identity;
                        }
                        else
                        {
                            matrix[i, j].Operation = Core.EditDistance.EditOperation.Change;
                        }
                    }
                }
            }

            // readout the cheapest path

            i = sourceObjects.Count;
            j = targetObjects.Count;
            result.Distance = matrix[i, j].Score;

            while (i > 0 || j > 0)
            {
                EditDistanceItem item = new EditDistanceItem();
                item.Resolution = EditDistanceResolution.None;

                item.Operation = matrix[i, j].Operation;
                switch (item.Operation)
                {
                case EditOperation.Identity:
                    --i;
                    --j;
                    item.Costs = 0.0d;
                    break;

                case EditOperation.Change:
                    item.Costs = 1.0d - matrix[i, j].Similarity;
                    --i;
                    --j;
                    break;

                case EditOperation.Insert:
                    --j;
                    item.Costs = _InsertDeleteCosts;
                    break;

                case EditOperation.Delete:
                    item.Costs = _InsertDeleteCosts;
                    --i;
                    break;
                }

                item.Source = i;
                item.Target = j;
                result.AddAtStart(item);
            }

            // identify move operations which are pairs of insert/delete operations in the shortest path.
            // Note that the comparision result is already in the matrix and we only care about identity.
            // TODO we may rather use a configurable threshold than identity (1.0) to catch move operations
            //  of sufficiently similar items (e.g. case-insensitive)

            int moves = 0;

            // try to detect moves in case the move penalty is smaller than the sum of insert/delete penalties
            if (_ComputeMoveOperations)
            {
                // matrix[i, j].Similarity is the cached token similarity between source[i-1] and target[j-1]

                // TODO may need to restrict moves to undisputed corresponding items, i.e. those which
                //  have only one row/column similarity maximum

                for (int index = 0; index < result.Items.Count; ++index)
                {
                    EditOperation op = result[index].Operation;
                    if (op == EditOperation.Delete || op == EditOperation.Insert)
                    {
                        int moveSource       = 0;
                        int moveTarget       = 0;
                        int moveSourceTarget = 0;
                        int moveTargetSource = 0;

                        // search in the remainder of the result list for a "compensating" operation
                        int comp = 0;
                        for (comp = index + 1; comp < result.Items.Count; ++comp)
                        {
                            if (result[comp].Operation == EditOperation.Insert &&
                                op == EditOperation.Delete &&
                                matrix[result[index].Source + 1, result[comp].Target + 1].Similarity >= _SimThreshold)
                            {
                                // source[result[index].Source] was deleted
                                // target[result[comp].Target] was inserted
                                moveSource       = result[index].Source;
                                moveSourceTarget = result[index].Target;

                                moveTarget       = result[comp].Target;
                                moveTargetSource = result[comp].Source;
                                break;
                            }
                            else if (result[comp].Operation == EditOperation.Delete &&
                                     op == EditOperation.Insert &&
                                     matrix[result[comp].Source + 1, result[index].Target + 1].Similarity >= _SimThreshold)
                            {
                                // source[result[comp].Source] was deleted
                                // target[result[index].Target] was inserted
                                moveSource       = result[comp].Source;
                                moveSourceTarget = result[comp].Target;

                                moveTarget       = result[index].Target;
                                moveTargetSource = result[index].Source;
                                break;
                            }
                        }

                        // TODO take moveDistance into account, i.e. penalty depends on distance? Avoids
                        //  long-distance moves.

                        if (comp < result.Items.Count)
                        {
                            // compensating operation found
                            // TODO backtrack to find other compensating items?
                            EditDistanceItem item = result[index];
                            item.Operation        = EditOperation.Move;
                            item.Source           = moveSource;
                            item.Target           = moveTarget;
                            item.MoveSourceTarget = moveSourceTarget;
                            item.MoveTargetSource = moveTargetSource;
                            // TODO update item similarity
                            result.Items[index] = item;
                            result.Items.RemoveAt(comp);
                            ++moves;
                        }
                    }
                }
            }

            if (moves > 0)
            {
                // adjust score: substract moves * (deletionCosts + insertionCosts), add moves * moveCosts
                // TODO take moveDistance into account, i.e. penalty depends on distance?
                result.Distance -= (double)moves * (2.0d * _InsertDeleteCosts);
                result.Distance += (double)moves * _MoveCosts;
            }

#if DEBUG && !SILVERLIGHT
            _DumpMatrix = false;             //  typeof(T) != typeof(char);
            if (_DumpMatrix)
            {
                // in debug mode, write matrix to a temp file in HTML format
                System.Environment.GetEnvironmentVariable("TEMP");
                System.IO.StreamWriter wtr = new System.IO.StreamWriter(System.Environment.GetEnvironmentVariable("TEMP") + "/SimMatrix.html",
                                                                        false, System.Text.Encoding.UTF8);
                System.Web.UI.Html32TextWriter htmlWriter = new System.Web.UI.Html32TextWriter(wtr);

                htmlWriter.WriteFullBeginTag("html");
                htmlWriter.WriteFullBeginTag("body");
                htmlWriter.WriteBeginTag("table");
                htmlWriter.WriteAttribute("border", "1");

                for (j = -1; j <= targetObjects.Count; ++j)
                {
                    htmlWriter.WriteFullBeginTag("tr");

                    for (i = -1; i <= sourceObjects.Count; ++i)
                    {
                        htmlWriter.WriteFullBeginTag("td");

                        if (i < 0)
                        {
                            // caption row
                            if (j >= 0)
                            {
                                htmlWriter.Write("j={0}", j);
                                if (j > 0)
                                {
                                    htmlWriter.WriteFullBeginTag("br");
                                    htmlWriter.WriteFullBeginTag("b");
                                    htmlWriter.Write(targetObjects[j - 1].ToString());
                                    htmlWriter.WriteEndTag("b");
                                }
                            }
                        }
                        else if (j < 0)
                        {
                            // j < 0 but i >= 0 -->
                            htmlWriter.Write("i={0}", i);
                            if (i > 0)
                            {
                                htmlWriter.WriteFullBeginTag("br");
                                htmlWriter.WriteFullBeginTag("b");
                                htmlWriter.Write(sourceObjects[i - 1].ToString());
                                htmlWriter.WriteEndTag("b");
                            }
                        }
                        else
                        {
                            // content cell
                            htmlWriter.Write("d={0}", matrix[i, j].Score);
                            htmlWriter.WriteFullBeginTag("br");
                            htmlWriter.Write("s={0}", matrix[i, j].Similarity);
                            htmlWriter.WriteFullBeginTag("br");
                            htmlWriter.Write("o={0}", matrix[i, j].Operation.ToString());
                        }

                        htmlWriter.WriteEndTag("td");
                    }

                    htmlWriter.WriteEndTag("tr");
                }

                htmlWriter.WriteEndTag("table");

                htmlWriter.WriteFullBeginTag("h2");
                htmlWriter.Write("Result");
                htmlWriter.WriteEndTag("h2");

                htmlWriter.Write("Score = {0}", result.Distance);

                htmlWriter.WriteFullBeginTag("ol");

                for (i = 0; i < result.Items.Count; ++i)
                {
                    htmlWriter.WriteFullBeginTag("li");
                    htmlWriter.Write("{0}: s={1} t={2}",
                                     result[i].Operation.ToString(), result[i].Source, result[i].Target);
                }

                htmlWriter.WriteEndTag("ol");

                htmlWriter.WriteEndTag("body");
                htmlWriter.WriteEndTag("html");

                htmlWriter.Close();
            }
#endif

            return(result);
        }
コード例 #4
0
        /// <summary>
        /// If the tag alignment suggests action "Change", but the ED can't find this,
        /// we need to patch the corresponding ED item for the corresponding start or end tag as well.
        /// </summary>
        /// <param name="result"></param>
        /// <param name="tagAlignment"></param>
        private void FixTagActions(IList <Core.Tokenization.Token> sourceTokens,
                                   IList <Core.Tokenization.Token> targetTokens,
                                   Core.EditDistance.EditDistance result, TagAssociations tagAlignment)
        {
            // not yet working
            return;

            // NOTE a single ED item may suggest "D" or "I" for a tag. This does not necessarily conflict
            //  with a "C" suggested by the alignment, as a following ED item may suggest
            //  a compensating "I" or "D" which would result in a "M", which is still
            //  compatible with the alignment's "C".

            TagAssociation srcAssoc = null;
            TagAssociation trgAssoc = null;

            foreach (EditDistanceItem edi in result.Items)
            {
                switch (edi.Operation)
                {
                case EditOperation.Identity:
                case EditOperation.Change:
                    srcAssoc = tagAlignment.GetBySourcePosition(edi.Source);
                    trgAssoc = tagAlignment.GetByTargetPosition(edi.Target);
                    if (srcAssoc != null || trgAssoc != null)
                    {
                        // assignment is only valid between tags
                        System.Diagnostics.Debug.Assert(srcAssoc != null && trgAssoc != null);
                        // should also be the same association, otherwise incompatible tags are
                        //  associated
                        System.Diagnostics.Debug.Assert(object.ReferenceEquals(srcAssoc, trgAssoc));
                        System.Diagnostics.Debug.Assert(srcAssoc.SourceTag != null && srcAssoc.TargetTag != null);

                        if (edi.Source == srcAssoc.SourceTag.Start)
                        {
                            System.Diagnostics.Debug.Assert(srcAssoc.SourceTag.StartTagOperation == EditOperation.Undefined);
                            System.Diagnostics.Debug.Assert(srcAssoc.TargetTag.StartTagOperation == EditOperation.Undefined);
                            // source tag start position
                            srcAssoc.SourceTag.StartTagOperation = edi.Operation;
                            srcAssoc.TargetTag.StartTagOperation = edi.Operation;
                        }
                        else
                        {
                            System.Diagnostics.Debug.Assert(edi.Source == srcAssoc.SourceTag.End);
                            System.Diagnostics.Debug.Assert(srcAssoc.SourceTag.EndTagOperation == EditOperation.Undefined);
                            System.Diagnostics.Debug.Assert(srcAssoc.TargetTag.EndTagOperation == EditOperation.Undefined);

                            srcAssoc.SourceTag.EndTagOperation = edi.Operation;
                            srcAssoc.TargetTag.EndTagOperation = edi.Operation;
                        }
                    }
                    break;

                case EditOperation.Insert:
                    trgAssoc = tagAlignment.GetByTargetPosition(edi.Target);
                    if (trgAssoc != null)
                    {
                        if (edi.Target == trgAssoc.TargetTag.Start)
                        {
                            System.Diagnostics.Debug.Assert(trgAssoc.TargetTag.StartTagOperation == EditOperation.Undefined);
                            trgAssoc.TargetTag.StartTagOperation = edi.Operation;
                        }
                        else
                        {
                            System.Diagnostics.Debug.Assert(edi.Target == trgAssoc.TargetTag.End);
                            System.Diagnostics.Debug.Assert(trgAssoc.TargetTag.EndTagOperation == EditOperation.Undefined);
                            trgAssoc.TargetTag.EndTagOperation = edi.Operation;
                        }
                    }
                    break;

                case EditOperation.Delete:
                    srcAssoc = tagAlignment.GetBySourcePosition(edi.Source);
                    if (srcAssoc != null)
                    {
                        if (edi.Source == srcAssoc.SourceTag.Start)
                        {
                            System.Diagnostics.Debug.Assert(srcAssoc.SourceTag.StartTagOperation == EditOperation.Undefined);
                            srcAssoc.SourceTag.StartTagOperation = edi.Operation;
                        }
                        else
                        {
                            System.Diagnostics.Debug.Assert(edi.Source == srcAssoc.SourceTag.End);
                            System.Diagnostics.Debug.Assert(srcAssoc.SourceTag.EndTagOperation == EditOperation.Undefined);
                            srcAssoc.SourceTag.EndTagOperation = edi.Operation;
                        }
                    }
                    break;

                case EditOperation.Move:
                case EditOperation.Undefined:
                default:
                    throw new Exception("Unexpected case");
                }
            }

            // phase 2: detect conflicts

            foreach (TagAssociation ta in tagAlignment)
            {
                EditOperation startOp = EditOperation.Undefined;
                EditOperation endOp   = EditOperation.Undefined;

                if ((ta.SourceTag.StartTagOperation == EditOperation.Insert) &&
                    (ta.TargetTag.StartTagOperation == EditOperation.Delete))
                {
                    startOp = EditOperation.Move;
                }
                else if ((ta.SourceTag.StartTagOperation == EditOperation.Delete) &&
                         (ta.TargetTag.StartTagOperation == EditOperation.Insert))
                {
                    startOp = EditOperation.Move;
                }
                else if (ta.SourceTag.StartTagOperation == ta.TargetTag.StartTagOperation)
                {
                    startOp = ta.SourceTag.StartTagOperation;
                }
                else
                {
                    System.Diagnostics.Debug.Assert(false, "Conflicting start tag operations");
                    startOp = EditOperation.Undefined;
                }

                if ((ta.SourceTag.EndTagOperation == EditOperation.Insert) &&
                    (ta.TargetTag.EndTagOperation == EditOperation.Delete))
                {
                    endOp = EditOperation.Move;
                }
                else if ((ta.SourceTag.EndTagOperation == EditOperation.Delete) &&
                         (ta.TargetTag.EndTagOperation == EditOperation.Insert))
                {
                    endOp = EditOperation.Move;
                }
                else if (ta.SourceTag.EndTagOperation == ta.TargetTag.EndTagOperation)
                {
                    endOp = ta.SourceTag.EndTagOperation;
                }
                else
                {
                    System.Diagnostics.Debug.Assert(false, "Conflicting end tag operations");
                    endOp = EditOperation.Undefined;
                }

                if (startOp != endOp)
                {
                    System.Diagnostics.Debug.Assert(false, "Conflicting tag actions");
                }
            }
        }
コード例 #5
0
        private Core.EditDistance.EditDistance ComputeEditDistanceImpl_Original(
            IList <Core.Tokenization.Token> sourceTokens,
            IList <Core.Tokenization.Token> targetTokens,
            bool computeDiagonalOnly,
            Core.Tokenization.BuiltinRecognizers disabledAutoSubstitutions,
            out TagAssociations alignedTags)
        {
            /*
             * The "classic" ED approach has the problem that it doesn't detect moves
             * reliably, particularly block moves. Patching up insert/delete pairs as
             * moves also won't catch moves which appear as changes in the ED.
             */

            if (sourceTokens == null)
            {
                throw new ArgumentNullException("sourceTokens");
            }
            if (targetTokens == null)
            {
                throw new ArgumentNullException("targetTokens");
            }

            alignedTags = null;

            int i, j;

            // TODO handle special cases (one/both of the arrays being empty/having no elements)
            // TODO use diagonal algorithm

            bool enforceFullMatrixComputation = false;

            Core.EditDistance.EditDistance result =
                new Core.EditDistance.EditDistance(sourceTokens.Count, targetTokens.Count, 0.0d);

            // matrix which captures the similarity between two tokens as well as preassignments
            SimilarityMatrix sim = new SimilarityMatrix(sourceTokens, targetTokens,
                                                        _UseStringEditDistance, disabledAutoSubstitutions);

            if (enforceFullMatrixComputation)
            {
                // this will be fully computed by the tag aligner in most cases, but we may save a bit
                // on plain text segments
                sim.Compute(computeDiagonalOnly);
            }

            MatrixItem[,] matrix = CreateEditDistanceMatrix(sourceTokens, targetTokens);

            alignedTags = TagAligner.AlignPairedTags(sourceTokens, targetTokens, sim);
            if (alignedTags != null && alignedTags.Count > 0)
            {
                // Patch the sim matrix so that non-aligned tags can't be assigned to each other
                PatchSimilarityMatrix(sim, sourceTokens, targetTokens, alignedTags);
                ComputeEditDistanceMatrix_Full(matrix, sim, alignedTags);
            }
            else if (enforceFullMatrixComputation)
            {
                ComputeEditDistanceMatrix_Full(matrix, sim, alignedTags);
            }
            else
            {
                ComputeEditDistanceMatrix_Lazy(matrix, sim);
            }

            // readout the cheapest path

            i = sourceTokens.Count;
            j = targetTokens.Count;
            result.Distance = matrix[i, j].Score;

            while (i > 0 || j > 0)
            {
                EditDistanceItem item = new EditDistanceItem();
                item.Resolution = EditDistanceResolution.None;

                MatrixItem m = matrix[i, j];

                item.Operation = m.Operation;

                switch (item.Operation)
                {
                case EditOperation.Identity:
                    item.Costs = 0.0d;
                    --i;
                    --j;
                    break;

                case EditOperation.Change:
                    item.Costs = _UseStringEditDistance
                                                ? (1.0d - m.Similarity)
                                                : (1.0d - SimilarityComputers.GetTokenSimilarity(sourceTokens[i - 1], targetTokens[j - 1],
                                                                                                 true, disabledAutoSubstitutions));
                    // item.Costs = (1.0d - m.Similarity);
                    --i;
                    --j;
                    break;

                case EditOperation.Insert:
                    item.Costs = _InsertDeleteCosts;
                    --j;
                    break;

                case EditOperation.Delete:
                    item.Costs = _InsertDeleteCosts;
                    --i;
                    break;

                case EditOperation.Undefined:
                    throw new Exception("Internal ED computation error");
                }

                item.Source = i;
                item.Target = j;
                result.AddAtStart(item);
            }

            if (alignedTags != null && alignedTags.Count > 0)
            {
                // should happen before move detection
                FixTagActions(sourceTokens, targetTokens, result, alignedTags);
            }

            // identify move operations which are pairs of insert/delete operations in the shortest path.
            // Note that the comparision result is already in the matrix and we only care about identity.
            // TODO we may rather use a configurable threshold than identity (1.0) to catch move operations
            //  of sufficiently similar items (e.g. case-insensitive)

            if (_ComputeMoves)
            {
                int moves = DetectMoves(result, matrix);
                if (moves > 0)
                {
                    // adjust score: substract moves * (deletionCosts + insertionCosts), add moves * moveCosts
                    // TODO take moveDistance into account, i.e. penalty depends on distance?
                    result.Distance -= (double)moves * (2.0d * _InsertDeleteCosts);
                    result.Distance += (double)moves * _MoveCosts;
                }
            }

#if DEBUG
            // a stream for logging. Will always be null in non-Debug builds
            System.IO.TextWriter logStream = null;
            bool log = false;
            if (log)
            {
                logStream = new System.IO.StreamWriter(System.IO.Path.GetTempPath() + "/ed.log",
                                                       false, System.Text.Encoding.UTF8);

                logStream.WriteLine("Source objects:");
                for (int p = 0; p < sourceTokens.Count; ++p)
                {
                    logStream.WriteLine("\t{0}:\t{1}", p, sourceTokens[p].ToString());
                }
                logStream.WriteLine();
                logStream.WriteLine("Target objects:");
                for (int p = 0; p < targetTokens.Count; ++p)
                {
                    logStream.WriteLine("\t{0}:\t{1}", p, targetTokens[p].ToString());
                }
                logStream.WriteLine();
                logStream.WriteLine();

                if (alignedTags != null)
                {
                    logStream.WriteLine("Tag Alignment:");
                    foreach (TagAssociation ta in alignedTags)
                    {
                        logStream.WriteLine("\t{0}", ta.ToString());
                    }
                    logStream.WriteLine();
                    logStream.WriteLine();
                }

                result.Dump(logStream, "Final ED");

                logStream.Close();
                logStream.Dispose();
                logStream = null;
            }
#endif

#if DEBUG
            // write matrix to a temp file in HTML format
            _DumpMatrix = false;             //  typeof(T) != typeof(char);
            if (_DumpMatrix)
            {
                System.IO.StreamWriter wtr = new System.IO.StreamWriter(System.IO.Path.GetTempPath() + "/SimMatrix.html",
                                                                        false, System.Text.Encoding.UTF8);
                System.Web.UI.Html32TextWriter htmlWriter = new System.Web.UI.Html32TextWriter(wtr);

                htmlWriter.WriteFullBeginTag("html");
                htmlWriter.WriteFullBeginTag("body");
                htmlWriter.WriteBeginTag("table");
                htmlWriter.WriteAttribute("border", "1");

                for (j = -1; j <= targetTokens.Count; ++j)
                {
                    htmlWriter.WriteFullBeginTag("tr");

                    for (i = -1; i <= sourceTokens.Count; ++i)
                    {
                        htmlWriter.WriteFullBeginTag("td");

                        if (i < 0)
                        {
                            // caption row
                            if (j >= 0)
                            {
                                htmlWriter.Write("j={0}", j);
                                if (j > 0)
                                {
                                    htmlWriter.WriteFullBeginTag("br");
                                    htmlWriter.WriteFullBeginTag("b");
                                    htmlWriter.Write(targetTokens[j - 1].ToString());
                                    htmlWriter.WriteEndTag("b");
                                }
                            }
                        }
                        else if (j < 0)
                        {
                            // j < 0 but i >= 0 -->
                            htmlWriter.Write("i={0}", i);
                            if (i > 0)
                            {
                                htmlWriter.WriteFullBeginTag("br");
                                htmlWriter.WriteFullBeginTag("b");
                                htmlWriter.Write(sourceTokens[i - 1].ToString());
                                htmlWriter.WriteEndTag("b");
                            }
                        }
                        else
                        {
                            // content cell
                            htmlWriter.Write("d={0}", matrix[i, j].Score);
                            htmlWriter.WriteFullBeginTag("br");
                            htmlWriter.Write("s={0}", matrix[i, j].Similarity);
                            htmlWriter.WriteFullBeginTag("br");
                            htmlWriter.Write("o={0}", matrix[i, j].Operation.ToString());
                        }

                        htmlWriter.WriteEndTag("td");
                    }

                    htmlWriter.WriteEndTag("tr");
                }

                htmlWriter.WriteEndTag("table");

                htmlWriter.WriteFullBeginTag("h2");
                htmlWriter.Write("Result");
                htmlWriter.WriteEndTag("h2");

                htmlWriter.Write("Score = {0}", result.Distance);

                htmlWriter.WriteFullBeginTag("ol");

                for (i = 0; i < result.Items.Count; ++i)
                {
                    htmlWriter.WriteFullBeginTag("li");
                    htmlWriter.Write("{0}: s={1} t={2}",
                                     result[i].Operation.ToString(), result[i].Source, result[i].Target);
                }

                htmlWriter.WriteEndTag("ol");

                htmlWriter.WriteEndTag("body");
                htmlWriter.WriteEndTag("html");

                htmlWriter.Close();
            }
#endif

            return(result);
        }
コード例 #6
0
        private int DetectMoves(Core.EditDistance.EditDistance result, MatrixItem[,] matrix)
        {
            int moves = 0;

            // try to detect moves in case the move penalty is smaller than the sum of insert/delete penalties

            // matrix[i, j].Similarity is the cached token similarity between source[i-1] and target[j-1]

            // TODO may need to restrict moves to undisputed corresponding items, i.e. those which
            //  have only one row/column similarity maximum

            for (int index = 0; index < result.Items.Count; ++index)
            {
                EditOperation op = result[index].Operation;
                if (op == EditOperation.Delete || op == EditOperation.Insert)
                {
                    int moveSource       = 0;
                    int moveTarget       = 0;
                    int moveSourceTarget = 0;
                    int moveTargetSource = 0;

                    // search in the remainder of the result list for a "compensating" operation
                    int comp = 0;
                    for (comp = index + 1; comp < result.Items.Count; ++comp)
                    {
                        if (op == EditOperation.Delete &&
                            result[comp].Operation == EditOperation.Insert &&
                            matrix[result[index].Source + 1, result[comp].Target + 1].Similarity >= _MoveSimThreshold)
                        {
                            // source[result[index].Source] was deleted
                            // target[result[comp].Target] was inserted
                            moveSource       = result[index].Source;
                            moveSourceTarget = result[index].Target;

                            moveTarget       = result[comp].Target;
                            moveTargetSource = result[comp].Source;
                            break;
                        }
                        else if (op == EditOperation.Insert &&
                                 result[comp].Operation == EditOperation.Delete &&
                                 matrix[result[comp].Source + 1, result[index].Target + 1].Similarity >= _MoveSimThreshold)
                        {
                            // source[result[comp].Source] was deleted
                            // target[result[index].Target] was inserted
                            moveSource       = result[comp].Source;
                            moveSourceTarget = result[comp].Target;

                            moveTarget       = result[index].Target;
                            moveTargetSource = result[index].Source;
                            break;
                        }
                    }

                    // TODO take moveDistance into account, i.e. penalty depends on distance? Avoids
                    //  long-distance moves.

                    if (comp < result.Items.Count)
                    {
                        // compensating operation found
                        // TODO backtrack to find other compensating items?
                        EditDistanceItem item = result[index];
                        item.Operation        = EditOperation.Move;
                        item.Source           = moveSource;
                        item.Target           = moveTarget;
                        item.MoveSourceTarget = moveSourceTarget;
                        item.MoveTargetSource = moveTargetSource;
                        // TODO update item similarity
                        result.Items[index] = item;
                        result.Items.RemoveAt(comp);
                        ++moves;
                    }
                }
            }

            return(moves);
        }