/// <summary>
        /// Patch the similarity matrix so that tags which are not aligned can't be associated
        /// by the ED
        /// </summary>
        private void PatchSimilarityMatrix(SimilarityMatrix sim,
                                           IList <Core.Tokenization.Token> srcTokens,
                                           IList <Core.Tokenization.Token> trgTokens,
                                           TagAssociations tagAlignment)
        {
            if (tagAlignment == null || tagAlignment.Count == 0)
            {
                return;
            }

            for (int s = 0; s < srcTokens.Count; ++s)
            {
                if (!(srcTokens[s] is Core.Tokenization.TagToken))
                {
                    // not a tag
                    continue;
                }

                Core.Tag st = ((Core.Tokenization.TagToken)srcTokens[s]).Tag;
                if (!(st.Type == TagType.Start || st.Type == TagType.End))
                {
                    // not a paired tag
                    continue;
                }

                for (int t = 0; t < trgTokens.Count; ++t)
                {
                    if (sim.IsAssigned(s, t) && sim[s, t] < 0.0d)
                    {
                        // invalid assignment anyway, no need to check further
                        continue;
                    }

                    if (!(trgTokens[t] is Core.Tokenization.TagToken))
                    {
                        // should't really be the case as then sim[s, t] < 0
                        System.Diagnostics.Debug.Assert(false, "Shouldn't be");
                        continue;
                    }

                    Core.Tag tt = ((Core.Tokenization.TagToken)trgTokens[t]).Tag;
                    if (!(tt.Type == TagType.Start || tt.Type == TagType.End))
                    {
                        // should't really be the case as then sim[s, t] < 0
                        System.Diagnostics.Debug.Assert(false, "Shouldn't be");
                        continue;
                    }

                    if (!tagAlignment.AreAssociated(s, t))
                    {
                        sim[s, t] = -1.0d;
                    }
                }
            }
        }
        /// <summary>
        /// Computes the ED
        /// </summary>
        /// <param name="sourceTokens"></param>
        /// <param name="targetTokens"></param>
        /// <param name="computeDiagonalOnly">If number of tokens is equivalent, only the diagonal's similarities are computed.</param>
        /// <param name="alignedTags"></param>
        /// <returns></returns>
        public Core.EditDistance.EditDistance ComputeEditDistance(
            IList <Core.Tokenization.Token> sourceTokens,
            IList <Core.Tokenization.Token> targetTokens,
            bool computeDiagonalOnly,
            Core.Tokenization.BuiltinRecognizers disabledAutoSubstitutions,
            out TagAssociations alignedTags)
        {
            Core.EditDistance.EditDistance result = null;

            result = ComputeEditDistanceImpl_Original(sourceTokens, targetTokens, computeDiagonalOnly, disabledAutoSubstitutions, out alignedTags);

#if DEBUG
            {
                bool ok = VerifyEditDistance(result.Items, sourceTokens.Count, targetTokens.Count);
                if (!ok)
                {
                    System.Diagnostics.Debug.Assert(ok, "ED error - let Oli know and provide test data");
                }
            }
#endif
            return(result);
        }
示例#3
0
        public static TagAssociations AlignPairedTags(IList <Core.Tokenization.Token> sourceTokens,
                                                      IList <Core.Tokenization.Token> targetTokens,
                                                      SimilarityMatrix similarityMatrix)
        {
            TagPairs srcPairedTags = FindPairedTags(sourceTokens);

            if (srcPairedTags == null || srcPairedTags.Count == 0)
            {
                return(null);
            }

            TagPairs trgPairedTags = FindPairedTags(targetTokens);

            if (trgPairedTags == null || trgPairedTags.Count == 0)
            {
                return(null);
            }

            TagAssociations associations = new TagAssociations();

            System.Collections.BitArray processedSrcTags
                = new System.Collections.BitArray(srcPairedTags.Count);
            System.Collections.BitArray processedTrgTags
                = new System.Collections.BitArray(trgPairedTags.Count);

            const bool useEndPositions = true;

            if (srcPairedTags.Count > 0 && trgPairedTags.Count > 0)
            {
                int[,] lcsScores = ComputeTagAssociationScores(similarityMatrix,
                                                               srcPairedTags, trgPairedTags,
                                                               useEndPositions);

                if (lcsScores == null)
                {
                    return(null);
                }

                while (true)
                {
                    // find global row/column maximum

                    int  maxScore = Int32.MinValue;
                    int  maxS     = -1;
                    int  maxT     = -1;
                    bool unique   = false;

                    for (int s = 0; s < srcPairedTags.Count; ++s)
                    {
                        if (processedSrcTags[s])
                        {
                            continue;
                        }

                        for (int t = 0; t < trgPairedTags.Count; ++t)
                        {
                            if (processedTrgTags[t])
                            {
                                continue;
                            }

                            if (lcsScores[s, t] > maxScore)
                            {
                                maxScore = lcsScores[s, t];
                                maxS     = s;
                                maxT     = t;
                                unique   = true;
                            }
                            else if (lcsScores[s, t] == maxScore)
                            {
                                unique = false;
                            }
                        }
                    }

                    if (maxS >= 0)
                    {
                        if (!unique)
                        {
                            // disambiguation required? Only if in same row or column - DNC right now.
                            // System.Diagnostics.Debug.Assert(false, "Investigate - let Oli know and provide test data");
                        }

                        // global unique maximum - associate tags
                        associations.Add(srcPairedTags[maxS], trgPairedTags[maxT],
                                         Core.EditDistance.EditOperation.Change);
                        processedSrcTags[maxS] = true;
                        processedTrgTags[maxT] = true;
                    }
                    else
                    {
                        // no global max found anymore
                        break;
                    }
                }
            }

            for (int p = 0; p < srcPairedTags.Count; ++p)
            {
                if (!processedSrcTags[p])
                {
                    // src tag at that position is not associated
                    associations.Add(srcPairedTags[p], null);
                }
            }

            for (int p = 0; p < trgPairedTags.Count; ++p)
            {
                if (!processedTrgTags[p])
                {
                    associations.Add(null, trgPairedTags[p]);
                }
            }

            return(associations);
        }
        /// <summary>
        /// If the tag alignment suggests action "Change", but the ED can't find this,
        /// we need to patch the corresponding ED item for the corresponding start or end tag as well.
        /// </summary>
        /// <param name="result"></param>
        /// <param name="tagAlignment"></param>
        private void FixTagActions(IList <Core.Tokenization.Token> sourceTokens,
                                   IList <Core.Tokenization.Token> targetTokens,
                                   Core.EditDistance.EditDistance result, TagAssociations tagAlignment)
        {
            // not yet working
            return;

            // NOTE a single ED item may suggest "D" or "I" for a tag. This does not necessarily conflict
            //  with a "C" suggested by the alignment, as a following ED item may suggest
            //  a compensating "I" or "D" which would result in a "M", which is still
            //  compatible with the alignment's "C".

            TagAssociation srcAssoc = null;
            TagAssociation trgAssoc = null;

            foreach (EditDistanceItem edi in result.Items)
            {
                switch (edi.Operation)
                {
                case EditOperation.Identity:
                case EditOperation.Change:
                    srcAssoc = tagAlignment.GetBySourcePosition(edi.Source);
                    trgAssoc = tagAlignment.GetByTargetPosition(edi.Target);
                    if (srcAssoc != null || trgAssoc != null)
                    {
                        // assignment is only valid between tags
                        System.Diagnostics.Debug.Assert(srcAssoc != null && trgAssoc != null);
                        // should also be the same association, otherwise incompatible tags are
                        //  associated
                        System.Diagnostics.Debug.Assert(object.ReferenceEquals(srcAssoc, trgAssoc));
                        System.Diagnostics.Debug.Assert(srcAssoc.SourceTag != null && srcAssoc.TargetTag != null);

                        if (edi.Source == srcAssoc.SourceTag.Start)
                        {
                            System.Diagnostics.Debug.Assert(srcAssoc.SourceTag.StartTagOperation == EditOperation.Undefined);
                            System.Diagnostics.Debug.Assert(srcAssoc.TargetTag.StartTagOperation == EditOperation.Undefined);
                            // source tag start position
                            srcAssoc.SourceTag.StartTagOperation = edi.Operation;
                            srcAssoc.TargetTag.StartTagOperation = edi.Operation;
                        }
                        else
                        {
                            System.Diagnostics.Debug.Assert(edi.Source == srcAssoc.SourceTag.End);
                            System.Diagnostics.Debug.Assert(srcAssoc.SourceTag.EndTagOperation == EditOperation.Undefined);
                            System.Diagnostics.Debug.Assert(srcAssoc.TargetTag.EndTagOperation == EditOperation.Undefined);

                            srcAssoc.SourceTag.EndTagOperation = edi.Operation;
                            srcAssoc.TargetTag.EndTagOperation = edi.Operation;
                        }
                    }
                    break;

                case EditOperation.Insert:
                    trgAssoc = tagAlignment.GetByTargetPosition(edi.Target);
                    if (trgAssoc != null)
                    {
                        if (edi.Target == trgAssoc.TargetTag.Start)
                        {
                            System.Diagnostics.Debug.Assert(trgAssoc.TargetTag.StartTagOperation == EditOperation.Undefined);
                            trgAssoc.TargetTag.StartTagOperation = edi.Operation;
                        }
                        else
                        {
                            System.Diagnostics.Debug.Assert(edi.Target == trgAssoc.TargetTag.End);
                            System.Diagnostics.Debug.Assert(trgAssoc.TargetTag.EndTagOperation == EditOperation.Undefined);
                            trgAssoc.TargetTag.EndTagOperation = edi.Operation;
                        }
                    }
                    break;

                case EditOperation.Delete:
                    srcAssoc = tagAlignment.GetBySourcePosition(edi.Source);
                    if (srcAssoc != null)
                    {
                        if (edi.Source == srcAssoc.SourceTag.Start)
                        {
                            System.Diagnostics.Debug.Assert(srcAssoc.SourceTag.StartTagOperation == EditOperation.Undefined);
                            srcAssoc.SourceTag.StartTagOperation = edi.Operation;
                        }
                        else
                        {
                            System.Diagnostics.Debug.Assert(edi.Source == srcAssoc.SourceTag.End);
                            System.Diagnostics.Debug.Assert(srcAssoc.SourceTag.EndTagOperation == EditOperation.Undefined);
                            srcAssoc.SourceTag.EndTagOperation = edi.Operation;
                        }
                    }
                    break;

                case EditOperation.Move:
                case EditOperation.Undefined:
                default:
                    throw new Exception("Unexpected case");
                }
            }

            // phase 2: detect conflicts

            foreach (TagAssociation ta in tagAlignment)
            {
                EditOperation startOp = EditOperation.Undefined;
                EditOperation endOp   = EditOperation.Undefined;

                if ((ta.SourceTag.StartTagOperation == EditOperation.Insert) &&
                    (ta.TargetTag.StartTagOperation == EditOperation.Delete))
                {
                    startOp = EditOperation.Move;
                }
                else if ((ta.SourceTag.StartTagOperation == EditOperation.Delete) &&
                         (ta.TargetTag.StartTagOperation == EditOperation.Insert))
                {
                    startOp = EditOperation.Move;
                }
                else if (ta.SourceTag.StartTagOperation == ta.TargetTag.StartTagOperation)
                {
                    startOp = ta.SourceTag.StartTagOperation;
                }
                else
                {
                    System.Diagnostics.Debug.Assert(false, "Conflicting start tag operations");
                    startOp = EditOperation.Undefined;
                }

                if ((ta.SourceTag.EndTagOperation == EditOperation.Insert) &&
                    (ta.TargetTag.EndTagOperation == EditOperation.Delete))
                {
                    endOp = EditOperation.Move;
                }
                else if ((ta.SourceTag.EndTagOperation == EditOperation.Delete) &&
                         (ta.TargetTag.EndTagOperation == EditOperation.Insert))
                {
                    endOp = EditOperation.Move;
                }
                else if (ta.SourceTag.EndTagOperation == ta.TargetTag.EndTagOperation)
                {
                    endOp = ta.SourceTag.EndTagOperation;
                }
                else
                {
                    System.Diagnostics.Debug.Assert(false, "Conflicting end tag operations");
                    endOp = EditOperation.Undefined;
                }

                if (startOp != endOp)
                {
                    System.Diagnostics.Debug.Assert(false, "Conflicting tag actions");
                }
            }
        }
        private Core.EditDistance.EditDistance ComputeEditDistanceImpl_Original(
            IList <Core.Tokenization.Token> sourceTokens,
            IList <Core.Tokenization.Token> targetTokens,
            bool computeDiagonalOnly,
            Core.Tokenization.BuiltinRecognizers disabledAutoSubstitutions,
            out TagAssociations alignedTags)
        {
            /*
             * The "classic" ED approach has the problem that it doesn't detect moves
             * reliably, particularly block moves. Patching up insert/delete pairs as
             * moves also won't catch moves which appear as changes in the ED.
             */

            if (sourceTokens == null)
            {
                throw new ArgumentNullException("sourceTokens");
            }
            if (targetTokens == null)
            {
                throw new ArgumentNullException("targetTokens");
            }

            alignedTags = null;

            int i, j;

            // TODO handle special cases (one/both of the arrays being empty/having no elements)
            // TODO use diagonal algorithm

            bool enforceFullMatrixComputation = false;

            Core.EditDistance.EditDistance result =
                new Core.EditDistance.EditDistance(sourceTokens.Count, targetTokens.Count, 0.0d);

            // matrix which captures the similarity between two tokens as well as preassignments
            SimilarityMatrix sim = new SimilarityMatrix(sourceTokens, targetTokens,
                                                        _UseStringEditDistance, disabledAutoSubstitutions);

            if (enforceFullMatrixComputation)
            {
                // this will be fully computed by the tag aligner in most cases, but we may save a bit
                // on plain text segments
                sim.Compute(computeDiagonalOnly);
            }

            MatrixItem[,] matrix = CreateEditDistanceMatrix(sourceTokens, targetTokens);

            alignedTags = TagAligner.AlignPairedTags(sourceTokens, targetTokens, sim);
            if (alignedTags != null && alignedTags.Count > 0)
            {
                // Patch the sim matrix so that non-aligned tags can't be assigned to each other
                PatchSimilarityMatrix(sim, sourceTokens, targetTokens, alignedTags);
                ComputeEditDistanceMatrix_Full(matrix, sim, alignedTags);
            }
            else if (enforceFullMatrixComputation)
            {
                ComputeEditDistanceMatrix_Full(matrix, sim, alignedTags);
            }
            else
            {
                ComputeEditDistanceMatrix_Lazy(matrix, sim);
            }

            // readout the cheapest path

            i = sourceTokens.Count;
            j = targetTokens.Count;
            result.Distance = matrix[i, j].Score;

            while (i > 0 || j > 0)
            {
                EditDistanceItem item = new EditDistanceItem();
                item.Resolution = EditDistanceResolution.None;

                MatrixItem m = matrix[i, j];

                item.Operation = m.Operation;

                switch (item.Operation)
                {
                case EditOperation.Identity:
                    item.Costs = 0.0d;
                    --i;
                    --j;
                    break;

                case EditOperation.Change:
                    item.Costs = _UseStringEditDistance
                                                ? (1.0d - m.Similarity)
                                                : (1.0d - SimilarityComputers.GetTokenSimilarity(sourceTokens[i - 1], targetTokens[j - 1],
                                                                                                 true, disabledAutoSubstitutions));
                    // item.Costs = (1.0d - m.Similarity);
                    --i;
                    --j;
                    break;

                case EditOperation.Insert:
                    item.Costs = _InsertDeleteCosts;
                    --j;
                    break;

                case EditOperation.Delete:
                    item.Costs = _InsertDeleteCosts;
                    --i;
                    break;

                case EditOperation.Undefined:
                    throw new Exception("Internal ED computation error");
                }

                item.Source = i;
                item.Target = j;
                result.AddAtStart(item);
            }

            if (alignedTags != null && alignedTags.Count > 0)
            {
                // should happen before move detection
                FixTagActions(sourceTokens, targetTokens, result, alignedTags);
            }

            // identify move operations which are pairs of insert/delete operations in the shortest path.
            // Note that the comparision result is already in the matrix and we only care about identity.
            // TODO we may rather use a configurable threshold than identity (1.0) to catch move operations
            //  of sufficiently similar items (e.g. case-insensitive)

            if (_ComputeMoves)
            {
                int moves = DetectMoves(result, matrix);
                if (moves > 0)
                {
                    // adjust score: substract moves * (deletionCosts + insertionCosts), add moves * moveCosts
                    // TODO take moveDistance into account, i.e. penalty depends on distance?
                    result.Distance -= (double)moves * (2.0d * _InsertDeleteCosts);
                    result.Distance += (double)moves * _MoveCosts;
                }
            }

#if DEBUG
            // a stream for logging. Will always be null in non-Debug builds
            System.IO.TextWriter logStream = null;
            bool log = false;
            if (log)
            {
                logStream = new System.IO.StreamWriter(System.IO.Path.GetTempPath() + "/ed.log",
                                                       false, System.Text.Encoding.UTF8);

                logStream.WriteLine("Source objects:");
                for (int p = 0; p < sourceTokens.Count; ++p)
                {
                    logStream.WriteLine("\t{0}:\t{1}", p, sourceTokens[p].ToString());
                }
                logStream.WriteLine();
                logStream.WriteLine("Target objects:");
                for (int p = 0; p < targetTokens.Count; ++p)
                {
                    logStream.WriteLine("\t{0}:\t{1}", p, targetTokens[p].ToString());
                }
                logStream.WriteLine();
                logStream.WriteLine();

                if (alignedTags != null)
                {
                    logStream.WriteLine("Tag Alignment:");
                    foreach (TagAssociation ta in alignedTags)
                    {
                        logStream.WriteLine("\t{0}", ta.ToString());
                    }
                    logStream.WriteLine();
                    logStream.WriteLine();
                }

                result.Dump(logStream, "Final ED");

                logStream.Close();
                logStream.Dispose();
                logStream = null;
            }
#endif

#if DEBUG
            // write matrix to a temp file in HTML format
            _DumpMatrix = false;             //  typeof(T) != typeof(char);
            if (_DumpMatrix)
            {
                System.IO.StreamWriter wtr = new System.IO.StreamWriter(System.IO.Path.GetTempPath() + "/SimMatrix.html",
                                                                        false, System.Text.Encoding.UTF8);
                System.Web.UI.Html32TextWriter htmlWriter = new System.Web.UI.Html32TextWriter(wtr);

                htmlWriter.WriteFullBeginTag("html");
                htmlWriter.WriteFullBeginTag("body");
                htmlWriter.WriteBeginTag("table");
                htmlWriter.WriteAttribute("border", "1");

                for (j = -1; j <= targetTokens.Count; ++j)
                {
                    htmlWriter.WriteFullBeginTag("tr");

                    for (i = -1; i <= sourceTokens.Count; ++i)
                    {
                        htmlWriter.WriteFullBeginTag("td");

                        if (i < 0)
                        {
                            // caption row
                            if (j >= 0)
                            {
                                htmlWriter.Write("j={0}", j);
                                if (j > 0)
                                {
                                    htmlWriter.WriteFullBeginTag("br");
                                    htmlWriter.WriteFullBeginTag("b");
                                    htmlWriter.Write(targetTokens[j - 1].ToString());
                                    htmlWriter.WriteEndTag("b");
                                }
                            }
                        }
                        else if (j < 0)
                        {
                            // j < 0 but i >= 0 -->
                            htmlWriter.Write("i={0}", i);
                            if (i > 0)
                            {
                                htmlWriter.WriteFullBeginTag("br");
                                htmlWriter.WriteFullBeginTag("b");
                                htmlWriter.Write(sourceTokens[i - 1].ToString());
                                htmlWriter.WriteEndTag("b");
                            }
                        }
                        else
                        {
                            // content cell
                            htmlWriter.Write("d={0}", matrix[i, j].Score);
                            htmlWriter.WriteFullBeginTag("br");
                            htmlWriter.Write("s={0}", matrix[i, j].Similarity);
                            htmlWriter.WriteFullBeginTag("br");
                            htmlWriter.Write("o={0}", matrix[i, j].Operation.ToString());
                        }

                        htmlWriter.WriteEndTag("td");
                    }

                    htmlWriter.WriteEndTag("tr");
                }

                htmlWriter.WriteEndTag("table");

                htmlWriter.WriteFullBeginTag("h2");
                htmlWriter.Write("Result");
                htmlWriter.WriteEndTag("h2");

                htmlWriter.Write("Score = {0}", result.Distance);

                htmlWriter.WriteFullBeginTag("ol");

                for (i = 0; i < result.Items.Count; ++i)
                {
                    htmlWriter.WriteFullBeginTag("li");
                    htmlWriter.Write("{0}: s={1} t={2}",
                                     result[i].Operation.ToString(), result[i].Source, result[i].Target);
                }

                htmlWriter.WriteEndTag("ol");

                htmlWriter.WriteEndTag("body");
                htmlWriter.WriteEndTag("html");

                htmlWriter.Close();
            }
#endif

            return(result);
        }
        private void ComputeEditDistanceMatrix_Full(MatrixItem[,] matrix,
                                                    SimilarityMatrix sim,
                                                    TagAssociations alignedTags)
        {
            for (int i = 1; i <= sim.SourceTokens.Count; ++i)
            {
                for (int j = 1; j <= sim.TargetTokens.Count; ++j)
                {
                    // current cell must not yet be computed:
                    System.Diagnostics.Debug.Assert(matrix[i, j].Operation == EditOperation.Undefined);
                    // predecessors must be valid:
                    System.Diagnostics.Debug.Assert(matrix[i - 1, j - 1].Operation != EditOperation.Undefined);
                    System.Diagnostics.Debug.Assert(matrix[i, j - 1].Operation != EditOperation.Undefined);
                    System.Diagnostics.Debug.Assert(matrix[i - 1, j].Operation != EditOperation.Undefined);

                    double similarity = sim[i - 1, j - 1];

                    System.Diagnostics.Debug.Assert((similarity >= 0.0d && similarity <= 1.0d) ||
                                                    similarity == -1.0d);

                    // low similarity means high "change costs" and vice versa:
                    double changeCosts = (similarity < 0)
                                                ? _InvalidAssignmentCosts
                                                : matrix[i - 1, j - 1].Score + (1.0d - similarity);

                    double insertCosts = matrix[i, j - 1].Score + _InsertDeleteCosts;
                    double deleteCosts = matrix[i - 1, j].Score + _InsertDeleteCosts;

                    double min = Math.Min(Math.Min(changeCosts, deleteCosts), insertCosts);

                    // verify the shortcut condition:
                    System.Diagnostics.Debug.Assert(similarity < 1.0d || min == changeCosts);

                    EditOperation op = EditOperation.Undefined;
                    if (min == deleteCosts)
                    {
                        op = EditOperation.Delete;
                    }
                    else if (min == insertCosts)
                    {
                        op = EditOperation.Insert;
                    }
                    else if (min == changeCosts)
                    {
                        if (similarity == 1.0d)
                        {
                            op = EditOperation.Identity;
                        }
                        else
                        {
                            op = EditOperation.Change;
                        }
                    }

                    if (alignedTags != null && alignedTags.Count > 0)
                    {
                        // check whether tag alignment overrides ED result:
                        // TODO do this during population or during readout?

                        EditOperation srcTagOp = alignedTags.GetOperationBySourcePosition(i - 1);
                        EditOperation trgTagOp = alignedTags.GetOperationByTargetPosition(j - 1);

                        // changes/identity of tags are through ED, while the tag alignment
                        //  defines deletions, insertions
                        if ((srcTagOp == EditOperation.Insert || srcTagOp == EditOperation.Delete) &&
                            op != srcTagOp)
                        {
                            // this is where the pre-alignment of tags supersedes the ED result
                            op = srcTagOp;
                        }
                        else if ((trgTagOp == EditOperation.Insert || trgTagOp == EditOperation.Delete) &&
                                 op != trgTagOp)
                        {
                            op = trgTagOp;
                        }
                    }

                    matrix[i, j].Similarity = similarity;
                    matrix[i, j].Operation  = op;

                    if (op == EditOperation.Delete)
                    {
                        matrix[i, j].Score = deleteCosts;
                    }
                    else if (op == EditOperation.Insert)
                    {
                        matrix[i, j].Score = insertCosts;
                    }
                    else
                    {
                        matrix[i, j].Score = changeCosts;
                    }
                }
            }
        }