示例#1
0
 /// <summary>
 /// blablabla
 /// </summary>
 /// <param name="simMatrix"></param>
 /// <param name="threshold"></param>
 /// <param name="maySkip">If true, computes the longest common subsequence. Otherwise,
 /// computes the longest common substring.
 /// </param>
 public TokenIndexLCSScoreProvider(SimilarityMatrix simMatrix,
                                   double threshold,
                                   bool maySkip)
 {
     _SimMatrix = simMatrix;
     _Threshold = threshold;
     _MaySkip   = maySkip;
 }
        /// <summary>
        /// Patch the similarity matrix so that tags which are not aligned can't be associated
        /// by the ED
        /// </summary>
        private void PatchSimilarityMatrix(SimilarityMatrix sim,
                                           IList <Core.Tokenization.Token> srcTokens,
                                           IList <Core.Tokenization.Token> trgTokens,
                                           TagAssociations tagAlignment)
        {
            if (tagAlignment == null || tagAlignment.Count == 0)
            {
                return;
            }

            for (int s = 0; s < srcTokens.Count; ++s)
            {
                if (!(srcTokens[s] is Core.Tokenization.TagToken))
                {
                    // not a tag
                    continue;
                }

                Core.Tag st = ((Core.Tokenization.TagToken)srcTokens[s]).Tag;
                if (!(st.Type == TagType.Start || st.Type == TagType.End))
                {
                    // not a paired tag
                    continue;
                }

                for (int t = 0; t < trgTokens.Count; ++t)
                {
                    if (sim.IsAssigned(s, t) && sim[s, t] < 0.0d)
                    {
                        // invalid assignment anyway, no need to check further
                        continue;
                    }

                    if (!(trgTokens[t] is Core.Tokenization.TagToken))
                    {
                        // should't really be the case as then sim[s, t] < 0
                        System.Diagnostics.Debug.Assert(false, "Shouldn't be");
                        continue;
                    }

                    Core.Tag tt = ((Core.Tokenization.TagToken)trgTokens[t]).Tag;
                    if (!(tt.Type == TagType.Start || tt.Type == TagType.End))
                    {
                        // should't really be the case as then sim[s, t] < 0
                        System.Diagnostics.Debug.Assert(false, "Shouldn't be");
                        continue;
                    }

                    if (!tagAlignment.AreAssociated(s, t))
                    {
                        sim[s, t] = -1.0d;
                    }
                }
            }
        }
示例#3
0
        private static int[,] ComputeTagAssociationScores(SimilarityMatrix similarityMatrix,
                                                          TagPairs srcPairedTags,
                                                          TagPairs trgPairedTags,
                                                          bool useEndPositions)
        {
            // this should pretty much result in first-come first-serve alignment, but we hopefully
            //  get better associations for nested tags

            // foreach src tag, compute LCS to each target tag

            int[,] lcsScores = new int[srcPairedTags.Count, trgPairedTags.Count];

            List <int> sourceTokenPositions = new List <int>();
            List <int> targetTokenPositions = new List <int>();

            TokenIndexLCSScoreProvider scorer
                = new TokenIndexLCSScoreProvider(similarityMatrix, 0.75, true);

            for (int p = 0; p < similarityMatrix.SourceTokens.Count; ++p)
            {
                sourceTokenPositions.Add(p);
            }
            for (int p = 0; p < similarityMatrix.TargetTokens.Count; ++p)
            {
                targetTokenPositions.Add(p);
            }

            SequenceAlignmentComputer <int> aligner
                = new SequenceAlignmentComputer <int>(sourceTokenPositions,
                                                      targetTokenPositions, scorer, null, 1, 1);

            int uptoSource;
            int uptoTarget;

            for (int srcTag = srcPairedTags.Count - 1; srcTag >= 0; --srcTag)
            {
                PairedTag sPt = srcPairedTags[srcTag];

                uptoSource = (useEndPositions ? sPt.End : sPt.Start);

                for (int trgTag = trgPairedTags.Count - 1; trgTag >= 0; --trgTag)
                {
                    PairedTag tPt = trgPairedTags[trgTag];
                    uptoTarget = (useEndPositions ? tPt.End : tPt.Start);

                    List <AlignedSubstring> result
                        = aligner.Compute(uptoSource, uptoTarget);

                    if (result != null && result.Count > 0)
                    {
                        System.Diagnostics.Debug.Assert(result.Count == 1);

                        // the result is the common subsequence length minus items which were deleted or inserted
                        int score = result[0].Score
                                    - (uptoSource - result[0].Score)
                                    - (uptoTarget - result[0].Score);

                        // penalize large differences in the spanned width, but not if
                        //  we include the end positions in the LCS
                        int malus;
                        if (useEndPositions)
                        {
                            malus = 0;
                        }
                        else
                        {
                            int srcSpan = GetTagSpan(sPt);
                            int trgSpan = GetTagSpan(tPt);

                            malus = Math.Abs(srcSpan - trgSpan) / 2;
                        }

                        lcsScores[srcTag, trgTag] = score - malus;
                    }
                }
            }

            return(lcsScores);
        }
示例#4
0
        public static TagAssociations AlignPairedTags(IList <Core.Tokenization.Token> sourceTokens,
                                                      IList <Core.Tokenization.Token> targetTokens,
                                                      SimilarityMatrix similarityMatrix)
        {
            TagPairs srcPairedTags = FindPairedTags(sourceTokens);

            if (srcPairedTags == null || srcPairedTags.Count == 0)
            {
                return(null);
            }

            TagPairs trgPairedTags = FindPairedTags(targetTokens);

            if (trgPairedTags == null || trgPairedTags.Count == 0)
            {
                return(null);
            }

            TagAssociations associations = new TagAssociations();

            System.Collections.BitArray processedSrcTags
                = new System.Collections.BitArray(srcPairedTags.Count);
            System.Collections.BitArray processedTrgTags
                = new System.Collections.BitArray(trgPairedTags.Count);

            const bool useEndPositions = true;

            if (srcPairedTags.Count > 0 && trgPairedTags.Count > 0)
            {
                int[,] lcsScores = ComputeTagAssociationScores(similarityMatrix,
                                                               srcPairedTags, trgPairedTags,
                                                               useEndPositions);

                if (lcsScores == null)
                {
                    return(null);
                }

                while (true)
                {
                    // find global row/column maximum

                    int  maxScore = Int32.MinValue;
                    int  maxS     = -1;
                    int  maxT     = -1;
                    bool unique   = false;

                    for (int s = 0; s < srcPairedTags.Count; ++s)
                    {
                        if (processedSrcTags[s])
                        {
                            continue;
                        }

                        for (int t = 0; t < trgPairedTags.Count; ++t)
                        {
                            if (processedTrgTags[t])
                            {
                                continue;
                            }

                            if (lcsScores[s, t] > maxScore)
                            {
                                maxScore = lcsScores[s, t];
                                maxS     = s;
                                maxT     = t;
                                unique   = true;
                            }
                            else if (lcsScores[s, t] == maxScore)
                            {
                                unique = false;
                            }
                        }
                    }

                    if (maxS >= 0)
                    {
                        if (!unique)
                        {
                            // disambiguation required? Only if in same row or column - DNC right now.
                            // System.Diagnostics.Debug.Assert(false, "Investigate - let Oli know and provide test data");
                        }

                        // global unique maximum - associate tags
                        associations.Add(srcPairedTags[maxS], trgPairedTags[maxT],
                                         Core.EditDistance.EditOperation.Change);
                        processedSrcTags[maxS] = true;
                        processedTrgTags[maxT] = true;
                    }
                    else
                    {
                        // no global max found anymore
                        break;
                    }
                }
            }

            for (int p = 0; p < srcPairedTags.Count; ++p)
            {
                if (!processedSrcTags[p])
                {
                    // src tag at that position is not associated
                    associations.Add(srcPairedTags[p], null);
                }
            }

            for (int p = 0; p < trgPairedTags.Count; ++p)
            {
                if (!processedTrgTags[p])
                {
                    associations.Add(null, trgPairedTags[p]);
                }
            }

            return(associations);
        }
        private Core.EditDistance.EditDistance ComputeEditDistanceImpl_Original(
            IList <Core.Tokenization.Token> sourceTokens,
            IList <Core.Tokenization.Token> targetTokens,
            bool computeDiagonalOnly,
            Core.Tokenization.BuiltinRecognizers disabledAutoSubstitutions,
            out TagAssociations alignedTags)
        {
            /*
             * The "classic" ED approach has the problem that it doesn't detect moves
             * reliably, particularly block moves. Patching up insert/delete pairs as
             * moves also won't catch moves which appear as changes in the ED.
             */

            if (sourceTokens == null)
            {
                throw new ArgumentNullException("sourceTokens");
            }
            if (targetTokens == null)
            {
                throw new ArgumentNullException("targetTokens");
            }

            alignedTags = null;

            int i, j;

            // TODO handle special cases (one/both of the arrays being empty/having no elements)
            // TODO use diagonal algorithm

            bool enforceFullMatrixComputation = false;

            Core.EditDistance.EditDistance result =
                new Core.EditDistance.EditDistance(sourceTokens.Count, targetTokens.Count, 0.0d);

            // matrix which captures the similarity between two tokens as well as preassignments
            SimilarityMatrix sim = new SimilarityMatrix(sourceTokens, targetTokens,
                                                        _UseStringEditDistance, disabledAutoSubstitutions);

            if (enforceFullMatrixComputation)
            {
                // this will be fully computed by the tag aligner in most cases, but we may save a bit
                // on plain text segments
                sim.Compute(computeDiagonalOnly);
            }

            MatrixItem[,] matrix = CreateEditDistanceMatrix(sourceTokens, targetTokens);

            alignedTags = TagAligner.AlignPairedTags(sourceTokens, targetTokens, sim);
            if (alignedTags != null && alignedTags.Count > 0)
            {
                // Patch the sim matrix so that non-aligned tags can't be assigned to each other
                PatchSimilarityMatrix(sim, sourceTokens, targetTokens, alignedTags);
                ComputeEditDistanceMatrix_Full(matrix, sim, alignedTags);
            }
            else if (enforceFullMatrixComputation)
            {
                ComputeEditDistanceMatrix_Full(matrix, sim, alignedTags);
            }
            else
            {
                ComputeEditDistanceMatrix_Lazy(matrix, sim);
            }

            // readout the cheapest path

            i = sourceTokens.Count;
            j = targetTokens.Count;
            result.Distance = matrix[i, j].Score;

            while (i > 0 || j > 0)
            {
                EditDistanceItem item = new EditDistanceItem();
                item.Resolution = EditDistanceResolution.None;

                MatrixItem m = matrix[i, j];

                item.Operation = m.Operation;

                switch (item.Operation)
                {
                case EditOperation.Identity:
                    item.Costs = 0.0d;
                    --i;
                    --j;
                    break;

                case EditOperation.Change:
                    item.Costs = _UseStringEditDistance
                                                ? (1.0d - m.Similarity)
                                                : (1.0d - SimilarityComputers.GetTokenSimilarity(sourceTokens[i - 1], targetTokens[j - 1],
                                                                                                 true, disabledAutoSubstitutions));
                    // item.Costs = (1.0d - m.Similarity);
                    --i;
                    --j;
                    break;

                case EditOperation.Insert:
                    item.Costs = _InsertDeleteCosts;
                    --j;
                    break;

                case EditOperation.Delete:
                    item.Costs = _InsertDeleteCosts;
                    --i;
                    break;

                case EditOperation.Undefined:
                    throw new Exception("Internal ED computation error");
                }

                item.Source = i;
                item.Target = j;
                result.AddAtStart(item);
            }

            if (alignedTags != null && alignedTags.Count > 0)
            {
                // should happen before move detection
                FixTagActions(sourceTokens, targetTokens, result, alignedTags);
            }

            // identify move operations which are pairs of insert/delete operations in the shortest path.
            // Note that the comparision result is already in the matrix and we only care about identity.
            // TODO we may rather use a configurable threshold than identity (1.0) to catch move operations
            //  of sufficiently similar items (e.g. case-insensitive)

            if (_ComputeMoves)
            {
                int moves = DetectMoves(result, matrix);
                if (moves > 0)
                {
                    // adjust score: substract moves * (deletionCosts + insertionCosts), add moves * moveCosts
                    // TODO take moveDistance into account, i.e. penalty depends on distance?
                    result.Distance -= (double)moves * (2.0d * _InsertDeleteCosts);
                    result.Distance += (double)moves * _MoveCosts;
                }
            }

#if DEBUG
            // a stream for logging. Will always be null in non-Debug builds
            System.IO.TextWriter logStream = null;
            bool log = false;
            if (log)
            {
                logStream = new System.IO.StreamWriter(System.IO.Path.GetTempPath() + "/ed.log",
                                                       false, System.Text.Encoding.UTF8);

                logStream.WriteLine("Source objects:");
                for (int p = 0; p < sourceTokens.Count; ++p)
                {
                    logStream.WriteLine("\t{0}:\t{1}", p, sourceTokens[p].ToString());
                }
                logStream.WriteLine();
                logStream.WriteLine("Target objects:");
                for (int p = 0; p < targetTokens.Count; ++p)
                {
                    logStream.WriteLine("\t{0}:\t{1}", p, targetTokens[p].ToString());
                }
                logStream.WriteLine();
                logStream.WriteLine();

                if (alignedTags != null)
                {
                    logStream.WriteLine("Tag Alignment:");
                    foreach (TagAssociation ta in alignedTags)
                    {
                        logStream.WriteLine("\t{0}", ta.ToString());
                    }
                    logStream.WriteLine();
                    logStream.WriteLine();
                }

                result.Dump(logStream, "Final ED");

                logStream.Close();
                logStream.Dispose();
                logStream = null;
            }
#endif

#if DEBUG
            // write matrix to a temp file in HTML format
            _DumpMatrix = false;             //  typeof(T) != typeof(char);
            if (_DumpMatrix)
            {
                System.IO.StreamWriter wtr = new System.IO.StreamWriter(System.IO.Path.GetTempPath() + "/SimMatrix.html",
                                                                        false, System.Text.Encoding.UTF8);
                System.Web.UI.Html32TextWriter htmlWriter = new System.Web.UI.Html32TextWriter(wtr);

                htmlWriter.WriteFullBeginTag("html");
                htmlWriter.WriteFullBeginTag("body");
                htmlWriter.WriteBeginTag("table");
                htmlWriter.WriteAttribute("border", "1");

                for (j = -1; j <= targetTokens.Count; ++j)
                {
                    htmlWriter.WriteFullBeginTag("tr");

                    for (i = -1; i <= sourceTokens.Count; ++i)
                    {
                        htmlWriter.WriteFullBeginTag("td");

                        if (i < 0)
                        {
                            // caption row
                            if (j >= 0)
                            {
                                htmlWriter.Write("j={0}", j);
                                if (j > 0)
                                {
                                    htmlWriter.WriteFullBeginTag("br");
                                    htmlWriter.WriteFullBeginTag("b");
                                    htmlWriter.Write(targetTokens[j - 1].ToString());
                                    htmlWriter.WriteEndTag("b");
                                }
                            }
                        }
                        else if (j < 0)
                        {
                            // j < 0 but i >= 0 -->
                            htmlWriter.Write("i={0}", i);
                            if (i > 0)
                            {
                                htmlWriter.WriteFullBeginTag("br");
                                htmlWriter.WriteFullBeginTag("b");
                                htmlWriter.Write(sourceTokens[i - 1].ToString());
                                htmlWriter.WriteEndTag("b");
                            }
                        }
                        else
                        {
                            // content cell
                            htmlWriter.Write("d={0}", matrix[i, j].Score);
                            htmlWriter.WriteFullBeginTag("br");
                            htmlWriter.Write("s={0}", matrix[i, j].Similarity);
                            htmlWriter.WriteFullBeginTag("br");
                            htmlWriter.Write("o={0}", matrix[i, j].Operation.ToString());
                        }

                        htmlWriter.WriteEndTag("td");
                    }

                    htmlWriter.WriteEndTag("tr");
                }

                htmlWriter.WriteEndTag("table");

                htmlWriter.WriteFullBeginTag("h2");
                htmlWriter.Write("Result");
                htmlWriter.WriteEndTag("h2");

                htmlWriter.Write("Score = {0}", result.Distance);

                htmlWriter.WriteFullBeginTag("ol");

                for (i = 0; i < result.Items.Count; ++i)
                {
                    htmlWriter.WriteFullBeginTag("li");
                    htmlWriter.Write("{0}: s={1} t={2}",
                                     result[i].Operation.ToString(), result[i].Source, result[i].Target);
                }

                htmlWriter.WriteEndTag("ol");

                htmlWriter.WriteEndTag("body");
                htmlWriter.WriteEndTag("html");

                htmlWriter.Close();
            }
#endif

            return(result);
        }
 private void ComputeEditDistanceMatrix_Lazy(MatrixItem[,] matrix,
                                             SimilarityMatrix sim)
 {
     ComputeCell(matrix, sim, sim.SourceTokens.Count, sim.TargetTokens.Count);
 }
        private void ComputeCell(MatrixItem[,] matrix,
                                 SimilarityMatrix sim, int i, int j)
        {
            if (matrix[i, j].Operation != EditOperation.Undefined)
            {
                // cell already computed - no further processing required
                return;
            }

            // ensure that the diagonal cell is computed (always needed)
            ComputeCell(matrix, sim, i - 1, j - 1);
            System.Diagnostics.Debug.Assert(matrix[i - 1, j - 1].Operation != EditOperation.Undefined);

            double similarity = sim[i - 1, j - 1];
            // low similarity means high "change costs" and vice versa:
            double changeCosts = (similarity < 0.0d)
                                ? _InvalidAssignmentCosts
                                : matrix[i - 1, j - 1].Score + (1.0d - similarity);

            EditOperation op          = EditOperation.Undefined;
            double        insertCosts = 0.0d;
            double        deleteCosts = 0.0d;

            // i == j: main diagonal. i < j: below, i > j: above.

            /*
             * if (similarity == 1.0d)
             * {
             *      // this seems to assume that the costs are minimal in the diagonal - not
             *      //  sure that's true in the general case (only if insert/delete/change
             *      //  costs are equal = 1)
             *      op = EditOperation.Identity;
             * }
             * else
             */
            if (i < j)
            {
                // below main diagonal.
                ComputeCell(matrix, sim, i, j - 1);
                insertCosts = matrix[i, j - 1].Score + _InsertDeleteCosts;
                if (insertCosts >= changeCosts || changeCosts == _InvalidAssignmentCosts)
                {
                    // need to get the deletion costs as well
                    ComputeCell(matrix, sim, i - 1, j);
                    deleteCosts = matrix[i - 1, j].Score + _InsertDeleteCosts;

                    op = GetOperation(changeCosts, insertCosts, deleteCosts, similarity);
                }
                else
                {
                    if (insertCosts < changeCosts)
                    {
                        op = EditOperation.Insert;
                    }
                    else
                    {
                        op = EditOperation.Change;
                    }
                }
            }
            else
            {
                // on or above main diagonal
                ComputeCell(matrix, sim, i - 1, j);
                deleteCosts = matrix[i - 1, j].Score + _InsertDeleteCosts;

                if (deleteCosts >= changeCosts || changeCosts == _InvalidAssignmentCosts)
                {
                    // need to get the insert costs as well
                    ComputeCell(matrix, sim, i, j - 1);
                    insertCosts = matrix[i, j - 1].Score + _InsertDeleteCosts;

                    op = GetOperation(changeCosts, insertCosts, deleteCosts, similarity);
                }
                else
                {
                    if (deleteCosts < changeCosts)
                    {
                        op = EditOperation.Delete;
                    }
                    else
                    {
                        op = EditOperation.Change;
                    }
                }
            }

            matrix[i, j].Similarity = similarity;
            matrix[i, j].Operation  = op;

            System.Diagnostics.Debug.Assert(op != EditOperation.Undefined);

            if (op == EditOperation.Delete)
            {
                matrix[i, j].Score = deleteCosts;
            }
            else if (op == EditOperation.Insert)
            {
                matrix[i, j].Score = insertCosts;
            }
            else
            {
                matrix[i, j].Score = changeCosts;
            }
        }
        private void ComputeEditDistanceMatrix_Full(MatrixItem[,] matrix,
                                                    SimilarityMatrix sim,
                                                    TagAssociations alignedTags)
        {
            for (int i = 1; i <= sim.SourceTokens.Count; ++i)
            {
                for (int j = 1; j <= sim.TargetTokens.Count; ++j)
                {
                    // current cell must not yet be computed:
                    System.Diagnostics.Debug.Assert(matrix[i, j].Operation == EditOperation.Undefined);
                    // predecessors must be valid:
                    System.Diagnostics.Debug.Assert(matrix[i - 1, j - 1].Operation != EditOperation.Undefined);
                    System.Diagnostics.Debug.Assert(matrix[i, j - 1].Operation != EditOperation.Undefined);
                    System.Diagnostics.Debug.Assert(matrix[i - 1, j].Operation != EditOperation.Undefined);

                    double similarity = sim[i - 1, j - 1];

                    System.Diagnostics.Debug.Assert((similarity >= 0.0d && similarity <= 1.0d) ||
                                                    similarity == -1.0d);

                    // low similarity means high "change costs" and vice versa:
                    double changeCosts = (similarity < 0)
                                                ? _InvalidAssignmentCosts
                                                : matrix[i - 1, j - 1].Score + (1.0d - similarity);

                    double insertCosts = matrix[i, j - 1].Score + _InsertDeleteCosts;
                    double deleteCosts = matrix[i - 1, j].Score + _InsertDeleteCosts;

                    double min = Math.Min(Math.Min(changeCosts, deleteCosts), insertCosts);

                    // verify the shortcut condition:
                    System.Diagnostics.Debug.Assert(similarity < 1.0d || min == changeCosts);

                    EditOperation op = EditOperation.Undefined;
                    if (min == deleteCosts)
                    {
                        op = EditOperation.Delete;
                    }
                    else if (min == insertCosts)
                    {
                        op = EditOperation.Insert;
                    }
                    else if (min == changeCosts)
                    {
                        if (similarity == 1.0d)
                        {
                            op = EditOperation.Identity;
                        }
                        else
                        {
                            op = EditOperation.Change;
                        }
                    }

                    if (alignedTags != null && alignedTags.Count > 0)
                    {
                        // check whether tag alignment overrides ED result:
                        // TODO do this during population or during readout?

                        EditOperation srcTagOp = alignedTags.GetOperationBySourcePosition(i - 1);
                        EditOperation trgTagOp = alignedTags.GetOperationByTargetPosition(j - 1);

                        // changes/identity of tags are through ED, while the tag alignment
                        //  defines deletions, insertions
                        if ((srcTagOp == EditOperation.Insert || srcTagOp == EditOperation.Delete) &&
                            op != srcTagOp)
                        {
                            // this is where the pre-alignment of tags supersedes the ED result
                            op = srcTagOp;
                        }
                        else if ((trgTagOp == EditOperation.Insert || trgTagOp == EditOperation.Delete) &&
                                 op != trgTagOp)
                        {
                            op = trgTagOp;
                        }
                    }

                    matrix[i, j].Similarity = similarity;
                    matrix[i, j].Operation  = op;

                    if (op == EditOperation.Delete)
                    {
                        matrix[i, j].Score = deleteCosts;
                    }
                    else if (op == EditOperation.Insert)
                    {
                        matrix[i, j].Score = insertCosts;
                    }
                    else
                    {
                        matrix[i, j].Score = changeCosts;
                    }
                }
            }
        }