示例#1
0
        public SimilarityMatrix(IList <Core.Tokenization.Token> sourceTokens,
                                IList <Core.Tokenization.Token> targetTokens,
                                bool useStringEditDistance,
                                Core.Tokenization.BuiltinRecognizers disabledAutoSubstitutions)
        {
            _SourceTokens              = sourceTokens;
            _TargetTokens              = targetTokens;
            _UseStringEditDistance     = useStringEditDistance;
            _DisabledAutoSubstitutions = disabledAutoSubstitutions;

            _Sim = new double[_SourceTokens.Count, _TargetTokens.Count];
            for (int s = 0; s < _SourceTokens.Count; ++s)
            {
                for (int t = 0; t < _TargetTokens.Count; ++t)
                {
                    _Sim[s, t] = _Uncomputed;
                }
            }
        }
        /// <summary>
        /// Computes the ED
        /// </summary>
        /// <param name="sourceTokens"></param>
        /// <param name="targetTokens"></param>
        /// <param name="computeDiagonalOnly">If number of tokens is equivalent, only the diagonal's similarities are computed.</param>
        /// <param name="alignedTags"></param>
        /// <returns></returns>
        public Core.EditDistance.EditDistance ComputeEditDistance(
            IList <Core.Tokenization.Token> sourceTokens,
            IList <Core.Tokenization.Token> targetTokens,
            bool computeDiagonalOnly,
            Core.Tokenization.BuiltinRecognizers disabledAutoSubstitutions,
            out TagAssociations alignedTags)
        {
            Core.EditDistance.EditDistance result = null;

            result = ComputeEditDistanceImpl_Original(sourceTokens, targetTokens, computeDiagonalOnly, disabledAutoSubstitutions, out alignedTags);

#if DEBUG
            {
                bool ok = VerifyEditDistance(result.Items, sourceTokens.Count, targetTokens.Count);
                if (!ok)
                {
                    System.Diagnostics.Debug.Assert(ok, "ED error - let Oli know and provide test data");
                }
            }
#endif
            return(result);
        }
        public static double GetPlaceableSimilarity(Core.Tokenization.Token a, Core.Tokenization.Token b,
                                                    Core.Tokenization.BuiltinRecognizers disabledAutoSubstitutions)
        {
            /*
             * identical type and value - 1.0
             * similar type and value - 0.85
             * identical type - 0.7
             * otherwise - 0.0
             * */

            if (!(a.IsPlaceable && b.IsPlaceable))
            {
                System.Diagnostics.Debug.Assert(false, "Expect placeable types");
                return(0.0d);
            }

            // Can't compare different types
            if (a.Type != b.Type || a.GetType() != b.GetType())
            {
                return(0.0d);
            }

            Core.Tokenization.TagToken ta = a as Core.Tokenization.TagToken;
            Core.Tokenization.TagToken tb = b as Core.Tokenization.TagToken;

            if (ta != null || tb != null)
            {
                if (ta == null || tb == null)
                {
                    return(-1.0d);                    // can't align two placeables of which one isn't a tag
                }
                if (ta.Tag.Type != tb.Tag.Type)
                {
                    return(-1.0d);                    // can't align tags of different types
                }
            }

            if (disabledAutoSubstitutions != Sdl.LanguagePlatform.Core.Tokenization.BuiltinRecognizers.RecognizeNone)
            {
                bool requireEquality = false;

                switch (a.Type)
                {
                case Sdl.LanguagePlatform.Core.Tokenization.TokenType.Abbreviation:
                    requireEquality = (disabledAutoSubstitutions & Sdl.LanguagePlatform.Core.Tokenization.BuiltinRecognizers.RecognizeAcronyms)
                                      != Sdl.LanguagePlatform.Core.Tokenization.BuiltinRecognizers.RecognizeNone;
                    break;

                case Sdl.LanguagePlatform.Core.Tokenization.TokenType.Date:
                    requireEquality = (disabledAutoSubstitutions & Sdl.LanguagePlatform.Core.Tokenization.BuiltinRecognizers.RecognizeDates)
                                      != Sdl.LanguagePlatform.Core.Tokenization.BuiltinRecognizers.RecognizeNone;
                    break;

                case Sdl.LanguagePlatform.Core.Tokenization.TokenType.Time:
                    requireEquality = (disabledAutoSubstitutions & Sdl.LanguagePlatform.Core.Tokenization.BuiltinRecognizers.RecognizeTimes)
                                      != Sdl.LanguagePlatform.Core.Tokenization.BuiltinRecognizers.RecognizeNone;
                    break;

                case Sdl.LanguagePlatform.Core.Tokenization.TokenType.Variable:
                    requireEquality = (disabledAutoSubstitutions & Sdl.LanguagePlatform.Core.Tokenization.BuiltinRecognizers.RecognizeVariables)
                                      != Sdl.LanguagePlatform.Core.Tokenization.BuiltinRecognizers.RecognizeNone;
                    break;

                case Sdl.LanguagePlatform.Core.Tokenization.TokenType.Number:
                    requireEquality = (disabledAutoSubstitutions & Sdl.LanguagePlatform.Core.Tokenization.BuiltinRecognizers.RecognizeNumbers)
                                      != Sdl.LanguagePlatform.Core.Tokenization.BuiltinRecognizers.RecognizeNone;
                    break;

                case Sdl.LanguagePlatform.Core.Tokenization.TokenType.Measurement:
                    requireEquality = (disabledAutoSubstitutions & Sdl.LanguagePlatform.Core.Tokenization.BuiltinRecognizers.RecognizeMeasurements)
                                      != Sdl.LanguagePlatform.Core.Tokenization.BuiltinRecognizers.RecognizeNone;
                    break;

                default:
                    requireEquality = false;
                    break;
                }

                if (requireEquality)
                {
                    if (a.Equals(b))
                    {
                        return(1.0d);
                    }
                    else
                    {
                        return(0.7d);
                    }
                }
            }

            switch (a.GetSimilarity(b))
            {
            case Core.SegmentElement.Similarity.None:
                // in this case the tokens are not similar, but the token types are identical (checked above)
                return(0.7d);

            case Core.SegmentElement.Similarity.IdenticalType:
                return(0.85d);

            case Core.SegmentElement.Similarity.IdenticalValueAndType:
                return(1.0d);

            default:
                return(0.0d);
            }
        }
        public static double GetTokenSimilarity(Core.Tokenization.Token a, Core.Tokenization.Token b,
                                                bool useStringEditDistance,
                                                Core.Tokenization.BuiltinRecognizers disabledAutoSubstitutions)
        {
            /*
             * identical form - 1.0
             * identical case-insensitive - 0.9
             * stem-identical or same placeable type - 0.85
             * identical type - 0.4
             * */

            // TODO consider normalization of token texts, or let the tokenizer store the text in normalized form.

            Core.Tokenization.TagToken ta = a as Core.Tokenization.TagToken;
            Core.Tokenization.TagToken tb = b as Core.Tokenization.TagToken;

            bool aIsTag = (ta != null);
            bool bIsTag = (tb != null);

            if (aIsTag != bIsTag ||
                a.IsWhitespace != b.IsWhitespace ||
                a.IsPunctuation != b.IsPunctuation)
            {
                // comparing a tag with a non-tag results in no-change-allowed similarity (<0)
                // same for whitespace, punctuation
                return(-1.0d);
            }

            if (aIsTag && bIsTag)
            {
                System.Diagnostics.Debug.Assert(ta.Tag != null && tb.Tag != null);
                if (ta.Tag.Type == tb.Tag.Type)
                {
                    // assignable
                    return(0.95d);
                }
                else if ((ta.Tag.Type == Sdl.LanguagePlatform.Core.TagType.Standalone &&
                          tb.Tag.Type == Sdl.LanguagePlatform.Core.TagType.TextPlaceholder)
                         ||
                         (ta.Tag.Type == Sdl.LanguagePlatform.Core.TagType.TextPlaceholder &&
                          tb.Tag.Type == Sdl.LanguagePlatform.Core.TagType.Standalone))
                {
                    // one placeholder, one text placeholder
                    return(0.85d);
                }
                else
                {
                    // not assignable
                    return(-1.0d);
                }
            }

            double malus = 0.0d;
            double sim   = 0.0d;

            if (a.IsPlaceable && b.IsPlaceable)
            {
                return(GetPlaceableSimilarity(a, b, disabledAutoSubstitutions));
            }

            if (a.Text == null || b.Text == null)
            {
                System.Diagnostics.Debug.Assert(false, "Expected non-null token text. Let Oli know if this assertion fires and provide test data.");
                return(0.0d);
            }

            if (a.IsWord != b.IsWord)
            {
                // tokens of different types - reduce similarity accordingly
                // NOTE only checks whether both are words or non-words
                malus = 0.1d;
            }

            if (a.Text.Equals(b.Text, StringComparison.Ordinal))
            {
                sim = 1.0d;
            }
            else if (a.IsWhitespace || a.IsPunctuation)
            {
                // slightly less than the SegmentEditDistanceComputer's move threshold, as
                //  we don't want to move them around
                sim = 0.94d;
            }
            else if (a.Text.Equals(b.Text, StringComparison.OrdinalIgnoreCase))
            {
                // we want to detect moves for such tokens, so:
                sim = 0.95d;                 // the SegmentEditDistanceComputer's move threshold
            }
            else if (a is Core.Tokenization.SimpleToken && b is Core.Tokenization.SimpleToken)
            {
                Core.Tokenization.SimpleToken ast = a as Core.Tokenization.SimpleToken;
                Core.Tokenization.SimpleToken bst = b as Core.Tokenization.SimpleToken;

                if (ast != null && bst != null &&
                    ast.Stem != null && bst.Stem != null &&
                    ast.Stem.Equals(bst.Stem, StringComparison.OrdinalIgnoreCase))
                {
                    sim = 0.85d;
                }
                else
                {
                    sim = useStringEditDistance
                                                ? 0.95d * GetThreshold(GetStringSimilarity(a.Text, b.Text))
                                                : 0.0d;
                }
            }
            else
            {
                // strings are not identical or identical w/ ignore case
                sim = useStringEditDistance
                                        ? 0.95d * GetThreshold(GetStringSimilarity(a.Text, b.Text))
                                        : 0.0d;
            }

            return(Math.Max(0.0d, sim - malus));
        }
        private Core.EditDistance.EditDistance ComputeEditDistanceImpl_Original(
            IList <Core.Tokenization.Token> sourceTokens,
            IList <Core.Tokenization.Token> targetTokens,
            bool computeDiagonalOnly,
            Core.Tokenization.BuiltinRecognizers disabledAutoSubstitutions,
            out TagAssociations alignedTags)
        {
            /*
             * The "classic" ED approach has the problem that it doesn't detect moves
             * reliably, particularly block moves. Patching up insert/delete pairs as
             * moves also won't catch moves which appear as changes in the ED.
             */

            if (sourceTokens == null)
            {
                throw new ArgumentNullException("sourceTokens");
            }
            if (targetTokens == null)
            {
                throw new ArgumentNullException("targetTokens");
            }

            alignedTags = null;

            int i, j;

            // TODO handle special cases (one/both of the arrays being empty/having no elements)
            // TODO use diagonal algorithm

            bool enforceFullMatrixComputation = false;

            Core.EditDistance.EditDistance result =
                new Core.EditDistance.EditDistance(sourceTokens.Count, targetTokens.Count, 0.0d);

            // matrix which captures the similarity between two tokens as well as preassignments
            SimilarityMatrix sim = new SimilarityMatrix(sourceTokens, targetTokens,
                                                        _UseStringEditDistance, disabledAutoSubstitutions);

            if (enforceFullMatrixComputation)
            {
                // this will be fully computed by the tag aligner in most cases, but we may save a bit
                // on plain text segments
                sim.Compute(computeDiagonalOnly);
            }

            MatrixItem[,] matrix = CreateEditDistanceMatrix(sourceTokens, targetTokens);

            alignedTags = TagAligner.AlignPairedTags(sourceTokens, targetTokens, sim);
            if (alignedTags != null && alignedTags.Count > 0)
            {
                // Patch the sim matrix so that non-aligned tags can't be assigned to each other
                PatchSimilarityMatrix(sim, sourceTokens, targetTokens, alignedTags);
                ComputeEditDistanceMatrix_Full(matrix, sim, alignedTags);
            }
            else if (enforceFullMatrixComputation)
            {
                ComputeEditDistanceMatrix_Full(matrix, sim, alignedTags);
            }
            else
            {
                ComputeEditDistanceMatrix_Lazy(matrix, sim);
            }

            // readout the cheapest path

            i = sourceTokens.Count;
            j = targetTokens.Count;
            result.Distance = matrix[i, j].Score;

            while (i > 0 || j > 0)
            {
                EditDistanceItem item = new EditDistanceItem();
                item.Resolution = EditDistanceResolution.None;

                MatrixItem m = matrix[i, j];

                item.Operation = m.Operation;

                switch (item.Operation)
                {
                case EditOperation.Identity:
                    item.Costs = 0.0d;
                    --i;
                    --j;
                    break;

                case EditOperation.Change:
                    item.Costs = _UseStringEditDistance
                                                ? (1.0d - m.Similarity)
                                                : (1.0d - SimilarityComputers.GetTokenSimilarity(sourceTokens[i - 1], targetTokens[j - 1],
                                                                                                 true, disabledAutoSubstitutions));
                    // item.Costs = (1.0d - m.Similarity);
                    --i;
                    --j;
                    break;

                case EditOperation.Insert:
                    item.Costs = _InsertDeleteCosts;
                    --j;
                    break;

                case EditOperation.Delete:
                    item.Costs = _InsertDeleteCosts;
                    --i;
                    break;

                case EditOperation.Undefined:
                    throw new Exception("Internal ED computation error");
                }

                item.Source = i;
                item.Target = j;
                result.AddAtStart(item);
            }

            if (alignedTags != null && alignedTags.Count > 0)
            {
                // should happen before move detection
                FixTagActions(sourceTokens, targetTokens, result, alignedTags);
            }

            // identify move operations which are pairs of insert/delete operations in the shortest path.
            // Note that the comparision result is already in the matrix and we only care about identity.
            // TODO we may rather use a configurable threshold than identity (1.0) to catch move operations
            //  of sufficiently similar items (e.g. case-insensitive)

            if (_ComputeMoves)
            {
                int moves = DetectMoves(result, matrix);
                if (moves > 0)
                {
                    // adjust score: substract moves * (deletionCosts + insertionCosts), add moves * moveCosts
                    // TODO take moveDistance into account, i.e. penalty depends on distance?
                    result.Distance -= (double)moves * (2.0d * _InsertDeleteCosts);
                    result.Distance += (double)moves * _MoveCosts;
                }
            }

#if DEBUG
            // a stream for logging. Will always be null in non-Debug builds
            System.IO.TextWriter logStream = null;
            bool log = false;
            if (log)
            {
                logStream = new System.IO.StreamWriter(System.IO.Path.GetTempPath() + "/ed.log",
                                                       false, System.Text.Encoding.UTF8);

                logStream.WriteLine("Source objects:");
                for (int p = 0; p < sourceTokens.Count; ++p)
                {
                    logStream.WriteLine("\t{0}:\t{1}", p, sourceTokens[p].ToString());
                }
                logStream.WriteLine();
                logStream.WriteLine("Target objects:");
                for (int p = 0; p < targetTokens.Count; ++p)
                {
                    logStream.WriteLine("\t{0}:\t{1}", p, targetTokens[p].ToString());
                }
                logStream.WriteLine();
                logStream.WriteLine();

                if (alignedTags != null)
                {
                    logStream.WriteLine("Tag Alignment:");
                    foreach (TagAssociation ta in alignedTags)
                    {
                        logStream.WriteLine("\t{0}", ta.ToString());
                    }
                    logStream.WriteLine();
                    logStream.WriteLine();
                }

                result.Dump(logStream, "Final ED");

                logStream.Close();
                logStream.Dispose();
                logStream = null;
            }
#endif

#if DEBUG
            // write matrix to a temp file in HTML format
            _DumpMatrix = false;             //  typeof(T) != typeof(char);
            if (_DumpMatrix)
            {
                System.IO.StreamWriter wtr = new System.IO.StreamWriter(System.IO.Path.GetTempPath() + "/SimMatrix.html",
                                                                        false, System.Text.Encoding.UTF8);
                System.Web.UI.Html32TextWriter htmlWriter = new System.Web.UI.Html32TextWriter(wtr);

                htmlWriter.WriteFullBeginTag("html");
                htmlWriter.WriteFullBeginTag("body");
                htmlWriter.WriteBeginTag("table");
                htmlWriter.WriteAttribute("border", "1");

                for (j = -1; j <= targetTokens.Count; ++j)
                {
                    htmlWriter.WriteFullBeginTag("tr");

                    for (i = -1; i <= sourceTokens.Count; ++i)
                    {
                        htmlWriter.WriteFullBeginTag("td");

                        if (i < 0)
                        {
                            // caption row
                            if (j >= 0)
                            {
                                htmlWriter.Write("j={0}", j);
                                if (j > 0)
                                {
                                    htmlWriter.WriteFullBeginTag("br");
                                    htmlWriter.WriteFullBeginTag("b");
                                    htmlWriter.Write(targetTokens[j - 1].ToString());
                                    htmlWriter.WriteEndTag("b");
                                }
                            }
                        }
                        else if (j < 0)
                        {
                            // j < 0 but i >= 0 -->
                            htmlWriter.Write("i={0}", i);
                            if (i > 0)
                            {
                                htmlWriter.WriteFullBeginTag("br");
                                htmlWriter.WriteFullBeginTag("b");
                                htmlWriter.Write(sourceTokens[i - 1].ToString());
                                htmlWriter.WriteEndTag("b");
                            }
                        }
                        else
                        {
                            // content cell
                            htmlWriter.Write("d={0}", matrix[i, j].Score);
                            htmlWriter.WriteFullBeginTag("br");
                            htmlWriter.Write("s={0}", matrix[i, j].Similarity);
                            htmlWriter.WriteFullBeginTag("br");
                            htmlWriter.Write("o={0}", matrix[i, j].Operation.ToString());
                        }

                        htmlWriter.WriteEndTag("td");
                    }

                    htmlWriter.WriteEndTag("tr");
                }

                htmlWriter.WriteEndTag("table");

                htmlWriter.WriteFullBeginTag("h2");
                htmlWriter.Write("Result");
                htmlWriter.WriteEndTag("h2");

                htmlWriter.Write("Score = {0}", result.Distance);

                htmlWriter.WriteFullBeginTag("ol");

                for (i = 0; i < result.Items.Count; ++i)
                {
                    htmlWriter.WriteFullBeginTag("li");
                    htmlWriter.Write("{0}: s={1} t={2}",
                                     result[i].Operation.ToString(), result[i].Source, result[i].Target);
                }

                htmlWriter.WriteEndTag("ol");

                htmlWriter.WriteEndTag("body");
                htmlWriter.WriteEndTag("html");

                htmlWriter.Close();
            }
#endif

            return(result);
        }