Exemplo n.º 1
0
        /// <summary>
        /// Returns a dictionary of paired tag indices, or null if none exist.
        /// </summary>
        public static TagPairs FindPairedTags(IList <Core.Tokenization.Token> tokens)
        {
            // TODO this is equivalent to Core.Segment.GetTagPairings() - clean up

            TagPairs result = null;

            for (int stp = 0; stp < tokens.Count; ++stp)
            {
                Core.Tokenization.TagToken st = tokens[stp] as Core.Tokenization.TagToken;
                if (st != null && st.Tag.Type == Core.TagType.Start)
                {
                    // find the end tag, which is supposed to follow
                    int etp = 0;
                    for (etp = stp + 1; etp < tokens.Count; ++etp)
                    {
                        Core.Tokenization.TagToken et = tokens[etp] as Core.Tokenization.TagToken;
                        if (et != null && et.Tag.Type == Core.TagType.End && et.Tag.Anchor == st.Tag.Anchor)
                        {
                            if (result == null)
                            {
                                result = new TagPairs();
                            }
                            result.Add(stp, etp, st.Tag.Anchor);
                            break;
                        }
                    }
                    System.Diagnostics.Debug.Assert(etp < tokens.Count, "End tag not found");
                }
            }

            return(result);
        }
        public static double GetPlaceableSimilarity(Core.Tokenization.Token a, Core.Tokenization.Token b,
                                                    Core.Tokenization.BuiltinRecognizers disabledAutoSubstitutions)
        {
            /*
             * identical type and value - 1.0
             * similar type and value - 0.85
             * identical type - 0.7
             * otherwise - 0.0
             * */

            if (!(a.IsPlaceable && b.IsPlaceable))
            {
                System.Diagnostics.Debug.Assert(false, "Expect placeable types");
                return(0.0d);
            }

            // Can't compare different types
            if (a.Type != b.Type || a.GetType() != b.GetType())
            {
                return(0.0d);
            }

            Core.Tokenization.TagToken ta = a as Core.Tokenization.TagToken;
            Core.Tokenization.TagToken tb = b as Core.Tokenization.TagToken;

            if (ta != null || tb != null)
            {
                if (ta == null || tb == null)
                {
                    return(-1.0d);                    // can't align two placeables of which one isn't a tag
                }
                if (ta.Tag.Type != tb.Tag.Type)
                {
                    return(-1.0d);                    // can't align tags of different types
                }
            }

            if (disabledAutoSubstitutions != Sdl.LanguagePlatform.Core.Tokenization.BuiltinRecognizers.RecognizeNone)
            {
                bool requireEquality = false;

                switch (a.Type)
                {
                case Sdl.LanguagePlatform.Core.Tokenization.TokenType.Abbreviation:
                    requireEquality = (disabledAutoSubstitutions & Sdl.LanguagePlatform.Core.Tokenization.BuiltinRecognizers.RecognizeAcronyms)
                                      != Sdl.LanguagePlatform.Core.Tokenization.BuiltinRecognizers.RecognizeNone;
                    break;

                case Sdl.LanguagePlatform.Core.Tokenization.TokenType.Date:
                    requireEquality = (disabledAutoSubstitutions & Sdl.LanguagePlatform.Core.Tokenization.BuiltinRecognizers.RecognizeDates)
                                      != Sdl.LanguagePlatform.Core.Tokenization.BuiltinRecognizers.RecognizeNone;
                    break;

                case Sdl.LanguagePlatform.Core.Tokenization.TokenType.Time:
                    requireEquality = (disabledAutoSubstitutions & Sdl.LanguagePlatform.Core.Tokenization.BuiltinRecognizers.RecognizeTimes)
                                      != Sdl.LanguagePlatform.Core.Tokenization.BuiltinRecognizers.RecognizeNone;
                    break;

                case Sdl.LanguagePlatform.Core.Tokenization.TokenType.Variable:
                    requireEquality = (disabledAutoSubstitutions & Sdl.LanguagePlatform.Core.Tokenization.BuiltinRecognizers.RecognizeVariables)
                                      != Sdl.LanguagePlatform.Core.Tokenization.BuiltinRecognizers.RecognizeNone;
                    break;

                case Sdl.LanguagePlatform.Core.Tokenization.TokenType.Number:
                    requireEquality = (disabledAutoSubstitutions & Sdl.LanguagePlatform.Core.Tokenization.BuiltinRecognizers.RecognizeNumbers)
                                      != Sdl.LanguagePlatform.Core.Tokenization.BuiltinRecognizers.RecognizeNone;
                    break;

                case Sdl.LanguagePlatform.Core.Tokenization.TokenType.Measurement:
                    requireEquality = (disabledAutoSubstitutions & Sdl.LanguagePlatform.Core.Tokenization.BuiltinRecognizers.RecognizeMeasurements)
                                      != Sdl.LanguagePlatform.Core.Tokenization.BuiltinRecognizers.RecognizeNone;
                    break;

                default:
                    requireEquality = false;
                    break;
                }

                if (requireEquality)
                {
                    if (a.Equals(b))
                    {
                        return(1.0d);
                    }
                    else
                    {
                        return(0.7d);
                    }
                }
            }

            switch (a.GetSimilarity(b))
            {
            case Core.SegmentElement.Similarity.None:
                // in this case the tokens are not similar, but the token types are identical (checked above)
                return(0.7d);

            case Core.SegmentElement.Similarity.IdenticalType:
                return(0.85d);

            case Core.SegmentElement.Similarity.IdenticalValueAndType:
                return(1.0d);

            default:
                return(0.0d);
            }
        }
        public static double GetTokenSimilarity(Core.Tokenization.Token a, Core.Tokenization.Token b,
                                                bool useStringEditDistance,
                                                Core.Tokenization.BuiltinRecognizers disabledAutoSubstitutions)
        {
            /*
             * identical form - 1.0
             * identical case-insensitive - 0.9
             * stem-identical or same placeable type - 0.85
             * identical type - 0.4
             * */

            // TODO consider normalization of token texts, or let the tokenizer store the text in normalized form.

            Core.Tokenization.TagToken ta = a as Core.Tokenization.TagToken;
            Core.Tokenization.TagToken tb = b as Core.Tokenization.TagToken;

            bool aIsTag = (ta != null);
            bool bIsTag = (tb != null);

            if (aIsTag != bIsTag ||
                a.IsWhitespace != b.IsWhitespace ||
                a.IsPunctuation != b.IsPunctuation)
            {
                // comparing a tag with a non-tag results in no-change-allowed similarity (<0)
                // same for whitespace, punctuation
                return(-1.0d);
            }

            if (aIsTag && bIsTag)
            {
                System.Diagnostics.Debug.Assert(ta.Tag != null && tb.Tag != null);
                if (ta.Tag.Type == tb.Tag.Type)
                {
                    // assignable
                    return(0.95d);
                }
                else if ((ta.Tag.Type == Sdl.LanguagePlatform.Core.TagType.Standalone &&
                          tb.Tag.Type == Sdl.LanguagePlatform.Core.TagType.TextPlaceholder)
                         ||
                         (ta.Tag.Type == Sdl.LanguagePlatform.Core.TagType.TextPlaceholder &&
                          tb.Tag.Type == Sdl.LanguagePlatform.Core.TagType.Standalone))
                {
                    // one placeholder, one text placeholder
                    return(0.85d);
                }
                else
                {
                    // not assignable
                    return(-1.0d);
                }
            }

            double malus = 0.0d;
            double sim   = 0.0d;

            if (a.IsPlaceable && b.IsPlaceable)
            {
                return(GetPlaceableSimilarity(a, b, disabledAutoSubstitutions));
            }

            if (a.Text == null || b.Text == null)
            {
                System.Diagnostics.Debug.Assert(false, "Expected non-null token text. Let Oli know if this assertion fires and provide test data.");
                return(0.0d);
            }

            if (a.IsWord != b.IsWord)
            {
                // tokens of different types - reduce similarity accordingly
                // NOTE only checks whether both are words or non-words
                malus = 0.1d;
            }

            if (a.Text.Equals(b.Text, StringComparison.Ordinal))
            {
                sim = 1.0d;
            }
            else if (a.IsWhitespace || a.IsPunctuation)
            {
                // slightly less than the SegmentEditDistanceComputer's move threshold, as
                //  we don't want to move them around
                sim = 0.94d;
            }
            else if (a.Text.Equals(b.Text, StringComparison.OrdinalIgnoreCase))
            {
                // we want to detect moves for such tokens, so:
                sim = 0.95d;                 // the SegmentEditDistanceComputer's move threshold
            }
            else if (a is Core.Tokenization.SimpleToken && b is Core.Tokenization.SimpleToken)
            {
                Core.Tokenization.SimpleToken ast = a as Core.Tokenization.SimpleToken;
                Core.Tokenization.SimpleToken bst = b as Core.Tokenization.SimpleToken;

                if (ast != null && bst != null &&
                    ast.Stem != null && bst.Stem != null &&
                    ast.Stem.Equals(bst.Stem, StringComparison.OrdinalIgnoreCase))
                {
                    sim = 0.85d;
                }
                else
                {
                    sim = useStringEditDistance
                                                ? 0.95d * GetThreshold(GetStringSimilarity(a.Text, b.Text))
                                                : 0.0d;
                }
            }
            else
            {
                // strings are not identical or identical w/ ignore case
                sim = useStringEditDistance
                                        ? 0.95d * GetThreshold(GetStringSimilarity(a.Text, b.Text))
                                        : 0.0d;
            }

            return(Math.Max(0.0d, sim - malus));
        }