コード例 #1
0
        private TokenHashes[] ComputeTokenHashes(IList <Core.Tokenization.Token> sourceObjects)
        {
            TokenHashes[] result = new TokenHashes[sourceObjects.Count];
            for (int p = 0; p < sourceObjects.Count; ++p)
            {
                Core.Tokenization.Token       t  = sourceObjects[p];
                Core.Tokenization.SimpleToken st = t as Core.Tokenization.SimpleToken;

                TokenHashes th = new TokenHashes();

                th.TextHash = t.Text == null ? 0 : t.Text.GetHashCode();
                th.CaseInsensitiveTextHash = t.Text == null ? 0 : t.Text.ToLowerInvariant().GetHashCode();
                th.StemHash = (st == null || st.Stem == null)
                                        ? 0
                                        : st.Stem.ToLowerInvariant().GetHashCode();

                result[p] = th;
            }
            return(result);
        }
コード例 #2
0
        public static double GetTokenSimilarity(Core.Tokenization.Token a, Core.Tokenization.Token b,
                                                bool useStringEditDistance,
                                                Core.Tokenization.BuiltinRecognizers disabledAutoSubstitutions)
        {
            /*
             * identical form - 1.0
             * identical case-insensitive - 0.9
             * stem-identical or same placeable type - 0.85
             * identical type - 0.4
             * */

            // TODO consider normalization of token texts, or let the tokenizer store the text in normalized form.

            Core.Tokenization.TagToken ta = a as Core.Tokenization.TagToken;
            Core.Tokenization.TagToken tb = b as Core.Tokenization.TagToken;

            bool aIsTag = (ta != null);
            bool bIsTag = (tb != null);

            if (aIsTag != bIsTag ||
                a.IsWhitespace != b.IsWhitespace ||
                a.IsPunctuation != b.IsPunctuation)
            {
                // comparing a tag with a non-tag results in no-change-allowed similarity (<0)
                // same for whitespace, punctuation
                return(-1.0d);
            }

            if (aIsTag && bIsTag)
            {
                System.Diagnostics.Debug.Assert(ta.Tag != null && tb.Tag != null);
                if (ta.Tag.Type == tb.Tag.Type)
                {
                    // assignable
                    return(0.95d);
                }
                else if ((ta.Tag.Type == Sdl.LanguagePlatform.Core.TagType.Standalone &&
                          tb.Tag.Type == Sdl.LanguagePlatform.Core.TagType.TextPlaceholder)
                         ||
                         (ta.Tag.Type == Sdl.LanguagePlatform.Core.TagType.TextPlaceholder &&
                          tb.Tag.Type == Sdl.LanguagePlatform.Core.TagType.Standalone))
                {
                    // one placeholder, one text placeholder
                    return(0.85d);
                }
                else
                {
                    // not assignable
                    return(-1.0d);
                }
            }

            double malus = 0.0d;
            double sim   = 0.0d;

            if (a.IsPlaceable && b.IsPlaceable)
            {
                return(GetPlaceableSimilarity(a, b, disabledAutoSubstitutions));
            }

            if (a.Text == null || b.Text == null)
            {
                System.Diagnostics.Debug.Assert(false, "Expected non-null token text. Let Oli know if this assertion fires and provide test data.");
                return(0.0d);
            }

            if (a.IsWord != b.IsWord)
            {
                // tokens of different types - reduce similarity accordingly
                // NOTE only checks whether both are words or non-words
                malus = 0.1d;
            }

            if (a.Text.Equals(b.Text, StringComparison.Ordinal))
            {
                sim = 1.0d;
            }
            else if (a.IsWhitespace || a.IsPunctuation)
            {
                // slightly less than the SegmentEditDistanceComputer's move threshold, as
                //  we don't want to move them around
                sim = 0.94d;
            }
            else if (a.Text.Equals(b.Text, StringComparison.OrdinalIgnoreCase))
            {
                // we want to detect moves for such tokens, so:
                sim = 0.95d;                 // the SegmentEditDistanceComputer's move threshold
            }
            else if (a is Core.Tokenization.SimpleToken && b is Core.Tokenization.SimpleToken)
            {
                Core.Tokenization.SimpleToken ast = a as Core.Tokenization.SimpleToken;
                Core.Tokenization.SimpleToken bst = b as Core.Tokenization.SimpleToken;

                if (ast != null && bst != null &&
                    ast.Stem != null && bst.Stem != null &&
                    ast.Stem.Equals(bst.Stem, StringComparison.OrdinalIgnoreCase))
                {
                    sim = 0.85d;
                }
                else
                {
                    sim = useStringEditDistance
                                                ? 0.95d * GetThreshold(GetStringSimilarity(a.Text, b.Text))
                                                : 0.0d;
                }
            }
            else
            {
                // strings are not identical or identical w/ ignore case
                sim = useStringEditDistance
                                        ? 0.95d * GetThreshold(GetStringSimilarity(a.Text, b.Text))
                                        : 0.0d;
            }

            return(Math.Max(0.0d, sim - malus));
        }