public int GetAlignScore(int a, int b) { int sp = a; int tp = b; Core.Tokenization.Token st = _SimMatrix.SourceTokens[sp]; Core.Tokenization.Token tt = _SimMatrix.TargetTokens[tp]; double v = _SimMatrix[a, b]; return(v >= _Threshold ? 1 : -100000); }
private TokenHashes[] ComputeTokenHashes(IList <Core.Tokenization.Token> sourceObjects) { TokenHashes[] result = new TokenHashes[sourceObjects.Count]; for (int p = 0; p < sourceObjects.Count; ++p) { Core.Tokenization.Token t = sourceObjects[p]; Core.Tokenization.SimpleToken st = t as Core.Tokenization.SimpleToken; TokenHashes th = new TokenHashes(); th.TextHash = t.Text == null ? 0 : t.Text.GetHashCode(); th.CaseInsensitiveTextHash = t.Text == null ? 0 : t.Text.ToLowerInvariant().GetHashCode(); th.StemHash = (st == null || st.Stem == null) ? 0 : st.Stem.ToLowerInvariant().GetHashCode(); result[p] = th; } return(result); }
public static double GetPlaceableSimilarity(Core.Tokenization.Token a, Core.Tokenization.Token b, Core.Tokenization.BuiltinRecognizers disabledAutoSubstitutions) { /* * identical type and value - 1.0 * similar type and value - 0.85 * identical type - 0.7 * otherwise - 0.0 * */ if (!(a.IsPlaceable && b.IsPlaceable)) { System.Diagnostics.Debug.Assert(false, "Expect placeable types"); return(0.0d); } // Can't compare different types if (a.Type != b.Type || a.GetType() != b.GetType()) { return(0.0d); } Core.Tokenization.TagToken ta = a as Core.Tokenization.TagToken; Core.Tokenization.TagToken tb = b as Core.Tokenization.TagToken; if (ta != null || tb != null) { if (ta == null || tb == null) { return(-1.0d); // can't align two placeables of which one isn't a tag } if (ta.Tag.Type != tb.Tag.Type) { return(-1.0d); // can't align tags of different types } } if (disabledAutoSubstitutions != Sdl.LanguagePlatform.Core.Tokenization.BuiltinRecognizers.RecognizeNone) { bool requireEquality = false; switch (a.Type) { case Sdl.LanguagePlatform.Core.Tokenization.TokenType.Abbreviation: requireEquality = (disabledAutoSubstitutions & Sdl.LanguagePlatform.Core.Tokenization.BuiltinRecognizers.RecognizeAcronyms) != Sdl.LanguagePlatform.Core.Tokenization.BuiltinRecognizers.RecognizeNone; break; case Sdl.LanguagePlatform.Core.Tokenization.TokenType.Date: requireEquality = (disabledAutoSubstitutions & Sdl.LanguagePlatform.Core.Tokenization.BuiltinRecognizers.RecognizeDates) != Sdl.LanguagePlatform.Core.Tokenization.BuiltinRecognizers.RecognizeNone; break; case Sdl.LanguagePlatform.Core.Tokenization.TokenType.Time: requireEquality = (disabledAutoSubstitutions & Sdl.LanguagePlatform.Core.Tokenization.BuiltinRecognizers.RecognizeTimes) != Sdl.LanguagePlatform.Core.Tokenization.BuiltinRecognizers.RecognizeNone; break; case Sdl.LanguagePlatform.Core.Tokenization.TokenType.Variable: requireEquality = (disabledAutoSubstitutions & Sdl.LanguagePlatform.Core.Tokenization.BuiltinRecognizers.RecognizeVariables) != Sdl.LanguagePlatform.Core.Tokenization.BuiltinRecognizers.RecognizeNone; break; case Sdl.LanguagePlatform.Core.Tokenization.TokenType.Number: requireEquality = (disabledAutoSubstitutions & Sdl.LanguagePlatform.Core.Tokenization.BuiltinRecognizers.RecognizeNumbers) != Sdl.LanguagePlatform.Core.Tokenization.BuiltinRecognizers.RecognizeNone; break; case Sdl.LanguagePlatform.Core.Tokenization.TokenType.Measurement: requireEquality = (disabledAutoSubstitutions & Sdl.LanguagePlatform.Core.Tokenization.BuiltinRecognizers.RecognizeMeasurements) != Sdl.LanguagePlatform.Core.Tokenization.BuiltinRecognizers.RecognizeNone; break; default: requireEquality = false; break; } if (requireEquality) { if (a.Equals(b)) { return(1.0d); } else { return(0.7d); } } } switch (a.GetSimilarity(b)) { case Core.SegmentElement.Similarity.None: // in this case the tokens are not similar, but the token types are identical (checked above) return(0.7d); case Core.SegmentElement.Similarity.IdenticalType: return(0.85d); case Core.SegmentElement.Similarity.IdenticalValueAndType: return(1.0d); default: return(0.0d); } }
public static double GetTokenSimilarity(Core.Tokenization.Token a, Core.Tokenization.Token b, bool useStringEditDistance, Core.Tokenization.BuiltinRecognizers disabledAutoSubstitutions) { /* * identical form - 1.0 * identical case-insensitive - 0.9 * stem-identical or same placeable type - 0.85 * identical type - 0.4 * */ // TODO consider normalization of token texts, or let the tokenizer store the text in normalized form. Core.Tokenization.TagToken ta = a as Core.Tokenization.TagToken; Core.Tokenization.TagToken tb = b as Core.Tokenization.TagToken; bool aIsTag = (ta != null); bool bIsTag = (tb != null); if (aIsTag != bIsTag || a.IsWhitespace != b.IsWhitespace || a.IsPunctuation != b.IsPunctuation) { // comparing a tag with a non-tag results in no-change-allowed similarity (<0) // same for whitespace, punctuation return(-1.0d); } if (aIsTag && bIsTag) { System.Diagnostics.Debug.Assert(ta.Tag != null && tb.Tag != null); if (ta.Tag.Type == tb.Tag.Type) { // assignable return(0.95d); } else if ((ta.Tag.Type == Sdl.LanguagePlatform.Core.TagType.Standalone && tb.Tag.Type == Sdl.LanguagePlatform.Core.TagType.TextPlaceholder) || (ta.Tag.Type == Sdl.LanguagePlatform.Core.TagType.TextPlaceholder && tb.Tag.Type == Sdl.LanguagePlatform.Core.TagType.Standalone)) { // one placeholder, one text placeholder return(0.85d); } else { // not assignable return(-1.0d); } } double malus = 0.0d; double sim = 0.0d; if (a.IsPlaceable && b.IsPlaceable) { return(GetPlaceableSimilarity(a, b, disabledAutoSubstitutions)); } if (a.Text == null || b.Text == null) { System.Diagnostics.Debug.Assert(false, "Expected non-null token text. Let Oli know if this assertion fires and provide test data."); return(0.0d); } if (a.IsWord != b.IsWord) { // tokens of different types - reduce similarity accordingly // NOTE only checks whether both are words or non-words malus = 0.1d; } if (a.Text.Equals(b.Text, StringComparison.Ordinal)) { sim = 1.0d; } else if (a.IsWhitespace || a.IsPunctuation) { // slightly less than the SegmentEditDistanceComputer's move threshold, as // we don't want to move them around sim = 0.94d; } else if (a.Text.Equals(b.Text, StringComparison.OrdinalIgnoreCase)) { // we want to detect moves for such tokens, so: sim = 0.95d; // the SegmentEditDistanceComputer's move threshold } else if (a is Core.Tokenization.SimpleToken && b is Core.Tokenization.SimpleToken) { Core.Tokenization.SimpleToken ast = a as Core.Tokenization.SimpleToken; Core.Tokenization.SimpleToken bst = b as Core.Tokenization.SimpleToken; if (ast != null && bst != null && ast.Stem != null && bst.Stem != null && ast.Stem.Equals(bst.Stem, StringComparison.OrdinalIgnoreCase)) { sim = 0.85d; } else { sim = useStringEditDistance ? 0.95d * GetThreshold(GetStringSimilarity(a.Text, b.Text)) : 0.0d; } } else { // strings are not identical or identical w/ ignore case sim = useStringEditDistance ? 0.95d * GetThreshold(GetStringSimilarity(a.Text, b.Text)) : 0.0d; } return(Math.Max(0.0d, sim - malus)); }