private TokenHashes[] ComputeTokenHashes(IList <Core.Tokenization.Token> sourceObjects) { TokenHashes[] result = new TokenHashes[sourceObjects.Count]; for (int p = 0; p < sourceObjects.Count; ++p) { Core.Tokenization.Token t = sourceObjects[p]; Core.Tokenization.SimpleToken st = t as Core.Tokenization.SimpleToken; TokenHashes th = new TokenHashes(); th.TextHash = t.Text == null ? 0 : t.Text.GetHashCode(); th.CaseInsensitiveTextHash = t.Text == null ? 0 : t.Text.ToLowerInvariant().GetHashCode(); th.StemHash = (st == null || st.Stem == null) ? 0 : st.Stem.ToLowerInvariant().GetHashCode(); result[p] = th; } return(result); }
public static double GetTokenSimilarity(Core.Tokenization.Token a, Core.Tokenization.Token b, bool useStringEditDistance, Core.Tokenization.BuiltinRecognizers disabledAutoSubstitutions) { /* * identical form - 1.0 * identical case-insensitive - 0.9 * stem-identical or same placeable type - 0.85 * identical type - 0.4 * */ // TODO consider normalization of token texts, or let the tokenizer store the text in normalized form. Core.Tokenization.TagToken ta = a as Core.Tokenization.TagToken; Core.Tokenization.TagToken tb = b as Core.Tokenization.TagToken; bool aIsTag = (ta != null); bool bIsTag = (tb != null); if (aIsTag != bIsTag || a.IsWhitespace != b.IsWhitespace || a.IsPunctuation != b.IsPunctuation) { // comparing a tag with a non-tag results in no-change-allowed similarity (<0) // same for whitespace, punctuation return(-1.0d); } if (aIsTag && bIsTag) { System.Diagnostics.Debug.Assert(ta.Tag != null && tb.Tag != null); if (ta.Tag.Type == tb.Tag.Type) { // assignable return(0.95d); } else if ((ta.Tag.Type == Sdl.LanguagePlatform.Core.TagType.Standalone && tb.Tag.Type == Sdl.LanguagePlatform.Core.TagType.TextPlaceholder) || (ta.Tag.Type == Sdl.LanguagePlatform.Core.TagType.TextPlaceholder && tb.Tag.Type == Sdl.LanguagePlatform.Core.TagType.Standalone)) { // one placeholder, one text placeholder return(0.85d); } else { // not assignable return(-1.0d); } } double malus = 0.0d; double sim = 0.0d; if (a.IsPlaceable && b.IsPlaceable) { return(GetPlaceableSimilarity(a, b, disabledAutoSubstitutions)); } if (a.Text == null || b.Text == null) { System.Diagnostics.Debug.Assert(false, "Expected non-null token text. Let Oli know if this assertion fires and provide test data."); return(0.0d); } if (a.IsWord != b.IsWord) { // tokens of different types - reduce similarity accordingly // NOTE only checks whether both are words or non-words malus = 0.1d; } if (a.Text.Equals(b.Text, StringComparison.Ordinal)) { sim = 1.0d; } else if (a.IsWhitespace || a.IsPunctuation) { // slightly less than the SegmentEditDistanceComputer's move threshold, as // we don't want to move them around sim = 0.94d; } else if (a.Text.Equals(b.Text, StringComparison.OrdinalIgnoreCase)) { // we want to detect moves for such tokens, so: sim = 0.95d; // the SegmentEditDistanceComputer's move threshold } else if (a is Core.Tokenization.SimpleToken && b is Core.Tokenization.SimpleToken) { Core.Tokenization.SimpleToken ast = a as Core.Tokenization.SimpleToken; Core.Tokenization.SimpleToken bst = b as Core.Tokenization.SimpleToken; if (ast != null && bst != null && ast.Stem != null && bst.Stem != null && ast.Stem.Equals(bst.Stem, StringComparison.OrdinalIgnoreCase)) { sim = 0.85d; } else { sim = useStringEditDistance ? 0.95d * GetThreshold(GetStringSimilarity(a.Text, b.Text)) : 0.0d; } } else { // strings are not identical or identical w/ ignore case sim = useStringEditDistance ? 0.95d * GetThreshold(GetStringSimilarity(a.Text, b.Text)) : 0.0d; } return(Math.Max(0.0d, sim - malus)); }