private static int[,] ComputeTokenAssociationScores(Core.Segment searchSegment, Core.Segment textSegment) { int[,] overlaps = new int[searchSegment.Tokens.Count, textSegment.Tokens.Count]; CaseAwareCharSubsequenceScoreProvider scorer = new CaseAwareCharSubsequenceScoreProvider(); Core.Tokenization.Token srcToken; Core.Tokenization.Token txtToken; for (int s = 0; s < searchSegment.Tokens.Count; ++s) { srcToken = searchSegment.Tokens[s]; if (srcToken.IsWhitespace || srcToken is Core.Tokenization.TagToken) { continue; } for (int t = 0; t < textSegment.Tokens.Count; ++t) { txtToken = textSegment.Tokens[t]; if (txtToken.IsWhitespace || txtToken is Core.Tokenization.TagToken) { continue; } overlaps[s, t] = 0; // TODO relax equality criteria on placeables of the same type // TODO does the score include information whether tolower/tobase has // been applied? List <AlignedSubstring> alignment = SequenceAlignmentComputer <char> .ComputeLongestCommonSubsequence(srcToken.Text.ToCharArray(), txtToken.Text.ToCharArray(), 0, scorer, null); if (alignment == null || alignment.Count == 0) { continue; } int common = alignment.Sum(x => x.Length); if (common == 0) { continue; } // dice // TODO experiment with other scoring methods? Scoring only relative to query? float score = 2.0f * common / (float)(srcToken.Text.Length + txtToken.Text.Length); if (score >= _DICE_THRESHOLD) { // percentage of score overlaps[s, t] = (int)(score * 100.0f); System.Diagnostics.Debug.Assert(overlaps[s, t] >= 0 && overlaps[s, t] <= 100); } } } return(overlaps); }
private static TermFinderResult FindTermsCharBased(Core.Segment searchSegment, Core.Segment textSegment, bool expectContinuousMatch) { // This should only be used for far-east languages // these ranges capture the mapping from a character position in the plain text arrays // to a segment position (run/position pairs) List <Core.SegmentPosition> searchSegmentPositions; List <Core.SegmentPosition> textSegmentPositions; string searchPlain = searchSegment.ToPlain(true, true, out searchSegmentPositions); string textPlain = textSegment.ToPlain(true, true, out textSegmentPositions); if (searchPlain.Length == 0) { // TODO may need to look into what may cause such an issue: System.Diagnostics.Debug.Assert(false, "Let Oli know and provide test data"); return(null); } char[] searchPlainArray = searchPlain.ToCharArray(); char[] textPlainArray = textPlain.ToCharArray(); int searchPlainLength = searchPlain.Length; int textPlainLength = textPlain.Length; System.Diagnostics.Debug.Assert(searchPlainLength == searchPlainArray.Length); System.Diagnostics.Debug.Assert(textPlainLength == textPlainArray.Length); List <AlignedSubstring> lcs = null; SubstringAlignmentDisambiguator picker = new SubstringAlignmentDisambiguator(); lcs = SequenceAlignmentComputer <char> .ComputeCoverage(searchPlainArray, textPlainArray, new CharSubstringScoreProvider(), picker); if (lcs == null || lcs.Count == 0) { return(null); } TermFinderResult result = new TermFinderResult(); result.MatchingRanges = new List <Sdl.LanguagePlatform.Core.SegmentRange>(); List <Core.SegmentPosition> textPositions = new List <Core.SegmentPosition>(); for (int subIdx = 0; subIdx < lcs.Count; ++subIdx) { AlignedSubstring sub = lcs[subIdx]; if (sub.Source.Length != sub.Target.Length) { // NOTE LCSubseq instead of Substring? Check scorer if this fires System.Diagnostics.Debug.Assert(false, "Not supported - let Oli know and provide test data"); return(null); } for (int p = 0; p < sub.Source.Length; ++p) { textPositions.Add(textSegmentPositions[sub.Target.Start + p]); } } if (textPositions.Count == 0) { return(null); } // covered ranges in the text segment: result.MatchingRanges = SortAndMelt(textPositions); // TODO this does not capture adjacency float baseScore = (float)textPositions.Count / (float)searchPlainLength; #if DEBUG bool ok = VerifyRanges(result.MatchingRanges, textSegment); if (!ok) { System.Diagnostics.Debug.Assert(false, "Range verification failed"); } #endif result.Score = (int)(100.0f * baseScore); if (result.Score < 0) { result.Score = 0; } else if (result.Score > 100) { result.Score = 100; } return(result); }
private static TermFinderResult FindTermsWordBased(Core.Segment searchSegment, Core.Segment textSegment, bool expectContinuousMatch) { // compute the maximum overlap scores for each token in the source and each token in the text, // using LCS or ED const bool useLcsScoreAdjustment = true; int[,] overlaps = ComputeTokenAssociationScores(searchSegment, textSegment); int[] maxScores = new int[searchSegment.Tokens.Count]; TermFinderResult result = new TermFinderResult(); result.MatchingRanges = new List <Sdl.LanguagePlatform.Core.SegmentRange>(); System.Collections.BitArray coveredTargetTokens = new System.Collections.BitArray(textSegment.Tokens.Count); IList <string> srcConcatenatedTokens = new List <string>(); IList <string> trgConcatenatedTokens = new List <string>(); int nonwhiteSearchTokens = 0; for (int s = 0; s < searchSegment.Tokens.Count; ++s) { if (!searchSegment.Tokens[s].IsWhitespace) { ++nonwhiteSearchTokens; if (useLcsScoreAdjustment) { srcConcatenatedTokens.Add(searchSegment.Tokens[s].Text.ToLowerInvariant()); } } for (int t = 0; t < textSegment.Tokens.Count; ++t) { if (overlaps[s, t] > 0) { if (!coveredTargetTokens[t]) { result.MatchingRanges.Add(textSegment.Tokens[t].Span); coveredTargetTokens[t] = true; } if (overlaps[s, t] > maxScores[s]) { System.Diagnostics.Debug.Assert(overlaps[s, t] >= 0 && overlaps[s, t] <= 100); maxScores[s] = overlaps[s, t]; } } } } if (nonwhiteSearchTokens == 0) { return(null); } int tokenOverlapScore = (int)((float)maxScores.Sum() / (float)nonwhiteSearchTokens); if (useLcsScoreAdjustment) { int relevantTextTokens = 0; // TODO this won't work really well if the same search token appears // multiple times int the match - the concatenation will include all // occurrences (which will reduce the LCS score) and also increase the // text token count (further reducing the LCS score) for (int tokenIndex = 0; tokenIndex < textSegment.Tokens.Count; ++tokenIndex) { if (coveredTargetTokens[tokenIndex]) { if (trgConcatenatedTokens.Count > 0) { int previousWordTokenIndex = GetPreviousWordTokenIndex(textSegment.Tokens, tokenIndex); if ((previousWordTokenIndex > -1) && (!coveredTargetTokens[previousWordTokenIndex])) { const string UnmatchedToken = "#"; trgConcatenatedTokens.Add(UnmatchedToken); } } ++relevantTextTokens; trgConcatenatedTokens.Add(textSegment.Tokens[tokenIndex].Text.ToLowerInvariant()); } } string srcConcat = string.Join("~", srcConcatenatedTokens.ToArray()); string trgConcat = string.Join("~", trgConcatenatedTokens.ToArray()); int lcsOverlapScore = 0; if ((expectContinuousMatch || tokenOverlapScore < 100) && srcConcat.Length > 0 && trgConcat.Length > 0) { List <AlignedSubstring> lcs = SequenceAlignmentComputer <char> .ComputeLongestCommonSubsequence(srcConcat.ToCharArray(), trgConcat.ToCharArray(), 1, new SimpleCharLSAScoreProvider(), null); int lcsOverlap = lcs.Sum(x => x.Length); // dice again, this time on the concatenated strings of the tokens, with a // penalty if the number of tokens differs [0..1] float tokenCountDeltaPenalty = 2.0f * (float)Math.Min(nonwhiteSearchTokens, relevantTextTokens) / (float)(nonwhiteSearchTokens + relevantTextTokens); // another dice // 2009-08-10, OC: reduce token count delta penalty lcsOverlapScore = (int)(((75.0f + 25.0f * tokenCountDeltaPenalty) * 2.0f * (float)lcsOverlap) / (float)(srcConcat.Length + trgConcat.Length)); if (lcsOverlapScore < 0) { lcsOverlapScore = 0; } if (lcsOverlapScore > 100) { lcsOverlapScore = 100; } System.Diagnostics.Debug.Assert(lcsOverlapScore >= 0 && lcsOverlapScore <= 100); } if (tokenOverlapScore == 100 && lcsOverlapScore > 0) { // discontinuous/swapped match - not sure how to penalize // TODO work out exact scoring result.Score = (200 + lcsOverlapScore) / 3; } else { result.Score = Math.Max(tokenOverlapScore, lcsOverlapScore); } } else { result.Score = tokenOverlapScore; } return(result); }
private static int[,] ComputeTagAssociationScores(SimilarityMatrix similarityMatrix, TagPairs srcPairedTags, TagPairs trgPairedTags, bool useEndPositions) { // this should pretty much result in first-come first-serve alignment, but we hopefully // get better associations for nested tags // foreach src tag, compute LCS to each target tag int[,] lcsScores = new int[srcPairedTags.Count, trgPairedTags.Count]; List <int> sourceTokenPositions = new List <int>(); List <int> targetTokenPositions = new List <int>(); TokenIndexLCSScoreProvider scorer = new TokenIndexLCSScoreProvider(similarityMatrix, 0.75, true); for (int p = 0; p < similarityMatrix.SourceTokens.Count; ++p) { sourceTokenPositions.Add(p); } for (int p = 0; p < similarityMatrix.TargetTokens.Count; ++p) { targetTokenPositions.Add(p); } SequenceAlignmentComputer <int> aligner = new SequenceAlignmentComputer <int>(sourceTokenPositions, targetTokenPositions, scorer, null, 1, 1); int uptoSource; int uptoTarget; for (int srcTag = srcPairedTags.Count - 1; srcTag >= 0; --srcTag) { PairedTag sPt = srcPairedTags[srcTag]; uptoSource = (useEndPositions ? sPt.End : sPt.Start); for (int trgTag = trgPairedTags.Count - 1; trgTag >= 0; --trgTag) { PairedTag tPt = trgPairedTags[trgTag]; uptoTarget = (useEndPositions ? tPt.End : tPt.Start); List <AlignedSubstring> result = aligner.Compute(uptoSource, uptoTarget); if (result != null && result.Count > 0) { System.Diagnostics.Debug.Assert(result.Count == 1); // the result is the common subsequence length minus items which were deleted or inserted int score = result[0].Score - (uptoSource - result[0].Score) - (uptoTarget - result[0].Score); // penalize large differences in the spanned width, but not if // we include the end positions in the LCS int malus; if (useEndPositions) { malus = 0; } else { int srcSpan = GetTagSpan(sPt); int trgSpan = GetTagSpan(tPt); malus = Math.Abs(srcSpan - trgSpan) / 2; } lcsScores[srcTag, trgTag] = score - malus; } } } return(lcsScores); }