private static TermFinderResult FindTermsWordBased(Core.Segment searchSegment, Core.Segment textSegment, bool expectContinuousMatch) { // compute the maximum overlap scores for each token in the source and each token in the text, // using LCS or ED const bool useLcsScoreAdjustment = true; int[,] overlaps = ComputeTokenAssociationScores(searchSegment, textSegment); int[] maxScores = new int[searchSegment.Tokens.Count]; TermFinderResult result = new TermFinderResult(); result.MatchingRanges = new List <Sdl.LanguagePlatform.Core.SegmentRange>(); System.Collections.BitArray coveredTargetTokens = new System.Collections.BitArray(textSegment.Tokens.Count); IList <string> srcConcatenatedTokens = new List <string>(); IList <string> trgConcatenatedTokens = new List <string>(); int nonwhiteSearchTokens = 0; for (int s = 0; s < searchSegment.Tokens.Count; ++s) { if (!searchSegment.Tokens[s].IsWhitespace) { ++nonwhiteSearchTokens; if (useLcsScoreAdjustment) { srcConcatenatedTokens.Add(searchSegment.Tokens[s].Text.ToLowerInvariant()); } } for (int t = 0; t < textSegment.Tokens.Count; ++t) { if (overlaps[s, t] > 0) { if (!coveredTargetTokens[t]) { result.MatchingRanges.Add(textSegment.Tokens[t].Span); coveredTargetTokens[t] = true; } if (overlaps[s, t] > maxScores[s]) { System.Diagnostics.Debug.Assert(overlaps[s, t] >= 0 && overlaps[s, t] <= 100); maxScores[s] = overlaps[s, t]; } } } } if (nonwhiteSearchTokens == 0) { return(null); } int tokenOverlapScore = (int)((float)maxScores.Sum() / (float)nonwhiteSearchTokens); if (useLcsScoreAdjustment) { int relevantTextTokens = 0; // TODO this won't work really well if the same search token appears // multiple times int the match - the concatenation will include all // occurrences (which will reduce the LCS score) and also increase the // text token count (further reducing the LCS score) for (int tokenIndex = 0; tokenIndex < textSegment.Tokens.Count; ++tokenIndex) { if (coveredTargetTokens[tokenIndex]) { if (trgConcatenatedTokens.Count > 0) { int previousWordTokenIndex = GetPreviousWordTokenIndex(textSegment.Tokens, tokenIndex); if ((previousWordTokenIndex > -1) && (!coveredTargetTokens[previousWordTokenIndex])) { const string UnmatchedToken = "#"; trgConcatenatedTokens.Add(UnmatchedToken); } } ++relevantTextTokens; trgConcatenatedTokens.Add(textSegment.Tokens[tokenIndex].Text.ToLowerInvariant()); } } string srcConcat = string.Join("~", srcConcatenatedTokens.ToArray()); string trgConcat = string.Join("~", trgConcatenatedTokens.ToArray()); int lcsOverlapScore = 0; if ((expectContinuousMatch || tokenOverlapScore < 100) && srcConcat.Length > 0 && trgConcat.Length > 0) { List <AlignedSubstring> lcs = SequenceAlignmentComputer <char> .ComputeLongestCommonSubsequence(srcConcat.ToCharArray(), trgConcat.ToCharArray(), 1, new SimpleCharLSAScoreProvider(), null); int lcsOverlap = lcs.Sum(x => x.Length); // dice again, this time on the concatenated strings of the tokens, with a // penalty if the number of tokens differs [0..1] float tokenCountDeltaPenalty = 2.0f * (float)Math.Min(nonwhiteSearchTokens, relevantTextTokens) / (float)(nonwhiteSearchTokens + relevantTextTokens); // another dice // 2009-08-10, OC: reduce token count delta penalty lcsOverlapScore = (int)(((75.0f + 25.0f * tokenCountDeltaPenalty) * 2.0f * (float)lcsOverlap) / (float)(srcConcat.Length + trgConcat.Length)); if (lcsOverlapScore < 0) { lcsOverlapScore = 0; } if (lcsOverlapScore > 100) { lcsOverlapScore = 100; } System.Diagnostics.Debug.Assert(lcsOverlapScore >= 0 && lcsOverlapScore <= 100); } if (tokenOverlapScore == 100 && lcsOverlapScore > 0) { // discontinuous/swapped match - not sure how to penalize // TODO work out exact scoring result.Score = (200 + lcsOverlapScore) / 3; } else { result.Score = Math.Max(tokenOverlapScore, lcsOverlapScore); } } else { result.Score = tokenOverlapScore; } return(result); }
private static TermFinderResult FindTermsCharBased(Core.Segment searchSegment, Core.Segment textSegment, bool expectContinuousMatch) { // This should only be used for far-east languages // these ranges capture the mapping from a character position in the plain text arrays // to a segment position (run/position pairs) List <Core.SegmentPosition> searchSegmentPositions; List <Core.SegmentPosition> textSegmentPositions; string searchPlain = searchSegment.ToPlain(true, true, out searchSegmentPositions); string textPlain = textSegment.ToPlain(true, true, out textSegmentPositions); if (searchPlain.Length == 0) { // TODO may need to look into what may cause such an issue: System.Diagnostics.Debug.Assert(false, "Let Oli know and provide test data"); return(null); } char[] searchPlainArray = searchPlain.ToCharArray(); char[] textPlainArray = textPlain.ToCharArray(); int searchPlainLength = searchPlain.Length; int textPlainLength = textPlain.Length; System.Diagnostics.Debug.Assert(searchPlainLength == searchPlainArray.Length); System.Diagnostics.Debug.Assert(textPlainLength == textPlainArray.Length); List <AlignedSubstring> lcs = null; SubstringAlignmentDisambiguator picker = new SubstringAlignmentDisambiguator(); lcs = SequenceAlignmentComputer <char> .ComputeCoverage(searchPlainArray, textPlainArray, new CharSubstringScoreProvider(), picker); if (lcs == null || lcs.Count == 0) { return(null); } TermFinderResult result = new TermFinderResult(); result.MatchingRanges = new List <Sdl.LanguagePlatform.Core.SegmentRange>(); List <Core.SegmentPosition> textPositions = new List <Core.SegmentPosition>(); for (int subIdx = 0; subIdx < lcs.Count; ++subIdx) { AlignedSubstring sub = lcs[subIdx]; if (sub.Source.Length != sub.Target.Length) { // NOTE LCSubseq instead of Substring? Check scorer if this fires System.Diagnostics.Debug.Assert(false, "Not supported - let Oli know and provide test data"); return(null); } for (int p = 0; p < sub.Source.Length; ++p) { textPositions.Add(textSegmentPositions[sub.Target.Start + p]); } } if (textPositions.Count == 0) { return(null); } // covered ranges in the text segment: result.MatchingRanges = SortAndMelt(textPositions); // TODO this does not capture adjacency float baseScore = (float)textPositions.Count / (float)searchPlainLength; #if DEBUG bool ok = VerifyRanges(result.MatchingRanges, textSegment); if (!ok) { System.Diagnostics.Debug.Assert(false, "Range verification failed"); } #endif result.Score = (int)(100.0f * baseScore); if (result.Score < 0) { result.Score = 0; } else if (result.Score > 100) { result.Score = 100; } return(result); }