예제 #1
0
        private static TermFinderResult FindTermsWordBased(Core.Segment searchSegment,
                                                           Core.Segment textSegment, bool expectContinuousMatch)
        {
            // compute the maximum overlap scores for each token in the source and each token in the text,
            //  using LCS or ED

            const bool useLcsScoreAdjustment = true;

            int[,] overlaps = ComputeTokenAssociationScores(searchSegment, textSegment);

            int[] maxScores = new int[searchSegment.Tokens.Count];

            TermFinderResult result = new TermFinderResult();

            result.MatchingRanges = new List <Sdl.LanguagePlatform.Core.SegmentRange>();

            System.Collections.BitArray coveredTargetTokens
                = new System.Collections.BitArray(textSegment.Tokens.Count);

            IList <string> srcConcatenatedTokens = new List <string>();
            IList <string> trgConcatenatedTokens = new List <string>();

            int nonwhiteSearchTokens = 0;

            for (int s = 0; s < searchSegment.Tokens.Count; ++s)
            {
                if (!searchSegment.Tokens[s].IsWhitespace)
                {
                    ++nonwhiteSearchTokens;
                    if (useLcsScoreAdjustment)
                    {
                        srcConcatenatedTokens.Add(searchSegment.Tokens[s].Text.ToLowerInvariant());
                    }
                }

                for (int t = 0; t < textSegment.Tokens.Count; ++t)
                {
                    if (overlaps[s, t] > 0)
                    {
                        if (!coveredTargetTokens[t])
                        {
                            result.MatchingRanges.Add(textSegment.Tokens[t].Span);
                            coveredTargetTokens[t] = true;
                        }

                        if (overlaps[s, t] > maxScores[s])
                        {
                            System.Diagnostics.Debug.Assert(overlaps[s, t] >= 0 && overlaps[s, t] <= 100);
                            maxScores[s] = overlaps[s, t];
                        }
                    }
                }
            }

            if (nonwhiteSearchTokens == 0)
            {
                return(null);
            }

            int tokenOverlapScore = (int)((float)maxScores.Sum() / (float)nonwhiteSearchTokens);

            if (useLcsScoreAdjustment)
            {
                int relevantTextTokens = 0;

                // TODO this won't work really well if the same search token appears
                //  multiple times int the match - the concatenation will include all
                //  occurrences (which will reduce the LCS score) and also increase the
                //  text token count (further reducing the LCS score)

                for (int tokenIndex = 0; tokenIndex < textSegment.Tokens.Count; ++tokenIndex)
                {
                    if (coveredTargetTokens[tokenIndex])
                    {
                        if (trgConcatenatedTokens.Count > 0)
                        {
                            int previousWordTokenIndex = GetPreviousWordTokenIndex(textSegment.Tokens, tokenIndex);
                            if ((previousWordTokenIndex > -1) && (!coveredTargetTokens[previousWordTokenIndex]))
                            {
                                const string UnmatchedToken = "#";
                                trgConcatenatedTokens.Add(UnmatchedToken);
                            }
                        }

                        ++relevantTextTokens;
                        trgConcatenatedTokens.Add(textSegment.Tokens[tokenIndex].Text.ToLowerInvariant());
                    }
                }

                string srcConcat = string.Join("~", srcConcatenatedTokens.ToArray());
                string trgConcat = string.Join("~", trgConcatenatedTokens.ToArray());

                int lcsOverlapScore = 0;

                if ((expectContinuousMatch || tokenOverlapScore < 100) &&
                    srcConcat.Length > 0 && trgConcat.Length > 0)
                {
                    List <AlignedSubstring> lcs = SequenceAlignmentComputer <char> .ComputeLongestCommonSubsequence(srcConcat.ToCharArray(),
                                                                                                                    trgConcat.ToCharArray(), 1,
                                                                                                                    new SimpleCharLSAScoreProvider(), null);

                    int lcsOverlap = lcs.Sum(x => x.Length);

                    // dice again, this time on the concatenated strings of the tokens, with a
                    //  penalty if the number of tokens differs [0..1]
                    float tokenCountDeltaPenalty = 2.0f * (float)Math.Min(nonwhiteSearchTokens, relevantTextTokens)
                                                   / (float)(nonwhiteSearchTokens + relevantTextTokens);

                    // another dice
                    // 2009-08-10, OC: reduce token count delta penalty
                    lcsOverlapScore = (int)(((75.0f + 25.0f * tokenCountDeltaPenalty)
                                             * 2.0f * (float)lcsOverlap) / (float)(srcConcat.Length + trgConcat.Length));
                    if (lcsOverlapScore < 0)
                    {
                        lcsOverlapScore = 0;
                    }
                    if (lcsOverlapScore > 100)
                    {
                        lcsOverlapScore = 100;
                    }

                    System.Diagnostics.Debug.Assert(lcsOverlapScore >= 0 && lcsOverlapScore <= 100);
                }

                if (tokenOverlapScore == 100 && lcsOverlapScore > 0)
                {
                    // discontinuous/swapped match - not sure how to penalize
                    // TODO work out exact scoring
                    result.Score = (200 + lcsOverlapScore) / 3;
                }
                else
                {
                    result.Score = Math.Max(tokenOverlapScore, lcsOverlapScore);
                }
            }
            else
            {
                result.Score = tokenOverlapScore;
            }

            return(result);
        }
예제 #2
0
        private static TermFinderResult FindTermsCharBased(Core.Segment searchSegment,
                                                           Core.Segment textSegment, bool expectContinuousMatch)
        {
            // This should only be used for far-east languages

            // these ranges capture the mapping from a character position in the plain text arrays
            //  to a segment position (run/position pairs)
            List <Core.SegmentPosition> searchSegmentPositions;
            List <Core.SegmentPosition> textSegmentPositions;

            string searchPlain = searchSegment.ToPlain(true, true, out searchSegmentPositions);
            string textPlain   = textSegment.ToPlain(true, true, out textSegmentPositions);

            if (searchPlain.Length == 0)
            {
                // TODO may need to look into what may cause such an issue:
                System.Diagnostics.Debug.Assert(false, "Let Oli know and provide test data");
                return(null);
            }

            char[] searchPlainArray = searchPlain.ToCharArray();
            char[] textPlainArray   = textPlain.ToCharArray();

            int searchPlainLength = searchPlain.Length;
            int textPlainLength   = textPlain.Length;

            System.Diagnostics.Debug.Assert(searchPlainLength == searchPlainArray.Length);
            System.Diagnostics.Debug.Assert(textPlainLength == textPlainArray.Length);

            List <AlignedSubstring> lcs = null;

            SubstringAlignmentDisambiguator picker
                = new SubstringAlignmentDisambiguator();

            lcs = SequenceAlignmentComputer <char> .ComputeCoverage(searchPlainArray,
                                                                    textPlainArray, new CharSubstringScoreProvider(), picker);

            if (lcs == null || lcs.Count == 0)
            {
                return(null);
            }

            TermFinderResult result = new TermFinderResult();

            result.MatchingRanges = new List <Sdl.LanguagePlatform.Core.SegmentRange>();

            List <Core.SegmentPosition> textPositions = new List <Core.SegmentPosition>();

            for (int subIdx = 0; subIdx < lcs.Count; ++subIdx)
            {
                AlignedSubstring sub = lcs[subIdx];

                if (sub.Source.Length != sub.Target.Length)
                {
                    // NOTE LCSubseq instead of Substring? Check scorer if this fires
                    System.Diagnostics.Debug.Assert(false, "Not supported - let Oli know and provide test data");
                    return(null);
                }

                for (int p = 0; p < sub.Source.Length; ++p)
                {
                    textPositions.Add(textSegmentPositions[sub.Target.Start + p]);
                }
            }

            if (textPositions.Count == 0)
            {
                return(null);
            }

            // covered ranges in the text segment:
            result.MatchingRanges = SortAndMelt(textPositions);

            // TODO this does not capture adjacency
            float baseScore = (float)textPositions.Count / (float)searchPlainLength;

#if DEBUG
            bool ok = VerifyRanges(result.MatchingRanges,
                                   textSegment);
            if (!ok)
            {
                System.Diagnostics.Debug.Assert(false, "Range verification failed");
            }
#endif

            result.Score = (int)(100.0f * baseScore);
            if (result.Score < 0)
            {
                result.Score = 0;
            }
            else if (result.Score > 100)
            {
                result.Score = 100;
            }

            return(result);
        }