Пример #1
0
        private static int[,] ComputeTokenAssociationScores(Core.Segment searchSegment, Core.Segment textSegment)
        {
            int[,] overlaps = new int[searchSegment.Tokens.Count, textSegment.Tokens.Count];

            CaseAwareCharSubsequenceScoreProvider scorer = new CaseAwareCharSubsequenceScoreProvider();

            Core.Tokenization.Token srcToken;
            Core.Tokenization.Token txtToken;

            for (int s = 0; s < searchSegment.Tokens.Count; ++s)
            {
                srcToken = searchSegment.Tokens[s];
                if (srcToken.IsWhitespace || srcToken is Core.Tokenization.TagToken)
                {
                    continue;
                }

                for (int t = 0; t < textSegment.Tokens.Count; ++t)
                {
                    txtToken = textSegment.Tokens[t];
                    if (txtToken.IsWhitespace || txtToken is Core.Tokenization.TagToken)
                    {
                        continue;
                    }

                    overlaps[s, t] = 0;

                    // TODO relax equality criteria on placeables of the same type

                    // TODO does the score include information whether tolower/tobase has
                    //  been applied?

                    List <AlignedSubstring> alignment
                        = SequenceAlignmentComputer <char> .ComputeLongestCommonSubsequence(srcToken.Text.ToCharArray(),
                                                                                            txtToken.Text.ToCharArray(), 0, scorer, null);

                    if (alignment == null || alignment.Count == 0)
                    {
                        continue;
                    }

                    int common = alignment.Sum(x => x.Length);
                    if (common == 0)
                    {
                        continue;
                    }

                    // dice
                    // TODO experiment with other scoring methods? Scoring only relative to query?
                    float score = 2.0f * common / (float)(srcToken.Text.Length + txtToken.Text.Length);
                    if (score >= _DICE_THRESHOLD)
                    {
                        // percentage of score
                        overlaps[s, t] = (int)(score * 100.0f);
                        System.Diagnostics.Debug.Assert(overlaps[s, t] >= 0 && overlaps[s, t] <= 100);
                    }
                }
            }

            return(overlaps);
        }
Пример #2
0
        private static TermFinderResult FindTermsCharBased(Core.Segment searchSegment,
                                                           Core.Segment textSegment, bool expectContinuousMatch)
        {
            // This should only be used for far-east languages

            // these ranges capture the mapping from a character position in the plain text arrays
            //  to a segment position (run/position pairs)
            List <Core.SegmentPosition> searchSegmentPositions;
            List <Core.SegmentPosition> textSegmentPositions;

            string searchPlain = searchSegment.ToPlain(true, true, out searchSegmentPositions);
            string textPlain   = textSegment.ToPlain(true, true, out textSegmentPositions);

            if (searchPlain.Length == 0)
            {
                // TODO may need to look into what may cause such an issue:
                System.Diagnostics.Debug.Assert(false, "Let Oli know and provide test data");
                return(null);
            }

            char[] searchPlainArray = searchPlain.ToCharArray();
            char[] textPlainArray   = textPlain.ToCharArray();

            int searchPlainLength = searchPlain.Length;
            int textPlainLength   = textPlain.Length;

            System.Diagnostics.Debug.Assert(searchPlainLength == searchPlainArray.Length);
            System.Diagnostics.Debug.Assert(textPlainLength == textPlainArray.Length);

            List <AlignedSubstring> lcs = null;

            SubstringAlignmentDisambiguator picker
                = new SubstringAlignmentDisambiguator();

            lcs = SequenceAlignmentComputer <char> .ComputeCoverage(searchPlainArray,
                                                                    textPlainArray, new CharSubstringScoreProvider(), picker);

            if (lcs == null || lcs.Count == 0)
            {
                return(null);
            }

            TermFinderResult result = new TermFinderResult();

            result.MatchingRanges = new List <Sdl.LanguagePlatform.Core.SegmentRange>();

            List <Core.SegmentPosition> textPositions = new List <Core.SegmentPosition>();

            for (int subIdx = 0; subIdx < lcs.Count; ++subIdx)
            {
                AlignedSubstring sub = lcs[subIdx];

                if (sub.Source.Length != sub.Target.Length)
                {
                    // NOTE LCSubseq instead of Substring? Check scorer if this fires
                    System.Diagnostics.Debug.Assert(false, "Not supported - let Oli know and provide test data");
                    return(null);
                }

                for (int p = 0; p < sub.Source.Length; ++p)
                {
                    textPositions.Add(textSegmentPositions[sub.Target.Start + p]);
                }
            }

            if (textPositions.Count == 0)
            {
                return(null);
            }

            // covered ranges in the text segment:
            result.MatchingRanges = SortAndMelt(textPositions);

            // TODO this does not capture adjacency
            float baseScore = (float)textPositions.Count / (float)searchPlainLength;

#if DEBUG
            bool ok = VerifyRanges(result.MatchingRanges,
                                   textSegment);
            if (!ok)
            {
                System.Diagnostics.Debug.Assert(false, "Range verification failed");
            }
#endif

            result.Score = (int)(100.0f * baseScore);
            if (result.Score < 0)
            {
                result.Score = 0;
            }
            else if (result.Score > 100)
            {
                result.Score = 100;
            }

            return(result);
        }
Пример #3
0
        private static TermFinderResult FindTermsWordBased(Core.Segment searchSegment,
                                                           Core.Segment textSegment, bool expectContinuousMatch)
        {
            // compute the maximum overlap scores for each token in the source and each token in the text,
            //  using LCS or ED

            const bool useLcsScoreAdjustment = true;

            int[,] overlaps = ComputeTokenAssociationScores(searchSegment, textSegment);

            int[] maxScores = new int[searchSegment.Tokens.Count];

            TermFinderResult result = new TermFinderResult();

            result.MatchingRanges = new List <Sdl.LanguagePlatform.Core.SegmentRange>();

            System.Collections.BitArray coveredTargetTokens
                = new System.Collections.BitArray(textSegment.Tokens.Count);

            IList <string> srcConcatenatedTokens = new List <string>();
            IList <string> trgConcatenatedTokens = new List <string>();

            int nonwhiteSearchTokens = 0;

            for (int s = 0; s < searchSegment.Tokens.Count; ++s)
            {
                if (!searchSegment.Tokens[s].IsWhitespace)
                {
                    ++nonwhiteSearchTokens;
                    if (useLcsScoreAdjustment)
                    {
                        srcConcatenatedTokens.Add(searchSegment.Tokens[s].Text.ToLowerInvariant());
                    }
                }

                for (int t = 0; t < textSegment.Tokens.Count; ++t)
                {
                    if (overlaps[s, t] > 0)
                    {
                        if (!coveredTargetTokens[t])
                        {
                            result.MatchingRanges.Add(textSegment.Tokens[t].Span);
                            coveredTargetTokens[t] = true;
                        }

                        if (overlaps[s, t] > maxScores[s])
                        {
                            System.Diagnostics.Debug.Assert(overlaps[s, t] >= 0 && overlaps[s, t] <= 100);
                            maxScores[s] = overlaps[s, t];
                        }
                    }
                }
            }

            if (nonwhiteSearchTokens == 0)
            {
                return(null);
            }

            int tokenOverlapScore = (int)((float)maxScores.Sum() / (float)nonwhiteSearchTokens);

            if (useLcsScoreAdjustment)
            {
                int relevantTextTokens = 0;

                // TODO this won't work really well if the same search token appears
                //  multiple times int the match - the concatenation will include all
                //  occurrences (which will reduce the LCS score) and also increase the
                //  text token count (further reducing the LCS score)

                for (int tokenIndex = 0; tokenIndex < textSegment.Tokens.Count; ++tokenIndex)
                {
                    if (coveredTargetTokens[tokenIndex])
                    {
                        if (trgConcatenatedTokens.Count > 0)
                        {
                            int previousWordTokenIndex = GetPreviousWordTokenIndex(textSegment.Tokens, tokenIndex);
                            if ((previousWordTokenIndex > -1) && (!coveredTargetTokens[previousWordTokenIndex]))
                            {
                                const string UnmatchedToken = "#";
                                trgConcatenatedTokens.Add(UnmatchedToken);
                            }
                        }

                        ++relevantTextTokens;
                        trgConcatenatedTokens.Add(textSegment.Tokens[tokenIndex].Text.ToLowerInvariant());
                    }
                }

                string srcConcat = string.Join("~", srcConcatenatedTokens.ToArray());
                string trgConcat = string.Join("~", trgConcatenatedTokens.ToArray());

                int lcsOverlapScore = 0;

                if ((expectContinuousMatch || tokenOverlapScore < 100) &&
                    srcConcat.Length > 0 && trgConcat.Length > 0)
                {
                    List <AlignedSubstring> lcs = SequenceAlignmentComputer <char> .ComputeLongestCommonSubsequence(srcConcat.ToCharArray(),
                                                                                                                    trgConcat.ToCharArray(), 1,
                                                                                                                    new SimpleCharLSAScoreProvider(), null);

                    int lcsOverlap = lcs.Sum(x => x.Length);

                    // dice again, this time on the concatenated strings of the tokens, with a
                    //  penalty if the number of tokens differs [0..1]
                    float tokenCountDeltaPenalty = 2.0f * (float)Math.Min(nonwhiteSearchTokens, relevantTextTokens)
                                                   / (float)(nonwhiteSearchTokens + relevantTextTokens);

                    // another dice
                    // 2009-08-10, OC: reduce token count delta penalty
                    lcsOverlapScore = (int)(((75.0f + 25.0f * tokenCountDeltaPenalty)
                                             * 2.0f * (float)lcsOverlap) / (float)(srcConcat.Length + trgConcat.Length));
                    if (lcsOverlapScore < 0)
                    {
                        lcsOverlapScore = 0;
                    }
                    if (lcsOverlapScore > 100)
                    {
                        lcsOverlapScore = 100;
                    }

                    System.Diagnostics.Debug.Assert(lcsOverlapScore >= 0 && lcsOverlapScore <= 100);
                }

                if (tokenOverlapScore == 100 && lcsOverlapScore > 0)
                {
                    // discontinuous/swapped match - not sure how to penalize
                    // TODO work out exact scoring
                    result.Score = (200 + lcsOverlapScore) / 3;
                }
                else
                {
                    result.Score = Math.Max(tokenOverlapScore, lcsOverlapScore);
                }
            }
            else
            {
                result.Score = tokenOverlapScore;
            }

            return(result);
        }
Пример #4
0
        private static int[,] ComputeTagAssociationScores(SimilarityMatrix similarityMatrix,
                                                          TagPairs srcPairedTags,
                                                          TagPairs trgPairedTags,
                                                          bool useEndPositions)
        {
            // this should pretty much result in first-come first-serve alignment, but we hopefully
            //  get better associations for nested tags

            // foreach src tag, compute LCS to each target tag

            int[,] lcsScores = new int[srcPairedTags.Count, trgPairedTags.Count];

            List <int> sourceTokenPositions = new List <int>();
            List <int> targetTokenPositions = new List <int>();

            TokenIndexLCSScoreProvider scorer
                = new TokenIndexLCSScoreProvider(similarityMatrix, 0.75, true);

            for (int p = 0; p < similarityMatrix.SourceTokens.Count; ++p)
            {
                sourceTokenPositions.Add(p);
            }
            for (int p = 0; p < similarityMatrix.TargetTokens.Count; ++p)
            {
                targetTokenPositions.Add(p);
            }

            SequenceAlignmentComputer <int> aligner
                = new SequenceAlignmentComputer <int>(sourceTokenPositions,
                                                      targetTokenPositions, scorer, null, 1, 1);

            int uptoSource;
            int uptoTarget;

            for (int srcTag = srcPairedTags.Count - 1; srcTag >= 0; --srcTag)
            {
                PairedTag sPt = srcPairedTags[srcTag];

                uptoSource = (useEndPositions ? sPt.End : sPt.Start);

                for (int trgTag = trgPairedTags.Count - 1; trgTag >= 0; --trgTag)
                {
                    PairedTag tPt = trgPairedTags[trgTag];
                    uptoTarget = (useEndPositions ? tPt.End : tPt.Start);

                    List <AlignedSubstring> result
                        = aligner.Compute(uptoSource, uptoTarget);

                    if (result != null && result.Count > 0)
                    {
                        System.Diagnostics.Debug.Assert(result.Count == 1);

                        // the result is the common subsequence length minus items which were deleted or inserted
                        int score = result[0].Score
                                    - (uptoSource - result[0].Score)
                                    - (uptoTarget - result[0].Score);

                        // penalize large differences in the spanned width, but not if
                        //  we include the end positions in the LCS
                        int malus;
                        if (useEndPositions)
                        {
                            malus = 0;
                        }
                        else
                        {
                            int srcSpan = GetTagSpan(sPt);
                            int trgSpan = GetTagSpan(tPt);

                            malus = Math.Abs(srcSpan - trgSpan) / 2;
                        }

                        lcsScores[srcTag, trgTag] = score - malus;
                    }
                }
            }

            return(lcsScores);
        }