예제 #1
0
        protected override IList <TranslationMemory.SearchResults> SearchMultipleSegmentsInternal(TranslationMemory.SearchSettings settings, IList <Core.Segment> searchSegments)
        {
            lock (_serviceLockObject)
            {
                List <Core.Segment> translatedSegments = null;

                translatedSegments = new List <Core.Segment>();

                foreach (Core.Segment segment in searchSegments)
                {
                    Core.Segment translation = _service.Translate(segment);

                    // If We Got Nothing Back, Create A Blank Segment To Prevent Errors Further Along
                    if (translation == null)
                    {
                        translation = new Core.Segment(this.LanguageDirection.TargetCulture);
                    }

                    translatedSegments.Add(translation);
                }

                if (translatedSegments.Count != searchSegments.Count)
                {
                    throw new Exception(PluginResources.EMSG_InvalidTranslationCount);
                }

                return(CreateSearchResultsFromTranslations(searchSegments, translatedSegments));
            }
        }
예제 #2
0
        /// <summary>
        /// Attempts to identify the locations of (substrings of) the search segment in the result segment.
        /// Typically, the search segment is a single search string (such as a concordance search string),
        /// and the result segment is a transation unit segment.
        /// </summary>
        /// <param name="searchSegment">The search segment or search word. The segment must be tokenized.</param>
        /// <param name="textSegment">The text segment, usually a sentence or larger block of text.
        /// The segment must be tokenized.</param>
        /// <returns>The locations and other information about the coverage of the search segment
        /// in the result segment.</returns>
        public static TermFinderResult FindTerms(Core.Segment searchSegment,
                                                 Core.Segment textSegment,
                                                 bool expectContinuousMatch)
        {
            /*
             * Scoring parameters:
             *
             * - coverage of query string - the more the better
             * - coverage of query string by the largest fragment?
             * - number of fragments - the fewer the better
             * - distance of fragments - the smaller the better
             * - avg size of fragments? variance? - few large better than many small, even with larger distance
             * - coverage of fragment relative to the token(s) it spans
             * - consistent word order, inserted words
             *
             * Delete very small fragments (maybe not in FE languages?)
             * Number of tokens in query?
             * */


            if (searchSegment == null)
            {
                throw new ArgumentNullException("searchSegment");
            }
            if (textSegment == null)
            {
                throw new ArgumentNullException("textSegment");
            }

            if (searchSegment.Tokens == null || textSegment.Tokens == null)
            {
                throw new ArgumentException("Segments are expected to be tokenized");
            }

            if (searchSegment.Culture == null || textSegment.Culture == null)
            {
                throw new ArgumentException("At least one segment culture is null");
            }

            if (!Core.CultureInfoExtensions.AreCompatible(searchSegment.Culture, textSegment.Culture))
            {
                throw new ArgumentException("Segment cultures are incompatible");
            }

            if (searchSegment.Tokens.Count == 0 || textSegment.Tokens.Count == 0)
            {
                return(null);
            }

            bool usesBlanksAsSeparators = Core.CultureInfoExtensions.UseBlankAsWordSeparator(searchSegment.Culture);

            if (usesBlanksAsSeparators)
            {
                return(FindTermsWordBased(searchSegment, textSegment, expectContinuousMatch));
            }
            else
            {
                return(FindTermsCharBased(searchSegment, textSegment, expectContinuousMatch));
            }
        }
예제 #3
0
        public List <Core.Tokenization.Token> Tokenize(Core.Segment s, bool allowTokenBundles)
        {
            // TODO check whether segment culture is compatible with tokenizer parameter's culture? Or accept junk-in-junk-out?

            List <Token> result = new List <Token>();

            int run = -1;

            foreach (SegmentElement se in s.Elements)
            {
                ++run;

                if (se == null)
                {
                    System.Diagnostics.Debug.Assert(false, "empty segment run!");
                    continue;
                }

                Text  txtR = se as Text;
                Token tokR = se as Token;
                Tag   tagR = se as Tag;

                if (tagR != null)
                {
                    // TODO rather have a "markup token" type?
                    Token t = new TagToken(tagR);
                    t.Span = new SegmentRange(run, 0, 0);
                    result.Add(t);
                }
                else if (tokR != null)
                {
                    // partially pretokenized input
                    // TODO duplicate token/deep copy?
                    tokR.Span = new SegmentRange(run, 0, 0);
                    result.Add(tokR);
                }
                else if (txtR != null)
                {
                    List <Token> tokenized = TokenizeInternal(txtR.Value, run, _Parameters.CreateWhitespaceTokens, allowTokenBundles);
                    if (tokenized != null && tokenized.Count > 0)
                    {
                        result.AddRange(tokenized);
                    }
                }
                else
                {
                    System.Diagnostics.Debug.Assert(false, "Unknown segment run type");
                }
            }

            ReclassifyAcronyms(result);

            return(result);
        }
예제 #4
0
        protected override TranslationMemory.SearchResults SearchSingleSegmentInternal(TranslationMemory.SearchSettings settings, Core.Segment segment)
        {
            if (segment == null)
            {
                return(null);
            }

            lock (_serviceLockObject)
            {
                Core.Segment translation = _service.Translate(segment);
                return(CreateSearchResultFromTranslation(segment, translation));
            }
        }
예제 #5
0
        /// <summary>
        /// Verifies that all ranges are "inside" the segment's (text) elements.
        /// </summary>
        private static bool VerifyRanges(IEnumerable <Core.SegmentRange> ranges,
                                         Core.Segment segment)
        {
            if (ranges == null)
            {
                return(true);
            }

            foreach (Core.SegmentRange sr in ranges)
            {
                if (sr == null || sr.From == null || sr.Into == null)
                {
                    return(false);
                }
                if (sr.From.Index != sr.Into.Index)
                {
                    return(false);
                }
                if (sr.From.Index < 0 || sr.From.Index >= segment.Elements.Count)
                {
                    return(false);
                }
                if (sr.From.Position > sr.Into.Position)
                {
                    return(false);
                }
                Core.Text txt = segment.Elements[sr.From.Index] as Core.Text;
                if (txt == null)
                {
                    return(false);
                }
                if (sr.From.Position >= txt.Value.Length || sr.Into.Position >= txt.Value.Length)
                {
                    return(false);
                }
            }
            return(true);
        }
예제 #6
0
        private static int[,] ComputeTokenAssociationScores(Core.Segment searchSegment, Core.Segment textSegment)
        {
            int[,] overlaps = new int[searchSegment.Tokens.Count, textSegment.Tokens.Count];

            CaseAwareCharSubsequenceScoreProvider scorer = new CaseAwareCharSubsequenceScoreProvider();

            Core.Tokenization.Token srcToken;
            Core.Tokenization.Token txtToken;

            for (int s = 0; s < searchSegment.Tokens.Count; ++s)
            {
                srcToken = searchSegment.Tokens[s];
                if (srcToken.IsWhitespace || srcToken is Core.Tokenization.TagToken)
                {
                    continue;
                }

                for (int t = 0; t < textSegment.Tokens.Count; ++t)
                {
                    txtToken = textSegment.Tokens[t];
                    if (txtToken.IsWhitespace || txtToken is Core.Tokenization.TagToken)
                    {
                        continue;
                    }

                    overlaps[s, t] = 0;

                    // TODO relax equality criteria on placeables of the same type

                    // TODO does the score include information whether tolower/tobase has
                    //  been applied?

                    List <AlignedSubstring> alignment
                        = SequenceAlignmentComputer <char> .ComputeLongestCommonSubsequence(srcToken.Text.ToCharArray(),
                                                                                            txtToken.Text.ToCharArray(), 0, scorer, null);

                    if (alignment == null || alignment.Count == 0)
                    {
                        continue;
                    }

                    int common = alignment.Sum(x => x.Length);
                    if (common == 0)
                    {
                        continue;
                    }

                    // dice
                    // TODO experiment with other scoring methods? Scoring only relative to query?
                    float score = 2.0f * common / (float)(srcToken.Text.Length + txtToken.Text.Length);
                    if (score >= _DICE_THRESHOLD)
                    {
                        // percentage of score
                        overlaps[s, t] = (int)(score * 100.0f);
                        System.Diagnostics.Debug.Assert(overlaps[s, t] >= 0 && overlaps[s, t] <= 100);
                    }
                }
            }

            return(overlaps);
        }
예제 #7
0
        private static TermFinderResult FindTermsWordBased(Core.Segment searchSegment,
                                                           Core.Segment textSegment, bool expectContinuousMatch)
        {
            // compute the maximum overlap scores for each token in the source and each token in the text,
            //  using LCS or ED

            const bool useLcsScoreAdjustment = true;

            int[,] overlaps = ComputeTokenAssociationScores(searchSegment, textSegment);

            int[] maxScores = new int[searchSegment.Tokens.Count];

            TermFinderResult result = new TermFinderResult();

            result.MatchingRanges = new List <Sdl.LanguagePlatform.Core.SegmentRange>();

            System.Collections.BitArray coveredTargetTokens
                = new System.Collections.BitArray(textSegment.Tokens.Count);

            IList <string> srcConcatenatedTokens = new List <string>();
            IList <string> trgConcatenatedTokens = new List <string>();

            int nonwhiteSearchTokens = 0;

            for (int s = 0; s < searchSegment.Tokens.Count; ++s)
            {
                if (!searchSegment.Tokens[s].IsWhitespace)
                {
                    ++nonwhiteSearchTokens;
                    if (useLcsScoreAdjustment)
                    {
                        srcConcatenatedTokens.Add(searchSegment.Tokens[s].Text.ToLowerInvariant());
                    }
                }

                for (int t = 0; t < textSegment.Tokens.Count; ++t)
                {
                    if (overlaps[s, t] > 0)
                    {
                        if (!coveredTargetTokens[t])
                        {
                            result.MatchingRanges.Add(textSegment.Tokens[t].Span);
                            coveredTargetTokens[t] = true;
                        }

                        if (overlaps[s, t] > maxScores[s])
                        {
                            System.Diagnostics.Debug.Assert(overlaps[s, t] >= 0 && overlaps[s, t] <= 100);
                            maxScores[s] = overlaps[s, t];
                        }
                    }
                }
            }

            if (nonwhiteSearchTokens == 0)
            {
                return(null);
            }

            int tokenOverlapScore = (int)((float)maxScores.Sum() / (float)nonwhiteSearchTokens);

            if (useLcsScoreAdjustment)
            {
                int relevantTextTokens = 0;

                // TODO this won't work really well if the same search token appears
                //  multiple times int the match - the concatenation will include all
                //  occurrences (which will reduce the LCS score) and also increase the
                //  text token count (further reducing the LCS score)

                for (int tokenIndex = 0; tokenIndex < textSegment.Tokens.Count; ++tokenIndex)
                {
                    if (coveredTargetTokens[tokenIndex])
                    {
                        if (trgConcatenatedTokens.Count > 0)
                        {
                            int previousWordTokenIndex = GetPreviousWordTokenIndex(textSegment.Tokens, tokenIndex);
                            if ((previousWordTokenIndex > -1) && (!coveredTargetTokens[previousWordTokenIndex]))
                            {
                                const string UnmatchedToken = "#";
                                trgConcatenatedTokens.Add(UnmatchedToken);
                            }
                        }

                        ++relevantTextTokens;
                        trgConcatenatedTokens.Add(textSegment.Tokens[tokenIndex].Text.ToLowerInvariant());
                    }
                }

                string srcConcat = string.Join("~", srcConcatenatedTokens.ToArray());
                string trgConcat = string.Join("~", trgConcatenatedTokens.ToArray());

                int lcsOverlapScore = 0;

                if ((expectContinuousMatch || tokenOverlapScore < 100) &&
                    srcConcat.Length > 0 && trgConcat.Length > 0)
                {
                    List <AlignedSubstring> lcs = SequenceAlignmentComputer <char> .ComputeLongestCommonSubsequence(srcConcat.ToCharArray(),
                                                                                                                    trgConcat.ToCharArray(), 1,
                                                                                                                    new SimpleCharLSAScoreProvider(), null);

                    int lcsOverlap = lcs.Sum(x => x.Length);

                    // dice again, this time on the concatenated strings of the tokens, with a
                    //  penalty if the number of tokens differs [0..1]
                    float tokenCountDeltaPenalty = 2.0f * (float)Math.Min(nonwhiteSearchTokens, relevantTextTokens)
                                                   / (float)(nonwhiteSearchTokens + relevantTextTokens);

                    // another dice
                    // 2009-08-10, OC: reduce token count delta penalty
                    lcsOverlapScore = (int)(((75.0f + 25.0f * tokenCountDeltaPenalty)
                                             * 2.0f * (float)lcsOverlap) / (float)(srcConcat.Length + trgConcat.Length));
                    if (lcsOverlapScore < 0)
                    {
                        lcsOverlapScore = 0;
                    }
                    if (lcsOverlapScore > 100)
                    {
                        lcsOverlapScore = 100;
                    }

                    System.Diagnostics.Debug.Assert(lcsOverlapScore >= 0 && lcsOverlapScore <= 100);
                }

                if (tokenOverlapScore == 100 && lcsOverlapScore > 0)
                {
                    // discontinuous/swapped match - not sure how to penalize
                    // TODO work out exact scoring
                    result.Score = (200 + lcsOverlapScore) / 3;
                }
                else
                {
                    result.Score = Math.Max(tokenOverlapScore, lcsOverlapScore);
                }
            }
            else
            {
                result.Score = tokenOverlapScore;
            }

            return(result);
        }
예제 #8
0
        private static TermFinderResult FindTermsCharBased(Core.Segment searchSegment,
                                                           Core.Segment textSegment, bool expectContinuousMatch)
        {
            // This should only be used for far-east languages

            // these ranges capture the mapping from a character position in the plain text arrays
            //  to a segment position (run/position pairs)
            List <Core.SegmentPosition> searchSegmentPositions;
            List <Core.SegmentPosition> textSegmentPositions;

            string searchPlain = searchSegment.ToPlain(true, true, out searchSegmentPositions);
            string textPlain   = textSegment.ToPlain(true, true, out textSegmentPositions);

            if (searchPlain.Length == 0)
            {
                // TODO may need to look into what may cause such an issue:
                System.Diagnostics.Debug.Assert(false, "Let Oli know and provide test data");
                return(null);
            }

            char[] searchPlainArray = searchPlain.ToCharArray();
            char[] textPlainArray   = textPlain.ToCharArray();

            int searchPlainLength = searchPlain.Length;
            int textPlainLength   = textPlain.Length;

            System.Diagnostics.Debug.Assert(searchPlainLength == searchPlainArray.Length);
            System.Diagnostics.Debug.Assert(textPlainLength == textPlainArray.Length);

            List <AlignedSubstring> lcs = null;

            SubstringAlignmentDisambiguator picker
                = new SubstringAlignmentDisambiguator();

            lcs = SequenceAlignmentComputer <char> .ComputeCoverage(searchPlainArray,
                                                                    textPlainArray, new CharSubstringScoreProvider(), picker);

            if (lcs == null || lcs.Count == 0)
            {
                return(null);
            }

            TermFinderResult result = new TermFinderResult();

            result.MatchingRanges = new List <Sdl.LanguagePlatform.Core.SegmentRange>();

            List <Core.SegmentPosition> textPositions = new List <Core.SegmentPosition>();

            for (int subIdx = 0; subIdx < lcs.Count; ++subIdx)
            {
                AlignedSubstring sub = lcs[subIdx];

                if (sub.Source.Length != sub.Target.Length)
                {
                    // NOTE LCSubseq instead of Substring? Check scorer if this fires
                    System.Diagnostics.Debug.Assert(false, "Not supported - let Oli know and provide test data");
                    return(null);
                }

                for (int p = 0; p < sub.Source.Length; ++p)
                {
                    textPositions.Add(textSegmentPositions[sub.Target.Start + p]);
                }
            }

            if (textPositions.Count == 0)
            {
                return(null);
            }

            // covered ranges in the text segment:
            result.MatchingRanges = SortAndMelt(textPositions);

            // TODO this does not capture adjacency
            float baseScore = (float)textPositions.Count / (float)searchPlainLength;

#if DEBUG
            bool ok = VerifyRanges(result.MatchingRanges,
                                   textSegment);
            if (!ok)
            {
                System.Diagnostics.Debug.Assert(false, "Range verification failed");
            }
#endif

            result.Score = (int)(100.0f * baseScore);
            if (result.Score < 0)
            {
                result.Score = 0;
            }
            else if (result.Score > 100)
            {
                result.Score = 100;
            }

            return(result);
        }
예제 #9
0
 public List <Core.Tokenization.Token> Tokenize(Core.Segment s)
 {
     return(Tokenize(s, false));
 }