Example #1
0
        private int ComputeCosts(List <AlignedSubstring> path, AlignedSubstring candidate)
        {
            if (candidate == null)
            {
                throw new ArgumentNullException("candidate");
            }

            // currently we simply compute the distance of the candidate to the nearest path element.
            //  We may later compute "contributions" to existing token ranges (i.e. how well a token is covered)

            int minDist = -1;

            foreach (AlignedSubstring alg in path)
            {
                int srcDist = 0;
                int trgDist = 0;

                if (alg.Source.Start < candidate.Source.Start)
                {
                    srcDist = candidate.Source.Start - alg.Source.Start - alg.Source.Length;
                }
                else
                {
                    srcDist = alg.Source.Start - candidate.Source.Start - candidate.Source.Length;
                }

                if (alg.Target.Start < candidate.Target.Start)
                {
                    trgDist = candidate.Target.Start - alg.Target.Start - alg.Target.Length;
                }
                else
                {
                    trgDist = alg.Target.Start - candidate.Target.Start - candidate.Target.Length;
                }

                System.Diagnostics.Debug.Assert(srcDist >= 0);
                System.Diagnostics.Debug.Assert(trgDist >= 0);

                int dist = Math.Max(srcDist, trgDist);
                if (minDist < 0 || dist < minDist)
                {
                    minDist = dist;
                }
            }

            System.Diagnostics.Debug.Assert(minDist >= 0);
            return(minDist);
        }
Example #2
0
        public AlignedSubstring PickExtension(List <AlignedSubstring> path,
                                              List <AlignedSubstring> candidates)
        {
            if (path == null)
            {
                throw new ArgumentNullException("path");
            }
            if (candidates == null)
            {
                throw new ArgumentNullException("candidates");
            }

            if (candidates.Count == 0)
            {
                return(null);
            }
            if (candidates.Count == 1)
            {
                return(candidates[0]);
            }

            // if we don't yet have a path, pick any candidate (can't attach)
            if (path.Count == 0)
            {
                return(candidates[0]);
            }

            AlignedSubstring result = null;
            int minCost             = 0;

            foreach (AlignedSubstring cand in candidates)
            {
                int cost = ComputeCosts(path, cand);
                if (result == null || cost < minCost)
                {
                    minCost = cost;
                    result  = cand;
                }
            }

            return(result);
        }
Example #3
0
        private static TermFinderResult FindTermsCharBased(Core.Segment searchSegment,
                                                           Core.Segment textSegment, bool expectContinuousMatch)
        {
            // This should only be used for far-east languages

            // these ranges capture the mapping from a character position in the plain text arrays
            //  to a segment position (run/position pairs)
            List <Core.SegmentPosition> searchSegmentPositions;
            List <Core.SegmentPosition> textSegmentPositions;

            string searchPlain = searchSegment.ToPlain(true, true, out searchSegmentPositions);
            string textPlain   = textSegment.ToPlain(true, true, out textSegmentPositions);

            if (searchPlain.Length == 0)
            {
                // TODO may need to look into what may cause such an issue:
                System.Diagnostics.Debug.Assert(false, "Let Oli know and provide test data");
                return(null);
            }

            char[] searchPlainArray = searchPlain.ToCharArray();
            char[] textPlainArray   = textPlain.ToCharArray();

            int searchPlainLength = searchPlain.Length;
            int textPlainLength   = textPlain.Length;

            System.Diagnostics.Debug.Assert(searchPlainLength == searchPlainArray.Length);
            System.Diagnostics.Debug.Assert(textPlainLength == textPlainArray.Length);

            List <AlignedSubstring> lcs = null;

            SubstringAlignmentDisambiguator picker
                = new SubstringAlignmentDisambiguator();

            lcs = SequenceAlignmentComputer <char> .ComputeCoverage(searchPlainArray,
                                                                    textPlainArray, new CharSubstringScoreProvider(), picker);

            if (lcs == null || lcs.Count == 0)
            {
                return(null);
            }

            TermFinderResult result = new TermFinderResult();

            result.MatchingRanges = new List <Sdl.LanguagePlatform.Core.SegmentRange>();

            List <Core.SegmentPosition> textPositions = new List <Core.SegmentPosition>();

            for (int subIdx = 0; subIdx < lcs.Count; ++subIdx)
            {
                AlignedSubstring sub = lcs[subIdx];

                if (sub.Source.Length != sub.Target.Length)
                {
                    // NOTE LCSubseq instead of Substring? Check scorer if this fires
                    System.Diagnostics.Debug.Assert(false, "Not supported - let Oli know and provide test data");
                    return(null);
                }

                for (int p = 0; p < sub.Source.Length; ++p)
                {
                    textPositions.Add(textSegmentPositions[sub.Target.Start + p]);
                }
            }

            if (textPositions.Count == 0)
            {
                return(null);
            }

            // covered ranges in the text segment:
            result.MatchingRanges = SortAndMelt(textPositions);

            // TODO this does not capture adjacency
            float baseScore = (float)textPositions.Count / (float)searchPlainLength;

#if DEBUG
            bool ok = VerifyRanges(result.MatchingRanges,
                                   textSegment);
            if (!ok)
            {
                System.Diagnostics.Debug.Assert(false, "Range verification failed");
            }
#endif

            result.Score = (int)(100.0f * baseScore);
            if (result.Score < 0)
            {
                result.Score = 0;
            }
            else if (result.Score > 100)
            {
                result.Score = 100;
            }

            return(result);
        }
Example #4
0
        /// <summary>
        /// Computes the longest local alignment coverage of the two sequences used to
        /// initialize this instance. Unlike <see cref="M:Compute()"/>, you can specify
        /// positions in the sequences up to which to compute the alignment.
        /// </summary>
        /// <param name="uptoSource">The maximum index to cover in the source sequence (exclusive)</param>
        /// <param name="uptoTarget">The maximum index to cover in the target sequence (exclusive)</param>
        public List <AlignedSubstring> Compute(int uptoSource, int uptoTarget)
        {
            if (uptoSource <= 0 || uptoSource > _Source.Count)
            {
                throw new ArgumentOutOfRangeException("uptoSource");
            }
            if (uptoTarget <= 0 || uptoTarget > _Target.Count)
            {
                throw new ArgumentOutOfRangeException("uptoTarget");
            }

            List <AlignedSubstring> result = new List <AlignedSubstring>();

            int globalMax             = 0;
            List <Pair <int> > maxima = new List <Pair <int> >();

            if (_AlignmentScores == null)
            {
                _AlignmentScores = ComputeScores(_Source, _Target, _Scorer);
            }

            bool maySkip = _Scorer.MaySkip;

            if (maySkip)
            {
                if (_SourceSkipScores == null || _TargetSkipScores == null)
                {
                    ComputeSkipScoreCaches();
                }
            }

            bool computeCoverage = (_MaxItems != 1);

            if (_Table == null)
            {
                _Table = new Cell[_Source.Count + 1, _Target.Count + 1];
                if (!computeCoverage)
                {
                    ComputeFullTable(maySkip);
                }
            }

            bool[,] blocked = null;
            if (computeCoverage)
            {
                blocked = new bool[_Source.Count + 1, _Target.Count + 1];
            }

            do
            {
                if (computeCoverage)
                {
                    ComputeMaximaForCoverage(maxima, ref globalMax, uptoSource, uptoTarget, maySkip, blocked);
                }
                else
                {
                    ComputeMaximaForLCS(maxima, ref globalMax, uptoSource, uptoTarget);
                }

                if (maxima.Count > 0)
                {
                    List <AlignedSubstring> extensionCandiates = new List <AlignedSubstring>();

                    foreach (Pair <int> max in maxima)
                    {
                        // read out the transition
                        int iStart = max.Left;
                        int jStart = max.Right;

                        int len = 0;

                        while (_Table[iStart, jStart].Score > 0)
                        {
                            Cell c = _Table[iStart, jStart];
                            if (c.Op == Operation.Align)
                            {
                                // not a skip
                                len++;
                            }
                            iStart = c.BackI;
                            jStart = c.BackJ;
                        }

                        // aligned sequence is s1[iStart, globalMaxI[, s2[jStart, globalMaxJ[

                        AlignedSubstring lsa
                            = new AlignedSubstring(iStart, max.Left - iStart, jStart, max.Right - jStart, globalMax, len);

                        if (len >= _MinLength)
                        {
                            extensionCandiates.Add(lsa);
                        }
                    }

                    if (extensionCandiates.Count == 0)
                    {
                        // if we have maxima, but no extension candidates, they got dropped by the minLength
                        //  requirement. To avoid infinite loops, we must stop further iteration and set the
                        //  break criterion.
                        maxima.Clear();
                    }
                    else
                    {
                        AlignedSubstring winner = null;
                        if (_Picker == null)
                        {
                            // if we have no disambiguator, pick the first candidate
                            // TODO other defaults may be better (longest, etc.)
                            winner = extensionCandiates[0];
                        }
                        else
                        {
                            winner = _Picker.PickExtension(result, extensionCandiates);
                        }

                        if (winner == null)
                        {
                            break;
                        }
                        else
                        {
                            if (blocked != null)
                            {
                                // mark the covered ranges as "taken" so that we don't get overlaps
                                for (int iStart = 1; iStart <= uptoSource; ++iStart)
                                {
                                    for (int jStart = 1; jStart <= uptoTarget; ++jStart)
                                    {
                                        if ((iStart > winner.Source.Start && iStart <= winner.Source.Start + winner.Source.Length) ||
                                            (jStart > winner.Target.Start && jStart <= winner.Target.Start + winner.Target.Length))
                                        {
                                            blocked[iStart, jStart] = true;
                                        }
                                    }
                                }
                            }
                            result.Add(winner);
                        }
                    }
                }
            } while (maxima.Count > 0 && (_MaxItems == 0 || result.Count < _MaxItems));

            return(result);
        }