private int ComputeCosts(List <AlignedSubstring> path, AlignedSubstring candidate) { if (candidate == null) { throw new ArgumentNullException("candidate"); } // currently we simply compute the distance of the candidate to the nearest path element. // We may later compute "contributions" to existing token ranges (i.e. how well a token is covered) int minDist = -1; foreach (AlignedSubstring alg in path) { int srcDist = 0; int trgDist = 0; if (alg.Source.Start < candidate.Source.Start) { srcDist = candidate.Source.Start - alg.Source.Start - alg.Source.Length; } else { srcDist = alg.Source.Start - candidate.Source.Start - candidate.Source.Length; } if (alg.Target.Start < candidate.Target.Start) { trgDist = candidate.Target.Start - alg.Target.Start - alg.Target.Length; } else { trgDist = alg.Target.Start - candidate.Target.Start - candidate.Target.Length; } System.Diagnostics.Debug.Assert(srcDist >= 0); System.Diagnostics.Debug.Assert(trgDist >= 0); int dist = Math.Max(srcDist, trgDist); if (minDist < 0 || dist < minDist) { minDist = dist; } } System.Diagnostics.Debug.Assert(minDist >= 0); return(minDist); }
public AlignedSubstring PickExtension(List <AlignedSubstring> path, List <AlignedSubstring> candidates) { if (path == null) { throw new ArgumentNullException("path"); } if (candidates == null) { throw new ArgumentNullException("candidates"); } if (candidates.Count == 0) { return(null); } if (candidates.Count == 1) { return(candidates[0]); } // if we don't yet have a path, pick any candidate (can't attach) if (path.Count == 0) { return(candidates[0]); } AlignedSubstring result = null; int minCost = 0; foreach (AlignedSubstring cand in candidates) { int cost = ComputeCosts(path, cand); if (result == null || cost < minCost) { minCost = cost; result = cand; } } return(result); }
private static TermFinderResult FindTermsCharBased(Core.Segment searchSegment, Core.Segment textSegment, bool expectContinuousMatch) { // This should only be used for far-east languages // these ranges capture the mapping from a character position in the plain text arrays // to a segment position (run/position pairs) List <Core.SegmentPosition> searchSegmentPositions; List <Core.SegmentPosition> textSegmentPositions; string searchPlain = searchSegment.ToPlain(true, true, out searchSegmentPositions); string textPlain = textSegment.ToPlain(true, true, out textSegmentPositions); if (searchPlain.Length == 0) { // TODO may need to look into what may cause such an issue: System.Diagnostics.Debug.Assert(false, "Let Oli know and provide test data"); return(null); } char[] searchPlainArray = searchPlain.ToCharArray(); char[] textPlainArray = textPlain.ToCharArray(); int searchPlainLength = searchPlain.Length; int textPlainLength = textPlain.Length; System.Diagnostics.Debug.Assert(searchPlainLength == searchPlainArray.Length); System.Diagnostics.Debug.Assert(textPlainLength == textPlainArray.Length); List <AlignedSubstring> lcs = null; SubstringAlignmentDisambiguator picker = new SubstringAlignmentDisambiguator(); lcs = SequenceAlignmentComputer <char> .ComputeCoverage(searchPlainArray, textPlainArray, new CharSubstringScoreProvider(), picker); if (lcs == null || lcs.Count == 0) { return(null); } TermFinderResult result = new TermFinderResult(); result.MatchingRanges = new List <Sdl.LanguagePlatform.Core.SegmentRange>(); List <Core.SegmentPosition> textPositions = new List <Core.SegmentPosition>(); for (int subIdx = 0; subIdx < lcs.Count; ++subIdx) { AlignedSubstring sub = lcs[subIdx]; if (sub.Source.Length != sub.Target.Length) { // NOTE LCSubseq instead of Substring? Check scorer if this fires System.Diagnostics.Debug.Assert(false, "Not supported - let Oli know and provide test data"); return(null); } for (int p = 0; p < sub.Source.Length; ++p) { textPositions.Add(textSegmentPositions[sub.Target.Start + p]); } } if (textPositions.Count == 0) { return(null); } // covered ranges in the text segment: result.MatchingRanges = SortAndMelt(textPositions); // TODO this does not capture adjacency float baseScore = (float)textPositions.Count / (float)searchPlainLength; #if DEBUG bool ok = VerifyRanges(result.MatchingRanges, textSegment); if (!ok) { System.Diagnostics.Debug.Assert(false, "Range verification failed"); } #endif result.Score = (int)(100.0f * baseScore); if (result.Score < 0) { result.Score = 0; } else if (result.Score > 100) { result.Score = 100; } return(result); }
/// <summary> /// Computes the longest local alignment coverage of the two sequences used to /// initialize this instance. Unlike <see cref="M:Compute()"/>, you can specify /// positions in the sequences up to which to compute the alignment. /// </summary> /// <param name="uptoSource">The maximum index to cover in the source sequence (exclusive)</param> /// <param name="uptoTarget">The maximum index to cover in the target sequence (exclusive)</param> public List <AlignedSubstring> Compute(int uptoSource, int uptoTarget) { if (uptoSource <= 0 || uptoSource > _Source.Count) { throw new ArgumentOutOfRangeException("uptoSource"); } if (uptoTarget <= 0 || uptoTarget > _Target.Count) { throw new ArgumentOutOfRangeException("uptoTarget"); } List <AlignedSubstring> result = new List <AlignedSubstring>(); int globalMax = 0; List <Pair <int> > maxima = new List <Pair <int> >(); if (_AlignmentScores == null) { _AlignmentScores = ComputeScores(_Source, _Target, _Scorer); } bool maySkip = _Scorer.MaySkip; if (maySkip) { if (_SourceSkipScores == null || _TargetSkipScores == null) { ComputeSkipScoreCaches(); } } bool computeCoverage = (_MaxItems != 1); if (_Table == null) { _Table = new Cell[_Source.Count + 1, _Target.Count + 1]; if (!computeCoverage) { ComputeFullTable(maySkip); } } bool[,] blocked = null; if (computeCoverage) { blocked = new bool[_Source.Count + 1, _Target.Count + 1]; } do { if (computeCoverage) { ComputeMaximaForCoverage(maxima, ref globalMax, uptoSource, uptoTarget, maySkip, blocked); } else { ComputeMaximaForLCS(maxima, ref globalMax, uptoSource, uptoTarget); } if (maxima.Count > 0) { List <AlignedSubstring> extensionCandiates = new List <AlignedSubstring>(); foreach (Pair <int> max in maxima) { // read out the transition int iStart = max.Left; int jStart = max.Right; int len = 0; while (_Table[iStart, jStart].Score > 0) { Cell c = _Table[iStart, jStart]; if (c.Op == Operation.Align) { // not a skip len++; } iStart = c.BackI; jStart = c.BackJ; } // aligned sequence is s1[iStart, globalMaxI[, s2[jStart, globalMaxJ[ AlignedSubstring lsa = new AlignedSubstring(iStart, max.Left - iStart, jStart, max.Right - jStart, globalMax, len); if (len >= _MinLength) { extensionCandiates.Add(lsa); } } if (extensionCandiates.Count == 0) { // if we have maxima, but no extension candidates, they got dropped by the minLength // requirement. To avoid infinite loops, we must stop further iteration and set the // break criterion. maxima.Clear(); } else { AlignedSubstring winner = null; if (_Picker == null) { // if we have no disambiguator, pick the first candidate // TODO other defaults may be better (longest, etc.) winner = extensionCandiates[0]; } else { winner = _Picker.PickExtension(result, extensionCandiates); } if (winner == null) { break; } else { if (blocked != null) { // mark the covered ranges as "taken" so that we don't get overlaps for (int iStart = 1; iStart <= uptoSource; ++iStart) { for (int jStart = 1; jStart <= uptoTarget; ++jStart) { if ((iStart > winner.Source.Start && iStart <= winner.Source.Start + winner.Source.Length) || (jStart > winner.Target.Start && jStart <= winner.Target.Start + winner.Target.Length)) { blocked[iStart, jStart] = true; } } } } result.Add(winner); } } } } while (maxima.Count > 0 && (_MaxItems == 0 || result.Count < _MaxItems)); return(result); }