/// <summary> /// Get the Cluster from given inputs of matches. /// Steps are as follows: /// 1. Sort MUMs based on query sequence start. /// 2. Removing overlapping MUMs (in both sequences) and MUMs with same /// diagonal offset (usually adjacent) /// 3. Check for separation between two MUMs /// 4. Check the diagonal separation /// 5. If MUMs passes above conditions merge them in one cluster. /// 6. Sort MUMs using cluster id /// 7. Process clusters (Joining clusters)</summary> /// <param name="matches">List of maximum unique matches</param> /// <returns>List of Cluster</returns> public IList <Cluster> BuildClusters(IList <MaxUniqueMatch> matches) { // Validate the input if (null == matches) { return(null); } if (0 == matches.Count) { return(null); } IList <MaxUniqueMatchExtension> matchExtensions = new List <MaxUniqueMatchExtension>(); _unionFind = new List <int>(); // Convert list of matches to list of matchextensions foreach (MaxUniqueMatch match in matches) { MaxUniqueMatchExtension matchExtension = new MaxUniqueMatchExtension(match); _unionFind.Add(0); matchExtensions.Add(matchExtension); } // Get the cluster and return it return(GetClusters(matchExtensions)); }
/// <summary> /// Create a new delta alignment /// </summary> /// <param name="referenceSequence">Reference sequence</param> /// <param name="querySequence">Query sequence</param> /// <param name="cluster">Cluster object</param> /// <param name="match">Match object</param> /// <returns>Newly created DeltaAlignment object</returns> internal static DeltaAlignment NewAlignment( ISequence referenceSequence, ISequence querySequence, Cluster cluster, MaxUniqueMatchExtension match) { DeltaAlignment deltaAlignment = new DeltaAlignment(referenceSequence, querySequence); deltaAlignment.FirstSequenceStart = match.FirstSequenceStart; deltaAlignment.SecondSequenceStart = match.SecondSequenceStart; deltaAlignment.FirstSequenceEnd = match.FirstSequenceStart + match.Length - 1; deltaAlignment.SecondSequenceEnd = match.SecondSequenceStart + match.Length - 1; deltaAlignment.QueryDirection = cluster.QueryDirection; return(deltaAlignment); }
/// <summary> /// Extend the cluster in synteny /// </summary> /// <param name="synteny">Synteny in which cluster needs to be extened.</param> /// <returns>List of delta alignments</returns> private IList <DeltaAlignment> ExtendClusters(Synteny synteny) { bool isClusterExtended = false; IList <DeltaAlignment> deltaAlignments = new List <DeltaAlignment>(); DeltaAlignment deltaAlignment = null; DeltaAlignment targetAlignment = null; Cluster currentCluster = null; Cluster targetCluster = synteny.Clusters.Last(); int targetReference; int targetQuery; int methodName = NUCmerAligner.ForwardAlignFlag; IList <Cluster> clusters = synteny.Clusters; // Sort the cluster by first sequence start clusters = SortCluster(clusters, FirstSequenceStart); IEnumerator <Cluster> previousCluster = clusters.GetEnumerator(); previousCluster.MoveNext(); IEnumerator <Cluster> cluster = clusters.GetEnumerator(); while (cluster.MoveNext()) { currentCluster = cluster.Current; if (!isClusterExtended && (currentCluster.IsFused || IsClusterShadowed(deltaAlignments, currentCluster, deltaAlignment))) { currentCluster.IsFused = true; previousCluster.MoveNext(); currentCluster = previousCluster.Current; continue; } // Extend the match foreach (MaxUniqueMatchExtension match in currentCluster.Matches) { if (isClusterExtended) { if (deltaAlignment.FirstSequenceEnd != match.FirstSequenceStart || deltaAlignment.SecondSequenceEnd != match.SecondSequenceStart) { continue; } deltaAlignment.FirstSequenceEnd += match.Length - 1; deltaAlignment.SecondSequenceEnd += match.Length - 1; } else { deltaAlignment = DeltaAlignment.NewAlignment( synteny.ReferenceSequence, synteny.QuerySequence, currentCluster, match); deltaAlignments.Add(deltaAlignment); // Find the MUM which is a good candidate for extension in reverse direction targetAlignment = GetPreviousAlignment(deltaAlignments, deltaAlignment); if (ExtendToPreviousSequence( synteny.ReferenceSequence, synteny.QuerySequence, deltaAlignments, deltaAlignment, targetAlignment)) { deltaAlignment = targetAlignment; } } methodName = NUCmerAligner.ForwardAlignFlag; if (currentCluster.Matches.IndexOf(match) < currentCluster.Matches.Count - 1) { // extend till the match in the current cluster MaxUniqueMatchExtension nextMatch = currentCluster.Matches[currentCluster.Matches.IndexOf(match) + 1]; targetReference = nextMatch.FirstSequenceStart; targetQuery = nextMatch.SecondSequenceStart; isClusterExtended = ExtendToNextSequence( synteny.ReferenceSequence, synteny.QuerySequence, deltaAlignment, targetReference, targetQuery, methodName); } else { // extend till next cluster targetReference = synteny.ReferenceSequence.Count - 1; targetQuery = synteny.QuerySequence.Count - 1; targetCluster = GetNextCluster( clusters, currentCluster, ref targetReference, ref targetQuery); if (!synteny.Clusters.Contains(targetCluster)) { methodName |= NUCmerAligner.OptimalFlag; } isClusterExtended = ExtendToNextSequence( synteny.ReferenceSequence, synteny.QuerySequence, deltaAlignment, targetReference, targetQuery, methodName); } } if (!synteny.Clusters.Contains(targetCluster)) { isClusterExtended = false; } currentCluster.IsFused = true; if (!isClusterExtended) { previousCluster.MoveNext(); currentCluster = previousCluster.Current; } else { currentCluster = targetCluster; } } return(deltaAlignments); }
/// <summary> /// Find the longest increasing sub sequence from the given set of MUMs /// </summary> /// <param name="sortedMums">List of sorted MUMs</param> /// <returns>Longest Increasing Subsequence</returns> public IList <MaxUniqueMatch> GetLongestSequence(IList <MaxUniqueMatch> sortedMums) { MaxUniqueMatchExtension[] matches = ConvertToMUMExtension(sortedMums); for (var counteri = 0; counteri < matches.Length; counteri++) { var matches_i = matches[counteri]; // Initialize the MUM Extension matches_i.Score = matches[counteri].Length; matches_i.WrapScore = matches[counteri].Length; matches_i.Adjacent = 0; matches_i.From = -1; for (var counterj = 0; counterj < counteri; counterj++) { MaxUniqueMatchExtension matches_j = matches[counterj]; // Find the overlap in query sequence of MUM var overlap2 = matches_j.SecondSequenceStart + matches_j.Length; overlap2 -= matches_i.SecondSequenceStart; var overlap = overlap2 > 0 ? overlap2 : 0; // Calculate the score for query sequence of MUM var score = matches_j.Score + matches_i.Length - overlap; if (score > matches_i.WrapScore) { matches_i.WrapScore = score; } // Find the overlap in reference sequence of MUM var overlap1 = matches_j.FirstSequenceStart + matches_j.Length - matches_i.FirstSequenceStart; overlap = overlap > overlap1 ? overlap : overlap1; score = matches_j.Score + matches_i.Length - overlap; if (score > matches_i.Score) { // To remove crosses, mark counteri as next MUM From counterj // without any crosses matches_i.From = counterj; // Set the new score and overlap after removing the cross matches_i.Score = score; matches_i.Adjacent = overlap; } // Calculate the score for reference sequence of MUM score = matches_j.WrapScore + matches_i.Length - overlap; if (score >= matches_i.WrapScore) { matches_i.WrapScore = score; } } } // Find the best longest increasing subsequence // Sequence with highest score is the longest increasing subsequence var best = 0; var bestScore = matches[best].Score; for (var counteri = 1; counteri < matches.Length; counteri++) { if (matches[counteri].Score > bestScore) { best = counteri; bestScore = matches[best].Score; } } // Mark the MUMs in longest increasing subsequence as "Good" for (var counteri = best; counteri >= 0; counteri = matches[counteri].From) { matches[counteri].IsGood = true; } // Clear the list // Perform the adjustment to the MUMs in longest increasing subsequence (remove over) // Add it the list sortedMums.Clear(); foreach (var t in matches) { if (t.IsGood) { var adjacent = t.Adjacent; if (0 != adjacent) { t.FirstSequenceStart += adjacent; t.SecondSequenceStart += adjacent; t.Length -= adjacent; } if (0 < t.Length) { sortedMums.Add((MaxUniqueMatch)t); } } } // Return the list of MUMs that represent the longest increasing subsequence return(sortedMums); }