/// <summary> /// Initializations to be done before aligning sequences. /// Sets consensus resolver property to correct alphabet. /// </summary> /// <param name="inputSequence">Input sequence.</param> private void InitializeAlign(ISequence inputSequence) { // Initializations if (ConsensusResolver == null) { ConsensusResolver = new SimpleConsensusResolver(Alphabets.AmbiguousAlphabetMap[inputSequence.Alphabet]); } else { ConsensusResolver.SequenceAlphabet = Alphabets.AmbiguousAlphabetMap[inputSequence.Alphabet]; } }
/// <summary> /// Initializations to be done before aligning sequences. /// Sets consensus resolver property to correct alphabet. /// </summary> /// <param name="inputSequence">input sequence</param> private void InitializeAlign(ISequence inputSequence) { // Initializations if (ConsensusResolver == null) { ConsensusResolver = new SimpleConsensusResolver(inputSequence.Alphabet); } else { ConsensusResolver.SequenceAlphabet = inputSequence.Alphabet; } }
/// <summary> /// Analyze the passed contig and store a consensus into its Consensus property. /// Public method to allow testing of consensus generation part. /// Used by test automation. /// </summary> /// <param name="alphabet">Sequence alphabet</param> /// <param name="contig">Contig for which consensus is to be constructed</param> public void MakeConsensus(IAlphabet alphabet, Contig contig) { _sequenceAlphabet = alphabet; if (ConsensusResolver == null) { ConsensusResolver = new SimpleConsensusResolver(_sequenceAlphabet); } else { ConsensusResolver.SequenceAlphabet = _sequenceAlphabet; } MakeConsensus(contig); }
/// <summary> /// This method is considered as main execute method which defines the /// step by step algorithm. Drived class flows the defined flow by this /// method. /// </summary> /// <param name="referenceSequence">reference sequence</param> /// <param name="querySequenceList">list of input sequences</param> /// <returns>A list of sequence alignments</returns> private IList <IPairwiseSequenceAlignment> Alignment( ISequence referenceSequence, IList <ISequence> querySequenceList) { // Initializations if (ConsensusResolver == null) { ConsensusResolver = new SimpleConsensusResolver(referenceSequence.Alphabet); } else { ConsensusResolver.SequenceAlphabet = referenceSequence.Alphabet; } if (StoreMUMs) { return(AlignmentWithAccumulatedMUMs(referenceSequence, querySequenceList)); } else { return(AlignmentWithoutAccumulatedMUMs(referenceSequence, querySequenceList)); } }
/// <summary> /// This method is considered as main execute method which defines the /// step by step algorithm. Derived class flows the defined flow by this /// method. /// </summary> /// <param name="referenceSequenceList">Reference sequence.</param> /// <param name="querySequenceList">List of input sequences.</param> /// <returns>A list of sequence alignment.</returns> private IEnumerable <IPairwiseSequenceAlignment> Alignment(IEnumerable <ISequence> referenceSequenceList, IEnumerable <ISequence> querySequenceList) { ConsensusResolver = new SimpleConsensusResolver(referenceSequenceList.ElementAt(0).Alphabet); IList <IPairwiseSequenceAlignment> results = new List <IPairwiseSequenceAlignment>(); IPairwiseSequenceAlignment sequenceAlignment; IList <PairwiseAlignedSequence> alignments; List <DeltaAlignment> deltas = new List <DeltaAlignment>(); foreach (ISequence refSequence in referenceSequenceList) { this.nucmerAlgo = new NUCmer((Sequence)refSequence); if (GapOpenCost != DefaultGapOpenCost) { this.nucmerAlgo.GapOpenCost = GapOpenCost; } if (GapExtensionCost != DefaultGapExtensionCost) { this.nucmerAlgo.GapExtensionCost = GapExtensionCost; } if (LengthOfMUM != DefaultLengthOfMUM) { this.nucmerAlgo.LengthOfMUM = LengthOfMUM; } // Set the ClusterBuilder properties to defaults if (FixedSeparation != ClusterBuilder.DefaultFixedSeparation) { this.nucmerAlgo.FixedSeparation = FixedSeparation; } if (MaximumSeparation != ClusterBuilder.DefaultMaximumSeparation) { this.nucmerAlgo.MaximumSeparation = MaximumSeparation; } if (MinimumScore != ClusterBuilder.DefaultMinimumScore) { this.nucmerAlgo.MinimumScore = MinimumScore; } if (SeparationFactor != ClusterBuilder.DefaultSeparationFactor) { this.nucmerAlgo.SeparationFactor = SeparationFactor; } if (BreakLength != ModifiedSmithWaterman.DefaultBreakLength) { this.nucmerAlgo.BreakLength = BreakLength; } this.nucmerAlgo.ConsensusResolver = ConsensusResolver; if (SimilarityMatrix != null) { this.nucmerAlgo.SimilarityMatrix = SimilarityMatrix; } foreach (ISequence querySequence in querySequenceList) { IEnumerable <DeltaAlignment> deltaAlignment = this.nucmerAlgo.GetDeltaAlignments(querySequence); deltas.AddRange(deltaAlignment); } } if (deltas.Count > 0) { ISequence concatReference = referenceSequenceList.ElementAt(0); //// concat all the sequences into one sequence if (referenceSequenceList.Count() > 1) { concatReference = ConcatSequence(referenceSequenceList); } foreach (ISequence querySequence in querySequenceList) { List <DeltaAlignment> qDelta = deltas.Where(d => d.QuerySequence.Equals(querySequence)).ToList(); sequenceAlignment = new PairwiseSequenceAlignment(concatReference, querySequence); // Convert delta alignments to sequence alignments alignments = ConvertDeltaToAlignment(qDelta); if (alignments.Count > 0) { foreach (PairwiseAlignedSequence align in alignments) { // Calculate the score of alignment align.Score = CalculateScore( align.FirstSequence, align.SecondSequence); // Make Consensus align.Consensus = MakeConsensus( align.FirstSequence, align.SecondSequence); sequenceAlignment.PairwiseAlignedSequences.Add(align); } } results.Add(sequenceAlignment); } } return(results); }
/// <summary> /// This method is considered as main execute method which defines the /// step by step algorithm. Drived class flows the defined flow by this /// method. /// </summary> /// <param name="referenceSequenceList">reference sequence</param> /// <param name="querySequenceList">list of input sequences</param> /// <returns>A list of sequence alignment</returns> private IList <IPairwiseSequenceAlignment> Alignment( IList <ISequence> referenceSequenceList, IList <ISequence> querySequenceList) { // Initializations if (referenceSequenceList.Count > 0) { if (ConsensusResolver == null) { ConsensusResolver = new SimpleConsensusResolver(referenceSequenceList[0].Alphabet); } else { ConsensusResolver.SequenceAlphabet = referenceSequenceList[0].Alphabet; } } IList <IPairwiseSequenceAlignment> results = new List <IPairwiseSequenceAlignment>(); IPairwiseSequenceAlignment sequenceAlignment = null; IList <DeltaAlignment> deltaAlignments = null; IList <PairwiseAlignedSequence> alignments = null; ISequence referenceSequence = null; // Validate the input Validate(referenceSequenceList, querySequenceList); // Step:1 concat all the sequences into one sequence if (referenceSequenceList.Count > 1) { referenceSequence = ConcatSequence(referenceSequenceList); } else { referenceSequence = referenceSequenceList[0]; } // Getting refernce sequence _referenceSequence = referenceSequence; // Step2 : building suffix trees using reference sequence _suffixTree = BuildSuffixTree(_referenceSequence); // On each query sequence aligned with reference sequence foreach (ISequence sequence in querySequenceList) { if (sequence.Equals(referenceSequence)) { continue; } sequenceAlignment = new PairwiseSequenceAlignment(referenceSequence, sequence); // Step3 : streaming process is performed with the query sequence _mumList = Streaming(_suffixTree, sequence, LengthOfMUM); if (_mumList.Count > 0) { // Step 5 : Get the list of Clusters _clusterList = GetClusters(_mumList); // Step 7: Process Clusters and get delta deltaAlignments = ProcessCluster( referenceSequenceList, _clusterList); // Step 8: Convert delta alignments to sequence alignments alignments = ConvertDeltaToAlignment(deltaAlignments); if (alignments.Count > 0) { foreach (PairwiseAlignedSequence align in alignments) { // Calculate the score of alignment align.Score = CalculateScore( align.FirstSequence, align.SecondSequence); // Make Consensus align.Consensus = MakeConsensus( align.FirstSequence, align.SecondSequence); sequenceAlignment.PairwiseAlignedSequences.Add(align); } } } results.Add(sequenceAlignment); } return(results); }
/// <summary> /// Assemble the input sequences into the largest possible contigs. /// </summary> /// <remarks> /// The algorithm is: /// 1. initialize list of contigs to empty list. List of seqs is passed as argument. /// 2. compute pairwise overlap scores for each pair of input seqs (with reversal and /// complementation as appropriate). /// 3. choose best overlap score. the “merge items” (can be seqs or contigs) are the /// items with that score. If best score is less than threshold, assembly is finished. /// 4. merge the merge items into a single contig and remove them from their list(s) /// 5. compute the overlap between new item and all existing items /// 6. go to step 3 /// </remarks> /// <param name="inputSequences">The sequences to assemble.</param> /// <returns>Returns the OverlapDeNovoAssembly instance which contains list of /// contigs and list of unmerged sequences which are result of this assembly.</returns> public IDeNovoAssembly Assemble(IEnumerable<ISequence> inputSequences) { if (null == inputSequences) { throw new ArgumentNullException(Properties.Resource.ParameterNameInputSequences); } // numbering convention: every pool item (whether sequence or contig) // gets a fixed number. // sequence index = index into inputs (which we won't modify) // contig index = nSequences + index into contigs List<PoolItem> pool = inputSequences.Select(seq => new PoolItem(seq)).ToList(); // Initialization int sequenceCount = pool.Count; if (sequenceCount > 0) { _sequenceAlphabet = pool[0].Sequence.Alphabet; if (ConsensusResolver == null) { ConsensusResolver = new SimpleConsensusResolver(_sequenceAlphabet); } else { ConsensusResolver.SequenceAlphabet = _sequenceAlphabet; } } // put all the initial sequences into the pool, and generate the pair scores. // there are no contigs in the pool yet. // to save an iteration, we'll also find the best global score as we go. ItemScore globalBest = new ItemScore(-1, -1, false, false, 0, 0); int globalBestLargerIndex = -1; int unconsumedCount = sequenceCount; // Compute alignment scores for all combinations between input sequences // Store these scores in the poolItem corresponding to each sequence for (int newSeq = 0; newSeq < pool.Count; ++newSeq) { PoolItem newItem = pool[newSeq]; for (int oldSeq = 0; oldSeq < newSeq; ++oldSeq) { PoolItem oldItem = pool[oldSeq]; ItemScore score = AlignSequence(oldItem.SequenceOrConsensus, newItem.SequenceOrConsensus, oldSeq, newSeq); newItem.Scores.Add(score); if (score.OverlapScore > globalBest.OverlapScore) { globalBest = new ItemScore(score); globalBestLargerIndex = newSeq; } } } // Merge sequence if best score is above threshold // and add new contig to pool if (globalBest.OverlapScore >= MergeThreshold) { if (Trace.Want(Trace.AssemblyDetails)) { ApplicationLog.WriteLine("Merging (overlap score {0}):", globalBest.OverlapScore); } PoolItem mergeItem1 = pool[globalBest.OtherItem]; PoolItem mergeItem2 = pool[globalBestLargerIndex]; Contig newContig = new Contig(); if (Trace.Want(Trace.AssemblyDetails)) { ApplicationLog.WriteLine( "new pool item {0} will merge old items {1} and {2}", pool.Count, globalBest.OtherItem, globalBestLargerIndex); } MergeLowerIndexedSequence(newContig, globalBest, mergeItem1.Sequence); MergeHigherIndexedSequence(newContig, globalBest, mergeItem2.Sequence); MakeConsensus(newContig); // Set ConsumedBy value and // free memory as these sequences are no longer used mergeItem1.ConsumedBy = pool.Count; mergeItem2.ConsumedBy = pool.Count; mergeItem1.FreeSequences(); mergeItem2.FreeSequences(); pool.Add(new PoolItem(newContig)); unconsumedCount--; while (unconsumedCount > 1) { // Compute scores for each unconsumed sequence with new contig int newSeq = pool.Count - 1; PoolItem newItem = pool[newSeq]; for (int oldSeq = 0; oldSeq < pool.Count - 1; ++oldSeq) { PoolItem oldItem = pool[oldSeq]; if (oldItem.ConsumedBy >= 0) { // already consumed - just add dummy score to maintain correct indices newItem.Scores.Add(new ItemScore()); } else { ItemScore score = AlignSequence(oldItem.SequenceOrConsensus, newItem.SequenceOrConsensus, oldSeq, newSeq); newItem.Scores.Add(score); } } // find best global score in the modified pool. globalBest = new ItemScore(-1, -1, false, false, 0, 0); globalBestLargerIndex = -1; for (int current = 0; current < pool.Count; ++current) { PoolItem curItem = pool[current]; if (curItem.ConsumedBy < 0) { for (int other = 0; other < current; ++other) { if (pool[other].ConsumedBy < 0) { ItemScore itemScore = curItem.Scores[other]; if (itemScore.OverlapScore > globalBest.OverlapScore) { globalBest = new ItemScore(itemScore); // copy the winner so far globalBestLargerIndex = current; } } } } } if (globalBest.OverlapScore >= MergeThreshold) { // Merge sequences / contigs if above threshold mergeItem1 = pool[globalBest.OtherItem]; mergeItem2 = pool[globalBestLargerIndex]; newContig = new Contig(); if (mergeItem1.IsContig) { if (Trace.Want(Trace.AssemblyDetails)) { ApplicationLog.WriteLine( "item {0} is a contig (reversed = {1}, complemented = {2}, offset = {3}", globalBest.OtherItem, globalBest.Reversed, globalBest.Complemented, globalBest.FirstOffset); } MergeLowerIndexedContig(newContig, globalBest, mergeItem1.Contig); } else { if (Trace.Want(Trace.AssemblyDetails)) { ApplicationLog.WriteLine( "item {0} is a sequence (reversed = {1}, complemented = {2}, offset = {3}", globalBest.OtherItem, globalBest.Reversed, globalBest.Complemented, globalBest.FirstOffset); } MergeLowerIndexedSequence(newContig, globalBest, mergeItem1.Sequence); } if (mergeItem2.IsContig) { if (Trace.Want(Trace.AssemblyDetails)) { ApplicationLog.WriteLine( "item {0} is a contig (offset = {1}", globalBestLargerIndex, globalBest.SecondOffset); } MergeHigherIndexedContig(newContig, globalBest, mergeItem2.Contig); } else { if (Trace.Want(Trace.AssemblyDetails)) { ApplicationLog.WriteLine( "item {0} is a sequence (offset = {1}", globalBestLargerIndex, globalBest.SecondOffset); } MergeHigherIndexedSequence(newContig, globalBest, mergeItem2.Sequence); } MakeConsensus(newContig); if (Trace.Want(Trace.AssemblyDetails)) { Dump(newContig); } // Set ConsumedBy value for these poolItems and // free memory as these sequences are no longer used mergeItem1.ConsumedBy = pool.Count; mergeItem2.ConsumedBy = pool.Count; mergeItem1.FreeSequences(); mergeItem2.FreeSequences(); pool.Add(new PoolItem(newContig)); unconsumedCount--; } else { // None of the alignment scores cross threshold // No more merges possible. So end iteration. break; } } } // no further qualifying merges, so we're done. // populate contigs and unmergedSequences OverlapDeNovoAssembly sequenceAssembly = new OverlapDeNovoAssembly(); foreach (PoolItem curItem in pool) { if (curItem.ConsumedBy < 0) { if (curItem.IsContig) { sequenceAssembly.Contigs.Add(curItem.Contig); } else { sequenceAssembly.UnmergedSequences.Add(curItem.Sequence); } } } return sequenceAssembly; }
/// <summary> /// Assemble the input sequences into the largest possible contigs. /// </summary> /// <remarks> /// The algorithm is: /// 1. initialize list of contigs to empty list. List of seqs is passed as argument. /// 2. compute pairwise overlap scores for each pair of input seqs (with reversal and /// complementation as appropriate). /// 3. choose best overlap score. the “merge items” (can be seqs or contigs) are the /// items with that score. If best score is less than threshold, assembly is finished. /// 4. merge the merge items into a single contig and remove them from their list(s) /// 5. compute the overlap between new item and all existing items /// 6. go to step 3 /// </remarks> /// <param name="inputSequences">The sequences to assemble.</param> /// <returns>Returns the OverlapDeNovoAssembly instance which contains list of /// contigs and list of unmerged sequences which are result of this assembly.</returns> public IDeNovoAssembly Assemble(IEnumerable <ISequence> inputSequences) { if (null == inputSequences) { throw new ArgumentNullException(Properties.Resource.ParameterNameInputSequences); } // Initializations if (inputSequences.Count() > 0) { _sequenceAlphabet = inputSequences.First().Alphabet; if (ConsensusResolver == null) { ConsensusResolver = new SimpleConsensusResolver(_sequenceAlphabet); } else { ConsensusResolver.SequenceAlphabet = _sequenceAlphabet; } } OverlapDeNovoAssembly sequenceAssembly = null; // numbering convention: every pool item (whether sequence or contig) // gets a fixed number. // sequence index = index into inputs (which we won't modify) // contig index = nSequences + index into contigs List <PoolItem> pool = new List <PoolItem>(); foreach (ISequence seq in inputSequences) { pool.Add(new PoolItem(seq)); } // put all the initial sequences into the pool, and generate the pair scores. // there are no contigs in the pool yet. // to save an iteration, we'll also find the best global score as we go. ItemScore globalBest = new ItemScore(-1, -1, false, false, 0, 0); int globalBestLargerIndex = -1; int unconsumedCount = inputSequences.Count(); // Compute alignment scores for all combinations between input sequences // Store these scores in the poolItem correspodning to each sequence for (int newSeq = 0; newSeq < pool.Count; ++newSeq) { PoolItem newItem = pool[newSeq]; for (int oldSeq = 0; oldSeq < newSeq; ++oldSeq) { PoolItem oldItem = pool[oldSeq]; ItemScore score = AlignSequence(oldItem.SequenceOrConsensus, newItem.SequenceOrConsensus, oldSeq, newSeq); newItem.Scores.Add(score); if (score.OverlapScore > globalBest.OverlapScore) { globalBest = new ItemScore(score); globalBestLargerIndex = newSeq; } } } // Merge sequence if best score is above threshold // and add new contig to pool if (globalBest.OverlapScore >= MergeThreshold) { if (Trace.Want(Trace.AssemblyDetails)) { ApplicationLog.WriteLine("Merging (overlap score {0}):", globalBest.OverlapScore); } PoolItem mergeItem1 = pool[globalBest.OtherItem]; PoolItem mergeItem2 = pool[globalBestLargerIndex]; Contig newContig = new Contig(); if (Trace.Want(Trace.AssemblyDetails)) { ApplicationLog.WriteLine( "new pool item {0} will merge old items {1} and {2}", pool.Count, globalBest.OtherItem, globalBestLargerIndex); } MergeLowerIndexedSequence(newContig, globalBest, mergeItem1.Sequence); MergeHigherIndexedSequence(newContig, globalBest, mergeItem2.Sequence); MakeConsensus(newContig); // Set ConsumedBy value and // free memory as these sequences are no longer used mergeItem1.ConsumedBy = pool.Count; mergeItem2.ConsumedBy = pool.Count; mergeItem1.FreeSequences(); mergeItem2.FreeSequences(); pool.Add(new PoolItem(newContig)); unconsumedCount--; while (unconsumedCount > 1) { // Compute scores for each unconsumed sequence with new contig globalBest = new ItemScore(-1, -1, false, false, 0, 0); globalBestLargerIndex = -1; int newSeq = pool.Count - 1; PoolItem newItem = pool[newSeq]; for (int oldSeq = 0; oldSeq < pool.Count - 1; ++oldSeq) { PoolItem oldItem = pool[oldSeq]; if (oldItem.ConsumedBy >= 0) { // already consumed - just add dummy score to maintain correct indices newItem.Scores.Add(new ItemScore()); } else { ItemScore score = AlignSequence(oldItem.SequenceOrConsensus, newItem.SequenceOrConsensus, oldSeq, newSeq); newItem.Scores.Add(score); } } // find best global score in the modified pool. globalBest = new ItemScore(-1, -1, false, false, 0, 0); globalBestLargerIndex = -1; for (int current = 0; current < pool.Count; ++current) { PoolItem curItem = pool[current]; if (curItem.ConsumedBy < 0) { for (int other = 0; other < current; ++other) { if (pool[other].ConsumedBy < 0) { ItemScore itemScore = curItem.Scores[other]; if (itemScore.OverlapScore > globalBest.OverlapScore) { globalBest = new ItemScore(itemScore); // copy the winner so far globalBestLargerIndex = current; } } } } } if (globalBest.OverlapScore >= MergeThreshold) { // Merge sequences / contigs if above threshold mergeItem1 = pool[globalBest.OtherItem]; mergeItem2 = pool[globalBestLargerIndex]; newContig = new Contig(); if (mergeItem1.IsContig) { if (Trace.Want(Trace.AssemblyDetails)) { ApplicationLog.WriteLine( "item {0} is a contig (reversed = {1}, complemented = {2}, offset = {3}", globalBest.OtherItem, globalBest.Reversed, globalBest.Complemented, globalBest.FirstOffset); } MergeLowerIndexedContig(newContig, globalBest, mergeItem1.Contig); } else { if (Trace.Want(Trace.AssemblyDetails)) { ApplicationLog.WriteLine( "item {0} is a sequence (reversed = {1}, complemented = {2}, offset = {3}", globalBest.OtherItem, globalBest.Reversed, globalBest.Complemented, globalBest.FirstOffset); } MergeLowerIndexedSequence(newContig, globalBest, mergeItem1.Sequence); } if (mergeItem2.IsContig) { if (Trace.Want(Trace.AssemblyDetails)) { ApplicationLog.WriteLine( "item {0} is a contig (offset = {1}", globalBestLargerIndex, globalBest.SecondOffset); } MergeHigherIndexedContig(newContig, globalBest, mergeItem2.Contig); } else { if (Trace.Want(Trace.AssemblyDetails)) { ApplicationLog.WriteLine( "item {0} is a sequence (offset = {1}", globalBestLargerIndex, globalBest.SecondOffset); } MergeHigherIndexedSequence(newContig, globalBest, mergeItem2.Sequence); } MakeConsensus(newContig); if (Trace.Want(Trace.AssemblyDetails)) { Dump(newContig); } // Set ConsumedBy value for these poolItems and // free memory as these sequences are no longer used mergeItem1.ConsumedBy = pool.Count; mergeItem2.ConsumedBy = pool.Count; mergeItem1.FreeSequences(); mergeItem2.FreeSequences(); pool.Add(new PoolItem(newContig)); unconsumedCount--; } else { // None of the alignment scores cross threshold // No more merges possible. So end iteration. break; } } } // no further qualifying merges, so we're done. // populate contigs and unmergedSequences sequenceAssembly = new OverlapDeNovoAssembly(); foreach (PoolItem curItem in pool) { if (curItem.ConsumedBy < 0) { if (curItem.IsContig) { sequenceAssembly.Contigs.Add(curItem.Contig); } else { sequenceAssembly.UnmergedSequences.Add(curItem.Sequence); } } } return(sequenceAssembly); }
/// <summary> /// Performs Stage 1, 2, and 3 as described in class description. /// </summary> /// <param name="sequences">Input sequences</param> /// <returns>Alignment results</returns> private void DoAlignment(IList<ISequence> sequences) { Debug.Assert(this.alphabet != null); Debug.Assert(sequences.Count > 0); // Initializations if (ConsensusResolver == null) ConsensusResolver = new SimpleConsensusResolver(this.alphabet); else ConsensusResolver.SequenceAlphabet = this.alphabet; // Get ProfileAligner ready IProfileAligner profileAligner = null; switch (ProfileAlignerName) { case (ProfileAlignerNames.NeedlemanWunschProfileAligner): if (this.degreeOfParallelism == 1) { profileAligner = new NeedlemanWunschProfileAlignerSerial( SimilarityMatrix, ProfileProfileFunctionName, GapOpenCost, GapExtensionCost, this.numberOfPartitions); } else { profileAligner = new NeedlemanWunschProfileAlignerParallel( SimilarityMatrix, ProfileProfileFunctionName, GapOpenCost, GapExtensionCost, this.numberOfPartitions); } break; case (ProfileAlignerNames.SmithWatermanProfileAligner): if (this.degreeOfParallelism == 1) { profileAligner = new SmithWatermanProfileAlignerSerial( SimilarityMatrix, ProfileProfileFunctionName, GapOpenCost, GapExtensionCost, this.numberOfPartitions); } else { profileAligner = new SmithWatermanProfileAlignerParallel( SimilarityMatrix, ProfileProfileFunctionName, GapOpenCost, GapExtensionCost, this.numberOfPartitions); } break; default: throw new ArgumentException("Invalid profile aligner name"); } this.AlignedSequences = new List<ISequence>(sequences.Count); float currentScore = 0; // STAGE 1 ReportLog("Stage 1"); // Generate DistanceMatrix var kmerDistanceMatrixGenerator = new KmerDistanceMatrixGenerator(sequences, KmerLength, this.alphabet, DistanceFunctionName); // Hierarchical clustering IHierarchicalClustering hierarcicalClustering = new HierarchicalClusteringParallel (kmerDistanceMatrixGenerator.DistanceMatrix, HierarchicalClusteringMethodName); // Generate Guide Tree var binaryGuideTree = new BinaryGuideTree(hierarcicalClustering); // Progressive Alignment IProgressiveAligner progressiveAlignerA = new ProgressiveAligner(profileAligner); progressiveAlignerA.Align(sequences, binaryGuideTree); currentScore = MsaUtils.MultipleAlignmentScoreFunction(progressiveAlignerA.AlignedSequences, SimilarityMatrix, GapOpenCost, GapExtensionCost); if (currentScore > this.AlignmentScoreA) { this.AlignmentScoreA = currentScore; this.AlignedSequencesA = progressiveAlignerA.AlignedSequences; } if (this.AlignmentScoreA > this.AlignmentScore) { this.AlignmentScore = this.AlignmentScoreA; this.AlignedSequences = this.AlignedSequencesA; } if (PAMSAMMultipleSequenceAligner.FasterVersion) { this.AlignedSequencesB = this.AlignedSequencesA; this.AlignedSequencesC = this.AlignedSequencesA; this.AlignmentScoreB = this.AlignmentScoreA; this.AlignmentScoreC = this.AlignmentScoreA; } else { BinaryGuideTree binaryGuideTreeB = null; IHierarchicalClustering hierarcicalClusteringB = null; KimuraDistanceMatrixGenerator kimuraDistanceMatrixGenerator = new KimuraDistanceMatrixGenerator(); if (UseStageB) { // STAGE 2 ReportLog("Stage 2"); // Generate DistanceMatrix from Multiple Sequence Alignment while (true) { kimuraDistanceMatrixGenerator.GenerateDistanceMatrix(this.AlignedSequences); // Hierarchical clustering hierarcicalClusteringB = new HierarchicalClusteringParallel (kimuraDistanceMatrixGenerator.DistanceMatrix, HierarchicalClusteringMethodName); // Generate Guide Tree binaryGuideTreeB = new BinaryGuideTree(hierarcicalClusteringB); BinaryGuideTree.CompareTwoTrees(binaryGuideTreeB, binaryGuideTree); binaryGuideTree = binaryGuideTreeB; // Progressive Alignment IProgressiveAligner progressiveAlignerB = new ProgressiveAligner(profileAligner); progressiveAlignerB.Align(sequences, binaryGuideTreeB); currentScore = MsaUtils.MultipleAlignmentScoreFunction(progressiveAlignerB.AlignedSequences, SimilarityMatrix, GapOpenCost, GapExtensionCost); if (currentScore > this.AlignmentScoreB) { this.AlignmentScoreB = currentScore; this.AlignedSequencesB = progressiveAlignerB.AlignedSequences; } break; } if (this.AlignmentScoreB > this.AlignmentScore) { this.AlignmentScore = this.AlignmentScoreB; this.AlignedSequences = this.AlignedSequencesB; } } else { binaryGuideTreeB = binaryGuideTree; } // STAGE 3 ReportLog("Stage 3"); // refinement int maxRefineMentTime = 1; if (sequences.Count == 2) { maxRefineMentTime = 0; } int refinementTime = 0; this.AlignedSequencesC = new List<ISequence>(this.AlignedSequences.Count); foreach (ISequence t in this.AlignedSequences) { this.AlignedSequencesC.Add(new Sequence(Alphabets.GetAmbiguousAlphabet(this.alphabet), t.ToArray()) { ID = t.ID, // Do not shallow copy dictionary //Metadata = t.Metadata }); } while (refinementTime < maxRefineMentTime) { ++refinementTime; ReportLog("Refinement iter " + refinementTime); bool needRefinement = false; for (int edgeIndex = 0; edgeIndex < binaryGuideTreeB.NumberOfEdges; ++edgeIndex) { List<int>[] leafNodeIndices = binaryGuideTreeB.SeparateSequencesByCuttingTree(edgeIndex); List<int>[] allIndelPositions = new List<int>[2]; IProfileAlignment[] separatedProfileAlignments = ProfileAlignment.ProfileExtraction(this.AlignedSequencesC, leafNodeIndices[0], leafNodeIndices[1], out allIndelPositions); List<int>[] eStrings = new List<int>[2]; if (separatedProfileAlignments[0].NumberOfSequences < separatedProfileAlignments[1].NumberOfSequences) { profileAligner.Align(separatedProfileAlignments[0], separatedProfileAlignments[1]); eStrings[0] = profileAligner.GenerateEString(profileAligner.AlignedA); eStrings[1] = profileAligner.GenerateEString(profileAligner.AlignedB); } else { profileAligner.Align(separatedProfileAlignments[1], separatedProfileAlignments[0]); eStrings[0] = profileAligner.GenerateEString(profileAligner.AlignedB); eStrings[1] = profileAligner.GenerateEString(profileAligner.AlignedA); } for (int set = 0; set < 2; ++set) { Parallel.ForEach(leafNodeIndices[set], ParallelOption, i => { //Sequence seq = new Sequence(_alphabet, ""); List<byte> seqBytes = new List<byte>(); int indexAllIndel = 0; for (int j = 0; j < this.AlignedSequencesC[i].Count; ++j) { if (indexAllIndel < allIndelPositions[set].Count && j == allIndelPositions[set][indexAllIndel]) { ++indexAllIndel; } else { seqBytes.Add(this.AlignedSequencesC[i][j]); } } this.AlignedSequencesC[i] = profileAligner.GenerateSequenceFromEString(eStrings[set], new Sequence(Alphabets.GetAmbiguousAlphabet(this.alphabet), seqBytes.ToArray())); this.AlignedSequencesC[i].ID = this.AlignedSequencesC[i].ID; // Do not shallow copy dictionary //(_alignedSequencesC[i] as Sequence).Metadata = _alignedSequencesC[i].Metadata; }); } currentScore = MsaUtils.MultipleAlignmentScoreFunction(this.AlignedSequencesC, SimilarityMatrix, GapOpenCost, GapExtensionCost); if (currentScore > this.AlignmentScoreC) { this.AlignmentScoreC = currentScore; needRefinement = true; // recreate the tree kimuraDistanceMatrixGenerator.GenerateDistanceMatrix(this.AlignedSequencesC); hierarcicalClusteringB = new HierarchicalClusteringParallel (kimuraDistanceMatrixGenerator.DistanceMatrix, HierarchicalClusteringMethodName); binaryGuideTreeB = new BinaryGuideTree(hierarcicalClusteringB); break; } } if (!needRefinement) { refinementTime = maxRefineMentTime; break; } } if (this.AlignmentScoreC > this.AlignmentScore) { this.AlignmentScore = this.AlignmentScoreC; this.AlignedSequences = this.AlignedSequencesC; } ReportLog("Stop Stage 3"); } }
/// <summary> /// Performs Stage 1, 2, and 3 as described in class description. /// </summary> /// <param name="sequences">Input sequences</param> /// <returns>Alignment results</returns> private void DoAlignment(IList <ISequence> sequences) { Debug.Assert(this.alphabet != null); Debug.Assert(sequences.Count > 0); // Initializations if (ConsensusResolver == null) { ConsensusResolver = new SimpleConsensusResolver(this.alphabet); } else { ConsensusResolver.SequenceAlphabet = this.alphabet; } // Get ProfileAligner ready IProfileAligner profileAligner = null; switch (ProfileAlignerName) { case (ProfileAlignerNames.NeedlemanWunschProfileAligner): if (this.degreeOfParallelism == 1) { profileAligner = new NeedlemanWunschProfileAlignerSerial( SimilarityMatrix, ProfileProfileFunctionName, GapOpenCost, GapExtensionCost, this.numberOfPartitions); } else { profileAligner = new NeedlemanWunschProfileAlignerParallel( SimilarityMatrix, ProfileProfileFunctionName, GapOpenCost, GapExtensionCost, this.numberOfPartitions); } break; case (ProfileAlignerNames.SmithWatermanProfileAligner): if (this.degreeOfParallelism == 1) { profileAligner = new SmithWatermanProfileAlignerSerial( SimilarityMatrix, ProfileProfileFunctionName, GapOpenCost, GapExtensionCost, this.numberOfPartitions); } else { profileAligner = new SmithWatermanProfileAlignerParallel( SimilarityMatrix, ProfileProfileFunctionName, GapOpenCost, GapExtensionCost, this.numberOfPartitions); } break; default: throw new ArgumentException("Invalid profile aligner name"); } this.AlignedSequences = new List <ISequence>(sequences.Count); float currentScore = 0; // STAGE 1 ReportLog("Stage 1"); // Generate DistanceMatrix var kmerDistanceMatrixGenerator = new KmerDistanceMatrixGenerator(sequences, KmerLength, this.alphabet, DistanceFunctionName); // Hierarchical clustering IHierarchicalClustering hierarcicalClustering = new HierarchicalClusteringParallel (kmerDistanceMatrixGenerator.DistanceMatrix, HierarchicalClusteringMethodName); // Generate Guide Tree var binaryGuideTree = new BinaryGuideTree(hierarcicalClustering); // Progressive Alignment IProgressiveAligner progressiveAlignerA = new ProgressiveAligner(profileAligner); progressiveAlignerA.Align(sequences, binaryGuideTree); currentScore = MsaUtils.MultipleAlignmentScoreFunction(progressiveAlignerA.AlignedSequences, SimilarityMatrix, GapOpenCost, GapExtensionCost); if (currentScore > this.AlignmentScoreA) { this.AlignmentScoreA = currentScore; this.AlignedSequencesA = progressiveAlignerA.AlignedSequences; } if (this.AlignmentScoreA > this.AlignmentScore) { this.AlignmentScore = this.AlignmentScoreA; this.AlignedSequences = this.AlignedSequencesA; } if (PAMSAMMultipleSequenceAligner.FasterVersion) { this.AlignedSequencesB = this.AlignedSequencesA; this.AlignedSequencesC = this.AlignedSequencesA; this.AlignmentScoreB = this.AlignmentScoreA; this.AlignmentScoreC = this.AlignmentScoreA; } else { BinaryGuideTree binaryGuideTreeB = null; IHierarchicalClustering hierarcicalClusteringB = null; KimuraDistanceMatrixGenerator kimuraDistanceMatrixGenerator = new KimuraDistanceMatrixGenerator(); if (UseStageB) { // STAGE 2 ReportLog("Stage 2"); // Generate DistanceMatrix from Multiple Sequence Alignment while (true) { kimuraDistanceMatrixGenerator.GenerateDistanceMatrix(this.AlignedSequences); // Hierarchical clustering hierarcicalClusteringB = new HierarchicalClusteringParallel (kimuraDistanceMatrixGenerator.DistanceMatrix, HierarchicalClusteringMethodName); // Generate Guide Tree binaryGuideTreeB = new BinaryGuideTree(hierarcicalClusteringB); BinaryGuideTree.CompareTwoTrees(binaryGuideTreeB, binaryGuideTree); binaryGuideTree = binaryGuideTreeB; // Progressive Alignment IProgressiveAligner progressiveAlignerB = new ProgressiveAligner(profileAligner); progressiveAlignerB.Align(sequences, binaryGuideTreeB); currentScore = MsaUtils.MultipleAlignmentScoreFunction(progressiveAlignerB.AlignedSequences, SimilarityMatrix, GapOpenCost, GapExtensionCost); if (currentScore > this.AlignmentScoreB) { this.AlignmentScoreB = currentScore; this.AlignedSequencesB = progressiveAlignerB.AlignedSequences; } break; } if (this.AlignmentScoreB > this.AlignmentScore) { this.AlignmentScore = this.AlignmentScoreB; this.AlignedSequences = this.AlignedSequencesB; } } else { binaryGuideTreeB = binaryGuideTree; } // STAGE 3 ReportLog("Stage 3"); // refinement int maxRefineMentTime = 1; if (sequences.Count == 2) { maxRefineMentTime = 0; } int refinementTime = 0; this.AlignedSequencesC = new List <ISequence>(this.AlignedSequences.Count); foreach (ISequence t in this.AlignedSequences) { this.AlignedSequencesC.Add(new Sequence(Alphabets.GetAmbiguousAlphabet(this.alphabet), t.ToArray()) { ID = t.ID, // Do not shallow copy dictionary //Metadata = t.Metadata }); } while (refinementTime < maxRefineMentTime) { ++refinementTime; ReportLog("Refinement iter " + refinementTime); bool needRefinement = false; for (int edgeIndex = 0; edgeIndex < binaryGuideTreeB.NumberOfEdges; ++edgeIndex) { List <int>[] leafNodeIndices = binaryGuideTreeB.SeparateSequencesByCuttingTree(edgeIndex); List <int>[] allIndelPositions = new List <int> [2]; IProfileAlignment[] separatedProfileAlignments = ProfileAlignment.ProfileExtraction(this.AlignedSequencesC, leafNodeIndices[0], leafNodeIndices[1], out allIndelPositions); List <int>[] eStrings = new List <int> [2]; if (separatedProfileAlignments[0].NumberOfSequences < separatedProfileAlignments[1].NumberOfSequences) { profileAligner.Align(separatedProfileAlignments[0], separatedProfileAlignments[1]); eStrings[0] = profileAligner.GenerateEString(profileAligner.AlignedA); eStrings[1] = profileAligner.GenerateEString(profileAligner.AlignedB); } else { profileAligner.Align(separatedProfileAlignments[1], separatedProfileAlignments[0]); eStrings[0] = profileAligner.GenerateEString(profileAligner.AlignedB); eStrings[1] = profileAligner.GenerateEString(profileAligner.AlignedA); } for (int set = 0; set < 2; ++set) { Parallel.ForEach(leafNodeIndices[set], ParallelOption, i => { //Sequence seq = new Sequence(_alphabet, ""); List <byte> seqBytes = new List <byte>(); int indexAllIndel = 0; for (int j = 0; j < this.AlignedSequencesC[i].Count; ++j) { if (indexAllIndel < allIndelPositions[set].Count && j == allIndelPositions[set][indexAllIndel]) { ++indexAllIndel; } else { seqBytes.Add(this.AlignedSequencesC[i][j]); } } this.AlignedSequencesC[i] = profileAligner.GenerateSequenceFromEString(eStrings[set], new Sequence(Alphabets.GetAmbiguousAlphabet(this.alphabet), seqBytes.ToArray())); this.AlignedSequencesC[i].ID = this.AlignedSequencesC[i].ID; // Do not shallow copy dictionary //(_alignedSequencesC[i] as Sequence).Metadata = _alignedSequencesC[i].Metadata; }); } currentScore = MsaUtils.MultipleAlignmentScoreFunction(this.AlignedSequencesC, SimilarityMatrix, GapOpenCost, GapExtensionCost); if (currentScore > this.AlignmentScoreC) { this.AlignmentScoreC = currentScore; needRefinement = true; // recreate the tree kimuraDistanceMatrixGenerator.GenerateDistanceMatrix(this.AlignedSequencesC); hierarcicalClusteringB = new HierarchicalClusteringParallel (kimuraDistanceMatrixGenerator.DistanceMatrix, HierarchicalClusteringMethodName); binaryGuideTreeB = new BinaryGuideTree(hierarcicalClusteringB); break; } } if (!needRefinement) { refinementTime = maxRefineMentTime; break; } } if (this.AlignmentScoreC > this.AlignmentScore) { this.AlignmentScore = this.AlignmentScoreC; this.AlignedSequences = this.AlignedSequencesC; } ReportLog("Stop Stage 3"); } }
/// <summary> /// Generates consensus sequences from alignment layout. /// </summary> /// <param name="alignmentBetweenReferenceAndReads">Input list of reads.</param> /// <returns>List of contigs.</returns> public static IEnumerable <ISequence> GenerateConsensus(IEnumerable <DeltaAlignment> alignmentBetweenReferenceAndReads) { if (alignmentBetweenReferenceAndReads == null) { throw new ArgumentNullException("alignmentBetweenReferenceAndReads"); } SimpleConsensusResolver resolver = new SimpleConsensusResolver(AmbiguousDnaAlphabet.Instance); Dictionary <long, Sequence> outputSequences = new Dictionary <long, Sequence>(); Dictionary <DeltaAlignment, ISequence> deltasInCurrentContig = new Dictionary <DeltaAlignment, ISequence>(); IEnumerator <DeltaAlignment> deltaEnumerator = alignmentBetweenReferenceAndReads.GetEnumerator(); long currentAlignmentStartOffset = 0; long currentIndex = 0; long inDeltaIndex = 0; DeltaAlignment lastDelta; List <byte> currentContig = new List <byte>(); List <DeltaAlignment> deltasToRemove = new List <DeltaAlignment>(); // no deltas if (!deltaEnumerator.MoveNext()) { return(outputSequences.Values); } lastDelta = deltaEnumerator.Current; do { // Starting a new contig if (deltasInCurrentContig.Count == 0) { currentAlignmentStartOffset = lastDelta.FirstSequenceStart; currentIndex = 0; currentContig.Clear(); } // loop through all deltas at current index and find consensus do { // Proceed creating consensus till we find another delta stats aligning while (lastDelta != null && lastDelta.FirstSequenceStart == currentAlignmentStartOffset + currentIndex) { deltasInCurrentContig.Add(lastDelta, lastDelta.QuerySequence.GetSubSequence(lastDelta.SecondSequenceStart, (lastDelta.SecondSequenceEnd - lastDelta.SecondSequenceStart) + 1)); // Get next delta if (deltaEnumerator.MoveNext()) { lastDelta = deltaEnumerator.Current; continue; // see if new delta starts from the same offset } else { lastDelta = null; } } byte[] symbolsAtCurrentIndex = new byte[deltasInCurrentContig.Count]; int symbolCounter = 0; foreach (var delta in deltasInCurrentContig) { inDeltaIndex = currentIndex - (delta.Key.FirstSequenceStart - currentAlignmentStartOffset); symbolsAtCurrentIndex[symbolCounter++] = delta.Value[inDeltaIndex]; if (inDeltaIndex == delta.Value.Count - 1) { deltasToRemove.Add(delta.Key); } } if (deltasToRemove.Count > 0) { foreach (var deltaToRemove in deltasToRemove) { deltasInCurrentContig.Remove(deltaToRemove); } deltasToRemove.Clear(); } byte consensusSymbol = resolver.GetConsensus(symbolsAtCurrentIndex); currentContig.Add(consensusSymbol); currentIndex++; // See if another delta is adjacent if (deltasInCurrentContig.Count == 0 && lastDelta != null && lastDelta.FirstSequenceStart == currentAlignmentStartOffset + currentIndex) { deltasInCurrentContig.Add(lastDelta, lastDelta.QuerySequence.GetSubSequence(lastDelta.SecondSequenceStart, (lastDelta.SecondSequenceEnd - lastDelta.SecondSequenceStart) + 1)); // check next delta if (deltaEnumerator.MoveNext()) { lastDelta = deltaEnumerator.Current; continue; // read next delta to see if it starts from current reference sequence offset } else { lastDelta = null; } } }while (deltasInCurrentContig.Count > 0); outputSequences.Add(currentAlignmentStartOffset, new Sequence(AmbiguousDnaAlphabet.Instance, currentContig.ToArray(), false)); }while (lastDelta != null); return(outputSequences.Values); }
/// <summary> /// This method is considered as main execute method which defines the /// step by step algorithm. Derived class flows the defined flow by this /// method. /// </summary> /// <param name="referenceSequenceList">Reference sequence.</param> /// <param name="originalQuerySequences">List of input sequences.</param> /// <returns>A list of sequence alignment.</returns> private IEnumerable <IPairwiseSequenceAlignment> Alignment(IEnumerable <ISequence> referenceSequenceList, IEnumerable <ISequence> originalQuerySequences) { ConsensusResolver = new SimpleConsensusResolver(referenceSequenceList.ElementAt(0).Alphabet); IEnumerable <ISequence> querySequenceList = ForwardOnly ? originalQuerySequences : (ReverseOnly ? ReverseComplementSequenceList(originalQuerySequences) : AddReverseComplementsToSequenceList(originalQuerySequences)); IList <IPairwiseSequenceAlignment> results = new List <IPairwiseSequenceAlignment>(); var deltas = new List <DeltaAlignment>(); foreach (ISequence refSequence in referenceSequenceList) { this.nucmerAlgo = new NUCmer(refSequence); if (GapOpenCost != DefaultGapOpenCost) { this.nucmerAlgo.GapOpenCost = GapOpenCost; } if (GapExtensionCost != DefaultGapExtensionCost) { this.nucmerAlgo.GapExtensionCost = GapExtensionCost; } if (LengthOfMUM != DefaultLengthOfMUM) { this.nucmerAlgo.LengthOfMUM = LengthOfMUM; } // Set the ClusterBuilder properties to defaults if (FixedSeparation != ClusterBuilder.DefaultFixedSeparation) { this.nucmerAlgo.FixedSeparation = FixedSeparation; } if (MaximumSeparation != ClusterBuilder.DefaultMaximumSeparation) { this.nucmerAlgo.MaximumSeparation = MaximumSeparation; } if (MinimumScore != ClusterBuilder.DefaultMinimumScore) { this.nucmerAlgo.MinimumScore = MinimumScore; } if (SeparationFactor != ClusterBuilder.DefaultSeparationFactor) { this.nucmerAlgo.SeparationFactor = SeparationFactor; } if (BreakLength != ModifiedSmithWaterman.DefaultBreakLength) { this.nucmerAlgo.BreakLength = BreakLength; } this.nucmerAlgo.ConsensusResolver = ConsensusResolver; if (SimilarityMatrix != null) { this.nucmerAlgo.SimilarityMatrix = SimilarityMatrix; } foreach (ISequence querySequence in querySequenceList) { // Check for parameters that would prevent an alignment from being returned. if (Math.Min(querySequence.Count, refSequence.Count) < MinimumScore) { var msg = "Bad parameter settings for NucmerPairwiseAligner. " + "Tried to align a reference of length " + refSequence.Count.ToString() + " to a sequence of length " + querySequence.Count.ToString() + " while requiring a minimum score of MinimumScore = " + MinimumScore + ". This will prevent any alignments from being returned."; throw new ArgumentException(msg); } IEnumerable <DeltaAlignment> deltaAlignment = this.nucmerAlgo.GetDeltaAlignments(querySequence, !MaxMatch, querySequence.IsMarkedAsReverseComplement()); deltas.AddRange(deltaAlignment); } } if (deltas.Count > 0) { ISequence concatReference = referenceSequenceList.ElementAt(0); //// concat all the sequences into one sequence if (referenceSequenceList.Count() > 1) { concatReference = ConcatSequence(referenceSequenceList); } foreach (ISequence querySequence in querySequenceList) { List <DeltaAlignment> qDelta = deltas.Where(d => d.QuerySequence.Equals(querySequence)).ToList(); IPairwiseSequenceAlignment sequenceAlignment = new PairwiseSequenceAlignment(concatReference, querySequence); // Convert delta alignments to sequence alignments IList <PairwiseAlignedSequence> alignments = ConvertDeltaToAlignment(qDelta); if (alignments.Count > 0) { foreach (PairwiseAlignedSequence align in alignments) { // Calculate the score of alignment align.Score = CalculateScore( align.FirstSequence, align.SecondSequence); // Make Consensus align.Consensus = MakeConsensus( align.FirstSequence, align.SecondSequence); sequenceAlignment.PairwiseAlignedSequences.Add(align); } } results.Add(sequenceAlignment); } } return(results); }
/// <summary> /// Generates consensus sequences from alignment layout. /// </summary> /// <param name="alignmentBetweenReferenceAndReads">Input list of reads.</param> /// <returns>List of contigs.</returns> public static IEnumerable<ISequence> GenerateConsensus(DeltaAlignmentCollection alignmentBetweenReferenceAndReads) { if (alignmentBetweenReferenceAndReads == null) { throw new ArgumentNullException("alignmentBetweenReferenceAndReads"); } SimpleConsensusResolver resolver = new SimpleConsensusResolver(AmbiguousDnaAlphabet.Instance, 49); // this dictionary will not grow more than a few hundread in worst scenario, // as this stores delta and its corresponding sequences Dictionary<DeltaAlignment, ISequence> deltasInCurrentContig = new Dictionary<DeltaAlignment, ISequence>(); long currentAlignmentStartOffset = 0; long currentIndex = 0; List<byte> currentContig = new List<byte>(); List<DeltaAlignment> deltasToRemove = new List<DeltaAlignment>(); // no deltas if (alignmentBetweenReferenceAndReads.Count == 0) { yield break; } long index = 0; DeltaAlignment lastDelta = alignmentBetweenReferenceAndReads[index]; do { // Starting a new contig if (deltasInCurrentContig.Count == 0) { currentAlignmentStartOffset = lastDelta.FirstSequenceStart; currentIndex = 0; currentContig.Clear(); } // loop through all deltas at current index and find consensus do { // Proceed creating consensus till we find another delta stats aligning while (lastDelta != null && lastDelta.FirstSequenceStart == currentAlignmentStartOffset + currentIndex) { deltasInCurrentContig.Add(lastDelta, GetSequenceFromDelta(lastDelta)); // Get next delta index++; if (alignmentBetweenReferenceAndReads.Count > index) { lastDelta = alignmentBetweenReferenceAndReads[index]; continue; // see if new delta starts from the same offset } else { lastDelta = null; } } byte[] symbolsAtCurrentIndex = new byte[deltasInCurrentContig.Count]; int symbolCounter = 0; foreach (var delta in deltasInCurrentContig) { long inDeltaIndex = currentIndex - (delta.Key.FirstSequenceStart - currentAlignmentStartOffset); symbolsAtCurrentIndex[symbolCounter++] = delta.Value[inDeltaIndex]; if (inDeltaIndex == delta.Value.Count - 1) { deltasToRemove.Add(delta.Key); } } if (deltasToRemove.Count > 0) { for (int i = 0; i < deltasToRemove.Count; i++) { deltasInCurrentContig.Remove(deltasToRemove[i]); } deltasToRemove.Clear(); } byte consensusSymbol = resolver.GetConsensus(symbolsAtCurrentIndex); currentContig.Add(consensusSymbol); currentIndex++; // See if another delta is adjacent if (deltasInCurrentContig.Count == 0 && lastDelta != null && lastDelta.FirstSequenceStart == currentAlignmentStartOffset + currentIndex) { deltasInCurrentContig.Add(lastDelta, GetSequenceFromDelta(lastDelta)); // check next delta index++; if (alignmentBetweenReferenceAndReads.Count > index) { lastDelta = alignmentBetweenReferenceAndReads[index]; continue; // read next delta to see if it starts from current reference sequence offset } else { lastDelta = null; } } } while (deltasInCurrentContig.Count > 0); yield return new Sequence(AmbiguousDnaAlphabet.Instance, currentContig.ToArray(), false); } while (lastDelta != null); }
/// <summary> /// Performs Stage 1, 2, and 3 as described in class description. /// </summary> /// <param name="inputSequences"></param> /// <returns></returns> public IList <Bio.Algorithms.Alignment.ISequenceAlignment> Align(IEnumerable <ISequence> inputSequences) { List <ISequence> sequences = inputSequences.ToList(); // Initializations if (sequences.Count > 0) { if (ConsensusResolver == null) { ConsensusResolver = new SimpleConsensusResolver(_alphabet); } else { ConsensusResolver.SequenceAlphabet = _alphabet; } } // Get ProfileAligner ready IProfileAligner profileAligner = null; switch (_profileAlignerName) { case (ProfileAlignerNames.NeedlemanWunschProfileAligner): if (_degreeOfParallelism == 1) { profileAligner = new NeedlemanWunschProfileAlignerSerial( SimilarityMatrix, _profileProfileFunctionName, GapOpenCost, GapExtensionCost, _numberOfPartitions); } else { profileAligner = new NeedlemanWunschProfileAlignerParallel( SimilarityMatrix, _profileProfileFunctionName, GapOpenCost, GapExtensionCost, _numberOfPartitions); } break; case (ProfileAlignerNames.SmithWatermanProfileAligner): if (_degreeOfParallelism == 1) { profileAligner = new SmithWatermanProfileAlignerSerial( SimilarityMatrix, _profileProfileFunctionName, GapOpenCost, GapExtensionCost, _numberOfPartitions); } else { profileAligner = new SmithWatermanProfileAlignerParallel( SimilarityMatrix, _profileProfileFunctionName, GapOpenCost, GapExtensionCost, _numberOfPartitions); } break; default: throw new ArgumentException("Invalid profile aligner name"); } _alignedSequences = new List <ISequence>(sequences.Count); float currentScore = 0; // STAGE 1 Performance.Snapshot("Stage 1"); // Generate DistanceMatrix KmerDistanceMatrixGenerator kmerDistanceMatrixGenerator = new KmerDistanceMatrixGenerator(sequences, _kmerLength, _alphabet, _distanceFunctionName); // Hierarchical clustering IHierarchicalClustering hierarcicalClustering = new HierarchicalClusteringParallel (kmerDistanceMatrixGenerator.DistanceMatrix, _hierarchicalClusteringMethodName); // Generate Guide Tree BinaryGuideTree binaryGuideTree = new BinaryGuideTree(hierarcicalClustering); // Progressive Alignment IProgressiveAligner progressiveAlignerA = new ProgressiveAligner(profileAligner); progressiveAlignerA.Align(sequences, binaryGuideTree); currentScore = MsaUtils.MultipleAlignmentScoreFunction(progressiveAlignerA.AlignedSequences, SimilarityMatrix, GapOpenCost, GapExtensionCost); if (currentScore > _alignmentScoreA) { _alignmentScoreA = currentScore; _alignedSequencesA = progressiveAlignerA.AlignedSequences; } if (_alignmentScoreA > _alignmentScore) { _alignmentScore = _alignmentScoreA; _alignedSequences = _alignedSequencesA; } if (PAMSAMMultipleSequenceAligner.FasterVersion) { _alignedSequencesB = _alignedSequencesA; _alignedSequencesC = _alignedSequencesA; _alignmentScoreB = _alignmentScoreA; _alignmentScoreC = _alignmentScoreA; } else { BinaryGuideTree binaryGuideTreeB = null; IHierarchicalClustering hierarcicalClusteringB = null; KimuraDistanceMatrixGenerator kimuraDistanceMatrixGenerator = new KimuraDistanceMatrixGenerator(); if (PAMSAMMultipleSequenceAligner.UseStageB) { // STAGE 2 Performance.Snapshot("Stage 2"); // Generate DistanceMatrix from Multiple Sequence Alignment int iterateTime = 0; while (true) { ++iterateTime; kimuraDistanceMatrixGenerator.GenerateDistanceMatrix(_alignedSequences); // Hierarchical clustering hierarcicalClusteringB = new HierarchicalClusteringParallel (kimuraDistanceMatrixGenerator.DistanceMatrix, _hierarchicalClusteringMethodName); // Generate Guide Tree binaryGuideTreeB = new BinaryGuideTree(hierarcicalClusteringB); BinaryGuideTree.CompareTwoTrees(binaryGuideTreeB, binaryGuideTree); binaryGuideTree = binaryGuideTreeB; // Progressive Alignment IProgressiveAligner progressiveAlignerB = new ProgressiveAligner(profileAligner); progressiveAlignerB.Align(sequences, binaryGuideTreeB); currentScore = MsaUtils.MultipleAlignmentScoreFunction(progressiveAlignerB.AlignedSequences, SimilarityMatrix, GapOpenCost, GapExtensionCost); if (currentScore > _alignmentScoreB) { _alignmentScoreB = currentScore; _alignedSequencesB = progressiveAlignerB.AlignedSequences; break; } else { break; } } if (_alignmentScoreB > _alignmentScore) { _alignmentScore = _alignmentScoreB; _alignedSequences = _alignedSequencesB; } } else { binaryGuideTreeB = binaryGuideTree; } // STAGE 3 Performance.Snapshot("Stage 3"); // refinement //int maxRefineMentTime = sequences.Count * 2 - 2; int maxRefineMentTime = 1; if (sequences.Count == 2) { maxRefineMentTime = 0; } int refinementTime = 0; _alignedSequencesC = new List <ISequence>(sequences.Count); for (int i = 0; i < sequences.Count; ++i) { _alignedSequencesC.Add( new Sequence(Alphabets.GetAmbiguousAlphabet(_alphabet), _alignedSequences[i].ToArray()) { ID = _alignedSequences[i].ID, Metadata = _alignedSequences[i].Metadata }); } List <int>[] leafNodeIndices = null; List <int>[] allIndelPositions = null; IProfileAlignment[] separatedProfileAlignments = null; List <int>[] eStrings = null; while (refinementTime < maxRefineMentTime) { ++refinementTime; Performance.Snapshot("Refinement iter " + refinementTime.ToString()); bool needRefinement = false; for (int edgeIndex = 0; edgeIndex < binaryGuideTreeB.NumberOfEdges; ++edgeIndex) { leafNodeIndices = binaryGuideTreeB.SeparateSequencesByCuttingTree(edgeIndex); allIndelPositions = new List <int> [2]; separatedProfileAlignments = ProfileAlignment.ProfileExtraction(_alignedSequencesC, leafNodeIndices[0], leafNodeIndices[1], out allIndelPositions); eStrings = new List <int> [2]; if (separatedProfileAlignments[0].NumberOfSequences < separatedProfileAlignments[1].NumberOfSequences) { profileAligner.Align(separatedProfileAlignments[0], separatedProfileAlignments[1]); eStrings[0] = profileAligner.GenerateEString(profileAligner.AlignedA); eStrings[1] = profileAligner.GenerateEString(profileAligner.AlignedB); } else { profileAligner.Align(separatedProfileAlignments[1], separatedProfileAlignments[0]); eStrings[0] = profileAligner.GenerateEString(profileAligner.AlignedB); eStrings[1] = profileAligner.GenerateEString(profileAligner.AlignedA); } for (int set = 0; set < 2; ++set) { Parallel.ForEach(leafNodeIndices[set], PAMSAMMultipleSequenceAligner.parallelOption, i => { //Sequence seq = new Sequence(_alphabet, ""); List <byte> seqBytes = new List <byte>(); int indexAllIndel = 0; for (int j = 0; j < _alignedSequencesC[i].Count; ++j) { if (indexAllIndel < allIndelPositions[set].Count && j == allIndelPositions[set][indexAllIndel]) { ++indexAllIndel; } else { seqBytes.Add(_alignedSequencesC[i][j]); } } _alignedSequencesC[i] = profileAligner.GenerateSequenceFromEString(eStrings[set], new Sequence(Alphabets.GetAmbiguousAlphabet(_alphabet), seqBytes.ToArray())); _alignedSequencesC[i].ID = _alignedSequencesC[i].ID; (_alignedSequencesC[i] as Sequence).Metadata = _alignedSequencesC[i].Metadata; }); } currentScore = MsaUtils.MultipleAlignmentScoreFunction(_alignedSequencesC, SimilarityMatrix, GapOpenCost, GapExtensionCost); if (currentScore > _alignmentScoreC) { _alignmentScoreC = currentScore; needRefinement = true; // recreate the tree kimuraDistanceMatrixGenerator.GenerateDistanceMatrix(_alignedSequencesC); hierarcicalClusteringB = new HierarchicalClusteringParallel (kimuraDistanceMatrixGenerator.DistanceMatrix, _hierarchicalClusteringMethodName); binaryGuideTreeB = new BinaryGuideTree(hierarcicalClusteringB); break; } } if (!needRefinement) { refinementTime = maxRefineMentTime; break; } } if (_alignmentScoreC > _alignmentScore) { _alignmentScore = _alignmentScoreC; _alignedSequences = _alignedSequencesC; } Performance.Snapshot("Stop Stage 3"); } //just for the purpose of integrating PW and MSA with the same output IList <Bio.Algorithms.Alignment.ISequenceAlignment> results = new List <Bio.Algorithms.Alignment.ISequenceAlignment>(); return(results); }
/// <summary> /// Generates consensus sequences from alignment layout. /// </summary> /// <param name="alignmentBetweenReferenceAndReads">Input list of reads.</param> /// <returns>List of contigs.</returns> public static IEnumerable <ISequence> GenerateConsensus(DeltaAlignmentCollection alignmentBetweenReferenceAndReads) { if (alignmentBetweenReferenceAndReads == null) { throw new ArgumentNullException("alignmentBetweenReferenceAndReads"); } SimpleConsensusResolver resolver = new SimpleConsensusResolver(AmbiguousDnaAlphabet.Instance, 49); // this dictionary will not grow more than a few hundread in worst scenario, // as this stores delta and its corresponding sequences Dictionary <DeltaAlignment, ISequence> deltasInCurrentContig = new Dictionary <DeltaAlignment, ISequence>(); long currentAlignmentStartOffset = 0; long currentIndex = 0; long inDeltaIndex = 0; DeltaAlignment lastDelta; List <byte> currentContig = new List <byte>(); List <DeltaAlignment> deltasToRemove = new List <DeltaAlignment>(); // no deltas if (alignmentBetweenReferenceAndReads.Count == 0) { yield break; } long index = 0; lastDelta = alignmentBetweenReferenceAndReads[index]; do { // Starting a new contig if (deltasInCurrentContig.Count == 0) { currentAlignmentStartOffset = lastDelta.FirstSequenceStart; currentIndex = 0; currentContig.Clear(); } // loop through all deltas at current index and find consensus do { // Proceed creating consensus till we find another delta stats aligning while (lastDelta != null && lastDelta.FirstSequenceStart == currentAlignmentStartOffset + currentIndex) { deltasInCurrentContig.Add(lastDelta, GetSequenceFromDelta(lastDelta)); // Get next delta index++; if (alignmentBetweenReferenceAndReads.Count > index) { lastDelta = alignmentBetweenReferenceAndReads[index]; continue; // see if new delta starts from the same offset } else { lastDelta = null; } } byte[] symbolsAtCurrentIndex = new byte[deltasInCurrentContig.Count]; int symbolCounter = 0; foreach (var delta in deltasInCurrentContig) { inDeltaIndex = currentIndex - (delta.Key.FirstSequenceStart - currentAlignmentStartOffset); symbolsAtCurrentIndex[symbolCounter++] = delta.Value[inDeltaIndex]; if (inDeltaIndex == delta.Value.Count - 1) { deltasToRemove.Add(delta.Key); } } if (deltasToRemove.Count > 0) { for (int i = 0; i < deltasToRemove.Count; i++) { deltasInCurrentContig.Remove(deltasToRemove[i]); } deltasToRemove.Clear(); } byte consensusSymbol = resolver.GetConsensus(symbolsAtCurrentIndex); currentContig.Add(consensusSymbol); currentIndex++; // See if another delta is adjacent if (deltasInCurrentContig.Count == 0 && lastDelta != null && lastDelta.FirstSequenceStart == currentAlignmentStartOffset + currentIndex) { deltasInCurrentContig.Add(lastDelta, GetSequenceFromDelta(lastDelta)); // check next delta index++; if (alignmentBetweenReferenceAndReads.Count > index) { lastDelta = alignmentBetweenReferenceAndReads[index]; continue; // read next delta to see if it starts from current reference sequence offset } else { lastDelta = null; } } }while (deltasInCurrentContig.Count > 0); yield return(new Sequence(AmbiguousDnaAlphabet.Instance, currentContig.ToArray(), false)); }while (lastDelta != null); }
/// <summary> /// This method is considered as main execute method which defines the /// step by step algorithm. Derived class flows the defined flow by this /// method. /// </summary> /// <param name="referenceSequenceList">Reference sequence.</param> /// <param name="originalQuerySequences">List of input sequences.</param> /// <returns>A list of sequence alignment.</returns> private IEnumerable<IPairwiseSequenceAlignment> Alignment(IEnumerable<ISequence> referenceSequenceList, IEnumerable<ISequence> originalQuerySequences) { ConsensusResolver = new SimpleConsensusResolver(referenceSequenceList.ElementAt(0).Alphabet); IEnumerable<ISequence> querySequenceList = ForwardOnly ? originalQuerySequences : (ReverseOnly ? ReverseComplementSequenceList(originalQuerySequences) : AddReverseComplementsToSequenceList(originalQuerySequences)); IList<IPairwiseSequenceAlignment> results = new List<IPairwiseSequenceAlignment>(); var deltas = new List<DeltaAlignment>(); foreach (ISequence refSequence in referenceSequenceList) { this.nucmerAlgo = new NUCmer(refSequence); if (GapOpenCost != DefaultGapOpenCost) this.nucmerAlgo.GapOpenCost = GapOpenCost; if (GapExtensionCost != DefaultGapExtensionCost) this.nucmerAlgo.GapExtensionCost = GapExtensionCost; if (LengthOfMUM != DefaultLengthOfMUM) this.nucmerAlgo.LengthOfMUM = LengthOfMUM; // Set the ClusterBuilder properties to defaults if (FixedSeparation != ClusterBuilder.DefaultFixedSeparation) this.nucmerAlgo.FixedSeparation = FixedSeparation; if (MaximumSeparation != ClusterBuilder.DefaultMaximumSeparation) this.nucmerAlgo.MaximumSeparation = MaximumSeparation; if (MinimumScore != ClusterBuilder.DefaultMinimumScore) this.nucmerAlgo.MinimumScore = MinimumScore; if (SeparationFactor != ClusterBuilder.DefaultSeparationFactor) this.nucmerAlgo.SeparationFactor = SeparationFactor; if (BreakLength != ModifiedSmithWaterman.DefaultBreakLength) this.nucmerAlgo.BreakLength = BreakLength; this.nucmerAlgo.ConsensusResolver = ConsensusResolver; if (SimilarityMatrix != null) this.nucmerAlgo.SimilarityMatrix = SimilarityMatrix; foreach (ISequence querySequence in querySequenceList) { // Check for parameters that would prevent an alignment from being returned. if (Math.Min(querySequence.Count, refSequence.Count) < MinimumScore) { var msg = "Bad parameter settings for NucmerPairwiseAligner. " + "Tried to align a reference of length " + refSequence.Count.ToString() + " to a sequence of length " + querySequence.Count.ToString() + " while requiring a minimum score of MinimumScore = " + MinimumScore + ". This will prevent any alignments from being returned."; throw new ArgumentException(msg); } IEnumerable<DeltaAlignment> deltaAlignment = this.nucmerAlgo.GetDeltaAlignments(querySequence, !MaxMatch, querySequence.IsMarkedAsReverseComplement()); deltas.AddRange(deltaAlignment); } } if (deltas.Count > 0) { ISequence concatReference = referenceSequenceList.ElementAt(0); //// concat all the sequences into one sequence if (referenceSequenceList.Count() > 1) { concatReference = ConcatSequence(referenceSequenceList); } foreach (ISequence querySequence in querySequenceList) { List<DeltaAlignment> qDelta = deltas.Where(d => d.QuerySequence.Equals(querySequence)).ToList(); IPairwiseSequenceAlignment sequenceAlignment = new PairwiseSequenceAlignment(concatReference, querySequence); // Convert delta alignments to sequence alignments IList<PairwiseAlignedSequence> alignments = ConvertDeltaToAlignment(qDelta); if (alignments.Count > 0) { foreach (PairwiseAlignedSequence align in alignments) { // Calculate the score of alignment align.Score = CalculateScore( align.FirstSequence, align.SecondSequence); // Make Consensus align.Consensus = MakeConsensus( align.FirstSequence, align.SecondSequence); sequenceAlignment.PairwiseAlignedSequences.Add(align); } } results.Add(sequenceAlignment); } } return results; }