/// <summary> /// Assemble the input sequences into the largest possible contigs. /// </summary> /// <remarks> /// The algorithm is: /// 1. initialize list of contigs to empty list. List of seqs is passed as argument. /// 2. compute pairwise overlap scores for each pair of input seqs (with reversal and /// complementation as appropriate). /// 3. choose best overlap score. the “merge items” (can be seqs or contigs) are the /// items with that score. If best score is less than threshold, assembly is finished. /// 4. merge the merge items into a single contig and remove them from their list(s) /// 5. compute the overlap between new item and all existing items /// 6. go to step 3 /// </remarks> /// <param name="inputSequences">The sequences to assemble.</param> /// <returns>Returns the OverlapDeNovoAssembly instance which contains list of /// contigs and list of unmerged sequences which are result of this assembly.</returns> public IDeNovoAssembly Assemble(IEnumerable <ISequence> inputSequences) { if (null == inputSequences) { throw new ArgumentNullException(Properties.Resource.ParameterNameInputSequences); } // Initializations if (inputSequences.Count() > 0) { _sequenceAlphabet = inputSequences.First().Alphabet; if (ConsensusResolver == null) { ConsensusResolver = new SimpleConsensusResolver(_sequenceAlphabet); } else { ConsensusResolver.SequenceAlphabet = _sequenceAlphabet; } } OverlapDeNovoAssembly sequenceAssembly = null; // numbering convention: every pool item (whether sequence or contig) // gets a fixed number. // sequence index = index into inputs (which we won't modify) // contig index = nSequences + index into contigs List <PoolItem> pool = new List <PoolItem>(); foreach (ISequence seq in inputSequences) { pool.Add(new PoolItem(seq)); } // put all the initial sequences into the pool, and generate the pair scores. // there are no contigs in the pool yet. // to save an iteration, we'll also find the best global score as we go. ItemScore globalBest = new ItemScore(-1, -1, false, false, 0, 0); int globalBestLargerIndex = -1; int unconsumedCount = inputSequences.Count(); // Compute alignment scores for all combinations between input sequences // Store these scores in the poolItem correspodning to each sequence for (int newSeq = 0; newSeq < pool.Count; ++newSeq) { PoolItem newItem = pool[newSeq]; for (int oldSeq = 0; oldSeq < newSeq; ++oldSeq) { PoolItem oldItem = pool[oldSeq]; ItemScore score = AlignSequence(oldItem.SequenceOrConsensus, newItem.SequenceOrConsensus, oldSeq, newSeq); newItem.Scores.Add(score); if (score.OverlapScore > globalBest.OverlapScore) { globalBest = new ItemScore(score); globalBestLargerIndex = newSeq; } } } // Merge sequence if best score is above threshold // and add new contig to pool if (globalBest.OverlapScore >= MergeThreshold) { if (Trace.Want(Trace.AssemblyDetails)) { ApplicationLog.WriteLine("Merging (overlap score {0}):", globalBest.OverlapScore); } PoolItem mergeItem1 = pool[globalBest.OtherItem]; PoolItem mergeItem2 = pool[globalBestLargerIndex]; Contig newContig = new Contig(); if (Trace.Want(Trace.AssemblyDetails)) { ApplicationLog.WriteLine( "new pool item {0} will merge old items {1} and {2}", pool.Count, globalBest.OtherItem, globalBestLargerIndex); } MergeLowerIndexedSequence(newContig, globalBest, mergeItem1.Sequence); MergeHigherIndexedSequence(newContig, globalBest, mergeItem2.Sequence); MakeConsensus(newContig); // Set ConsumedBy value and // free memory as these sequences are no longer used mergeItem1.ConsumedBy = pool.Count; mergeItem2.ConsumedBy = pool.Count; mergeItem1.FreeSequences(); mergeItem2.FreeSequences(); pool.Add(new PoolItem(newContig)); unconsumedCount--; while (unconsumedCount > 1) { // Compute scores for each unconsumed sequence with new contig globalBest = new ItemScore(-1, -1, false, false, 0, 0); globalBestLargerIndex = -1; int newSeq = pool.Count - 1; PoolItem newItem = pool[newSeq]; for (int oldSeq = 0; oldSeq < pool.Count - 1; ++oldSeq) { PoolItem oldItem = pool[oldSeq]; if (oldItem.ConsumedBy >= 0) { // already consumed - just add dummy score to maintain correct indices newItem.Scores.Add(new ItemScore()); } else { ItemScore score = AlignSequence(oldItem.SequenceOrConsensus, newItem.SequenceOrConsensus, oldSeq, newSeq); newItem.Scores.Add(score); } } // find best global score in the modified pool. globalBest = new ItemScore(-1, -1, false, false, 0, 0); globalBestLargerIndex = -1; for (int current = 0; current < pool.Count; ++current) { PoolItem curItem = pool[current]; if (curItem.ConsumedBy < 0) { for (int other = 0; other < current; ++other) { if (pool[other].ConsumedBy < 0) { ItemScore itemScore = curItem.Scores[other]; if (itemScore.OverlapScore > globalBest.OverlapScore) { globalBest = new ItemScore(itemScore); // copy the winner so far globalBestLargerIndex = current; } } } } } if (globalBest.OverlapScore >= MergeThreshold) { // Merge sequences / contigs if above threshold mergeItem1 = pool[globalBest.OtherItem]; mergeItem2 = pool[globalBestLargerIndex]; newContig = new Contig(); if (mergeItem1.IsContig) { if (Trace.Want(Trace.AssemblyDetails)) { ApplicationLog.WriteLine( "item {0} is a contig (reversed = {1}, complemented = {2}, offset = {3}", globalBest.OtherItem, globalBest.Reversed, globalBest.Complemented, globalBest.FirstOffset); } MergeLowerIndexedContig(newContig, globalBest, mergeItem1.Contig); } else { if (Trace.Want(Trace.AssemblyDetails)) { ApplicationLog.WriteLine( "item {0} is a sequence (reversed = {1}, complemented = {2}, offset = {3}", globalBest.OtherItem, globalBest.Reversed, globalBest.Complemented, globalBest.FirstOffset); } MergeLowerIndexedSequence(newContig, globalBest, mergeItem1.Sequence); } if (mergeItem2.IsContig) { if (Trace.Want(Trace.AssemblyDetails)) { ApplicationLog.WriteLine( "item {0} is a contig (offset = {1}", globalBestLargerIndex, globalBest.SecondOffset); } MergeHigherIndexedContig(newContig, globalBest, mergeItem2.Contig); } else { if (Trace.Want(Trace.AssemblyDetails)) { ApplicationLog.WriteLine( "item {0} is a sequence (offset = {1}", globalBestLargerIndex, globalBest.SecondOffset); } MergeHigherIndexedSequence(newContig, globalBest, mergeItem2.Sequence); } MakeConsensus(newContig); if (Trace.Want(Trace.AssemblyDetails)) { Dump(newContig); } // Set ConsumedBy value for these poolItems and // free memory as these sequences are no longer used mergeItem1.ConsumedBy = pool.Count; mergeItem2.ConsumedBy = pool.Count; mergeItem1.FreeSequences(); mergeItem2.FreeSequences(); pool.Add(new PoolItem(newContig)); unconsumedCount--; } else { // None of the alignment scores cross threshold // No more merges possible. So end iteration. break; } } } // no further qualifying merges, so we're done. // populate contigs and unmergedSequences sequenceAssembly = new OverlapDeNovoAssembly(); foreach (PoolItem curItem in pool) { if (curItem.ConsumedBy < 0) { if (curItem.IsContig) { sequenceAssembly.Contigs.Add(curItem.Contig); } else { sequenceAssembly.UnmergedSequences.Add(curItem.Sequence); } } } return(sequenceAssembly); }