/// <summary> /// Method to merge higher-indexed item with new constructed contig. /// Merges consumed contig with new contig. For each sequence in consumed contig, /// compute sequence and offset to be added to new contig. /// </summary> /// <param name="newContig">New contig for merging</param> /// <param name="globalBest">Best Score, consensus, their offsets</param> /// <param name="consumedContig">Consumed Contig to be merged</param> private static void MergeHigherIndexedContig(Contig newContig, ItemScore globalBest, Contig consumedContig) { foreach (Contig.AssembledSequence aseq in consumedContig.Sequences) { Contig.AssembledSequence newASeq = new Contig.AssembledSequence(); // as the higher-index item, this contig is never reversed or complemented, so: newASeq.IsReversed = aseq.IsReversed; newASeq.IsComplemented = aseq.IsComplemented; // position in the new contig adjusted by alignment of the merged items. newASeq.Position = globalBest.SecondOffset + aseq.Position; newASeq.Sequence = SequenceWithoutTerminalGaps(aseq.Sequence); newContig.Sequences.Add(newASeq); if (Trace.Want(Trace.AssemblyDetails)) { ApplicationLog.WriteLine( "\tseq (rev = {0} comp = {1} pos = {2}) {3}", newASeq.IsReversed, newASeq.IsComplemented, newASeq.Position, newASeq.Sequence); } } }
/// <summary> /// Analyze the passed contig and store a consensus into its Consensus property. /// </summary> /// <param name="contig">Contig for which consensus is to be constructed</param> private void MakeConsensus(Contig contig) { Sequence consensusSequence = new Sequence(_sequenceAlphabet); List <ISequenceItem> positionItems = new List <ISequenceItem>(); // there's no simple way to pre-guess the length of the contig int position = 0; while (true) { // Initializations positionItems.Clear(); foreach (Contig.AssembledSequence aseq in contig.Sequences) { if (position >= aseq.Position && position < aseq.Position + aseq.Sequence.Count) { int seqPos; if (aseq.IsReversed) { seqPos = (aseq.Sequence.Count - 1) - (position - aseq.Position); } else { seqPos = position - aseq.Position; } if (aseq.IsComplemented) { positionItems.Add(aseq.Sequence.Complement[seqPos]); } else { positionItems.Add(aseq.Sequence[seqPos]); } } } if (positionItems.Count == 0) { // This means no sequences at this position. We're done contig.Consensus = consensusSequence; return; } else { consensusSequence.Add(ConsensusResolver.GetConsensus(positionItems)); } position++; } }
/// <summary> /// Analyze the passed contig and store a consensus into its Consensus property. /// Public method to allow testing of consensus generation part. /// Used by test automation. /// </summary> /// <param name="alphabet">Sequence alphabet</param> /// <param name="contig">Contig for which consensus is to be constructed</param> public void MakeConsensus(IAlphabet alphabet, Contig contig) { _sequenceAlphabet = alphabet; if (ConsensusResolver == null) { ConsensusResolver = new SimpleConsensusResolver(_sequenceAlphabet); } else { ConsensusResolver.SequenceAlphabet = _sequenceAlphabet; } MakeConsensus(contig); }
/// <summary> /// Write contig to application log /// </summary> /// <param name="contig">contig to be dumped</param> private static void Dump(Contig contig) { ApplicationLog.WriteLine("contig has {0} seqs, length {1}", contig.Sequences.Count, contig.Length); ApplicationLog.WriteLine("consensus: {0}", contig.Consensus); foreach (Contig.AssembledSequence aseq in contig.Sequences) { ApplicationLog.WriteLine( "seq (rev = {0} comp = {1} pos = {2}) {3}", aseq.IsReversed, aseq.IsComplemented, aseq.Position, aseq.Sequence); } ApplicationLog.WriteLine(string.Empty); }
/// <summary> /// Method to merge higher-indexed item with new constructed contig. /// Merges consumed sequence with new contig. For the consumed sequence, /// compute new sequence and offset to be added to new contig. /// </summary> /// <param name="newContig">New contig for merging</param> /// <param name="globalBest">Best Score, consensus, their offsets</param> /// <param name="consumedSequence">Consumed Sequence to be merged</param> private static void MergeHigherIndexedSequence(Contig newContig, ItemScore globalBest, ISequence consumedSequence) { Contig.AssembledSequence newASeq = new Contig.AssembledSequence(); // as the higher-index item, this sequence is never reversed or complemented, so: newASeq.IsReversed = false; newASeq.IsComplemented = false; newASeq.Position = globalBest.SecondOffset; newASeq.Sequence = SequenceWithoutTerminalGaps(consumedSequence); newContig.Sequences.Add(newASeq); if (Trace.Want(Trace.AssemblyDetails)) { ApplicationLog.WriteLine( "seq (rev = {0} comp = {1} pos = {2}) {3}", newASeq.IsReversed, newASeq.IsComplemented, newASeq.Position, newASeq.Sequence); } }
/// <summary> /// Method to merge lower-indexed item with new constructed contig /// Merges consumed sequence with new contig. For the consumed sequence, /// compute new sequence and offset to be added to new contig. /// </summary> /// <param name="newContig">New contig for merging</param> /// <param name="globalBest">Best Score, consensus, their offsets</param> /// <param name="consumedSequence">Consumed Sequence to be merged</param> private static void MergeLowerIndexedSequence(Contig newContig, ItemScore globalBest, ISequence consumedSequence) { Contig.AssembledSequence newASeq = new Contig.AssembledSequence(); // lower-indexed item might be reversed or complemented. // Retreive information from globalBest newASeq.IsReversed = globalBest.Reversed; newASeq.IsComplemented = globalBest.Complemented; newASeq.Position = globalBest.FirstOffset; newASeq.Sequence = SequenceWithoutTerminalGaps(consumedSequence); newContig.Sequences.Add(newASeq); if (Trace.Want(Trace.AssemblyDetails)) { ApplicationLog.WriteLine( "seq (rev = {0} comp = {1} pos = {2}) {3}", newASeq.IsReversed, newASeq.IsComplemented, newASeq.Position, newASeq.Sequence); } }
/// <summary> /// Method to merge lower-indexed item with new constructed contig. /// Merges consumed contig with new contig. For each sequence in consumed contig, /// compute sequence and offset to be added to new contig. /// </summary> /// <param name="newContig">New contig for merging</param> /// <param name="globalBest">Best Score along with offsets information</param> /// <param name="consumedContig">Contig to be merged</param> private static void MergeLowerIndexedContig(Contig newContig, ItemScore globalBest, Contig consumedContig) { foreach (Contig.AssembledSequence aseq in consumedContig.Sequences) { Contig.AssembledSequence newASeq = new Contig.AssembledSequence(); // lower-indexed item might be reversed or complemented. // Construct new sequence based on setting in globalBest // reverse of reverse, or comp of comp, equals no-op. So use xor newASeq.IsReversed = aseq.IsReversed ^ globalBest.Reversed; newASeq.IsComplemented = aseq.IsComplemented ^ globalBest.Complemented; // position in the new contig is adjusted by alignment of the merged items. // this depends on whether the contig is reverse-aligned. if (globalBest.Reversed) { int rightOffset = consumedContig.Length - (aseq.Sequence.Count + aseq.Position); newASeq.Position = globalBest.FirstOffset + rightOffset; } else { newASeq.Position = globalBest.FirstOffset + aseq.Position; } newASeq.Sequence = SequenceWithoutTerminalGaps(aseq.Sequence); newContig.Sequences.Add(newASeq); if (Trace.Want(Trace.AssemblyDetails)) { ApplicationLog.WriteLine( "\tseq (rev = {0} comp = {1} pos = {2}) {3}", newASeq.IsReversed, newASeq.IsComplemented, newASeq.Position, newASeq.Sequence); } } }
/// <summary> /// Initializes a new instance of the PoolItem class. /// constructor for contig. /// </summary> /// <param name="item">Pool object</param> internal PoolItem(Contig item) : this(item, true) { }
/// <summary> /// Assemble the input sequences into the largest possible contigs. /// </summary> /// <remarks> /// The algorithm is: /// 1. initialize list of contigs to empty list. List of seqs is passed as argument. /// 2. compute pairwise overlap scores for each pair of input seqs (with reversal and /// complementation as appropriate). /// 3. choose best overlap score. the “merge items” (can be seqs or contigs) are the /// items with that score. If best score is less than threshold, assembly is finished. /// 4. merge the merge items into a single contig and remove them from their list(s) /// 5. compute the overlap between new item and all existing items /// 6. go to step 3 /// </remarks> /// <param name="inputSequences">The sequences to assemble.</param> /// <returns>Returns the OverlapDeNovoAssembly instance which contains list of /// contigs and list of unmerged sequences which are result of this assembly.</returns> public IDeNovoAssembly Assemble(IList <ISequence> inputSequences) { // Initializations if (inputSequences.Count > 0) { _sequenceAlphabet = inputSequences[0].Alphabet; if (ConsensusResolver == null) { ConsensusResolver = new SimpleConsensusResolver(_sequenceAlphabet); } else { ConsensusResolver.SequenceAlphabet = _sequenceAlphabet; } } OverlapDeNovoAssembly sequenceAssembly = null; // numbering convention: every pool item (whether sequence or contig) // gets a fixed number. // sequence index = index into inputs (which we won't modify) // contig index = nSequences + index into contigs List <PoolItem> pool = new List <PoolItem>(); foreach (ISequence seq in inputSequences) { pool.Add(new PoolItem(seq)); } // put all the initial sequences into the pool, and generate the pair scores. // there are no contigs in the pool yet. // to save an iteration, we'll also find the best global score as we go. ItemScore globalBest = new ItemScore(-1, -1, false, false, 0, 0); int globalBestLargerIndex = -1; int unconsumedCount = inputSequences.Count; // Compute alignment scores for all combinations between input sequences // Store these scores in the poolItem correspodning to each sequence for (int newSeq = 0; newSeq < pool.Count; ++newSeq) { PoolItem newItem = pool[newSeq]; for (int oldSeq = 0; oldSeq < newSeq; ++oldSeq) { PoolItem oldItem = pool[oldSeq]; ItemScore score = AlignSequence(oldItem.SequenceOrConsensus, newItem.SequenceOrConsensus, oldSeq, newSeq); newItem.Scores.Add(score); if (score.OverlapScore > globalBest.OverlapScore) { globalBest = new ItemScore(score); globalBestLargerIndex = newSeq; } } } // Merge sequence if best score is above threshold // and add new contig to pool if (globalBest.OverlapScore >= MergeThreshold) { if (Trace.Want(Trace.AssemblyDetails)) { ApplicationLog.WriteLine("Merging (overlap score {0}):", globalBest.OverlapScore); } PoolItem mergeItem1 = pool[globalBest.OtherItem]; PoolItem mergeItem2 = pool[globalBestLargerIndex]; Contig newContig = new Contig(); if (Trace.Want(Trace.AssemblyDetails)) { ApplicationLog.WriteLine( "new pool item {0} will merge old items {1} and {2}", pool.Count, globalBest.OtherItem, globalBestLargerIndex); } MergeLowerIndexedSequence(newContig, globalBest, mergeItem1.Sequence); MergeHigherIndexedSequence(newContig, globalBest, mergeItem2.Sequence); MakeConsensus(newContig); // Set ConsumedBy value and // free memory as these sequences are no longer used mergeItem1.ConsumedBy = pool.Count; mergeItem2.ConsumedBy = pool.Count; mergeItem1.FreeSequences(); mergeItem2.FreeSequences(); pool.Add(new PoolItem(newContig)); unconsumedCount--; while (unconsumedCount > 1) { // Compute scores for each unconsumed sequence with new contig globalBest = new ItemScore(-1, -1, false, false, 0, 0); globalBestLargerIndex = -1; int newSeq = pool.Count - 1; PoolItem newItem = pool[newSeq]; for (int oldSeq = 0; oldSeq < pool.Count - 1; ++oldSeq) { PoolItem oldItem = pool[oldSeq]; if (oldItem.ConsumedBy >= 0) { // already consumed - just add dummy score to maintain correct indices newItem.Scores.Add(new ItemScore()); } else { ItemScore score = AlignSequence(oldItem.SequenceOrConsensus, newItem.SequenceOrConsensus, oldSeq, newSeq); newItem.Scores.Add(score); } } // find best global score in the modified pool. globalBest = new ItemScore(-1, -1, false, false, 0, 0); globalBestLargerIndex = -1; for (int current = 0; current < pool.Count; ++current) { PoolItem curItem = pool[current]; if (curItem.ConsumedBy < 0) { for (int other = 0; other < current; ++other) { if (pool[other].ConsumedBy < 0) { ItemScore itemScore = curItem.Scores[other]; if (itemScore.OverlapScore > globalBest.OverlapScore) { globalBest = new ItemScore(itemScore); // copy the winner so far globalBestLargerIndex = current; } } } } } if (globalBest.OverlapScore >= MergeThreshold) { // Merge sequences / contigs if above threshold mergeItem1 = pool[globalBest.OtherItem]; mergeItem2 = pool[globalBestLargerIndex]; newContig = new Contig(); if (mergeItem1.IsContig) { if (Trace.Want(Trace.AssemblyDetails)) { ApplicationLog.WriteLine( "item {0} is a contig (reversed = {1}, complemented = {2}, offset = {3}", globalBest.OtherItem, globalBest.Reversed, globalBest.Complemented, globalBest.FirstOffset); } MergeLowerIndexedContig(newContig, globalBest, mergeItem1.Contig); } else { if (Trace.Want(Trace.AssemblyDetails)) { ApplicationLog.WriteLine( "item {0} is a sequence (reversed = {1}, complemented = {2}, offset = {3}", globalBest.OtherItem, globalBest.Reversed, globalBest.Complemented, globalBest.FirstOffset); } MergeLowerIndexedSequence(newContig, globalBest, mergeItem1.Sequence); } if (mergeItem2.IsContig) { if (Trace.Want(Trace.AssemblyDetails)) { ApplicationLog.WriteLine( "item {0} is a contig (offset = {1}", globalBestLargerIndex, globalBest.SecondOffset); } MergeHigherIndexedContig(newContig, globalBest, mergeItem2.Contig); } else { if (Trace.Want(Trace.AssemblyDetails)) { ApplicationLog.WriteLine( "item {0} is a sequence (offset = {1}", globalBestLargerIndex, globalBest.SecondOffset); } MergeHigherIndexedSequence(newContig, globalBest, mergeItem2.Sequence); } MakeConsensus(newContig); if (Trace.Want(Trace.AssemblyDetails)) { Dump(newContig); } // Set ConsumedBy value for these poolItems and // free memory as these sequences are no longer used mergeItem1.ConsumedBy = pool.Count; mergeItem2.ConsumedBy = pool.Count; mergeItem1.FreeSequences(); mergeItem2.FreeSequences(); pool.Add(new PoolItem(newContig)); unconsumedCount--; } else { // None of the alignment scores cross threshold // No more merges possible. So end iteration. break; } } } // no further qualifying merges, so we're done. // populate contigs and unmergedSequences sequenceAssembly = new OverlapDeNovoAssembly(); foreach (PoolItem curItem in pool) { if (curItem.ConsumedBy < 0) { if (curItem.IsContig) { sequenceAssembly.Contigs.Add(curItem.Contig); } else { sequenceAssembly.UnmergedSequences.Add(curItem.Sequence); } } } return(sequenceAssembly); }