Exemple #1
0
        /// <summary>
        /// Assemble the input sequences into the largest possible contigs.
        /// </summary>
        /// <remarks>
        /// The algorithm is:
        /// 1.  initialize list of contigs to empty list. List of seqs is passed as argument.
        /// 2.  compute pairwise overlap scores for each pair of input seqs (with reversal and
        ///     complementation as appropriate).
        /// 3.  choose best overlap score. the “merge items” (can be seqs or contigs) are the
        ///     items with that score. If best score is less than threshold, assembly is finished.
        /// 4.  merge the merge items into a single contig and remove them from their list(s)
        /// 5.  compute the overlap between new item and all existing items
        /// 6.  go to step 3
        /// </remarks>
        /// <param name="inputSequences">The sequences to assemble.</param>
        /// <returns>Returns the OverlapDeNovoAssembly instance which contains list of
        /// contigs and list of unmerged sequences which are result of this assembly.</returns>
        public IDeNovoAssembly Assemble(IEnumerable <ISequence> inputSequences)
        {
            if (null == inputSequences)
            {
                throw new ArgumentNullException(Properties.Resource.ParameterNameInputSequences);
            }

            // Initializations
            if (inputSequences.Count() > 0)
            {
                _sequenceAlphabet = inputSequences.First().Alphabet;

                if (ConsensusResolver == null)
                {
                    ConsensusResolver = new SimpleConsensusResolver(_sequenceAlphabet);
                }
                else
                {
                    ConsensusResolver.SequenceAlphabet = _sequenceAlphabet;
                }
            }

            OverlapDeNovoAssembly sequenceAssembly = null;

            // numbering convention: every pool item (whether sequence or contig)
            // gets a fixed number.
            // sequence index = index into inputs (which we won't modify)
            // contig index = nSequences + index into contigs
            List <PoolItem> pool = new List <PoolItem>();

            foreach (ISequence seq in inputSequences)
            {
                pool.Add(new PoolItem(seq));
            }

            // put all the initial sequences into the pool, and generate the pair scores.
            // there are no contigs in the pool yet.
            // to save an iteration, we'll also find the best global score as we go.
            ItemScore globalBest            = new ItemScore(-1, -1, false, false, 0, 0);
            int       globalBestLargerIndex = -1;
            int       unconsumedCount       = inputSequences.Count();

            // Compute alignment scores for all combinations between input sequences
            // Store these scores in the poolItem correspodning to each sequence
            for (int newSeq = 0; newSeq < pool.Count; ++newSeq)
            {
                PoolItem newItem = pool[newSeq];
                for (int oldSeq = 0; oldSeq < newSeq; ++oldSeq)
                {
                    PoolItem  oldItem = pool[oldSeq];
                    ItemScore score   = AlignSequence(oldItem.SequenceOrConsensus, newItem.SequenceOrConsensus, oldSeq, newSeq);
                    newItem.Scores.Add(score);
                    if (score.OverlapScore > globalBest.OverlapScore)
                    {
                        globalBest            = new ItemScore(score);
                        globalBestLargerIndex = newSeq;
                    }
                }
            }

            // Merge sequence if best score is above threshold
            // and add new contig to pool
            if (globalBest.OverlapScore >= MergeThreshold)
            {
                if (Trace.Want(Trace.AssemblyDetails))
                {
                    ApplicationLog.WriteLine("Merging (overlap score {0}):", globalBest.OverlapScore);
                }

                PoolItem mergeItem1 = pool[globalBest.OtherItem];
                PoolItem mergeItem2 = pool[globalBestLargerIndex];
                Contig   newContig  = new Contig();
                if (Trace.Want(Trace.AssemblyDetails))
                {
                    ApplicationLog.WriteLine(
                        "new pool item {0} will merge old items {1} and {2}",
                        pool.Count,
                        globalBest.OtherItem,
                        globalBestLargerIndex);
                }

                MergeLowerIndexedSequence(newContig, globalBest, mergeItem1.Sequence);
                MergeHigherIndexedSequence(newContig, globalBest, mergeItem2.Sequence);

                MakeConsensus(newContig);

                // Set ConsumedBy value and
                // free memory as these sequences are no longer used
                mergeItem1.ConsumedBy = pool.Count;
                mergeItem2.ConsumedBy = pool.Count;
                mergeItem1.FreeSequences();
                mergeItem2.FreeSequences();
                pool.Add(new PoolItem(newContig));
                unconsumedCount--;

                while (unconsumedCount > 1)
                {
                    // Compute scores for each unconsumed sequence with new contig
                    globalBest            = new ItemScore(-1, -1, false, false, 0, 0);
                    globalBestLargerIndex = -1;
                    int      newSeq  = pool.Count - 1;
                    PoolItem newItem = pool[newSeq];
                    for (int oldSeq = 0; oldSeq < pool.Count - 1; ++oldSeq)
                    {
                        PoolItem oldItem = pool[oldSeq];
                        if (oldItem.ConsumedBy >= 0)
                        {
                            // already consumed - just add dummy score to maintain correct indices
                            newItem.Scores.Add(new ItemScore());
                        }
                        else
                        {
                            ItemScore score = AlignSequence(oldItem.SequenceOrConsensus, newItem.SequenceOrConsensus, oldSeq, newSeq);
                            newItem.Scores.Add(score);
                        }
                    }

                    // find best global score in the modified pool.
                    globalBest            = new ItemScore(-1, -1, false, false, 0, 0);
                    globalBestLargerIndex = -1;
                    for (int current = 0; current < pool.Count; ++current)
                    {
                        PoolItem curItem = pool[current];
                        if (curItem.ConsumedBy < 0)
                        {
                            for (int other = 0; other < current; ++other)
                            {
                                if (pool[other].ConsumedBy < 0)
                                {
                                    ItemScore itemScore = curItem.Scores[other];
                                    if (itemScore.OverlapScore > globalBest.OverlapScore)
                                    {
                                        globalBest            = new ItemScore(itemScore); // copy the winner so far
                                        globalBestLargerIndex = current;
                                    }
                                }
                            }
                        }
                    }

                    if (globalBest.OverlapScore >= MergeThreshold)
                    {
                        // Merge sequences / contigs if above threshold
                        mergeItem1 = pool[globalBest.OtherItem];
                        mergeItem2 = pool[globalBestLargerIndex];
                        newContig  = new Contig();

                        if (mergeItem1.IsContig)
                        {
                            if (Trace.Want(Trace.AssemblyDetails))
                            {
                                ApplicationLog.WriteLine(
                                    "item {0} is a contig (reversed = {1}, complemented = {2}, offset = {3}",
                                    globalBest.OtherItem,
                                    globalBest.Reversed,
                                    globalBest.Complemented,
                                    globalBest.FirstOffset);
                            }

                            MergeLowerIndexedContig(newContig, globalBest, mergeItem1.Contig);
                        }
                        else
                        {
                            if (Trace.Want(Trace.AssemblyDetails))
                            {
                                ApplicationLog.WriteLine(
                                    "item {0} is a sequence (reversed = {1}, complemented = {2}, offset = {3}",
                                    globalBest.OtherItem,
                                    globalBest.Reversed,
                                    globalBest.Complemented,
                                    globalBest.FirstOffset);
                            }

                            MergeLowerIndexedSequence(newContig, globalBest, mergeItem1.Sequence);
                        }

                        if (mergeItem2.IsContig)
                        {
                            if (Trace.Want(Trace.AssemblyDetails))
                            {
                                ApplicationLog.WriteLine(
                                    "item {0} is a contig (offset = {1}",
                                    globalBestLargerIndex,
                                    globalBest.SecondOffset);
                            }

                            MergeHigherIndexedContig(newContig, globalBest, mergeItem2.Contig);
                        }
                        else
                        {
                            if (Trace.Want(Trace.AssemblyDetails))
                            {
                                ApplicationLog.WriteLine(
                                    "item {0} is a sequence (offset = {1}",
                                    globalBestLargerIndex,
                                    globalBest.SecondOffset);
                            }

                            MergeHigherIndexedSequence(newContig, globalBest, mergeItem2.Sequence);
                        }

                        MakeConsensus(newContig);
                        if (Trace.Want(Trace.AssemblyDetails))
                        {
                            Dump(newContig);
                        }

                        // Set ConsumedBy value for these poolItems and
                        // free memory as these sequences are no longer used
                        mergeItem1.ConsumedBy = pool.Count;
                        mergeItem2.ConsumedBy = pool.Count;
                        mergeItem1.FreeSequences();
                        mergeItem2.FreeSequences();

                        pool.Add(new PoolItem(newContig));
                        unconsumedCount--;
                    }
                    else
                    {
                        // None of the alignment scores cross threshold
                        // No more merges possible. So end iteration.
                        break;
                    }
                }
            }

            // no further qualifying merges, so we're done.
            // populate contigs and unmergedSequences
            sequenceAssembly = new OverlapDeNovoAssembly();
            foreach (PoolItem curItem in pool)
            {
                if (curItem.ConsumedBy < 0)
                {
                    if (curItem.IsContig)
                    {
                        sequenceAssembly.Contigs.Add(curItem.Contig);
                    }
                    else
                    {
                        sequenceAssembly.UnmergedSequences.Add(curItem.Sequence);
                    }
                }
            }

            return(sequenceAssembly);
        }