示例#1
0
        public void Build(IEnumerable<ISequence> sequences)
        {
            // Size of Kmer List to grab, somewhat arbitrary but want to keep list size below large object threshold, which is ~85 kb 
            const int blockSize = 4096;

            // When to add list to blocking collection, most short reads are <=151 bp so this should avoid needing to grow the list
            const int addThreshold = blockSize - 151;

            // When to pause adding
            const int stopAddThreshold = 2000000 / blockSize;

            if (sequences == null)
                throw new ArgumentNullException("sequences");

            if (KmerLength > KmerData32.MAX_KMER_LENGTH)
                throw new ArgumentException(Properties.Resource.KmerLengthGreaterThan31);

            // A dictionary kmers to debruijin nodes
            KmerDictionary kmerManager = new KmerDictionary();

            // Create the producer thread.
            var kmerDataCollection = new BlockingCollection<List<KmerData32>>();
            Task producer = Task.Factory.StartNew(() =>
            {
                try
                {
                    List<KmerData32> kmerList = new List<KmerData32>(blockSize);

                    IAlphabet alphabet = Alphabets.DNA;
                    HashSet<byte> gapSymbols;
                    alphabet.TryGetGapSymbols(out gapSymbols);

                    // Generate the kmers from the sequences
                    foreach (ISequence sequence in sequences)
                    {
                        // if the sequence alphabet is not of type DNA then ignore it.
                        bool skipSequence = false;
                        if (sequence.Alphabet != Alphabets.DNA)
                        {
                            skipSequence = true;
                        }
                        else
                        {
                            // if the sequence contains any gap symbols then ignore the sequence.
                            foreach (byte symbol in gapSymbols)
                            {
                                for (long index = 0; index < sequence.Count; ++index)
                                {
                                    if (sequence[index] == symbol)
                                    {
                                        skipSequence = true;
                                        break;
                                    }
                                }

                                if (skipSequence)
                                    break;
                            }
                        }

                        if (skipSequence)
                        {
                            Interlocked.Increment(ref _skippedSequencesCount);
                            Interlocked.Increment(ref _processedSequencesCount);
                            continue;
                        }

                        // if the blocking collection count is exceeding 2 million kmers wait for 2 sec 
                        // so that the task can remove some kmers and create the nodes. 
                        // This will avoid OutofMemoryException
                        while (kmerDataCollection.Count > stopAddThreshold)
                        {
                            Task.Delay(TimeSpan.FromSeconds(2)).Wait();
                        }

                        // Convert sequences to k-mers
                        kmerList.AddRange(KmerData32.GetKmers(sequence, KmerLength));

                        // Most reads are <=150 basepairs, so this should avoid having to grow the list
                        // by keeping it below blockSize
                        if (kmerList.Count > addThreshold)
                        {
                            kmerDataCollection.Add(kmerList);
                            kmerList = new List<KmerData32>(4092);
                        }
                        Interlocked.Increment(ref _processedSequencesCount);
                    }

                    if (kmerList.Count <= addThreshold)
                        kmerDataCollection.Add(kmerList);
                }
                finally
                {
                    kmerDataCollection.CompleteAdding();
                }
            });

            // Consume k-mers by addding them to binary tree structure as nodes
            Parallel.ForEach(kmerDataCollection.GetConsumingEnumerable(),newKmerList=>
            {
                foreach (KmerData32 newKmer in newKmerList)
                {
                    // Create Vertex
                    DeBruijnNode node = kmerManager.SetNewOrGetOld(newKmer);

                    // Need to lock node if doing this in parallel
                    if (node.KmerCount <= 255)
                    {
                        lock (node)
                        {
                            node.KmerCount++;
                        }
                    }
                }
            });

            // Ensure producer exceptions are handled.
            producer.Wait();

            // Done filling binary tree
            kmerDataCollection.Dispose();

            //NOTE: To speed enumeration make the nodes into an array and dispose of the collection
            _nodeCount = kmerManager.NodeCount;
            _nodes = kmerManager.GenerateNodeArray();
            
            // Generate the links
            GenerateLinks(kmerManager);
            
            // Since we no longer need to search for values set left and right nodes of child array to null
            // so that they are available for GC if no longer needed
            foreach (DeBruijnNode node in _nodes)
            {
                node.Left = node.Right = null;
            }

            GraphBuildCompleted = true;
        }
        /// <summary>
        /// Build graph nodes and edges from list of k-mers.
        /// Creates a node for every unique k-mer (and reverse-complement)
        /// in the read. Then, generates adjacency information between nodes
        /// by computing pairs of nodes that have overlapping regions
        /// between node sequences.
        /// </summary>
        /// <param name="sequences">List of input sequences.</param>
        /// <param name="destroyKmerManagerAfterwards">MT Assembler specific flag
        public void Build(IEnumerable <ISequence> sequences, bool destroyKmerManagerAfterwards = true)
        {
            if (sequences == null)
            {
                throw new ArgumentNullException("sequences");
            }

            // Build the dictionary of kmers to debruijin nodes
            var kmerManager        = new KmerDictionary();
            var kmerDataCollection = new BlockingCollection <List <KmerData32> >();
            // Create the producer task
            Task theProducer = Task.Factory.StartNew(() =>
            {
                Thread.BeginCriticalRegion();
                try
                {
                    int i        = 0;
                    var kmerList = new List <KmerData32>(BlockSize);
                    // Generate the kmers from the sequences
                    foreach (ISequence sequence in sequences)
                    {
#if DEBUG
                        i++;
                        if (i % 50000 == 0)
                        {
                            //TODO: This is reported each 5 minutes anyway.
                            Console.WriteLine("Parsed: " + i.ToString() + " reads");
                        }
#endif
                        // if the sequence alphabet is not of type DNA then ignore it.
                        bool skipSequence = false;
                        if (sequence.Alphabet != Alphabets.NoGapDNA || sequence.Count < _kmerLength)
                        {
                            skipSequence = true;
#if FALSE
                            Console.WriteLine(sequence.Alphabet.ToString());
                            var qs            = sequence as Sequence;
                            var f             = new Sequence(qs);
                            var s             = f.ConvertToString();
                            byte[] acceptable = new byte[] { 65, 67, 71, 84 };
                            var s3            = new Sequence(qs.Alphabet, f.Where(x => !acceptable.Contains(x)).ToArray());

                            Console.WriteLine("BAD: " + s3.ConvertToString());
                            Console.WriteLine(f.ConvertToString());

                            //	var b = sequence as Sequence;
                            //Console.WriteLine((sequence as Sequence).ConvertToString());
#endif
                        }
                        if (skipSequence)
                        {
                            Interlocked.Increment(ref this._skippedSequencesCount);
                            Interlocked.Increment(ref this._processedSequencesCount);
                            continue;
                        }

                        // If the blocking collection count is exceeding 2 million kmers wait for 5 sec
                        // so that the task can remove some kmers and create the nodes.
                        // This will avoid OutofMemoryException
                        while (kmerDataCollection.Count > StopAddThreshold)
                        {
                            Thread.Sleep(2);
                        }

                        // Convert sequences to k-mers
                        var kmers = KmerData32.GetKmers(sequence, this.KmerLength);
                        kmerList.AddRange(kmers);

                        // Most reads are <=150 basepairs, so this should avoid having to grow the list
                        // by keeping it below blockSize
                        if (kmerList.Count > AddThreshold)
                        {
                            kmerDataCollection.Add(kmerList);
                            kmerList = new List <KmerData32>(BlockSize);
                        }

                        Interlocked.Increment(ref this._processedSequencesCount);
                        Thread.EndCriticalRegion();
                    }

                    if (kmerList.Count <= AddThreshold)
                    {
                        kmerDataCollection.Add(kmerList);
                    }
                }
                finally
                {
                    kmerDataCollection.CompleteAdding();
                }
            });

            if (true)// (!Bio.CrossPlatform.Environment.RunningInMono)
            {
                // Consume k-mers by adding them to binary tree structure as nodes
                Parallel.ForEach(kmerDataCollection.GetConsumingEnumerable(),
                                 new ParallelOptions()
                {
                    MaxDegreeOfParallelism = Environment.ProcessorCount
                }, newKmerList =>
                {
                    foreach (KmerData32 newKmer in newKmerList)
                    {
                        // Create Vertex
                        DeBruijnNode node = kmerManager.SetNewOrGetOld(newKmer);
                        Debug.Assert(newKmer.KmerData == node.NodeValue.KmerData);
                    }
                });
            }
            else
            {
                foreach (var newKmerList in kmerDataCollection.GetConsumingEnumerable())
                {
                    foreach (KmerData32 newKmer in newKmerList)
                    {
                        // Create Vertex
                        DeBruijnNode node = kmerManager.SetNewOrGetOld(newKmer);
                        Debug.Assert(newKmer.KmerData == node.NodeValue.KmerData);
                    }
                }
            }
            // Done filling binary tree
            theProducer.Wait(); // Make sure task is finished - also rethrows any exception here.
            kmerDataCollection.Dispose();

            // NOTE: To speed enumeration make the nodes into an array and dispose of the collection
            this._nodeCount = kmerManager.NodeCount;
            this._nodes     = kmerManager.GenerateNodeArray();

            // Generate the links
            this.GenerateLinks(kmerManager);

            if (destroyKmerManagerAfterwards)
            {
                // Since we no longer need to search for values delete tree structure, also set left and right nodes of child array to null
                // So that they are available for GC if no longer needed
                kmerManager = null;
                foreach (DeBruijnNode node in _nodes)
                {
                    node.Left  = null;
                    node.Right = null;
                }
            }
            else
            {
                KmerManager = kmerManager;
            }
            this.GraphBuildCompleted = true;
        }
示例#3
0
        public void Build(IEnumerable <ISequence> sequences)
        {
            // Size of Kmer List to grab, somewhat arbitrary but want to keep list size below large object threshold, which is ~85 kb
            const int blockSize = 4096;

            // When to add list to blocking collection, most short reads are <=151 bp so this should avoid needing to grow the list
            const int addThreshold = blockSize - 151;

            // When to pause adding
            const int stopAddThreshold = 2000000 / blockSize;

            if (sequences == null)
            {
                throw new ArgumentNullException("sequences");
            }

            if (KmerLength > KmerData32.MAX_KMER_LENGTH)
            {
                throw new ArgumentException(Properties.Resource.KmerLengthGreaterThan31);
            }

            // A dictionary kmers to debruijin nodes
            KmerDictionary kmerManager = new KmerDictionary();

            // Create the producer thread.
            var  kmerDataCollection = new BlockingCollection <List <KmerData32> >();
            Task producer           = Task.Factory.StartNew(() =>
            {
                try
                {
                    List <KmerData32> kmerList = new List <KmerData32>(blockSize);

                    IAlphabet alphabet = Alphabets.DNA;
                    HashSet <byte> gapSymbols;
                    alphabet.TryGetGapSymbols(out gapSymbols);

                    // Generate the kmers from the sequences
                    foreach (ISequence sequence in sequences)
                    {
                        // if the sequence alphabet is not of type DNA then ignore it.
                        bool skipSequence = false;
                        if (sequence.Alphabet != Alphabets.DNA)
                        {
                            skipSequence = true;
                        }
                        else
                        {
                            // if the sequence contains any gap symbols then ignore the sequence.
                            foreach (byte symbol in gapSymbols)
                            {
                                for (long index = 0; index < sequence.Count; ++index)
                                {
                                    if (sequence[index] == symbol)
                                    {
                                        skipSequence = true;
                                        break;
                                    }
                                }

                                if (skipSequence)
                                {
                                    break;
                                }
                            }
                        }

                        if (skipSequence)
                        {
                            Interlocked.Increment(ref _skippedSequencesCount);
                            Interlocked.Increment(ref _processedSequencesCount);
                            continue;
                        }

                        // if the blocking collection count is exceeding 2 million kmers wait for 2 sec
                        // so that the task can remove some kmers and create the nodes.
                        // This will avoid OutofMemoryException
                        while (kmerDataCollection.Count > stopAddThreshold)
                        {
                            Task.Delay(TimeSpan.FromSeconds(2)).Wait();
                        }

                        // Convert sequences to k-mers
                        kmerList.AddRange(KmerData32.GetKmers(sequence, KmerLength));

                        // Most reads are <=150 basepairs, so this should avoid having to grow the list
                        // by keeping it below blockSize
                        if (kmerList.Count > addThreshold)
                        {
                            kmerDataCollection.Add(kmerList);
                            kmerList = new List <KmerData32>(4092);
                        }
                        Interlocked.Increment(ref _processedSequencesCount);
                    }

                    if (kmerList.Count <= addThreshold)
                    {
                        kmerDataCollection.Add(kmerList);
                    }
                }
                finally
                {
                    kmerDataCollection.CompleteAdding();
                }
            });

            // Consume k-mers by addding them to binary tree structure as nodes
            Parallel.ForEach(kmerDataCollection.GetConsumingEnumerable(), newKmerList =>
            {
                foreach (KmerData32 newKmer in newKmerList)
                {
                    // Create Vertex
                    DeBruijnNode node = kmerManager.SetNewOrGetOld(newKmer);

                    // Need to lock node if doing this in parallel
                    if (node.KmerCount <= 255)
                    {
                        lock (node)
                        {
                            node.KmerCount++;
                        }
                    }
                }
            });

            // Ensure producer exceptions are handled.
            producer.Wait();

            // Done filling binary tree
            kmerDataCollection.Dispose();

            //NOTE: To speed enumeration make the nodes into an array and dispose of the collection
            _nodeCount = kmerManager.NodeCount;
            _nodes     = kmerManager.GenerateNodeArray();

            // Generate the links
            GenerateLinks(kmerManager);

            // Since we no longer need to search for values set left and right nodes of child array to null
            // so that they are available for GC if no longer needed
            foreach (DeBruijnNode node in _nodes)
            {
                node.Left = node.Right = null;
            }

            GraphBuildCompleted = true;
        }