Example #1
0
        /// <summary>
        /// Adds the links between the nodes of the graph.
        /// </summary>
        private void GenerateLinks(KmerDictionary kmerManager)
        {
            // Prepare a mask to remove the bits representing the first nucleotide (or left most bits in the encoded kmer)
            // First calculate how many bits do you have to move down a character until you are at the start of the kmer encoded sequence
            int   distancetoShift = 2 * (KmerLength - 1);
            ulong rightMask       = ~(((ulong)3) << distancetoShift);

            Parallel.ForEach(_nodes, node =>
            {
                DeBruijnNode searchResult  = null;
                KmerData32 searchNodeValue = new KmerData32();

                // Right Extensions - Remove first position from the value
                // Remove the left most value by using an exclusive
                ulong nextKmer = node.NodeValue.KmerData & rightMask;

                // Move it over two to get make a position for the next pair of bits to represent a new nucleotide
                nextKmer = nextKmer << 2;
                for (ulong i = 0; i < 4; i++)
                {
                    ulong tmpNextKmer = nextKmer | i;    // Equivalent to "ACGTA"+"N" where N is the 0-3 encoding for A,C,G,T

                    // Now to set the kmer value to this, the orientationForward value is equal to false if the
                    // reverse compliment of the kmer is used instead of the kmer value itself.
                    bool matchIsRC = searchNodeValue.SetKmerData(tmpNextKmer, KmerLength);
                    searchResult   = kmerManager.TryGetOld(searchNodeValue);
                    if (searchResult != null)
                    {
                        node.SetExtensionNode(true, matchIsRC, searchResult);
                    }
                }

                // Left Extensions
                nextKmer = node.NodeValue.KmerData;

                //Chop off the right most basepair
                nextKmer >>= 2;
                for (ulong i = 0; i < 4; i++)     // Cycle through A,C,G,T
                {
                    // Add the character on to the left side of the kmer
                    // Equivalent to "N" + "ACGAT" where the basepair is added on as the 2 bits
                    ulong tmpNextKmer = (i << distancetoShift) | nextKmer;
                    bool matchIsRC    = searchNodeValue.SetKmerData(tmpNextKmer, KmerLength);
                    searchResult      = kmerManager.TryGetOld(searchNodeValue);
                    if (searchResult != null)
                    {
                        node.SetExtensionNode(false, matchIsRC, searchResult);
                    }
                }
            });

            LinkGenerationCompleted = true;
        }
 /// <summary>
 /// Destroys the kmer manager.  Called after additional sequences are searched for by MT Assembler
 /// </summary>
 public void DestroyKmerManager()
 {
     if (KmerManager != null && GraphBuildCompleted)
     {
         // Since we no longer need to search for values delete tree structure, also set left and right nodes of child array to null
         // So that they are available for GC if no longer needed
         KmerManager = null;
         foreach (DeBruijnNode node in _nodes)
         {
             node.Left = null; node.Right = null;
         }
     }
 }
        /// <summary>
        /// Build graph nodes and edges from list of k-mers.
        /// Creates a node for every unique k-mer (and reverse-complement)
        /// in the read. Then, generates adjacency information between nodes
        /// by computing pairs of nodes that have overlapping regions
        /// between node sequences.
        /// </summary>
        /// <param name="sequences">List of input sequences.</param>
        /// <param name="destroyKmerManagerAfterwards">MT Assembler specific flag
        public void Build(IEnumerable <ISequence> sequences, bool destroyKmerManagerAfterwards = true)
        {
            if (sequences == null)
            {
                throw new ArgumentNullException("sequences");
            }

            // Build the dictionary of kmers to debruijin nodes
            var kmerManager        = new KmerDictionary();
            var kmerDataCollection = new BlockingCollection <List <KmerData32> >();
            // Create the producer task
            Task theProducer = Task.Factory.StartNew(() =>
            {
                Thread.BeginCriticalRegion();
                try
                {
                    int i        = 0;
                    var kmerList = new List <KmerData32>(BlockSize);
                    // Generate the kmers from the sequences
                    foreach (ISequence sequence in sequences)
                    {
#if DEBUG
                        i++;
                        if (i % 50000 == 0)
                        {
                            //TODO: This is reported each 5 minutes anyway.
                            Console.WriteLine("Parsed: " + i.ToString() + " reads");
                        }
#endif
                        // if the sequence alphabet is not of type DNA then ignore it.
                        bool skipSequence = false;
                        if (sequence.Alphabet != Alphabets.NoGapDNA || sequence.Count < _kmerLength)
                        {
                            skipSequence = true;
#if FALSE
                            Console.WriteLine(sequence.Alphabet.ToString());
                            var qs            = sequence as Sequence;
                            var f             = new Sequence(qs);
                            var s             = f.ConvertToString();
                            byte[] acceptable = new byte[] { 65, 67, 71, 84 };
                            var s3            = new Sequence(qs.Alphabet, f.Where(x => !acceptable.Contains(x)).ToArray());

                            Console.WriteLine("BAD: " + s3.ConvertToString());
                            Console.WriteLine(f.ConvertToString());

                            //	var b = sequence as Sequence;
                            //Console.WriteLine((sequence as Sequence).ConvertToString());
#endif
                        }
                        if (skipSequence)
                        {
                            Interlocked.Increment(ref this._skippedSequencesCount);
                            Interlocked.Increment(ref this._processedSequencesCount);
                            continue;
                        }

                        // If the blocking collection count is exceeding 2 million kmers wait for 5 sec
                        // so that the task can remove some kmers and create the nodes.
                        // This will avoid OutofMemoryException
                        while (kmerDataCollection.Count > StopAddThreshold)
                        {
                            Thread.Sleep(2);
                        }

                        // Convert sequences to k-mers
                        var kmers = KmerData32.GetKmers(sequence, this.KmerLength);
                        kmerList.AddRange(kmers);

                        // Most reads are <=150 basepairs, so this should avoid having to grow the list
                        // by keeping it below blockSize
                        if (kmerList.Count > AddThreshold)
                        {
                            kmerDataCollection.Add(kmerList);
                            kmerList = new List <KmerData32>(BlockSize);
                        }

                        Interlocked.Increment(ref this._processedSequencesCount);
                        Thread.EndCriticalRegion();
                    }

                    if (kmerList.Count <= AddThreshold)
                    {
                        kmerDataCollection.Add(kmerList);
                    }
                }
                finally
                {
                    kmerDataCollection.CompleteAdding();
                }
            });

            if (true)// (!Bio.CrossPlatform.Environment.RunningInMono)
            {
                // Consume k-mers by adding them to binary tree structure as nodes
                Parallel.ForEach(kmerDataCollection.GetConsumingEnumerable(),
                                 new ParallelOptions()
                {
                    MaxDegreeOfParallelism = Environment.ProcessorCount
                }, newKmerList =>
                {
                    foreach (KmerData32 newKmer in newKmerList)
                    {
                        // Create Vertex
                        DeBruijnNode node = kmerManager.SetNewOrGetOld(newKmer);
                        Debug.Assert(newKmer.KmerData == node.NodeValue.KmerData);
                    }
                });
            }
            else
            {
                foreach (var newKmerList in kmerDataCollection.GetConsumingEnumerable())
                {
                    foreach (KmerData32 newKmer in newKmerList)
                    {
                        // Create Vertex
                        DeBruijnNode node = kmerManager.SetNewOrGetOld(newKmer);
                        Debug.Assert(newKmer.KmerData == node.NodeValue.KmerData);
                    }
                }
            }
            // Done filling binary tree
            theProducer.Wait(); // Make sure task is finished - also rethrows any exception here.
            kmerDataCollection.Dispose();

            // NOTE: To speed enumeration make the nodes into an array and dispose of the collection
            this._nodeCount = kmerManager.NodeCount;
            this._nodes     = kmerManager.GenerateNodeArray();

            // Generate the links
            this.GenerateLinks(kmerManager);

            if (destroyKmerManagerAfterwards)
            {
                // Since we no longer need to search for values delete tree structure, also set left and right nodes of child array to null
                // So that they are available for GC if no longer needed
                kmerManager = null;
                foreach (DeBruijnNode node in _nodes)
                {
                    node.Left  = null;
                    node.Right = null;
                }
            }
            else
            {
                KmerManager = kmerManager;
            }
            this.GraphBuildCompleted = true;
        }
Example #4
0
        /// <summary>
        /// Adds the links between the nodes of the graph.
        /// </summary>
        private void GenerateLinks(KmerDictionary kmerManager)
        {
            // Prepare a mask to remove the bits representing the first nucleotide (or left most bits in the encoded kmer)
            // First calculate how many bits do you have to move down a character until you are at the start of the kmer encoded sequence
            int distancetoShift=2*(KmerLength-1);
            ulong rightMask = ~( ((ulong)3) << distancetoShift);
            Parallel.ForEach(_nodes, node =>
                {
                    DeBruijnNode searchResult = null;
                    KmerData32 searchNodeValue = new KmerData32();
                    
                    // Right Extensions - Remove first position from the value
                    // Remove the left most value by using an exclusive 
                    ulong nextKmer = node.NodeValue.KmerData & rightMask;
                    
                    // Move it over two to get make a position for the next pair of bits to represent a new nucleotide
                    nextKmer= nextKmer << 2;
                    for (ulong i = 0; i < 4; i++)
                    {
                        ulong tmpNextKmer = nextKmer | i;// Equivalent to "ACGTA"+"N" where N is the 0-3 encoding for A,C,G,T
                        
                        // Now to set the kmer value to this, the orientationForward value is equal to false if the 
                        // reverse compliment of the kmer is used instead of the kmer value itself.
                        bool matchIsRC = searchNodeValue.SetKmerData(tmpNextKmer, KmerLength);
                        searchResult = kmerManager.TryGetOld(searchNodeValue);
                        if (searchResult != null)
                        {
                            node.SetExtensionNode(true, matchIsRC, searchResult);
                        }
                    }

                    // Left Extensions
                    nextKmer = node.NodeValue.KmerData;
                    
                    //Chop off the right most basepair
                    nextKmer >>= 2;
                    for (ulong i = 0; i < 4; i++) // Cycle through A,C,G,T
                    {
                        // Add the character on to the left side of the kmer
                        // Equivalent to "N" + "ACGAT" where the basepair is added on as the 2 bits
                        ulong tmpNextKmer = (i<<distancetoShift) | nextKmer; 
                        bool matchIsRC=searchNodeValue.SetKmerData(tmpNextKmer, KmerLength);
                        searchResult = kmerManager.TryGetOld(searchNodeValue);
                        if (searchResult != null)
                        {
                            node.SetExtensionNode(false, matchIsRC, searchResult);
                        }
                    }
                });

            LinkGenerationCompleted = true;
        }
Example #5
0
        public void Build(IEnumerable<ISequence> sequences)
        {
            // Size of Kmer List to grab, somewhat arbitrary but want to keep list size below large object threshold, which is ~85 kb 
            const int blockSize = 4096;

            // When to add list to blocking collection, most short reads are <=151 bp so this should avoid needing to grow the list
            const int addThreshold = blockSize - 151;

            // When to pause adding
            const int stopAddThreshold = 2000000 / blockSize;

            if (sequences == null)
                throw new ArgumentNullException("sequences");

            if (KmerLength > KmerData32.MAX_KMER_LENGTH)
                throw new ArgumentException(Properties.Resource.KmerLengthGreaterThan31);

            // A dictionary kmers to debruijin nodes
            KmerDictionary kmerManager = new KmerDictionary();

            // Create the producer thread.
            var kmerDataCollection = new BlockingCollection<List<KmerData32>>();
            Task producer = Task.Factory.StartNew(() =>
            {
                try
                {
                    List<KmerData32> kmerList = new List<KmerData32>(blockSize);

                    IAlphabet alphabet = Alphabets.DNA;
                    HashSet<byte> gapSymbols;
                    alphabet.TryGetGapSymbols(out gapSymbols);

                    // Generate the kmers from the sequences
                    foreach (ISequence sequence in sequences)
                    {
                        // if the sequence alphabet is not of type DNA then ignore it.
                        bool skipSequence = false;
                        if (sequence.Alphabet != Alphabets.DNA)
                        {
                            skipSequence = true;
                        }
                        else
                        {
                            // if the sequence contains any gap symbols then ignore the sequence.
                            foreach (byte symbol in gapSymbols)
                            {
                                for (long index = 0; index < sequence.Count; ++index)
                                {
                                    if (sequence[index] == symbol)
                                    {
                                        skipSequence = true;
                                        break;
                                    }
                                }

                                if (skipSequence)
                                    break;
                            }
                        }

                        if (skipSequence)
                        {
                            Interlocked.Increment(ref _skippedSequencesCount);
                            Interlocked.Increment(ref _processedSequencesCount);
                            continue;
                        }

                        // if the blocking collection count is exceeding 2 million kmers wait for 2 sec 
                        // so that the task can remove some kmers and create the nodes. 
                        // This will avoid OutofMemoryException
                        while (kmerDataCollection.Count > stopAddThreshold)
                        {
                            Task.Delay(TimeSpan.FromSeconds(2)).Wait();
                        }

                        // Convert sequences to k-mers
                        kmerList.AddRange(KmerData32.GetKmers(sequence, KmerLength));

                        // Most reads are <=150 basepairs, so this should avoid having to grow the list
                        // by keeping it below blockSize
                        if (kmerList.Count > addThreshold)
                        {
                            kmerDataCollection.Add(kmerList);
                            kmerList = new List<KmerData32>(4092);
                        }
                        Interlocked.Increment(ref _processedSequencesCount);
                    }

                    if (kmerList.Count <= addThreshold)
                        kmerDataCollection.Add(kmerList);
                }
                finally
                {
                    kmerDataCollection.CompleteAdding();
                }
            });

            // Consume k-mers by addding them to binary tree structure as nodes
            Parallel.ForEach(kmerDataCollection.GetConsumingEnumerable(),newKmerList=>
            {
                foreach (KmerData32 newKmer in newKmerList)
                {
                    // Create Vertex
                    DeBruijnNode node = kmerManager.SetNewOrGetOld(newKmer);

                    // Need to lock node if doing this in parallel
                    if (node.KmerCount <= 255)
                    {
                        lock (node)
                        {
                            node.KmerCount++;
                        }
                    }
                }
            });

            // Ensure producer exceptions are handled.
            producer.Wait();

            // Done filling binary tree
            kmerDataCollection.Dispose();

            //NOTE: To speed enumeration make the nodes into an array and dispose of the collection
            _nodeCount = kmerManager.NodeCount;
            _nodes = kmerManager.GenerateNodeArray();
            
            // Generate the links
            GenerateLinks(kmerManager);
            
            // Since we no longer need to search for values set left and right nodes of child array to null
            // so that they are available for GC if no longer needed
            foreach (DeBruijnNode node in _nodes)
            {
                node.Left = node.Right = null;
            }

            GraphBuildCompleted = true;
        }
Example #6
0
        public void Build(IEnumerable <ISequence> sequences)
        {
            // Size of Kmer List to grab, somewhat arbitrary but want to keep list size below large object threshold, which is ~85 kb
            const int blockSize = 4096;

            // When to add list to blocking collection, most short reads are <=151 bp so this should avoid needing to grow the list
            const int addThreshold = blockSize - 151;

            // When to pause adding
            const int stopAddThreshold = 2000000 / blockSize;

            if (sequences == null)
            {
                throw new ArgumentNullException("sequences");
            }

            if (KmerLength > KmerData32.MAX_KMER_LENGTH)
            {
                throw new ArgumentException(Properties.Resource.KmerLengthGreaterThan31);
            }

            // A dictionary kmers to debruijin nodes
            KmerDictionary kmerManager = new KmerDictionary();

            // Create the producer thread.
            var  kmerDataCollection = new BlockingCollection <List <KmerData32> >();
            Task producer           = Task.Factory.StartNew(() =>
            {
                try
                {
                    List <KmerData32> kmerList = new List <KmerData32>(blockSize);

                    IAlphabet alphabet = Alphabets.DNA;
                    HashSet <byte> gapSymbols;
                    alphabet.TryGetGapSymbols(out gapSymbols);

                    // Generate the kmers from the sequences
                    foreach (ISequence sequence in sequences)
                    {
                        // if the sequence alphabet is not of type DNA then ignore it.
                        bool skipSequence = false;
                        if (sequence.Alphabet != Alphabets.DNA)
                        {
                            skipSequence = true;
                        }
                        else
                        {
                            // if the sequence contains any gap symbols then ignore the sequence.
                            foreach (byte symbol in gapSymbols)
                            {
                                for (long index = 0; index < sequence.Count; ++index)
                                {
                                    if (sequence[index] == symbol)
                                    {
                                        skipSequence = true;
                                        break;
                                    }
                                }

                                if (skipSequence)
                                {
                                    break;
                                }
                            }
                        }

                        if (skipSequence)
                        {
                            Interlocked.Increment(ref _skippedSequencesCount);
                            Interlocked.Increment(ref _processedSequencesCount);
                            continue;
                        }

                        // if the blocking collection count is exceeding 2 million kmers wait for 2 sec
                        // so that the task can remove some kmers and create the nodes.
                        // This will avoid OutofMemoryException
                        while (kmerDataCollection.Count > stopAddThreshold)
                        {
                            Task.Delay(TimeSpan.FromSeconds(2)).Wait();
                        }

                        // Convert sequences to k-mers
                        kmerList.AddRange(KmerData32.GetKmers(sequence, KmerLength));

                        // Most reads are <=150 basepairs, so this should avoid having to grow the list
                        // by keeping it below blockSize
                        if (kmerList.Count > addThreshold)
                        {
                            kmerDataCollection.Add(kmerList);
                            kmerList = new List <KmerData32>(4092);
                        }
                        Interlocked.Increment(ref _processedSequencesCount);
                    }

                    if (kmerList.Count <= addThreshold)
                    {
                        kmerDataCollection.Add(kmerList);
                    }
                }
                finally
                {
                    kmerDataCollection.CompleteAdding();
                }
            });

            // Consume k-mers by addding them to binary tree structure as nodes
            Parallel.ForEach(kmerDataCollection.GetConsumingEnumerable(), newKmerList =>
            {
                foreach (KmerData32 newKmer in newKmerList)
                {
                    // Create Vertex
                    DeBruijnNode node = kmerManager.SetNewOrGetOld(newKmer);

                    // Need to lock node if doing this in parallel
                    if (node.KmerCount <= 255)
                    {
                        lock (node)
                        {
                            node.KmerCount++;
                        }
                    }
                }
            });

            // Ensure producer exceptions are handled.
            producer.Wait();

            // Done filling binary tree
            kmerDataCollection.Dispose();

            //NOTE: To speed enumeration make the nodes into an array and dispose of the collection
            _nodeCount = kmerManager.NodeCount;
            _nodes     = kmerManager.GenerateNodeArray();

            // Generate the links
            GenerateLinks(kmerManager);

            // Since we no longer need to search for values set left and right nodes of child array to null
            // so that they are available for GC if no longer needed
            foreach (DeBruijnNode node in _nodes)
            {
                node.Left = node.Right = null;
            }

            GraphBuildCompleted = true;
        }