public void Build(IEnumerable<ISequence> sequences) { // Size of Kmer List to grab, somewhat arbitrary but want to keep list size below large object threshold, which is ~85 kb const int blockSize = 4096; // When to add list to blocking collection, most short reads are <=151 bp so this should avoid needing to grow the list const int addThreshold = blockSize - 151; // When to pause adding const int stopAddThreshold = 2000000 / blockSize; if (sequences == null) throw new ArgumentNullException("sequences"); if (KmerLength > KmerData32.MAX_KMER_LENGTH) throw new ArgumentException(Properties.Resource.KmerLengthGreaterThan31); // A dictionary kmers to debruijin nodes KmerDictionary kmerManager = new KmerDictionary(); // Create the producer thread. var kmerDataCollection = new BlockingCollection<List<KmerData32>>(); Task producer = Task.Factory.StartNew(() => { try { List<KmerData32> kmerList = new List<KmerData32>(blockSize); IAlphabet alphabet = Alphabets.DNA; HashSet<byte> gapSymbols; alphabet.TryGetGapSymbols(out gapSymbols); // Generate the kmers from the sequences foreach (ISequence sequence in sequences) { // if the sequence alphabet is not of type DNA then ignore it. bool skipSequence = false; if (sequence.Alphabet != Alphabets.DNA) { skipSequence = true; } else { // if the sequence contains any gap symbols then ignore the sequence. foreach (byte symbol in gapSymbols) { for (long index = 0; index < sequence.Count; ++index) { if (sequence[index] == symbol) { skipSequence = true; break; } } if (skipSequence) break; } } if (skipSequence) { Interlocked.Increment(ref _skippedSequencesCount); Interlocked.Increment(ref _processedSequencesCount); continue; } // if the blocking collection count is exceeding 2 million kmers wait for 2 sec // so that the task can remove some kmers and create the nodes. // This will avoid OutofMemoryException while (kmerDataCollection.Count > stopAddThreshold) { Task.Delay(TimeSpan.FromSeconds(2)).Wait(); } // Convert sequences to k-mers kmerList.AddRange(KmerData32.GetKmers(sequence, KmerLength)); // Most reads are <=150 basepairs, so this should avoid having to grow the list // by keeping it below blockSize if (kmerList.Count > addThreshold) { kmerDataCollection.Add(kmerList); kmerList = new List<KmerData32>(4092); } Interlocked.Increment(ref _processedSequencesCount); } if (kmerList.Count <= addThreshold) kmerDataCollection.Add(kmerList); } finally { kmerDataCollection.CompleteAdding(); } }); // Consume k-mers by addding them to binary tree structure as nodes Parallel.ForEach(kmerDataCollection.GetConsumingEnumerable(),newKmerList=> { foreach (KmerData32 newKmer in newKmerList) { // Create Vertex DeBruijnNode node = kmerManager.SetNewOrGetOld(newKmer); // Need to lock node if doing this in parallel if (node.KmerCount <= 255) { lock (node) { node.KmerCount++; } } } }); // Ensure producer exceptions are handled. producer.Wait(); // Done filling binary tree kmerDataCollection.Dispose(); //NOTE: To speed enumeration make the nodes into an array and dispose of the collection _nodeCount = kmerManager.NodeCount; _nodes = kmerManager.GenerateNodeArray(); // Generate the links GenerateLinks(kmerManager); // Since we no longer need to search for values set left and right nodes of child array to null // so that they are available for GC if no longer needed foreach (DeBruijnNode node in _nodes) { node.Left = node.Right = null; } GraphBuildCompleted = true; }
/// <summary> /// Build graph nodes and edges from list of k-mers. /// Creates a node for every unique k-mer (and reverse-complement) /// in the read. Then, generates adjacency information between nodes /// by computing pairs of nodes that have overlapping regions /// between node sequences. /// </summary> /// <param name="sequences">List of input sequences.</param> /// <param name="destroyKmerManagerAfterwards">MT Assembler specific flag public void Build(IEnumerable <ISequence> sequences, bool destroyKmerManagerAfterwards = true) { if (sequences == null) { throw new ArgumentNullException("sequences"); } // Build the dictionary of kmers to debruijin nodes var kmerManager = new KmerDictionary(); var kmerDataCollection = new BlockingCollection <List <KmerData32> >(); // Create the producer task Task theProducer = Task.Factory.StartNew(() => { Thread.BeginCriticalRegion(); try { int i = 0; var kmerList = new List <KmerData32>(BlockSize); // Generate the kmers from the sequences foreach (ISequence sequence in sequences) { #if DEBUG i++; if (i % 50000 == 0) { //TODO: This is reported each 5 minutes anyway. Console.WriteLine("Parsed: " + i.ToString() + " reads"); } #endif // if the sequence alphabet is not of type DNA then ignore it. bool skipSequence = false; if (sequence.Alphabet != Alphabets.NoGapDNA || sequence.Count < _kmerLength) { skipSequence = true; #if FALSE Console.WriteLine(sequence.Alphabet.ToString()); var qs = sequence as Sequence; var f = new Sequence(qs); var s = f.ConvertToString(); byte[] acceptable = new byte[] { 65, 67, 71, 84 }; var s3 = new Sequence(qs.Alphabet, f.Where(x => !acceptable.Contains(x)).ToArray()); Console.WriteLine("BAD: " + s3.ConvertToString()); Console.WriteLine(f.ConvertToString()); // var b = sequence as Sequence; //Console.WriteLine((sequence as Sequence).ConvertToString()); #endif } if (skipSequence) { Interlocked.Increment(ref this._skippedSequencesCount); Interlocked.Increment(ref this._processedSequencesCount); continue; } // If the blocking collection count is exceeding 2 million kmers wait for 5 sec // so that the task can remove some kmers and create the nodes. // This will avoid OutofMemoryException while (kmerDataCollection.Count > StopAddThreshold) { Thread.Sleep(2); } // Convert sequences to k-mers var kmers = KmerData32.GetKmers(sequence, this.KmerLength); kmerList.AddRange(kmers); // Most reads are <=150 basepairs, so this should avoid having to grow the list // by keeping it below blockSize if (kmerList.Count > AddThreshold) { kmerDataCollection.Add(kmerList); kmerList = new List <KmerData32>(BlockSize); } Interlocked.Increment(ref this._processedSequencesCount); Thread.EndCriticalRegion(); } if (kmerList.Count <= AddThreshold) { kmerDataCollection.Add(kmerList); } } finally { kmerDataCollection.CompleteAdding(); } }); if (true)// (!Bio.CrossPlatform.Environment.RunningInMono) { // Consume k-mers by adding them to binary tree structure as nodes Parallel.ForEach(kmerDataCollection.GetConsumingEnumerable(), new ParallelOptions() { MaxDegreeOfParallelism = Environment.ProcessorCount }, newKmerList => { foreach (KmerData32 newKmer in newKmerList) { // Create Vertex DeBruijnNode node = kmerManager.SetNewOrGetOld(newKmer); Debug.Assert(newKmer.KmerData == node.NodeValue.KmerData); } }); } else { foreach (var newKmerList in kmerDataCollection.GetConsumingEnumerable()) { foreach (KmerData32 newKmer in newKmerList) { // Create Vertex DeBruijnNode node = kmerManager.SetNewOrGetOld(newKmer); Debug.Assert(newKmer.KmerData == node.NodeValue.KmerData); } } } // Done filling binary tree theProducer.Wait(); // Make sure task is finished - also rethrows any exception here. kmerDataCollection.Dispose(); // NOTE: To speed enumeration make the nodes into an array and dispose of the collection this._nodeCount = kmerManager.NodeCount; this._nodes = kmerManager.GenerateNodeArray(); // Generate the links this.GenerateLinks(kmerManager); if (destroyKmerManagerAfterwards) { // Since we no longer need to search for values delete tree structure, also set left and right nodes of child array to null // So that they are available for GC if no longer needed kmerManager = null; foreach (DeBruijnNode node in _nodes) { node.Left = null; node.Right = null; } } else { KmerManager = kmerManager; } this.GraphBuildCompleted = true; }
public void Build(IEnumerable <ISequence> sequences) { // Size of Kmer List to grab, somewhat arbitrary but want to keep list size below large object threshold, which is ~85 kb const int blockSize = 4096; // When to add list to blocking collection, most short reads are <=151 bp so this should avoid needing to grow the list const int addThreshold = blockSize - 151; // When to pause adding const int stopAddThreshold = 2000000 / blockSize; if (sequences == null) { throw new ArgumentNullException("sequences"); } if (KmerLength > KmerData32.MAX_KMER_LENGTH) { throw new ArgumentException(Properties.Resource.KmerLengthGreaterThan31); } // A dictionary kmers to debruijin nodes KmerDictionary kmerManager = new KmerDictionary(); // Create the producer thread. var kmerDataCollection = new BlockingCollection <List <KmerData32> >(); Task producer = Task.Factory.StartNew(() => { try { List <KmerData32> kmerList = new List <KmerData32>(blockSize); IAlphabet alphabet = Alphabets.DNA; HashSet <byte> gapSymbols; alphabet.TryGetGapSymbols(out gapSymbols); // Generate the kmers from the sequences foreach (ISequence sequence in sequences) { // if the sequence alphabet is not of type DNA then ignore it. bool skipSequence = false; if (sequence.Alphabet != Alphabets.DNA) { skipSequence = true; } else { // if the sequence contains any gap symbols then ignore the sequence. foreach (byte symbol in gapSymbols) { for (long index = 0; index < sequence.Count; ++index) { if (sequence[index] == symbol) { skipSequence = true; break; } } if (skipSequence) { break; } } } if (skipSequence) { Interlocked.Increment(ref _skippedSequencesCount); Interlocked.Increment(ref _processedSequencesCount); continue; } // if the blocking collection count is exceeding 2 million kmers wait for 2 sec // so that the task can remove some kmers and create the nodes. // This will avoid OutofMemoryException while (kmerDataCollection.Count > stopAddThreshold) { Task.Delay(TimeSpan.FromSeconds(2)).Wait(); } // Convert sequences to k-mers kmerList.AddRange(KmerData32.GetKmers(sequence, KmerLength)); // Most reads are <=150 basepairs, so this should avoid having to grow the list // by keeping it below blockSize if (kmerList.Count > addThreshold) { kmerDataCollection.Add(kmerList); kmerList = new List <KmerData32>(4092); } Interlocked.Increment(ref _processedSequencesCount); } if (kmerList.Count <= addThreshold) { kmerDataCollection.Add(kmerList); } } finally { kmerDataCollection.CompleteAdding(); } }); // Consume k-mers by addding them to binary tree structure as nodes Parallel.ForEach(kmerDataCollection.GetConsumingEnumerable(), newKmerList => { foreach (KmerData32 newKmer in newKmerList) { // Create Vertex DeBruijnNode node = kmerManager.SetNewOrGetOld(newKmer); // Need to lock node if doing this in parallel if (node.KmerCount <= 255) { lock (node) { node.KmerCount++; } } } }); // Ensure producer exceptions are handled. producer.Wait(); // Done filling binary tree kmerDataCollection.Dispose(); //NOTE: To speed enumeration make the nodes into an array and dispose of the collection _nodeCount = kmerManager.NodeCount; _nodes = kmerManager.GenerateNodeArray(); // Generate the links GenerateLinks(kmerManager); // Since we no longer need to search for values set left and right nodes of child array to null // so that they are available for GC if no longer needed foreach (DeBruijnNode node in _nodes) { node.Left = node.Right = null; } GraphBuildCompleted = true; }