/// <summary> /// Adds the links between the nodes of the graph. /// </summary> private void GenerateLinks(KmerDictionary kmerManager) { // Prepare a mask to remove the bits representing the first nucleotide (or left most bits in the encoded kmer) // First calculate how many bits do you have to move down a character until you are at the start of the kmer encoded sequence int distancetoShift = 2 * (KmerLength - 1); ulong rightMask = ~(((ulong)3) << distancetoShift); Parallel.ForEach(_nodes, node => { DeBruijnNode searchResult = null; KmerData32 searchNodeValue = new KmerData32(); // Right Extensions - Remove first position from the value // Remove the left most value by using an exclusive ulong nextKmer = node.NodeValue.KmerData & rightMask; // Move it over two to get make a position for the next pair of bits to represent a new nucleotide nextKmer = nextKmer << 2; for (ulong i = 0; i < 4; i++) { ulong tmpNextKmer = nextKmer | i; // Equivalent to "ACGTA"+"N" where N is the 0-3 encoding for A,C,G,T // Now to set the kmer value to this, the orientationForward value is equal to false if the // reverse compliment of the kmer is used instead of the kmer value itself. bool matchIsRC = searchNodeValue.SetKmerData(tmpNextKmer, KmerLength); searchResult = kmerManager.TryGetOld(searchNodeValue); if (searchResult != null) { node.SetExtensionNode(true, matchIsRC, searchResult); } } // Left Extensions nextKmer = node.NodeValue.KmerData; //Chop off the right most basepair nextKmer >>= 2; for (ulong i = 0; i < 4; i++) // Cycle through A,C,G,T { // Add the character on to the left side of the kmer // Equivalent to "N" + "ACGAT" where the basepair is added on as the 2 bits ulong tmpNextKmer = (i << distancetoShift) | nextKmer; bool matchIsRC = searchNodeValue.SetKmerData(tmpNextKmer, KmerLength); searchResult = kmerManager.TryGetOld(searchNodeValue); if (searchResult != null) { node.SetExtensionNode(false, matchIsRC, searchResult); } } }); LinkGenerationCompleted = true; }
/// <summary> /// Destroys the kmer manager. Called after additional sequences are searched for by MT Assembler /// </summary> public void DestroyKmerManager() { if (KmerManager != null && GraphBuildCompleted) { // Since we no longer need to search for values delete tree structure, also set left and right nodes of child array to null // So that they are available for GC if no longer needed KmerManager = null; foreach (DeBruijnNode node in _nodes) { node.Left = null; node.Right = null; } } }
/// <summary> /// Build graph nodes and edges from list of k-mers. /// Creates a node for every unique k-mer (and reverse-complement) /// in the read. Then, generates adjacency information between nodes /// by computing pairs of nodes that have overlapping regions /// between node sequences. /// </summary> /// <param name="sequences">List of input sequences.</param> /// <param name="destroyKmerManagerAfterwards">MT Assembler specific flag public void Build(IEnumerable <ISequence> sequences, bool destroyKmerManagerAfterwards = true) { if (sequences == null) { throw new ArgumentNullException("sequences"); } // Build the dictionary of kmers to debruijin nodes var kmerManager = new KmerDictionary(); var kmerDataCollection = new BlockingCollection <List <KmerData32> >(); // Create the producer task Task theProducer = Task.Factory.StartNew(() => { Thread.BeginCriticalRegion(); try { int i = 0; var kmerList = new List <KmerData32>(BlockSize); // Generate the kmers from the sequences foreach (ISequence sequence in sequences) { #if DEBUG i++; if (i % 50000 == 0) { //TODO: This is reported each 5 minutes anyway. Console.WriteLine("Parsed: " + i.ToString() + " reads"); } #endif // if the sequence alphabet is not of type DNA then ignore it. bool skipSequence = false; if (sequence.Alphabet != Alphabets.NoGapDNA || sequence.Count < _kmerLength) { skipSequence = true; #if FALSE Console.WriteLine(sequence.Alphabet.ToString()); var qs = sequence as Sequence; var f = new Sequence(qs); var s = f.ConvertToString(); byte[] acceptable = new byte[] { 65, 67, 71, 84 }; var s3 = new Sequence(qs.Alphabet, f.Where(x => !acceptable.Contains(x)).ToArray()); Console.WriteLine("BAD: " + s3.ConvertToString()); Console.WriteLine(f.ConvertToString()); // var b = sequence as Sequence; //Console.WriteLine((sequence as Sequence).ConvertToString()); #endif } if (skipSequence) { Interlocked.Increment(ref this._skippedSequencesCount); Interlocked.Increment(ref this._processedSequencesCount); continue; } // If the blocking collection count is exceeding 2 million kmers wait for 5 sec // so that the task can remove some kmers and create the nodes. // This will avoid OutofMemoryException while (kmerDataCollection.Count > StopAddThreshold) { Thread.Sleep(2); } // Convert sequences to k-mers var kmers = KmerData32.GetKmers(sequence, this.KmerLength); kmerList.AddRange(kmers); // Most reads are <=150 basepairs, so this should avoid having to grow the list // by keeping it below blockSize if (kmerList.Count > AddThreshold) { kmerDataCollection.Add(kmerList); kmerList = new List <KmerData32>(BlockSize); } Interlocked.Increment(ref this._processedSequencesCount); Thread.EndCriticalRegion(); } if (kmerList.Count <= AddThreshold) { kmerDataCollection.Add(kmerList); } } finally { kmerDataCollection.CompleteAdding(); } }); if (true)// (!Bio.CrossPlatform.Environment.RunningInMono) { // Consume k-mers by adding them to binary tree structure as nodes Parallel.ForEach(kmerDataCollection.GetConsumingEnumerable(), new ParallelOptions() { MaxDegreeOfParallelism = Environment.ProcessorCount }, newKmerList => { foreach (KmerData32 newKmer in newKmerList) { // Create Vertex DeBruijnNode node = kmerManager.SetNewOrGetOld(newKmer); Debug.Assert(newKmer.KmerData == node.NodeValue.KmerData); } }); } else { foreach (var newKmerList in kmerDataCollection.GetConsumingEnumerable()) { foreach (KmerData32 newKmer in newKmerList) { // Create Vertex DeBruijnNode node = kmerManager.SetNewOrGetOld(newKmer); Debug.Assert(newKmer.KmerData == node.NodeValue.KmerData); } } } // Done filling binary tree theProducer.Wait(); // Make sure task is finished - also rethrows any exception here. kmerDataCollection.Dispose(); // NOTE: To speed enumeration make the nodes into an array and dispose of the collection this._nodeCount = kmerManager.NodeCount; this._nodes = kmerManager.GenerateNodeArray(); // Generate the links this.GenerateLinks(kmerManager); if (destroyKmerManagerAfterwards) { // Since we no longer need to search for values delete tree structure, also set left and right nodes of child array to null // So that they are available for GC if no longer needed kmerManager = null; foreach (DeBruijnNode node in _nodes) { node.Left = null; node.Right = null; } } else { KmerManager = kmerManager; } this.GraphBuildCompleted = true; }
/// <summary> /// Adds the links between the nodes of the graph. /// </summary> private void GenerateLinks(KmerDictionary kmerManager) { // Prepare a mask to remove the bits representing the first nucleotide (or left most bits in the encoded kmer) // First calculate how many bits do you have to move down a character until you are at the start of the kmer encoded sequence int distancetoShift=2*(KmerLength-1); ulong rightMask = ~( ((ulong)3) << distancetoShift); Parallel.ForEach(_nodes, node => { DeBruijnNode searchResult = null; KmerData32 searchNodeValue = new KmerData32(); // Right Extensions - Remove first position from the value // Remove the left most value by using an exclusive ulong nextKmer = node.NodeValue.KmerData & rightMask; // Move it over two to get make a position for the next pair of bits to represent a new nucleotide nextKmer= nextKmer << 2; for (ulong i = 0; i < 4; i++) { ulong tmpNextKmer = nextKmer | i;// Equivalent to "ACGTA"+"N" where N is the 0-3 encoding for A,C,G,T // Now to set the kmer value to this, the orientationForward value is equal to false if the // reverse compliment of the kmer is used instead of the kmer value itself. bool matchIsRC = searchNodeValue.SetKmerData(tmpNextKmer, KmerLength); searchResult = kmerManager.TryGetOld(searchNodeValue); if (searchResult != null) { node.SetExtensionNode(true, matchIsRC, searchResult); } } // Left Extensions nextKmer = node.NodeValue.KmerData; //Chop off the right most basepair nextKmer >>= 2; for (ulong i = 0; i < 4; i++) // Cycle through A,C,G,T { // Add the character on to the left side of the kmer // Equivalent to "N" + "ACGAT" where the basepair is added on as the 2 bits ulong tmpNextKmer = (i<<distancetoShift) | nextKmer; bool matchIsRC=searchNodeValue.SetKmerData(tmpNextKmer, KmerLength); searchResult = kmerManager.TryGetOld(searchNodeValue); if (searchResult != null) { node.SetExtensionNode(false, matchIsRC, searchResult); } } }); LinkGenerationCompleted = true; }
public void Build(IEnumerable<ISequence> sequences) { // Size of Kmer List to grab, somewhat arbitrary but want to keep list size below large object threshold, which is ~85 kb const int blockSize = 4096; // When to add list to blocking collection, most short reads are <=151 bp so this should avoid needing to grow the list const int addThreshold = blockSize - 151; // When to pause adding const int stopAddThreshold = 2000000 / blockSize; if (sequences == null) throw new ArgumentNullException("sequences"); if (KmerLength > KmerData32.MAX_KMER_LENGTH) throw new ArgumentException(Properties.Resource.KmerLengthGreaterThan31); // A dictionary kmers to debruijin nodes KmerDictionary kmerManager = new KmerDictionary(); // Create the producer thread. var kmerDataCollection = new BlockingCollection<List<KmerData32>>(); Task producer = Task.Factory.StartNew(() => { try { List<KmerData32> kmerList = new List<KmerData32>(blockSize); IAlphabet alphabet = Alphabets.DNA; HashSet<byte> gapSymbols; alphabet.TryGetGapSymbols(out gapSymbols); // Generate the kmers from the sequences foreach (ISequence sequence in sequences) { // if the sequence alphabet is not of type DNA then ignore it. bool skipSequence = false; if (sequence.Alphabet != Alphabets.DNA) { skipSequence = true; } else { // if the sequence contains any gap symbols then ignore the sequence. foreach (byte symbol in gapSymbols) { for (long index = 0; index < sequence.Count; ++index) { if (sequence[index] == symbol) { skipSequence = true; break; } } if (skipSequence) break; } } if (skipSequence) { Interlocked.Increment(ref _skippedSequencesCount); Interlocked.Increment(ref _processedSequencesCount); continue; } // if the blocking collection count is exceeding 2 million kmers wait for 2 sec // so that the task can remove some kmers and create the nodes. // This will avoid OutofMemoryException while (kmerDataCollection.Count > stopAddThreshold) { Task.Delay(TimeSpan.FromSeconds(2)).Wait(); } // Convert sequences to k-mers kmerList.AddRange(KmerData32.GetKmers(sequence, KmerLength)); // Most reads are <=150 basepairs, so this should avoid having to grow the list // by keeping it below blockSize if (kmerList.Count > addThreshold) { kmerDataCollection.Add(kmerList); kmerList = new List<KmerData32>(4092); } Interlocked.Increment(ref _processedSequencesCount); } if (kmerList.Count <= addThreshold) kmerDataCollection.Add(kmerList); } finally { kmerDataCollection.CompleteAdding(); } }); // Consume k-mers by addding them to binary tree structure as nodes Parallel.ForEach(kmerDataCollection.GetConsumingEnumerable(),newKmerList=> { foreach (KmerData32 newKmer in newKmerList) { // Create Vertex DeBruijnNode node = kmerManager.SetNewOrGetOld(newKmer); // Need to lock node if doing this in parallel if (node.KmerCount <= 255) { lock (node) { node.KmerCount++; } } } }); // Ensure producer exceptions are handled. producer.Wait(); // Done filling binary tree kmerDataCollection.Dispose(); //NOTE: To speed enumeration make the nodes into an array and dispose of the collection _nodeCount = kmerManager.NodeCount; _nodes = kmerManager.GenerateNodeArray(); // Generate the links GenerateLinks(kmerManager); // Since we no longer need to search for values set left and right nodes of child array to null // so that they are available for GC if no longer needed foreach (DeBruijnNode node in _nodes) { node.Left = node.Right = null; } GraphBuildCompleted = true; }
public void Build(IEnumerable <ISequence> sequences) { // Size of Kmer List to grab, somewhat arbitrary but want to keep list size below large object threshold, which is ~85 kb const int blockSize = 4096; // When to add list to blocking collection, most short reads are <=151 bp so this should avoid needing to grow the list const int addThreshold = blockSize - 151; // When to pause adding const int stopAddThreshold = 2000000 / blockSize; if (sequences == null) { throw new ArgumentNullException("sequences"); } if (KmerLength > KmerData32.MAX_KMER_LENGTH) { throw new ArgumentException(Properties.Resource.KmerLengthGreaterThan31); } // A dictionary kmers to debruijin nodes KmerDictionary kmerManager = new KmerDictionary(); // Create the producer thread. var kmerDataCollection = new BlockingCollection <List <KmerData32> >(); Task producer = Task.Factory.StartNew(() => { try { List <KmerData32> kmerList = new List <KmerData32>(blockSize); IAlphabet alphabet = Alphabets.DNA; HashSet <byte> gapSymbols; alphabet.TryGetGapSymbols(out gapSymbols); // Generate the kmers from the sequences foreach (ISequence sequence in sequences) { // if the sequence alphabet is not of type DNA then ignore it. bool skipSequence = false; if (sequence.Alphabet != Alphabets.DNA) { skipSequence = true; } else { // if the sequence contains any gap symbols then ignore the sequence. foreach (byte symbol in gapSymbols) { for (long index = 0; index < sequence.Count; ++index) { if (sequence[index] == symbol) { skipSequence = true; break; } } if (skipSequence) { break; } } } if (skipSequence) { Interlocked.Increment(ref _skippedSequencesCount); Interlocked.Increment(ref _processedSequencesCount); continue; } // if the blocking collection count is exceeding 2 million kmers wait for 2 sec // so that the task can remove some kmers and create the nodes. // This will avoid OutofMemoryException while (kmerDataCollection.Count > stopAddThreshold) { Task.Delay(TimeSpan.FromSeconds(2)).Wait(); } // Convert sequences to k-mers kmerList.AddRange(KmerData32.GetKmers(sequence, KmerLength)); // Most reads are <=150 basepairs, so this should avoid having to grow the list // by keeping it below blockSize if (kmerList.Count > addThreshold) { kmerDataCollection.Add(kmerList); kmerList = new List <KmerData32>(4092); } Interlocked.Increment(ref _processedSequencesCount); } if (kmerList.Count <= addThreshold) { kmerDataCollection.Add(kmerList); } } finally { kmerDataCollection.CompleteAdding(); } }); // Consume k-mers by addding them to binary tree structure as nodes Parallel.ForEach(kmerDataCollection.GetConsumingEnumerable(), newKmerList => { foreach (KmerData32 newKmer in newKmerList) { // Create Vertex DeBruijnNode node = kmerManager.SetNewOrGetOld(newKmer); // Need to lock node if doing this in parallel if (node.KmerCount <= 255) { lock (node) { node.KmerCount++; } } } }); // Ensure producer exceptions are handled. producer.Wait(); // Done filling binary tree kmerDataCollection.Dispose(); //NOTE: To speed enumeration make the nodes into an array and dispose of the collection _nodeCount = kmerManager.NodeCount; _nodes = kmerManager.GenerateNodeArray(); // Generate the links GenerateLinks(kmerManager); // Since we no longer need to search for values set left and right nodes of child array to null // so that they are available for GC if no longer needed foreach (DeBruijnNode node in _nodes) { node.Left = node.Right = null; } GraphBuildCompleted = true; }