void ValidateTryGetDefaultGapSymbol(AlphabetsTypes option) { IAlphabet alphabetInstance = null; switch (option) { case AlphabetsTypes.Protein: alphabetInstance = ProteinAlphabet.Instance; break; case AlphabetsTypes.Rna: alphabetInstance = RnaAlphabet.Instance; break; case AlphabetsTypes.Dna: alphabetInstance = DnaAlphabet.Instance; break; } byte outputByte; alphabetInstance.TryGetDefaultGapSymbol(out outputByte); Assert.AreEqual('-', (char)outputByte); ApplicationLog.WriteLine(string.Concat(@"Alphabets BVT: Validation of Try Default gap symbol for ", option, " completed successfully.")); HashSet <byte> outputGapSymbol = new HashSet <byte>(); string outputGapString = ""; alphabetInstance.TryGetGapSymbols(out outputGapSymbol); outputGapString = new string(outputGapSymbol.Select(a => (char)a).ToArray()); Assert.AreEqual("-", outputGapString); ApplicationLog.WriteLine(string.Concat(@"Alphabets BVT: Validation of Try Get gap symbol for ", option, " completed successfully.")); }
/// <summary> /// Validate input sequences /// </summary> /// <param name="reads">The Reads</param> /// <returns>Valid reads.</returns> private IEnumerable <ISequence> ValidateReads(IEnumerable <ISequence> reads) { IAlphabet readAlphabet = Alphabets.GetAmbiguousAlphabet(reads.First().Alphabet); HashSet <byte> ambiguousSymbols = readAlphabet.GetAmbiguousSymbols(); HashSet <byte> gapSymbols; readAlphabet.TryGetGapSymbols(out gapSymbols); foreach (ISequence read in reads) { if (read.All(c => !ambiguousSymbols.Contains(c) && !gapSymbols.Contains(c))) { yield return(read); } else { continue; } } }
/// <summary> /// Validate input sequences /// </summary> /// <param name="reads">The Reads</param> /// <returns>Valid reads.</returns> private IEnumerable <ISequence> ValidateReads(IEnumerable <ISequence> reads) { IAlphabet readAlphabet = Alphabets.GetAmbiguousAlphabet(reads.First().Alphabet); HashSet <byte> ambiguousSymbols = readAlphabet.GetAmbiguousSymbols(); HashSet <byte> gapSymbols; readAlphabet.TryGetGapSymbols(out gapSymbols); foreach (ISequence read in reads) { string originalSequenceId; string pairedReadType; bool forward; string libraryName; if (Bio.Util.Helper.ValidatePairedSequenceId(read.ID, out originalSequenceId, out forward, out pairedReadType, out libraryName)) { if (!read.Alphabet.HasAmbiguity) { bool gapSymbolFound = false; for (long index = 0; index < read.Count; index++) { if (gapSymbols.Contains(read[index])) { gapSymbolFound = true; } } if (!gapSymbolFound) { // Exclude the otherinfo if any. read.ID = Bio.Util.Helper.GetReadIdExcludingOtherInfo(read.ID); yield return(read); } } else { continue; } } } }
/// <summary> /// Build graph nodes and edges from list of k-mers. /// Creates a node for every unique k-mer (and reverse-complement) /// in the read. Then, generates adjacency information between nodes /// by computing pairs of nodes that have overlapping regions /// between node sequences. /// </summary> /// <param name="sequences">List of input sequences.</param> public void Build(IEnumerable <ISequence> sequences) { if (sequences == null) { throw new ArgumentNullException("sequences"); } if (this.kmerLength <= 0) { throw new ArgumentException(Properties.Resource.KmerLengthShouldBePositive); } BlockingCollection <DeBruijnNode> kmerDataCollection = new BlockingCollection <DeBruijnNode>(); Task buildKmers = Task.Factory.StartNew(() => { while (!kmerDataCollection.IsCompleted) { DeBruijnNode newNode = null; if (kmerDataCollection.TryTake(out newNode, -1)) { // Tree Node Creation // create a new node if (this.root == null) // first element being added { this.root = newNode; // set node as root of the tree this.NodeCount++; continue; } int result = 0; DeBruijnNode temp = this.root; DeBruijnNode parent = this.root; // Search the tree where the new node should be inserted while (temp != null) { result = newNode.NodeValue.CompareTo(temp.NodeValue); if (result == 0) { if (temp.KmerCount <= 255) { temp.KmerCount++; break; } } else if (result > 0) // move to right sub-tree { parent = temp; temp = temp.Right; } else if (result < 0) // move to left sub-tree { parent = temp; temp = temp.Left; } } // position found if (result > 0) // add as right child { parent.Right = newNode; NodeCount++; } else if (result < 0) // add as left child { parent.Left = newNode; NodeCount++; } } // End of tree node creation. } }); IAlphabet alphabet = sequences.First().Alphabet; byte[] symbolMap = alphabet.GetSymbolValueMap(); HashSet <byte> ambiguousSymbols = alphabet.GetAmbiguousSymbols(); HashSet <byte> gapSymbols; alphabet.TryGetGapSymbols(out gapSymbols); // Generate the kmers from the sequences foreach (ISequence sequence in sequences) { // if the blocking collection count is exceeding 2 million wait for 5 sec // so that the task can remove some kmers and creat the nodes. // This will avoid OutofMemoryException while (kmerDataCollection.Count > 2000000) { System.Threading.Thread.Sleep(5); } long count = sequence.Count; byte[] convertedSymbols = new byte[count]; bool skipSequence = false; for (long index = 0; index < count; index++) { convertedSymbols[index] = symbolMap[sequence[index]]; if (ambiguousSymbols.Contains(convertedSymbols[index]) || gapSymbols.Contains(convertedSymbols[index])) { skipSequence = true; break; } } if (skipSequence) { continue; } Sequence convertedSequence = new Sequence(sequence.Alphabet, convertedSymbols, false); // generate the kmers from each sequence for (long i = 0; i <= count - this.kmerLength; ++i) { IKmerData kmerData = this.GetNewKmerData(); bool orientation = kmerData.SetKmerData(convertedSequence, i, this.kmerLength); kmerDataCollection.Add(new DeBruijnNode(kmerData, orientation, 1)); } } kmerDataCollection.CompleteAdding(); Task.WaitAll(buildKmers); kmerDataCollection.Dispose(); // Generate the links this.GenerateLinks(); }
/// <summary> /// Build graph nodes and edges from list of k-mers. /// Creates a node for every unique k-mer (and reverse-complement) /// in the read. Then, generates adjacency information between nodes /// by computing pairs of nodes that have overlapping regions /// between node sequences. /// </summary> /// <param name="sequences">List of input sequences.</param> public void Build(IEnumerable <ISequence> sequences) { if (sequences == null) { throw new ArgumentNullException("sequences"); } if (this.kmerLength <= 0) { throw new ArgumentException(Properties.Resource.KmerLengthShouldBePositive); } if (this.kmerLength > 32) { throw new ArgumentException(Properties.Resource.KmerLengthGreaterThan32); } BlockingCollection <DeBruijnNode> kmerDataCollection = new BlockingCollection <DeBruijnNode>(); Task createKmers = Task.Factory.StartNew(() => { IAlphabet alphabet = Alphabets.DNA; HashSet <byte> gapSymbols; alphabet.TryGetGapSymbols(out gapSymbols); // Generate the kmers from the sequences foreach (ISequence sequence in sequences) { // if the sequence alphabet is not of type DNA then ignore it. if (sequence.Alphabet != Alphabets.DNA) { Interlocked.Increment(ref this.skippedSequencesCount); Interlocked.Increment(ref this.processedSequencesCount); continue; } // if the sequence contains any gap symbols then ignore the sequence. bool skipSequence = false; foreach (byte symbol in gapSymbols) { for (long index = 0; index < sequence.Count; ++index) { if (sequence[index] == symbol) { skipSequence = true; break; } } if (skipSequence) { break; } } if (skipSequence) { Interlocked.Increment(ref this.skippedSequencesCount); Interlocked.Increment(ref this.processedSequencesCount); continue; } // if the blocking collection count is exceeding 2 million wait for 5 sec // so that the task can remove some kmers and creat the nodes. // This will avoid OutofMemoryException while (kmerDataCollection.Count > 2000000) { System.Threading.Thread.Sleep(5); } long count = sequence.Count; // generate the kmers from each sequence for (long i = 0; i <= count - this.kmerLength; ++i) { IKmerData kmerData = this.GetNewKmerData(); bool orientation = kmerData.SetKmerData(sequence, i, this.kmerLength); kmerDataCollection.Add(new DeBruijnNode(kmerData, orientation, 1)); } Interlocked.Increment(ref this.processedSequencesCount); } kmerDataCollection.CompleteAdding(); }); Task buildKmers = Task.Factory.StartNew(() => { while (!kmerDataCollection.IsCompleted) { DeBruijnNode newNode = null; if (kmerDataCollection.TryTake(out newNode, -1)) { // Tree Node Creation // create a new node if (this.root == null) // first element being added { this.root = newNode; // set node as root of the tree this.NodeCount++; newNode = null; continue; } int result = 0; DeBruijnNode temp = this.root; DeBruijnNode parent = this.root; // Search the tree where the new node should be inserted while (temp != null) { result = newNode.NodeValue.CompareTo(temp.NodeValue); if (result == 0) { if (temp.KmerCount <= 255) { temp.KmerCount++; break; } } else if (result > 0) // move to right sub-tree { parent = temp; temp = temp.Right; } else if (result < 0) // move to left sub-tree { parent = temp; temp = temp.Left; } } // position found if (result > 0) // add as right child { parent.Right = newNode; NodeCount++; } else if (result < 0) // add as left child { parent.Left = newNode; NodeCount++; } } // End of tree node creation. } }); Task.WaitAll(createKmers, buildKmers); kmerDataCollection.Dispose(); this.GraphBuildCompleted = true; // Generate the links this.GenerateLinks(); }
public void Build(IEnumerable <ISequence> sequences) { // Size of Kmer List to grab, somewhat arbitrary but want to keep list size below large object threshold, which is ~85 kb const int blockSize = 4096; // When to add list to blocking collection, most short reads are <=151 bp so this should avoid needing to grow the list const int addThreshold = blockSize - 151; // When to pause adding const int stopAddThreshold = 2000000 / blockSize; if (sequences == null) { throw new ArgumentNullException("sequences"); } if (KmerLength > KmerData32.MAX_KMER_LENGTH) { throw new ArgumentException(Properties.Resource.KmerLengthGreaterThan31); } // A dictionary kmers to debruijin nodes KmerDictionary kmerManager = new KmerDictionary(); // Create the producer thread. var kmerDataCollection = new BlockingCollection <List <KmerData32> >(); Task producer = Task.Factory.StartNew(() => { try { List <KmerData32> kmerList = new List <KmerData32>(blockSize); IAlphabet alphabet = Alphabets.DNA; HashSet <byte> gapSymbols; alphabet.TryGetGapSymbols(out gapSymbols); // Generate the kmers from the sequences foreach (ISequence sequence in sequences) { // if the sequence alphabet is not of type DNA then ignore it. bool skipSequence = false; if (sequence.Alphabet != Alphabets.DNA) { skipSequence = true; } else { // if the sequence contains any gap symbols then ignore the sequence. foreach (byte symbol in gapSymbols) { for (long index = 0; index < sequence.Count; ++index) { if (sequence[index] == symbol) { skipSequence = true; break; } } if (skipSequence) { break; } } } if (skipSequence) { Interlocked.Increment(ref _skippedSequencesCount); Interlocked.Increment(ref _processedSequencesCount); continue; } // if the blocking collection count is exceeding 2 million kmers wait for 2 sec // so that the task can remove some kmers and create the nodes. // This will avoid OutofMemoryException while (kmerDataCollection.Count > stopAddThreshold) { Task.Delay(TimeSpan.FromSeconds(2)).Wait(); } // Convert sequences to k-mers kmerList.AddRange(KmerData32.GetKmers(sequence, KmerLength)); // Most reads are <=150 basepairs, so this should avoid having to grow the list // by keeping it below blockSize if (kmerList.Count > addThreshold) { kmerDataCollection.Add(kmerList); kmerList = new List <KmerData32>(4092); } Interlocked.Increment(ref _processedSequencesCount); } if (kmerList.Count <= addThreshold) { kmerDataCollection.Add(kmerList); } } finally { kmerDataCollection.CompleteAdding(); } }); // Consume k-mers by addding them to binary tree structure as nodes Parallel.ForEach(kmerDataCollection.GetConsumingEnumerable(), newKmerList => { foreach (KmerData32 newKmer in newKmerList) { // Create Vertex DeBruijnNode node = kmerManager.SetNewOrGetOld(newKmer); // Need to lock node if doing this in parallel if (node.KmerCount <= 255) { lock (node) { node.KmerCount++; } } } }); // Ensure producer exceptions are handled. producer.Wait(); // Done filling binary tree kmerDataCollection.Dispose(); //NOTE: To speed enumeration make the nodes into an array and dispose of the collection _nodeCount = kmerManager.NodeCount; _nodes = kmerManager.GenerateNodeArray(); // Generate the links GenerateLinks(kmerManager); // Since we no longer need to search for values set left and right nodes of child array to null // so that they are available for GC if no longer needed foreach (DeBruijnNode node in _nodes) { node.Left = node.Right = null; } GraphBuildCompleted = true; }
/// <summary> /// Build graph nodes and edges from list of k-mers. /// Creates a node for every unique k-mer (and reverse-complement) /// in the read. Then, generates adjacency information between nodes /// by computing pairs of nodes that have overlapping regions /// between node sequences. /// </summary> /// <param name="sequences">List of input sequences.</param> public void Build(IEnumerable <ISequence> sequences) { if (sequences == null) { throw new ArgumentNullException("sequences"); } if (KmerLength <= 0) { throw new ArgumentException("KmerLengthShouldBePositive"); } if (KmerLength > MaxKmerLength) { throw new ArgumentException("KmerLengthGreaterThan32"); } var kmerDataCollection = new BlockingCollection <DeBruijnNode>(); Task.Factory.StartNew(() => { try { IAlphabet alphabet = Alphabets.DNA; HashSet <byte> gapSymbols; alphabet.TryGetGapSymbols(out gapSymbols); // Generate the kmers from the sequences foreach (ISequence sequence in sequences) { // if the sequence alphabet is not of type DNA then ignore it. if (sequence.Alphabet != Alphabets.DNA) { Interlocked.Increment(ref _skippedSequencesCount); Interlocked.Increment(ref _processedSequencesCount); continue; } // if the sequence contains any gap symbols then ignore the sequence. bool skipSequence = false; foreach (byte symbol in gapSymbols) { for (long index = 0; index < sequence.Count; ++index) { if (sequence[index] == symbol) { skipSequence = true; break; } } if (skipSequence) { break; } } if (skipSequence) { Interlocked.Increment(ref _skippedSequencesCount); Interlocked.Increment(ref _processedSequencesCount); continue; } // if the blocking collection count is exceeding 2 million wait for 5 sec // so that the task can remove some kmers and creat the nodes. // This will avoid OutofMemoryException while (kmerDataCollection.Count > StopAddThreshold) { Thread.Sleep(5); } // Generate the kmers from each sequence long count = sequence.Count; for (long i = 0; i <= count - KmerLength; ++i) { var kmerData = new KmerData32(); bool orientation = kmerData.SetKmerData(sequence, i, KmerLength); kmerDataCollection.Add(new DeBruijnNode(kmerData, orientation, 1)); } Interlocked.Increment(ref _processedSequencesCount); } } finally { kmerDataCollection.CompleteAdding(); } }); // The main thread will then process all the data - this will loop until the above // task completes adding the kmers. foreach (var newNode in kmerDataCollection.GetConsumingEnumerable()) { // Create a new node if (Root == null) // first element being added { Root = newNode; // set node as root of the tree NodeCount++; continue; } int result = 0; DeBruijnNode temp = Root; DeBruijnNode parent = Root; // Search the tree where the new node should be inserted while (temp != null) { result = newNode.NodeValue.CompareTo(temp.NodeValue); if (result == 0) { if (temp.KmerCount <= 255) { temp.KmerCount++; break; } } else if (result > 0) // move to right sub-tree { parent = temp; temp = temp.Right; } else if (result < 0) // move to left sub-tree { parent = temp; temp = temp.Left; } } // position found if (result > 0) // add as right child { parent.Right = newNode; NodeCount++; } else if (result < 0) // add as left child { parent.Left = newNode; NodeCount++; } } // Done adding - we can throw away the kmer collection as we now have the graph kmerDataCollection.Dispose(); this.GraphBuildCompleted = true; // Generate the links this.GenerateLinks(); }