/// <summary> /// Adds the links between the nodes of the graph. /// </summary> private void GenerateLinks() { Parallel.ForEach( this.GetNodes(), node => { DeBruijnNode searchResult = null; IKmerData searchNodeValue = GetNewKmerData(); string kmerString; string kmerStringRC; if (node.NodeDataOrientation) { kmerString = Encoding.Default.GetString(node.NodeValue.GetKmerData(this.kmerLength)); kmerStringRC = Encoding.Default.GetString(node.NodeValue.GetReverseComplementOfKmerData(this.KmerLength)); } else { kmerStringRC = Encoding.Default.GetString(node.NodeValue.GetKmerData(this.kmerLength)); kmerString = Encoding.Default.GetString(node.NodeValue.GetReverseComplementOfKmerData(this.KmerLength)); } string nextKmer; string nextKmerRC; // Right Extensions nextKmer = kmerString.Substring(1); nextKmerRC = kmerStringRC.Substring(0, kmerLength - 1); for (int i = 0; i < DnaSymbols.Length; i++) { string tmpNextKmer = nextKmer + DnaSymbols[i]; searchNodeValue.SetKmerData(Encoding.Default.GetBytes(tmpNextKmer), this.kmerLength); searchResult = this.SearchTree(searchNodeValue); if (searchResult != null) { node.SetExtensionNodes(true, searchResult.NodeDataOrientation, searchResult); } else { string tmpnextKmerRC = DnaSymbolsComplement[i] + nextKmerRC; searchNodeValue.SetKmerData(Encoding.Default.GetBytes(tmpnextKmerRC), this.kmerLength); searchResult = this.SearchTree(searchNodeValue); if (searchResult != null) { node.SetExtensionNodes(true, !searchResult.NodeDataOrientation, searchResult); } } } // Left Extensions nextKmer = kmerString.Substring(0, kmerLength - 1); nextKmerRC = kmerStringRC.Substring(1); for (int i = 0; i < DnaSymbols.Length; i++) { string tmpNextKmer = DnaSymbols[i] + nextKmer; searchNodeValue.SetKmerData(Encoding.Default.GetBytes(tmpNextKmer), this.kmerLength); searchResult = this.SearchTree(searchNodeValue); if (searchResult != null) { node.SetExtensionNodes(false, searchResult.NodeDataOrientation, searchResult); } else { string tmpNextKmerRC = nextKmerRC + DnaSymbolsComplement[i]; searchNodeValue.SetKmerData(Encoding.Default.GetBytes(tmpNextKmerRC), this.kmerLength); searchResult = this.SearchTree(searchNodeValue); if (searchResult != null) { node.SetExtensionNodes(false, !searchResult.NodeDataOrientation, searchResult); } } } }); }
/// <summary> /// Build graph nodes and edges from list of k-mers. /// Creates a node for every unique k-mer (and reverse-complement) /// in the read. Then, generates adjacency information between nodes /// by computing pairs of nodes that have overlapping regions /// between node sequences. /// </summary> /// <param name="sequences">List of input sequences.</param> public void Build(IEnumerable <ISequence> sequences) { if (sequences == null) { throw new ArgumentNullException("sequences"); } if (this.kmerLength <= 0) { throw new ArgumentException(Properties.Resource.KmerLengthShouldBePositive); } BlockingCollection <DeBruijnNode> kmerDataCollection = new BlockingCollection <DeBruijnNode>(); Task buildKmers = Task.Factory.StartNew(() => { while (!kmerDataCollection.IsCompleted) { DeBruijnNode newNode = null; if (kmerDataCollection.TryTake(out newNode, -1)) { // Tree Node Creation // create a new node if (this.root == null) // first element being added { this.root = newNode; // set node as root of the tree this.NodeCount++; continue; } int result = 0; DeBruijnNode temp = this.root; DeBruijnNode parent = this.root; // Search the tree where the new node should be inserted while (temp != null) { result = newNode.NodeValue.CompareTo(temp.NodeValue); if (result == 0) { if (temp.KmerCount <= 255) { temp.KmerCount++; break; } } else if (result > 0) // move to right sub-tree { parent = temp; temp = temp.Right; } else if (result < 0) // move to left sub-tree { parent = temp; temp = temp.Left; } } // position found if (result > 0) // add as right child { parent.Right = newNode; NodeCount++; } else if (result < 0) // add as left child { parent.Left = newNode; NodeCount++; } } // End of tree node creation. } }); IAlphabet alphabet = sequences.First().Alphabet; byte[] symbolMap = alphabet.GetSymbolValueMap(); HashSet <byte> ambiguousSymbols = alphabet.GetAmbiguousSymbols(); HashSet <byte> gapSymbols; alphabet.TryGetGapSymbols(out gapSymbols); // Generate the kmers from the sequences foreach (ISequence sequence in sequences) { // if the blocking collection count is exceeding 2 million wait for 5 sec // so that the task can remove some kmers and creat the nodes. // This will avoid OutofMemoryException while (kmerDataCollection.Count > 2000000) { System.Threading.Thread.Sleep(5); } long count = sequence.Count; byte[] convertedSymbols = new byte[count]; bool skipSequence = false; for (long index = 0; index < count; index++) { convertedSymbols[index] = symbolMap[sequence[index]]; if (ambiguousSymbols.Contains(convertedSymbols[index]) || gapSymbols.Contains(convertedSymbols[index])) { skipSequence = true; break; } } if (skipSequence) { continue; } Sequence convertedSequence = new Sequence(sequence.Alphabet, convertedSymbols, false); // generate the kmers from each sequence for (long i = 0; i <= count - this.kmerLength; ++i) { IKmerData kmerData = this.GetNewKmerData(); bool orientation = kmerData.SetKmerData(convertedSequence, i, this.kmerLength); kmerDataCollection.Add(new DeBruijnNode(kmerData, orientation, 1)); } } kmerDataCollection.CompleteAdding(); Task.WaitAll(buildKmers); kmerDataCollection.Dispose(); // Generate the links this.GenerateLinks(); }
/// <summary> /// Build graph nodes and edges from list of k-mers. /// Creates a node for every unique k-mer (and reverse-complement) /// in the read. Then, generates adjacency information between nodes /// by computing pairs of nodes that have overlapping regions /// between node sequences. /// </summary> /// <param name="sequences">List of input sequences.</param> public void Build(IEnumerable <ISequence> sequences) { if (sequences == null) { throw new ArgumentNullException("sequences"); } if (this.kmerLength <= 0) { throw new ArgumentException(Properties.Resource.KmerLengthShouldBePositive); } if (this.kmerLength > 32) { throw new ArgumentException(Properties.Resource.KmerLengthGreaterThan32); } BlockingCollection <DeBruijnNode> kmerDataCollection = new BlockingCollection <DeBruijnNode>(); Task createKmers = Task.Factory.StartNew(() => { IAlphabet alphabet = Alphabets.DNA; HashSet <byte> gapSymbols; alphabet.TryGetGapSymbols(out gapSymbols); // Generate the kmers from the sequences foreach (ISequence sequence in sequences) { // if the sequence alphabet is not of type DNA then ignore it. if (sequence.Alphabet != Alphabets.DNA) { Interlocked.Increment(ref this.skippedSequencesCount); Interlocked.Increment(ref this.processedSequencesCount); continue; } // if the sequence contains any gap symbols then ignore the sequence. bool skipSequence = false; foreach (byte symbol in gapSymbols) { for (long index = 0; index < sequence.Count; ++index) { if (sequence[index] == symbol) { skipSequence = true; break; } } if (skipSequence) { break; } } if (skipSequence) { Interlocked.Increment(ref this.skippedSequencesCount); Interlocked.Increment(ref this.processedSequencesCount); continue; } // if the blocking collection count is exceeding 2 million wait for 5 sec // so that the task can remove some kmers and creat the nodes. // This will avoid OutofMemoryException while (kmerDataCollection.Count > 2000000) { System.Threading.Thread.Sleep(5); } long count = sequence.Count; // generate the kmers from each sequence for (long i = 0; i <= count - this.kmerLength; ++i) { IKmerData kmerData = this.GetNewKmerData(); bool orientation = kmerData.SetKmerData(sequence, i, this.kmerLength); kmerDataCollection.Add(new DeBruijnNode(kmerData, orientation, 1)); } Interlocked.Increment(ref this.processedSequencesCount); } kmerDataCollection.CompleteAdding(); }); Task buildKmers = Task.Factory.StartNew(() => { while (!kmerDataCollection.IsCompleted) { DeBruijnNode newNode = null; if (kmerDataCollection.TryTake(out newNode, -1)) { // Tree Node Creation // create a new node if (this.root == null) // first element being added { this.root = newNode; // set node as root of the tree this.NodeCount++; newNode = null; continue; } int result = 0; DeBruijnNode temp = this.root; DeBruijnNode parent = this.root; // Search the tree where the new node should be inserted while (temp != null) { result = newNode.NodeValue.CompareTo(temp.NodeValue); if (result == 0) { if (temp.KmerCount <= 255) { temp.KmerCount++; break; } } else if (result > 0) // move to right sub-tree { parent = temp; temp = temp.Right; } else if (result < 0) // move to left sub-tree { parent = temp; temp = temp.Left; } } // position found if (result > 0) // add as right child { parent.Right = newNode; NodeCount++; } else if (result < 0) // add as left child { parent.Left = newNode; NodeCount++; } } // End of tree node creation. } }); Task.WaitAll(createKmers, buildKmers); kmerDataCollection.Dispose(); this.GraphBuildCompleted = true; // Generate the links this.GenerateLinks(); }