void ValidateGetAmbiguousCharacters(AlphabetsTypes option) { string referenceCharacters = ""; IAlphabet alphabetInstance = null; switch (option) { case AlphabetsTypes.Protein: referenceCharacters = "BZJX"; alphabetInstance = AmbiguousProteinAlphabet.Instance; break; case AlphabetsTypes.Rna: alphabetInstance = AmbiguousRnaAlphabet.Instance; referenceCharacters = "MRSWYKVHDBN"; break; case AlphabetsTypes.Dna: alphabetInstance = AmbiguousDnaAlphabet.Instance; referenceCharacters = "MRSWYKVHDBN"; break; } HashSet <byte> ambiguousCharacters = new HashSet <byte>(); ambiguousCharacters = alphabetInstance.GetAmbiguousSymbols(); string ambiguosCharacters = new string(ambiguousCharacters.Select(a => (char)a).ToArray()); char[] refCharacters = referenceCharacters.ToCharArray(); for (int i = 0; i < ambiguosCharacters.Length; i++) { Assert.IsTrue(ambiguosCharacters.Contains(refCharacters[i])); } }
/// <summary> /// Validate input sequences /// </summary> /// <param name="reads">The Reads</param> /// <returns>Valid reads.</returns> private IEnumerable <ISequence> ValidateReads(IEnumerable <ISequence> reads) { IAlphabet readAlphabet = Alphabets.GetAmbiguousAlphabet(reads.First().Alphabet); HashSet <byte> ambiguousSymbols = readAlphabet.GetAmbiguousSymbols(); HashSet <byte> gapSymbols; readAlphabet.TryGetGapSymbols(out gapSymbols); foreach (ISequence read in reads) { if (read.All(c => !ambiguousSymbols.Contains(c) && !gapSymbols.Contains(c))) { yield return(read); } else { continue; } } }
/// <summary> /// Validate input sequences /// </summary> /// <param name="reads">The Reads</param> /// <returns>Valid reads.</returns> private IEnumerable <ISequence> ValidateReads(IEnumerable <ISequence> reads) { IAlphabet readAlphabet = Alphabets.GetAmbiguousAlphabet(reads.First().Alphabet); HashSet <byte> ambiguousSymbols = readAlphabet.GetAmbiguousSymbols(); HashSet <byte> gapSymbols; readAlphabet.TryGetGapSymbols(out gapSymbols); foreach (ISequence read in reads) { string originalSequenceId; string pairedReadType; bool forward; string libraryName; if (Bio.Util.Helper.ValidatePairedSequenceId(read.ID, out originalSequenceId, out forward, out pairedReadType, out libraryName)) { if (!read.Alphabet.HasAmbiguity) { bool gapSymbolFound = false; for (long index = 0; index < read.Count; index++) { if (gapSymbols.Contains(read[index])) { gapSymbolFound = true; } } if (!gapSymbolFound) { // Exclude the otherinfo if any. read.ID = Bio.Util.Helper.GetReadIdExcludingOtherInfo(read.ID); yield return(read); } } else { continue; } } } }
/// <summary> /// Build graph nodes and edges from list of k-mers. /// Creates a node for every unique k-mer (and reverse-complement) /// in the read. Then, generates adjacency information between nodes /// by computing pairs of nodes that have overlapping regions /// between node sequences. /// </summary> /// <param name="sequences">List of input sequences.</param> public void Build(IEnumerable <ISequence> sequences) { if (sequences == null) { throw new ArgumentNullException("sequences"); } if (this.kmerLength <= 0) { throw new ArgumentException(Properties.Resource.KmerLengthShouldBePositive); } BlockingCollection <DeBruijnNode> kmerDataCollection = new BlockingCollection <DeBruijnNode>(); Task buildKmers = Task.Factory.StartNew(() => { while (!kmerDataCollection.IsCompleted) { DeBruijnNode newNode = null; if (kmerDataCollection.TryTake(out newNode, -1)) { // Tree Node Creation // create a new node if (this.root == null) // first element being added { this.root = newNode; // set node as root of the tree this.NodeCount++; continue; } int result = 0; DeBruijnNode temp = this.root; DeBruijnNode parent = this.root; // Search the tree where the new node should be inserted while (temp != null) { result = newNode.NodeValue.CompareTo(temp.NodeValue); if (result == 0) { if (temp.KmerCount <= 255) { temp.KmerCount++; break; } } else if (result > 0) // move to right sub-tree { parent = temp; temp = temp.Right; } else if (result < 0) // move to left sub-tree { parent = temp; temp = temp.Left; } } // position found if (result > 0) // add as right child { parent.Right = newNode; NodeCount++; } else if (result < 0) // add as left child { parent.Left = newNode; NodeCount++; } } // End of tree node creation. } }); IAlphabet alphabet = sequences.First().Alphabet; byte[] symbolMap = alphabet.GetSymbolValueMap(); HashSet <byte> ambiguousSymbols = alphabet.GetAmbiguousSymbols(); HashSet <byte> gapSymbols; alphabet.TryGetGapSymbols(out gapSymbols); // Generate the kmers from the sequences foreach (ISequence sequence in sequences) { // if the blocking collection count is exceeding 2 million wait for 5 sec // so that the task can remove some kmers and creat the nodes. // This will avoid OutofMemoryException while (kmerDataCollection.Count > 2000000) { System.Threading.Thread.Sleep(5); } long count = sequence.Count; byte[] convertedSymbols = new byte[count]; bool skipSequence = false; for (long index = 0; index < count; index++) { convertedSymbols[index] = symbolMap[sequence[index]]; if (ambiguousSymbols.Contains(convertedSymbols[index]) || gapSymbols.Contains(convertedSymbols[index])) { skipSequence = true; break; } } if (skipSequence) { continue; } Sequence convertedSequence = new Sequence(sequence.Alphabet, convertedSymbols, false); // generate the kmers from each sequence for (long i = 0; i <= count - this.kmerLength; ++i) { IKmerData kmerData = this.GetNewKmerData(); bool orientation = kmerData.SetKmerData(convertedSequence, i, this.kmerLength); kmerDataCollection.Add(new DeBruijnNode(kmerData, orientation, 1)); } } kmerDataCollection.CompleteAdding(); Task.WaitAll(buildKmers); kmerDataCollection.Dispose(); // Generate the links this.GenerateLinks(); }