Пример #1
0
        void ValidateTryGetDefaultGapSymbol(AlphabetsTypes option)
        {
            IAlphabet alphabetInstance = null;

            switch (option)
            {
            case AlphabetsTypes.Protein:
                alphabetInstance = ProteinAlphabet.Instance;
                break;

            case AlphabetsTypes.Rna:
                alphabetInstance = RnaAlphabet.Instance;
                break;

            case AlphabetsTypes.Dna:
                alphabetInstance = DnaAlphabet.Instance;
                break;
            }

            byte outputByte;

            alphabetInstance.TryGetDefaultGapSymbol(out outputByte);
            Assert.AreEqual('-', (char)outputByte);
            ApplicationLog.WriteLine(string.Concat(@"Alphabets BVT: Validation of 
                                Try Default gap symbol for ", option, " completed successfully."));
            HashSet <byte> outputGapSymbol = new HashSet <byte>();
            string         outputGapString = "";

            alphabetInstance.TryGetGapSymbols(out outputGapSymbol);
            outputGapString = new string(outputGapSymbol.Select(a => (char)a).ToArray());
            Assert.AreEqual("-", outputGapString);
            ApplicationLog.WriteLine(string.Concat(@"Alphabets BVT: Validation of 
                                Try  Get gap symbol for ", option, " completed successfully."));
        }
Пример #2
0
        /// <summary>
        /// Validate input sequences
        /// </summary>
        /// <param name="reads">The Reads</param>
        /// <returns>Valid reads.</returns>
        private IEnumerable <ISequence> ValidateReads(IEnumerable <ISequence> reads)
        {
            IAlphabet      readAlphabet     = Alphabets.GetAmbiguousAlphabet(reads.First().Alphabet);
            HashSet <byte> ambiguousSymbols = readAlphabet.GetAmbiguousSymbols();
            HashSet <byte> gapSymbols;

            readAlphabet.TryGetGapSymbols(out gapSymbols);

            foreach (ISequence read in reads)
            {
                if (read.All(c => !ambiguousSymbols.Contains(c) && !gapSymbols.Contains(c)))
                {
                    yield return(read);
                }
                else
                {
                    continue;
                }
            }
        }
Пример #3
0
        /// <summary>
        /// Validate input sequences
        /// </summary>
        /// <param name="reads">The Reads</param>
        /// <returns>Valid reads.</returns>
        private IEnumerable <ISequence> ValidateReads(IEnumerable <ISequence> reads)
        {
            IAlphabet      readAlphabet     = Alphabets.GetAmbiguousAlphabet(reads.First().Alphabet);
            HashSet <byte> ambiguousSymbols = readAlphabet.GetAmbiguousSymbols();
            HashSet <byte> gapSymbols;

            readAlphabet.TryGetGapSymbols(out gapSymbols);

            foreach (ISequence read in reads)
            {
                string originalSequenceId;
                string pairedReadType;
                bool   forward;
                string libraryName;
                if (Bio.Util.Helper.ValidatePairedSequenceId(read.ID, out originalSequenceId, out forward, out pairedReadType, out libraryName))
                {
                    if (!read.Alphabet.HasAmbiguity)
                    {
                        bool gapSymbolFound = false;
                        for (long index = 0; index < read.Count; index++)
                        {
                            if (gapSymbols.Contains(read[index]))
                            {
                                gapSymbolFound = true;
                            }
                        }

                        if (!gapSymbolFound)
                        {
                            // Exclude the otherinfo if any.
                            read.ID = Bio.Util.Helper.GetReadIdExcludingOtherInfo(read.ID);
                            yield return(read);
                        }
                    }
                    else
                    {
                        continue;
                    }
                }
            }
        }
Пример #4
0
        /// <summary>
        /// Build graph nodes and edges from list of k-mers.
        /// Creates a node for every unique k-mer (and reverse-complement)
        /// in the read. Then, generates adjacency information between nodes
        /// by computing pairs of nodes that have overlapping regions
        /// between node sequences.
        /// </summary>
        /// <param name="sequences">List of input sequences.</param>
        public void Build(IEnumerable <ISequence> sequences)
        {
            if (sequences == null)
            {
                throw new ArgumentNullException("sequences");
            }

            if (this.kmerLength <= 0)
            {
                throw new ArgumentException(Properties.Resource.KmerLengthShouldBePositive);
            }

            BlockingCollection <DeBruijnNode> kmerDataCollection = new BlockingCollection <DeBruijnNode>();

            Task buildKmers = Task.Factory.StartNew(() =>
            {
                while (!kmerDataCollection.IsCompleted)
                {
                    DeBruijnNode newNode = null;
                    if (kmerDataCollection.TryTake(out newNode, -1))
                    {
                        // Tree Node Creation

                        // create a new node
                        if (this.root == null)   // first element being added
                        {
                            this.root = newNode; // set node as root of the tree
                            this.NodeCount++;
                            continue;
                        }

                        int result          = 0;
                        DeBruijnNode temp   = this.root;
                        DeBruijnNode parent = this.root;

                        // Search the tree where the new node should be inserted
                        while (temp != null)
                        {
                            result = newNode.NodeValue.CompareTo(temp.NodeValue);
                            if (result == 0)
                            {
                                if (temp.KmerCount <= 255)
                                {
                                    temp.KmerCount++;
                                    break;
                                }
                            }
                            else if (result > 0) // move to right sub-tree
                            {
                                parent = temp;
                                temp   = temp.Right;
                            }
                            else if (result < 0) // move to left sub-tree
                            {
                                parent = temp;
                                temp   = temp.Left;
                            }
                        }

                        // position found
                        if (result > 0) // add as right child
                        {
                            parent.Right = newNode;
                            NodeCount++;
                        }
                        else if (result < 0) // add as left child
                        {
                            parent.Left = newNode;
                            NodeCount++;
                        }
                    } // End of tree node creation.
                }
            });

            IAlphabet alphabet = sequences.First().Alphabet;

            byte[]         symbolMap        = alphabet.GetSymbolValueMap();
            HashSet <byte> ambiguousSymbols = alphabet.GetAmbiguousSymbols();
            HashSet <byte> gapSymbols;

            alphabet.TryGetGapSymbols(out gapSymbols);

            // Generate the kmers from the sequences
            foreach (ISequence sequence in sequences)
            {
                // if the blocking collection count is exceeding 2 million wait for 5 sec
                // so that the task can remove some kmers and creat the nodes.
                // This will avoid OutofMemoryException
                while (kmerDataCollection.Count > 2000000)
                {
                    System.Threading.Thread.Sleep(5);
                }

                long   count            = sequence.Count;
                byte[] convertedSymbols = new byte[count];
                bool   skipSequence     = false;

                for (long index = 0; index < count; index++)
                {
                    convertedSymbols[index] = symbolMap[sequence[index]];
                    if (ambiguousSymbols.Contains(convertedSymbols[index]) || gapSymbols.Contains(convertedSymbols[index]))
                    {
                        skipSequence = true;
                        break;
                    }
                }

                if (skipSequence)
                {
                    continue;
                }

                Sequence convertedSequence = new Sequence(sequence.Alphabet, convertedSymbols, false);

                // generate the kmers from each sequence
                for (long i = 0; i <= count - this.kmerLength; ++i)
                {
                    IKmerData kmerData    = this.GetNewKmerData();
                    bool      orientation = kmerData.SetKmerData(convertedSequence, i, this.kmerLength);
                    kmerDataCollection.Add(new DeBruijnNode(kmerData, orientation, 1));
                }
            }

            kmerDataCollection.CompleteAdding();

            Task.WaitAll(buildKmers);

            kmerDataCollection.Dispose();

            // Generate the links
            this.GenerateLinks();
        }
Пример #5
0
        /// <summary>
        /// Build graph nodes and edges from list of k-mers.
        /// Creates a node for every unique k-mer (and reverse-complement)
        /// in the read. Then, generates adjacency information between nodes
        /// by computing pairs of nodes that have overlapping regions
        /// between node sequences.
        /// </summary>
        /// <param name="sequences">List of input sequences.</param>
        public void Build(IEnumerable <ISequence> sequences)
        {
            if (sequences == null)
            {
                throw new ArgumentNullException("sequences");
            }

            if (this.kmerLength <= 0)
            {
                throw new ArgumentException(Properties.Resource.KmerLengthShouldBePositive);
            }

            if (this.kmerLength > 32)
            {
                throw new ArgumentException(Properties.Resource.KmerLengthGreaterThan32);
            }

            BlockingCollection <DeBruijnNode> kmerDataCollection = new BlockingCollection <DeBruijnNode>();

            Task createKmers = Task.Factory.StartNew(() =>
            {
                IAlphabet alphabet = Alphabets.DNA;

                HashSet <byte> gapSymbols;
                alphabet.TryGetGapSymbols(out gapSymbols);

                // Generate the kmers from the sequences
                foreach (ISequence sequence in sequences)
                {
                    // if the sequence alphabet is not of type DNA then ignore it.
                    if (sequence.Alphabet != Alphabets.DNA)
                    {
                        Interlocked.Increment(ref this.skippedSequencesCount);
                        Interlocked.Increment(ref this.processedSequencesCount);
                        continue;
                    }

                    // if the sequence contains any gap symbols then ignore the sequence.
                    bool skipSequence = false;
                    foreach (byte symbol in gapSymbols)
                    {
                        for (long index = 0; index < sequence.Count; ++index)
                        {
                            if (sequence[index] == symbol)
                            {
                                skipSequence = true;
                                break;
                            }
                        }

                        if (skipSequence)
                        {
                            break;
                        }
                    }

                    if (skipSequence)
                    {
                        Interlocked.Increment(ref this.skippedSequencesCount);
                        Interlocked.Increment(ref this.processedSequencesCount);
                        continue;
                    }

                    // if the blocking collection count is exceeding 2 million wait for 5 sec
                    // so that the task can remove some kmers and creat the nodes.
                    // This will avoid OutofMemoryException
                    while (kmerDataCollection.Count > 2000000)
                    {
                        System.Threading.Thread.Sleep(5);
                    }

                    long count = sequence.Count;

                    // generate the kmers from each sequence
                    for (long i = 0; i <= count - this.kmerLength; ++i)
                    {
                        IKmerData kmerData = this.GetNewKmerData();
                        bool orientation   = kmerData.SetKmerData(sequence, i, this.kmerLength);
                        kmerDataCollection.Add(new DeBruijnNode(kmerData, orientation, 1));
                    }

                    Interlocked.Increment(ref this.processedSequencesCount);
                }

                kmerDataCollection.CompleteAdding();
            });

            Task buildKmers = Task.Factory.StartNew(() =>
            {
                while (!kmerDataCollection.IsCompleted)
                {
                    DeBruijnNode newNode = null;
                    if (kmerDataCollection.TryTake(out newNode, -1))
                    {
                        // Tree Node Creation

                        // create a new node
                        if (this.root == null)   // first element being added
                        {
                            this.root = newNode; // set node as root of the tree
                            this.NodeCount++;
                            newNode = null;
                            continue;
                        }

                        int result          = 0;
                        DeBruijnNode temp   = this.root;
                        DeBruijnNode parent = this.root;

                        // Search the tree where the new node should be inserted
                        while (temp != null)
                        {
                            result = newNode.NodeValue.CompareTo(temp.NodeValue);
                            if (result == 0)
                            {
                                if (temp.KmerCount <= 255)
                                {
                                    temp.KmerCount++;
                                    break;
                                }
                            }
                            else if (result > 0) // move to right sub-tree
                            {
                                parent = temp;
                                temp   = temp.Right;
                            }
                            else if (result < 0) // move to left sub-tree
                            {
                                parent = temp;
                                temp   = temp.Left;
                            }
                        }

                        // position found
                        if (result > 0) // add as right child
                        {
                            parent.Right = newNode;
                            NodeCount++;
                        }
                        else if (result < 0) // add as left child
                        {
                            parent.Left = newNode;
                            NodeCount++;
                        }
                    } // End of tree node creation.
                }
            });

            Task.WaitAll(createKmers, buildKmers);

            kmerDataCollection.Dispose();
            this.GraphBuildCompleted = true;

            // Generate the links
            this.GenerateLinks();
        }
Пример #6
0
        public void Build(IEnumerable <ISequence> sequences)
        {
            // Size of Kmer List to grab, somewhat arbitrary but want to keep list size below large object threshold, which is ~85 kb
            const int blockSize = 4096;

            // When to add list to blocking collection, most short reads are <=151 bp so this should avoid needing to grow the list
            const int addThreshold = blockSize - 151;

            // When to pause adding
            const int stopAddThreshold = 2000000 / blockSize;

            if (sequences == null)
            {
                throw new ArgumentNullException("sequences");
            }

            if (KmerLength > KmerData32.MAX_KMER_LENGTH)
            {
                throw new ArgumentException(Properties.Resource.KmerLengthGreaterThan31);
            }

            // A dictionary kmers to debruijin nodes
            KmerDictionary kmerManager = new KmerDictionary();

            // Create the producer thread.
            var  kmerDataCollection = new BlockingCollection <List <KmerData32> >();
            Task producer           = Task.Factory.StartNew(() =>
            {
                try
                {
                    List <KmerData32> kmerList = new List <KmerData32>(blockSize);

                    IAlphabet alphabet = Alphabets.DNA;
                    HashSet <byte> gapSymbols;
                    alphabet.TryGetGapSymbols(out gapSymbols);

                    // Generate the kmers from the sequences
                    foreach (ISequence sequence in sequences)
                    {
                        // if the sequence alphabet is not of type DNA then ignore it.
                        bool skipSequence = false;
                        if (sequence.Alphabet != Alphabets.DNA)
                        {
                            skipSequence = true;
                        }
                        else
                        {
                            // if the sequence contains any gap symbols then ignore the sequence.
                            foreach (byte symbol in gapSymbols)
                            {
                                for (long index = 0; index < sequence.Count; ++index)
                                {
                                    if (sequence[index] == symbol)
                                    {
                                        skipSequence = true;
                                        break;
                                    }
                                }

                                if (skipSequence)
                                {
                                    break;
                                }
                            }
                        }

                        if (skipSequence)
                        {
                            Interlocked.Increment(ref _skippedSequencesCount);
                            Interlocked.Increment(ref _processedSequencesCount);
                            continue;
                        }

                        // if the blocking collection count is exceeding 2 million kmers wait for 2 sec
                        // so that the task can remove some kmers and create the nodes.
                        // This will avoid OutofMemoryException
                        while (kmerDataCollection.Count > stopAddThreshold)
                        {
                            Task.Delay(TimeSpan.FromSeconds(2)).Wait();
                        }

                        // Convert sequences to k-mers
                        kmerList.AddRange(KmerData32.GetKmers(sequence, KmerLength));

                        // Most reads are <=150 basepairs, so this should avoid having to grow the list
                        // by keeping it below blockSize
                        if (kmerList.Count > addThreshold)
                        {
                            kmerDataCollection.Add(kmerList);
                            kmerList = new List <KmerData32>(4092);
                        }
                        Interlocked.Increment(ref _processedSequencesCount);
                    }

                    if (kmerList.Count <= addThreshold)
                    {
                        kmerDataCollection.Add(kmerList);
                    }
                }
                finally
                {
                    kmerDataCollection.CompleteAdding();
                }
            });

            // Consume k-mers by addding them to binary tree structure as nodes
            Parallel.ForEach(kmerDataCollection.GetConsumingEnumerable(), newKmerList =>
            {
                foreach (KmerData32 newKmer in newKmerList)
                {
                    // Create Vertex
                    DeBruijnNode node = kmerManager.SetNewOrGetOld(newKmer);

                    // Need to lock node if doing this in parallel
                    if (node.KmerCount <= 255)
                    {
                        lock (node)
                        {
                            node.KmerCount++;
                        }
                    }
                }
            });

            // Ensure producer exceptions are handled.
            producer.Wait();

            // Done filling binary tree
            kmerDataCollection.Dispose();

            //NOTE: To speed enumeration make the nodes into an array and dispose of the collection
            _nodeCount = kmerManager.NodeCount;
            _nodes     = kmerManager.GenerateNodeArray();

            // Generate the links
            GenerateLinks(kmerManager);

            // Since we no longer need to search for values set left and right nodes of child array to null
            // so that they are available for GC if no longer needed
            foreach (DeBruijnNode node in _nodes)
            {
                node.Left = node.Right = null;
            }

            GraphBuildCompleted = true;
        }
Пример #7
0
        /// <summary>
        /// Build graph nodes and edges from list of k-mers.
        /// Creates a node for every unique k-mer (and reverse-complement)
        /// in the read. Then, generates adjacency information between nodes
        /// by computing pairs of nodes that have overlapping regions
        /// between node sequences.
        /// </summary>
        /// <param name="sequences">List of input sequences.</param>
        public void Build(IEnumerable <ISequence> sequences)
        {
            if (sequences == null)
            {
                throw new ArgumentNullException("sequences");
            }

            if (KmerLength <= 0)
            {
                throw new ArgumentException("KmerLengthShouldBePositive");
            }

            if (KmerLength > MaxKmerLength)
            {
                throw new ArgumentException("KmerLengthGreaterThan32");
            }

            var kmerDataCollection = new BlockingCollection <DeBruijnNode>();

            Task.Factory.StartNew(() =>
            {
                try
                {
                    IAlphabet alphabet = Alphabets.DNA;

                    HashSet <byte> gapSymbols;
                    alphabet.TryGetGapSymbols(out gapSymbols);

                    // Generate the kmers from the sequences
                    foreach (ISequence sequence in sequences)
                    {
                        // if the sequence alphabet is not of type DNA then ignore it.
                        if (sequence.Alphabet != Alphabets.DNA)
                        {
                            Interlocked.Increment(ref _skippedSequencesCount);
                            Interlocked.Increment(ref _processedSequencesCount);
                            continue;
                        }

                        // if the sequence contains any gap symbols then ignore the sequence.
                        bool skipSequence = false;
                        foreach (byte symbol in gapSymbols)
                        {
                            for (long index = 0; index < sequence.Count; ++index)
                            {
                                if (sequence[index] == symbol)
                                {
                                    skipSequence = true;
                                    break;
                                }
                            }

                            if (skipSequence)
                            {
                                break;
                            }
                        }

                        if (skipSequence)
                        {
                            Interlocked.Increment(ref _skippedSequencesCount);
                            Interlocked.Increment(ref _processedSequencesCount);
                            continue;
                        }

                        // if the blocking collection count is exceeding 2 million wait for 5 sec
                        // so that the task can remove some kmers and creat the nodes.
                        // This will avoid OutofMemoryException
                        while (kmerDataCollection.Count > StopAddThreshold)
                        {
                            Thread.Sleep(5);
                        }

                        // Generate the kmers from each sequence
                        long count = sequence.Count;
                        for (long i = 0; i <= count - KmerLength; ++i)
                        {
                            var kmerData     = new KmerData32();
                            bool orientation = kmerData.SetKmerData(sequence, i, KmerLength);
                            kmerDataCollection.Add(new DeBruijnNode(kmerData, orientation, 1));
                        }

                        Interlocked.Increment(ref _processedSequencesCount);
                    }
                }
                finally
                {
                    kmerDataCollection.CompleteAdding();
                }
            });

            // The main thread will then process all the data - this will loop until the above
            // task completes adding the kmers.
            foreach (var newNode in kmerDataCollection.GetConsumingEnumerable())
            {
                // Create a new node
                if (Root == null)   // first element being added
                {
                    Root = newNode; // set node as root of the tree
                    NodeCount++;
                    continue;
                }

                int          result = 0;
                DeBruijnNode temp   = Root;
                DeBruijnNode parent = Root;

                // Search the tree where the new node should be inserted
                while (temp != null)
                {
                    result = newNode.NodeValue.CompareTo(temp.NodeValue);
                    if (result == 0)
                    {
                        if (temp.KmerCount <= 255)
                        {
                            temp.KmerCount++;
                            break;
                        }
                    }
                    else if (result > 0) // move to right sub-tree
                    {
                        parent = temp;
                        temp   = temp.Right;
                    }
                    else if (result < 0) // move to left sub-tree
                    {
                        parent = temp;
                        temp   = temp.Left;
                    }
                }

                // position found
                if (result > 0) // add as right child
                {
                    parent.Right = newNode;
                    NodeCount++;
                }
                else if (result < 0) // add as left child
                {
                    parent.Left = newNode;
                    NodeCount++;
                }
            }

            // Done adding - we can throw away the kmer collection as we now have the graph
            kmerDataCollection.Dispose();
            this.GraphBuildCompleted = true;

            // Generate the links
            this.GenerateLinks();
        }