Beispiel #1
0
        /// <summary>
        /// Returns a node for a given k-mer
        /// </summary>
        /// <param name="kmer">The kmer</param>
        /// <returns>true if the item has previously been assigned a serial number; otherwise, false.</returns>
        public DeBruijnNode TryGetOld(KmerData32 kmer)
        {
            int bucketIndex = assignBucket(kmer);
            var tree        = buckets[bucketIndex];

            return(tree.SearchTree(kmer));
        }
        /// <summary>
        /// Returns a node for a given k-mer
        /// </summary>
        /// <param name="kmer">The kmer</param>
        /// <returns>true if the item has previously been assigned a serial number; otherwise, false.</returns>
        public DeBruijnNode TryGetOld(KmerData32 kmer)
        {
            int bucketIndex = AssignBucket(kmer);
            BinaryTreeOfDebrujinNodes tree = _buckets[bucketIndex];

            return(tree.SearchTree(kmer));
        }
Beispiel #3
0
        /// <summary>
        /// Add a line to each debruijin node if it corresponds to a
        /// kmer from a single position in a reference genome,
        /// </summary>
        protected void PaintKmersWithReference()
        {
            List <int>    missingLocs      = new List <int> ();
            var           refKmerPositions = SequenceToKmerBuilder.BuildKmerDictionary(ReferenceGenome.ReferenceSequence, this.KmerLength);
            int           KmersPainted     = 0;
            int           KmersSkipped     = 0;
            DeBruijnGraph graph            = this.Graph;
            long          totalNodes       = graph.NodeCount;

            foreach (var v in refKmerPositions)
            {
                ISequence    seq       = v.Key;
                IList <long> locations = v.Value;
                if (locations.Count == 1)
                {
                    var kmerData = new KmerData32();
                    kmerData.SetKmerData(seq, 0, this.KmerLength);
                    DeBruijnNode matchingNode = this.Graph.KmerManager.SetNewOrGetOld(kmerData, false);
                    if (matchingNode != null)
                    {
                        matchingNode.ReferenceGenomePosition = (short)locations [0];
                        KmersPainted++;
                        if (matchingNode.ReferenceGenomePosition < 0)
                        {
                            throw new Exception();
                        }
                    }
                    else
                    {
                        missingLocs.Add((int)locations [0]);
                    }
                }
                else
                {
                    KmersSkipped += locations.Count;
                }
            }
            if (false && OutputDiagnosticInformation)
            {
                StreamWriter sw = new StreamWriter("OutMissing.csv");
                foreach (int i in missingLocs)
                {
                    sw.WriteLine(i.ToString());
                }
                sw.Close();
            }
            double percentKmersSkipped = 100.0 * (KmersSkipped) / ((double)(KmersPainted + KmersSkipped));

            if (percentKmersSkipped > 95.0)
            {
                throw new InvalidProgramException("Reference Genome Skipped over 95% of Kmers");
            }
            double percentHit = KmersPainted / (double)refKmerPositions.Count;

            RaiseMessage("A total of " + (100.0 * percentHit).ToString() + "% nodes in the reference were painted");
            PercentNodesPainted = 100.0 * KmersPainted / (double)totalNodes;
            RaiseMessage(PercentNodesPainted.ToString("n2") + " % of nodes painted, for a total of " + KmersPainted.ToString() + " painted.");
            RaiseMessage(percentKmersSkipped.ToString("n2") + " % of Kmers were skipped for being in multiple locations");
        }
Beispiel #4
0
        /// <summary>
        /// Adds the links between the nodes of the graph.
        /// </summary>
        private void GenerateLinks(KmerDictionary kmerManager)
        {
            // Prepare a mask to remove the bits representing the first nucleotide (or left most bits in the encoded kmer)
            // First calculate how many bits do you have to move down a character until you are at the start of the kmer encoded sequence
            int   distancetoShift = 2 * (KmerLength - 1);
            ulong rightMask       = ~(((ulong)3) << distancetoShift);

            Parallel.ForEach(_nodes, node =>
            {
                DeBruijnNode searchResult  = null;
                KmerData32 searchNodeValue = new KmerData32();

                // Right Extensions - Remove first position from the value
                // Remove the left most value by using an exclusive
                ulong nextKmer = node.NodeValue.KmerData & rightMask;

                // Move it over two to get make a position for the next pair of bits to represent a new nucleotide
                nextKmer = nextKmer << 2;
                for (ulong i = 0; i < 4; i++)
                {
                    ulong tmpNextKmer = nextKmer | i;    // Equivalent to "ACGTA"+"N" where N is the 0-3 encoding for A,C,G,T

                    // Now to set the kmer value to this, the orientationForward value is equal to false if the
                    // reverse compliment of the kmer is used instead of the kmer value itself.
                    bool matchIsRC = searchNodeValue.SetKmerData(tmpNextKmer, KmerLength);
                    searchResult   = kmerManager.TryGetOld(searchNodeValue);
                    if (searchResult != null)
                    {
                        node.SetExtensionNode(true, matchIsRC, searchResult);
                    }
                }

                // Left Extensions
                nextKmer = node.NodeValue.KmerData;

                //Chop off the right most basepair
                nextKmer >>= 2;
                for (ulong i = 0; i < 4; i++)     // Cycle through A,C,G,T
                {
                    // Add the character on to the left side of the kmer
                    // Equivalent to "N" + "ACGAT" where the basepair is added on as the 2 bits
                    ulong tmpNextKmer = (i << distancetoShift) | nextKmer;
                    bool matchIsRC    = searchNodeValue.SetKmerData(tmpNextKmer, KmerLength);
                    searchResult      = kmerManager.TryGetOld(searchNodeValue);
                    if (searchResult != null)
                    {
                        node.SetExtensionNode(false, matchIsRC, searchResult);
                    }
                }
            });

            LinkGenerationCompleted = true;
        }
        /// <summary>
        /// Either returns the DeBrujin node associated with the ulong, or
        /// sets it if an old one does not exist
        /// Parallel Note: Is thread safe
        /// </summary>
        /// <returns>The node representing this value</returns>
        public DeBruijnNode SetNewOrGetOld(KmerData32 value)
        {
            int bucket = AssignBucket(value);
            BinaryTreeOfDebrujinNodes curBucket = _buckets[bucket];

            //keep it thread safe for additions
            DeBruijnNode toReturn;

            lock (curBucket)
            {
                toReturn = curBucket.AddOrReturnCurrent(value);
            }
            return(toReturn);
        }
Beispiel #6
0
        /// <summary>
        /// Either returns the DeBrujin node associated with the ulong, or
        /// sets it if an old one does not exist
        ///
        /// Parallel Note: Is thread safe
        /// </summary>
        /// <returns>The node representing this value</returns>
        public DeBruijnNode SetNewOrGetOld(KmerData32 value, bool makeNewIfNotFound = true)
        {
            int bucket = assignBucket(value);
            BinaryTreeOfDebruijnNodes curBucket = buckets[bucket];

            //keep it thread safe for additions
            DeBruijnNode toReturn;

            lock (curBucket)
            {
                toReturn = curBucket.AddOrReturnCurrent(value, makeNewIfNotFound);
            }
            return(toReturn);
        }
Beispiel #7
0
        /// <summary>
        /// Tries to add specified value to the BinaryTree.
        /// If the value is already present in the tree then this method returns the value already in the tree.
        /// Useful when two values that are equal by comparison are not equal by reference.
        /// </summary>
        /// <param name="value">Value to add.</param>
        /// <returns>Returns the value added or already in the tree, else returns false.</returns>
        public DeBruijnNode AddOrReturnCurrent(KmerData32 value)
        {
            DeBruijnNode toReturn;

            if (_root == null)
            {
                toReturn = MakeNewNode(value);
                _root    = toReturn;
            }
            else
            {
                ulong        newKey = value.KmerData;
                DeBruijnNode node   = _root;
                while (true)
                {
                    ulong currentKey = node.NodeValue.KmerData;
                    if (currentKey == newKey)
                    {
                        // key already exists.
                        toReturn = node;
                        break;
                    }

                    if (newKey < currentKey)
                    {
                        // go to left.
                        if (node.Left == null)
                        {
                            toReturn  = MakeNewNode(value);
                            node.Left = toReturn;
                            break;
                        }
                        node = node.Left;
                    }
                    else
                    {
                        // go to right.
                        if (node.Right == null)
                        {
                            toReturn   = MakeNewNode(value);
                            node.Right = toReturn;
                            break;
                        }
                        node = node.Right;
                    }
                }
            }
            return(toReturn);
        }
Beispiel #8
0
        /// <summary>
        /// Searches for a particular node in the tree.
        /// </summary>
        /// <param name="kmerValue">The node to be searched.</param>
        /// <returns>Actual node in the tree.</returns>
        public DeBruijnNode SearchTree(KmerData32 kmerValue)
        {
            DeBruijnNode startNode = Root;

            while (startNode != null)
            {
                int result = kmerValue.CompareTo(startNode.NodeValue);
                if (result == 0)  // not found
                {
                    break;
                }

                // Search left if the value is smaller than the current node
                startNode = result < 0 ? startNode.Left : startNode.Right;
            }

            return(startNode);
        }
Beispiel #9
0
        /// <summary>
        ///     Searches for a particular node in the tree.
        /// </summary>
        /// <param name="kmerValue">The node to be searched.</param>
        /// <returns>Actual node in the tree.</returns>
        public DeBruijnNode SearchTree(KmerData32 kmerValue)
        {
            DeBruijnNode startNode = _root;

            while (startNode != null)
            {
                ulong currentValue = startNode.NodeValue.KmerData;

                // parameter value found
                if (currentValue == kmerValue.KmerData)
                {
                    break;
                }

                startNode = kmerValue.KmerData < currentValue ? startNode.Left : startNode.Right;
            }

            return(startNode);
        }
Beispiel #10
0
        /// <summary>
        /// Searches for a particular node in the tree.
        /// </summary>
        /// <param name="kmerValue">The node to be searched.</param>
        /// <returns>Actual node in the tree.</returns>
        public DeBruijnNode SearchTree(KmerData32 kmerValue)
        {
            DeBruijnNode startNode = this.root;

            while (startNode != null)
            {
                ulong currentValue = startNode.NodeValue.KmerData;
                // parameter value found
                if (currentValue == kmerValue.KmerData)
                {
                    break;
                }
                else if (kmerValue.KmerData < currentValue)
                {
                    // Search left if the value is smaller than the current node
                    startNode = startNode.Left; // search left
                }
                else
                {
                    startNode = startNode.Right; // search right
                }
            }
            return(startNode);
        }
Beispiel #11
0
        /// <summary>
        /// Delete nodes marked for erosion. Update adjacent nodes to update their extension tables.
        /// </summary>
        /// <param name="graph">De Bruijn Graph.</param>
        public static int RemovePathologicalNodes(DeBruijnGraph graph)
        {
            //Basic strategy here, start at all reference nodes, go find everything that isn't in there
            //and remove it.
            DeBruijnGraph.ValidateGraph(graph);

            var badSeq   = Enumerable.Repeat((byte)'A', graph.KmerLength).ToArray();
            var seq      = new Bio.Sequence(Bio.Alphabets.DNA, badSeq, false);
            var badkmer1 = KmerData32.GetKmers(seq, graph.KmerLength).First().KmerData;

            badSeq = Enumerable.Repeat((byte)'G', graph.KmerLength).ToArray();
            seq    = new Bio.Sequence(Bio.Alphabets.DNA, badSeq, false);
            var badkmer2     = KmerData32.GetKmers(seq, graph.KmerLength).First().KmerData;
            var badNodeCount = 0;

            foreach (var x in graph.GetNodes())
            {
                if (x.NodeValue.KmerData == badkmer1 ||
                    x.NodeValue.KmerData == badkmer2 ||
                    x.ContainsSelfReference)
                {
                    x.MarkNodeForDelete();
                    Interlocked.Increment(ref badNodeCount);
                }
            }

            foreach (var node in graph.GetNodes())
            {
                node.RemoveMarkedExtensions();
            }

            //Now to delete them, since they are not connected to anything we are keeping,
            //no need to alter the graph structure
            graph.RemoveMarkedNodes();
            return(badNodeCount);
        }
Beispiel #12
0
 /// <summary>
 /// Makes a new DeBruijinNode for a kmer, ignores orientation
 /// </summary>
 /// <param name="value">Kmer to make node with</param>
 private DeBruijnNode makeNewNode(KmerData32 value)
 {
     Count++;
     return(new DeBruijnNode(value, 0));
 }
 /// <summary>
 /// Assign a k-mer encoded as a ulong to a bucket
 /// </summary>
 /// <param name="value">kmer value</param>
 /// <returns>bucket index</returns>
 private int AssignBucket(KmerData32 value)
 {
     return((int)(value.KmerData & _hashingMask));
 }
        /// <summary>
        /// Build graph nodes and edges from list of k-mers.
        /// Creates a node for every unique k-mer (and reverse-complement)
        /// in the read. Then, generates adjacency information between nodes
        /// by computing pairs of nodes that have overlapping regions
        /// between node sequences.
        /// </summary>
        /// <param name="sequences">List of input sequences.</param>
        /// <param name="destroyKmerManagerAfterwards">MT Assembler specific flag
        public void Build(IEnumerable <ISequence> sequences, bool destroyKmerManagerAfterwards = true)
        {
            if (sequences == null)
            {
                throw new ArgumentNullException("sequences");
            }

            // Build the dictionary of kmers to debruijin nodes
            var kmerManager        = new KmerDictionary();
            var kmerDataCollection = new BlockingCollection <List <KmerData32> >();
            // Create the producer task
            Task theProducer = Task.Factory.StartNew(() =>
            {
                Thread.BeginCriticalRegion();
                try
                {
                    int i        = 0;
                    var kmerList = new List <KmerData32>(BlockSize);
                    // Generate the kmers from the sequences
                    foreach (ISequence sequence in sequences)
                    {
#if DEBUG
                        i++;
                        if (i % 50000 == 0)
                        {
                            //TODO: This is reported each 5 minutes anyway.
                            Console.WriteLine("Parsed: " + i.ToString() + " reads");
                        }
#endif
                        // if the sequence alphabet is not of type DNA then ignore it.
                        bool skipSequence = false;
                        if (sequence.Alphabet != Alphabets.NoGapDNA || sequence.Count < _kmerLength)
                        {
                            skipSequence = true;
#if FALSE
                            Console.WriteLine(sequence.Alphabet.ToString());
                            var qs            = sequence as Sequence;
                            var f             = new Sequence(qs);
                            var s             = f.ConvertToString();
                            byte[] acceptable = new byte[] { 65, 67, 71, 84 };
                            var s3            = new Sequence(qs.Alphabet, f.Where(x => !acceptable.Contains(x)).ToArray());

                            Console.WriteLine("BAD: " + s3.ConvertToString());
                            Console.WriteLine(f.ConvertToString());

                            //	var b = sequence as Sequence;
                            //Console.WriteLine((sequence as Sequence).ConvertToString());
#endif
                        }
                        if (skipSequence)
                        {
                            Interlocked.Increment(ref this._skippedSequencesCount);
                            Interlocked.Increment(ref this._processedSequencesCount);
                            continue;
                        }

                        // If the blocking collection count is exceeding 2 million kmers wait for 5 sec
                        // so that the task can remove some kmers and create the nodes.
                        // This will avoid OutofMemoryException
                        while (kmerDataCollection.Count > StopAddThreshold)
                        {
                            Thread.Sleep(2);
                        }

                        // Convert sequences to k-mers
                        var kmers = KmerData32.GetKmers(sequence, this.KmerLength);
                        kmerList.AddRange(kmers);

                        // Most reads are <=150 basepairs, so this should avoid having to grow the list
                        // by keeping it below blockSize
                        if (kmerList.Count > AddThreshold)
                        {
                            kmerDataCollection.Add(kmerList);
                            kmerList = new List <KmerData32>(BlockSize);
                        }

                        Interlocked.Increment(ref this._processedSequencesCount);
                        Thread.EndCriticalRegion();
                    }

                    if (kmerList.Count <= AddThreshold)
                    {
                        kmerDataCollection.Add(kmerList);
                    }
                }
                finally
                {
                    kmerDataCollection.CompleteAdding();
                }
            });

            if (true)// (!Bio.CrossPlatform.Environment.RunningInMono)
            {
                // Consume k-mers by adding them to binary tree structure as nodes
                Parallel.ForEach(kmerDataCollection.GetConsumingEnumerable(),
                                 new ParallelOptions()
                {
                    MaxDegreeOfParallelism = Environment.ProcessorCount
                }, newKmerList =>
                {
                    foreach (KmerData32 newKmer in newKmerList)
                    {
                        // Create Vertex
                        DeBruijnNode node = kmerManager.SetNewOrGetOld(newKmer);
                        Debug.Assert(newKmer.KmerData == node.NodeValue.KmerData);
                    }
                });
            }
            else
            {
                foreach (var newKmerList in kmerDataCollection.GetConsumingEnumerable())
                {
                    foreach (KmerData32 newKmer in newKmerList)
                    {
                        // Create Vertex
                        DeBruijnNode node = kmerManager.SetNewOrGetOld(newKmer);
                        Debug.Assert(newKmer.KmerData == node.NodeValue.KmerData);
                    }
                }
            }
            // Done filling binary tree
            theProducer.Wait(); // Make sure task is finished - also rethrows any exception here.
            kmerDataCollection.Dispose();

            // NOTE: To speed enumeration make the nodes into an array and dispose of the collection
            this._nodeCount = kmerManager.NodeCount;
            this._nodes     = kmerManager.GenerateNodeArray();

            // Generate the links
            this.GenerateLinks(kmerManager);

            if (destroyKmerManagerAfterwards)
            {
                // Since we no longer need to search for values delete tree structure, also set left and right nodes of child array to null
                // So that they are available for GC if no longer needed
                kmerManager = null;
                foreach (DeBruijnNode node in _nodes)
                {
                    node.Left  = null;
                    node.Right = null;
                }
            }
            else
            {
                KmerManager = kmerManager;
            }
            this.GraphBuildCompleted = true;
        }
Beispiel #15
0
 /// <summary>
 /// Assign a k-mer encoded as a ulong to a bucket
 /// </summary>
 /// <param name="value">kmer value</param>
 /// <returns>bucket index</returns>
 private int assignBucket(KmerData32 value)
 {
     //This should be inlined by the JIT, only writing this way for clarity
     return((int)(value.KmerData & hashingMask));
 }
Beispiel #16
0
        /// <summary>
        /// Adds the links between the nodes of the graph.
        /// </summary>
        private void GenerateLinks()
        {
            Parallel.ForEach(GetNodes(),
                             node =>
            {
                DeBruijnNode searchResult;
                KmerData32 searchNodeValue = new KmerData32();
                string kmerString, kmerStringRc;
                if (node.NodeDataOrientation)
                {
                    kmerString   = Encoding.Default.GetString(node.NodeValue.GetKmerData(KmerLength));
                    kmerStringRc = Encoding.Default.GetString(node.NodeValue.GetReverseComplementOfKmerData(KmerLength));
                }
                else
                {
                    kmerStringRc = Encoding.Default.GetString(node.NodeValue.GetKmerData(KmerLength));
                    kmerString   = Encoding.Default.GetString(node.NodeValue.GetReverseComplementOfKmerData(KmerLength));
                }

                // Right Extensions
                string nextKmer   = kmerString.Substring(1);
                string nextKmerRC = kmerStringRc.Substring(0, KmerLength - 1);
                for (int i = 0; i < _dnaSymbols.Length; i++)
                {
                    string tmpNextKmer = nextKmer + _dnaSymbols[i];
                    searchNodeValue.SetKmerData(Encoding.Default.GetBytes(tmpNextKmer), KmerLength);
                    searchResult = SearchTree(searchNodeValue);

                    if (searchResult != null)
                    {
                        node.SetExtensionNode(true, searchResult.NodeDataOrientation, searchResult);
                    }
                    else
                    {
                        string tmpnextKmerRC = _dnaSymbolsComplement[i] + nextKmerRC;
                        searchNodeValue.SetKmerData(Encoding.Default.GetBytes(tmpnextKmerRC), KmerLength);
                        searchResult = SearchTree(searchNodeValue);
                        if (searchResult != null)
                        {
                            node.SetExtensionNode(true, !searchResult.NodeDataOrientation, searchResult);
                        }
                    }
                }

                // Left Extensions
                nextKmer   = kmerString.Substring(0, KmerLength - 1);
                nextKmerRC = kmerStringRc.Substring(1);
                for (int i = 0; i < _dnaSymbols.Length; i++)
                {
                    string tmpNextKmer = _dnaSymbols[i] + nextKmer;
                    searchNodeValue.SetKmerData(Encoding.Default.GetBytes(tmpNextKmer), KmerLength);
                    searchResult = SearchTree(searchNodeValue);
                    if (searchResult != null)
                    {
                        node.SetExtensionNode(false, searchResult.NodeDataOrientation, searchResult);
                    }
                    else
                    {
                        string tmpNextKmerRC = nextKmerRC + _dnaSymbolsComplement[i];
                        searchNodeValue.SetKmerData(Encoding.Default.GetBytes(tmpNextKmerRC), KmerLength);
                        searchResult = SearchTree(searchNodeValue);
                        if (searchResult != null)
                        {
                            node.SetExtensionNode(false, !searchResult.NodeDataOrientation, searchResult);
                        }
                    }
                }
            });

            LinkGenerationCompleted = true;
        }
Beispiel #17
0
        /// <summary>
        /// Tries to add specified value to the tree setting its count to 1.
        /// If the value is already present in the tree then this method returns the value already in the tree.
        /// Useful when two values that are equal by comparison are not equal by reference.
        /// </summary>
        /// <param name="value">Value to add.</param>
        /// <returns>Returns the node added or found</returns>
        public DeBruijnNode AddOrReturnCurrent(KmerData32 value, bool makeNewIfNotFound = true)
        {
            DeBruijnNode toReturn = null;

            if (this.root == null)
            {
                toReturn  = makeNewNode(value);
                this.root = toReturn;
            }
            else
            {
                ulong        newKey = value.KmerData;
                DeBruijnNode node   = this.root;
                while (true)
                {
                    ulong currentKey = node.NodeValue.KmerData;
                    if (currentKey == newKey)
                    {
                        // key already exists.
                        toReturn = node;
                        break;
                    }
                    else if (newKey < currentKey)
                    {
                        // go to left.
                        if (node.Left == null)
                        {
                            if (makeNewIfNotFound)
                            {
                                toReturn  = makeNewNode(value);
                                node.Left = toReturn;
                            }
                            break;
                        }
                        else
                        {
                            node = node.Left;
                        }
                    }
                    else
                    {
                        // go to right.
                        if (node.Right == null)
                        {
                            if (makeNewIfNotFound)
                            {
                                toReturn   = makeNewNode(value);
                                node.Right = toReturn;
                            }
                            break;
                        }
                        else
                        {
                            node = node.Right;
                        }
                    }
                }
            }
            if (toReturn != null && toReturn.KmerCount < UInt32.MaxValue)
            {
                toReturn.KmerCount++;
            }
            return(toReturn);
        }
Beispiel #18
0
        public void Build(IEnumerable <ISequence> sequences)
        {
            // Size of Kmer List to grab, somewhat arbitrary but want to keep list size below large object threshold, which is ~85 kb
            const int blockSize = 4096;

            // When to add list to blocking collection, most short reads are <=151 bp so this should avoid needing to grow the list
            const int addThreshold = blockSize - 151;

            // When to pause adding
            const int stopAddThreshold = 2000000 / blockSize;

            if (sequences == null)
            {
                throw new ArgumentNullException("sequences");
            }

            if (KmerLength > KmerData32.MAX_KMER_LENGTH)
            {
                throw new ArgumentException(Properties.Resource.KmerLengthGreaterThan31);
            }

            // A dictionary kmers to debruijin nodes
            KmerDictionary kmerManager = new KmerDictionary();

            // Create the producer thread.
            var  kmerDataCollection = new BlockingCollection <List <KmerData32> >();
            Task producer           = Task.Factory.StartNew(() =>
            {
                try
                {
                    List <KmerData32> kmerList = new List <KmerData32>(blockSize);

                    IAlphabet alphabet = Alphabets.DNA;
                    HashSet <byte> gapSymbols;
                    alphabet.TryGetGapSymbols(out gapSymbols);

                    // Generate the kmers from the sequences
                    foreach (ISequence sequence in sequences)
                    {
                        // if the sequence alphabet is not of type DNA then ignore it.
                        bool skipSequence = false;
                        if (sequence.Alphabet != Alphabets.DNA)
                        {
                            skipSequence = true;
                        }
                        else
                        {
                            // if the sequence contains any gap symbols then ignore the sequence.
                            foreach (byte symbol in gapSymbols)
                            {
                                for (long index = 0; index < sequence.Count; ++index)
                                {
                                    if (sequence[index] == symbol)
                                    {
                                        skipSequence = true;
                                        break;
                                    }
                                }

                                if (skipSequence)
                                {
                                    break;
                                }
                            }
                        }

                        if (skipSequence)
                        {
                            Interlocked.Increment(ref _skippedSequencesCount);
                            Interlocked.Increment(ref _processedSequencesCount);
                            continue;
                        }

                        // if the blocking collection count is exceeding 2 million kmers wait for 2 sec
                        // so that the task can remove some kmers and create the nodes.
                        // This will avoid OutofMemoryException
                        while (kmerDataCollection.Count > stopAddThreshold)
                        {
                            Task.Delay(TimeSpan.FromSeconds(2)).Wait();
                        }

                        // Convert sequences to k-mers
                        kmerList.AddRange(KmerData32.GetKmers(sequence, KmerLength));

                        // Most reads are <=150 basepairs, so this should avoid having to grow the list
                        // by keeping it below blockSize
                        if (kmerList.Count > addThreshold)
                        {
                            kmerDataCollection.Add(kmerList);
                            kmerList = new List <KmerData32>(4092);
                        }
                        Interlocked.Increment(ref _processedSequencesCount);
                    }

                    if (kmerList.Count <= addThreshold)
                    {
                        kmerDataCollection.Add(kmerList);
                    }
                }
                finally
                {
                    kmerDataCollection.CompleteAdding();
                }
            });

            // Consume k-mers by addding them to binary tree structure as nodes
            Parallel.ForEach(kmerDataCollection.GetConsumingEnumerable(), newKmerList =>
            {
                foreach (KmerData32 newKmer in newKmerList)
                {
                    // Create Vertex
                    DeBruijnNode node = kmerManager.SetNewOrGetOld(newKmer);

                    // Need to lock node if doing this in parallel
                    if (node.KmerCount <= 255)
                    {
                        lock (node)
                        {
                            node.KmerCount++;
                        }
                    }
                }
            });

            // Ensure producer exceptions are handled.
            producer.Wait();

            // Done filling binary tree
            kmerDataCollection.Dispose();

            //NOTE: To speed enumeration make the nodes into an array and dispose of the collection
            _nodeCount = kmerManager.NodeCount;
            _nodes     = kmerManager.GenerateNodeArray();

            // Generate the links
            GenerateLinks(kmerManager);

            // Since we no longer need to search for values set left and right nodes of child array to null
            // so that they are available for GC if no longer needed
            foreach (DeBruijnNode node in _nodes)
            {
                node.Left = node.Right = null;
            }

            GraphBuildCompleted = true;
        }
Beispiel #19
0
 /// <summary>
 /// Initializes a new instance of the DeBruijnNode class.
 /// </summary>
 public DeBruijnNode(KmerData32 value, byte count)
 {
     this.NodeValue = value;
     this.KmerCount = count;
 }
Beispiel #20
0
        /// <summary>
        /// Build graph nodes and edges from list of k-mers.
        /// Creates a node for every unique k-mer (and reverse-complement)
        /// in the read. Then, generates adjacency information between nodes
        /// by computing pairs of nodes that have overlapping regions
        /// between node sequences.
        /// </summary>
        /// <param name="sequences">List of input sequences.</param>
        public void Build(IEnumerable <ISequence> sequences)
        {
            if (sequences == null)
            {
                throw new ArgumentNullException("sequences");
            }

            if (KmerLength <= 0)
            {
                throw new ArgumentException("KmerLengthShouldBePositive");
            }

            if (KmerLength > MaxKmerLength)
            {
                throw new ArgumentException("KmerLengthGreaterThan32");
            }

            var kmerDataCollection = new BlockingCollection <DeBruijnNode>();

            Task.Factory.StartNew(() =>
            {
                try
                {
                    IAlphabet alphabet = Alphabets.DNA;

                    HashSet <byte> gapSymbols;
                    alphabet.TryGetGapSymbols(out gapSymbols);

                    // Generate the kmers from the sequences
                    foreach (ISequence sequence in sequences)
                    {
                        // if the sequence alphabet is not of type DNA then ignore it.
                        if (sequence.Alphabet != Alphabets.DNA)
                        {
                            Interlocked.Increment(ref _skippedSequencesCount);
                            Interlocked.Increment(ref _processedSequencesCount);
                            continue;
                        }

                        // if the sequence contains any gap symbols then ignore the sequence.
                        bool skipSequence = false;
                        foreach (byte symbol in gapSymbols)
                        {
                            for (long index = 0; index < sequence.Count; ++index)
                            {
                                if (sequence[index] == symbol)
                                {
                                    skipSequence = true;
                                    break;
                                }
                            }

                            if (skipSequence)
                            {
                                break;
                            }
                        }

                        if (skipSequence)
                        {
                            Interlocked.Increment(ref _skippedSequencesCount);
                            Interlocked.Increment(ref _processedSequencesCount);
                            continue;
                        }

                        // if the blocking collection count is exceeding 2 million wait for 5 sec
                        // so that the task can remove some kmers and creat the nodes.
                        // This will avoid OutofMemoryException
                        while (kmerDataCollection.Count > StopAddThreshold)
                        {
                            Thread.Sleep(5);
                        }

                        // Generate the kmers from each sequence
                        long count = sequence.Count;
                        for (long i = 0; i <= count - KmerLength; ++i)
                        {
                            var kmerData     = new KmerData32();
                            bool orientation = kmerData.SetKmerData(sequence, i, KmerLength);
                            kmerDataCollection.Add(new DeBruijnNode(kmerData, orientation, 1));
                        }

                        Interlocked.Increment(ref _processedSequencesCount);
                    }
                }
                finally
                {
                    kmerDataCollection.CompleteAdding();
                }
            });

            // The main thread will then process all the data - this will loop until the above
            // task completes adding the kmers.
            foreach (var newNode in kmerDataCollection.GetConsumingEnumerable())
            {
                // Create a new node
                if (Root == null)   // first element being added
                {
                    Root = newNode; // set node as root of the tree
                    NodeCount++;
                    continue;
                }

                int          result = 0;
                DeBruijnNode temp   = Root;
                DeBruijnNode parent = Root;

                // Search the tree where the new node should be inserted
                while (temp != null)
                {
                    result = newNode.NodeValue.CompareTo(temp.NodeValue);
                    if (result == 0)
                    {
                        if (temp.KmerCount <= 255)
                        {
                            temp.KmerCount++;
                            break;
                        }
                    }
                    else if (result > 0) // move to right sub-tree
                    {
                        parent = temp;
                        temp   = temp.Right;
                    }
                    else if (result < 0) // move to left sub-tree
                    {
                        parent = temp;
                        temp   = temp.Left;
                    }
                }

                // position found
                if (result > 0) // add as right child
                {
                    parent.Right = newNode;
                    NodeCount++;
                }
                else if (result < 0) // add as left child
                {
                    parent.Left = newNode;
                    NodeCount++;
                }
            }

            // Done adding - we can throw away the kmer collection as we now have the graph
            kmerDataCollection.Dispose();
            this.GraphBuildCompleted = true;

            // Generate the links
            this.GenerateLinks();
        }