示例#1
0
        void ValidateGetAmbiguousCharacters(AlphabetsTypes option)
        {
            string    referenceCharacters = "";
            IAlphabet alphabetInstance    = null;

            switch (option)
            {
            case AlphabetsTypes.Protein:
                referenceCharacters = "BZJX";
                alphabetInstance    = AmbiguousProteinAlphabet.Instance;
                break;

            case AlphabetsTypes.Rna:
                alphabetInstance    = AmbiguousRnaAlphabet.Instance;
                referenceCharacters = "MRSWYKVHDBN";
                break;

            case AlphabetsTypes.Dna:
                alphabetInstance    = AmbiguousDnaAlphabet.Instance;
                referenceCharacters = "MRSWYKVHDBN";
                break;
            }

            HashSet <byte> ambiguousCharacters = new HashSet <byte>();

            ambiguousCharacters = alphabetInstance.GetAmbiguousSymbols();
            string ambiguosCharacters = new string(ambiguousCharacters.Select(a => (char)a).ToArray());

            char[] refCharacters = referenceCharacters.ToCharArray();

            for (int i = 0; i < ambiguosCharacters.Length; i++)
            {
                Assert.IsTrue(ambiguosCharacters.Contains(refCharacters[i]));
            }
        }
示例#2
0
        /// <summary>
        /// Validate input sequences
        /// </summary>
        /// <param name="reads">The Reads</param>
        /// <returns>Valid reads.</returns>
        private IEnumerable <ISequence> ValidateReads(IEnumerable <ISequence> reads)
        {
            IAlphabet      readAlphabet     = Alphabets.GetAmbiguousAlphabet(reads.First().Alphabet);
            HashSet <byte> ambiguousSymbols = readAlphabet.GetAmbiguousSymbols();
            HashSet <byte> gapSymbols;

            readAlphabet.TryGetGapSymbols(out gapSymbols);

            foreach (ISequence read in reads)
            {
                if (read.All(c => !ambiguousSymbols.Contains(c) && !gapSymbols.Contains(c)))
                {
                    yield return(read);
                }
                else
                {
                    continue;
                }
            }
        }
示例#3
0
        /// <summary>
        /// Validate input sequences
        /// </summary>
        /// <param name="reads">The Reads</param>
        /// <returns>Valid reads.</returns>
        private IEnumerable <ISequence> ValidateReads(IEnumerable <ISequence> reads)
        {
            IAlphabet      readAlphabet     = Alphabets.GetAmbiguousAlphabet(reads.First().Alphabet);
            HashSet <byte> ambiguousSymbols = readAlphabet.GetAmbiguousSymbols();
            HashSet <byte> gapSymbols;

            readAlphabet.TryGetGapSymbols(out gapSymbols);

            foreach (ISequence read in reads)
            {
                string originalSequenceId;
                string pairedReadType;
                bool   forward;
                string libraryName;
                if (Bio.Util.Helper.ValidatePairedSequenceId(read.ID, out originalSequenceId, out forward, out pairedReadType, out libraryName))
                {
                    if (!read.Alphabet.HasAmbiguity)
                    {
                        bool gapSymbolFound = false;
                        for (long index = 0; index < read.Count; index++)
                        {
                            if (gapSymbols.Contains(read[index]))
                            {
                                gapSymbolFound = true;
                            }
                        }

                        if (!gapSymbolFound)
                        {
                            // Exclude the otherinfo if any.
                            read.ID = Bio.Util.Helper.GetReadIdExcludingOtherInfo(read.ID);
                            yield return(read);
                        }
                    }
                    else
                    {
                        continue;
                    }
                }
            }
        }
示例#4
0
        /// <summary>
        /// Build graph nodes and edges from list of k-mers.
        /// Creates a node for every unique k-mer (and reverse-complement)
        /// in the read. Then, generates adjacency information between nodes
        /// by computing pairs of nodes that have overlapping regions
        /// between node sequences.
        /// </summary>
        /// <param name="sequences">List of input sequences.</param>
        public void Build(IEnumerable <ISequence> sequences)
        {
            if (sequences == null)
            {
                throw new ArgumentNullException("sequences");
            }

            if (this.kmerLength <= 0)
            {
                throw new ArgumentException(Properties.Resource.KmerLengthShouldBePositive);
            }

            BlockingCollection <DeBruijnNode> kmerDataCollection = new BlockingCollection <DeBruijnNode>();

            Task buildKmers = Task.Factory.StartNew(() =>
            {
                while (!kmerDataCollection.IsCompleted)
                {
                    DeBruijnNode newNode = null;
                    if (kmerDataCollection.TryTake(out newNode, -1))
                    {
                        // Tree Node Creation

                        // create a new node
                        if (this.root == null)   // first element being added
                        {
                            this.root = newNode; // set node as root of the tree
                            this.NodeCount++;
                            continue;
                        }

                        int result          = 0;
                        DeBruijnNode temp   = this.root;
                        DeBruijnNode parent = this.root;

                        // Search the tree where the new node should be inserted
                        while (temp != null)
                        {
                            result = newNode.NodeValue.CompareTo(temp.NodeValue);
                            if (result == 0)
                            {
                                if (temp.KmerCount <= 255)
                                {
                                    temp.KmerCount++;
                                    break;
                                }
                            }
                            else if (result > 0) // move to right sub-tree
                            {
                                parent = temp;
                                temp   = temp.Right;
                            }
                            else if (result < 0) // move to left sub-tree
                            {
                                parent = temp;
                                temp   = temp.Left;
                            }
                        }

                        // position found
                        if (result > 0) // add as right child
                        {
                            parent.Right = newNode;
                            NodeCount++;
                        }
                        else if (result < 0) // add as left child
                        {
                            parent.Left = newNode;
                            NodeCount++;
                        }
                    } // End of tree node creation.
                }
            });

            IAlphabet alphabet = sequences.First().Alphabet;

            byte[]         symbolMap        = alphabet.GetSymbolValueMap();
            HashSet <byte> ambiguousSymbols = alphabet.GetAmbiguousSymbols();
            HashSet <byte> gapSymbols;

            alphabet.TryGetGapSymbols(out gapSymbols);

            // Generate the kmers from the sequences
            foreach (ISequence sequence in sequences)
            {
                // if the blocking collection count is exceeding 2 million wait for 5 sec
                // so that the task can remove some kmers and creat the nodes.
                // This will avoid OutofMemoryException
                while (kmerDataCollection.Count > 2000000)
                {
                    System.Threading.Thread.Sleep(5);
                }

                long   count            = sequence.Count;
                byte[] convertedSymbols = new byte[count];
                bool   skipSequence     = false;

                for (long index = 0; index < count; index++)
                {
                    convertedSymbols[index] = symbolMap[sequence[index]];
                    if (ambiguousSymbols.Contains(convertedSymbols[index]) || gapSymbols.Contains(convertedSymbols[index]))
                    {
                        skipSequence = true;
                        break;
                    }
                }

                if (skipSequence)
                {
                    continue;
                }

                Sequence convertedSequence = new Sequence(sequence.Alphabet, convertedSymbols, false);

                // generate the kmers from each sequence
                for (long i = 0; i <= count - this.kmerLength; ++i)
                {
                    IKmerData kmerData    = this.GetNewKmerData();
                    bool      orientation = kmerData.SetKmerData(convertedSequence, i, this.kmerLength);
                    kmerDataCollection.Add(new DeBruijnNode(kmerData, orientation, 1));
                }
            }

            kmerDataCollection.CompleteAdding();

            Task.WaitAll(buildKmers);

            kmerDataCollection.Dispose();

            // Generate the links
            this.GenerateLinks();
        }