/// <summary>
        /// Method to do a denovo assembly.
        /// This sample uses Padena Denovo assembler.
        /// </summary>
        /// <param name="sequences">List of sequences to assemble.</param>
        /// <returns>PadenaAssembly which contain the assembled result.</returns>
        public static PadenaAssembly DoDenovoAssembly(List<ISequence> sequences)
        {
            // Create a denovo assembler
            ParallelDeNovoAssembler assembler = new ParallelDeNovoAssembler();

            // Length of kmer
            assembler.KmerLength = 6;

            // Threshold to be used for error correction step where dangling links are removed.
            // All dangling links that have lengths less than specified length will be removed.
            assembler.DanglingLinksThreshold = 3;

            // Threshold to be used for error correction step where redundant paths are removed.
            // Paths that have same start and end points (redundant paths) and whose lengths are less
            // than specified length will be removed. They will be replaced by a single 'best' path
            assembler.RedundantPathLengthThreshold = 7;

            // Enter the name of the library along with mean distance and standard deviation
            CloneLibrary.Instance.AddLibrary("abc", (float)4, (float)5);

            // Assemble
            return (PadenaAssembly)assembler.Assemble(sequences);
        }
Example #2
0
        /// <summary>
        /// Validate Parallel Denovo Assembly Assembled sequences.
        /// </summary>
        /// <param name="nodeName">XML node used to validate different test scenarios</param>
        internal void ValidatePadenaAssembledSeqs(string nodeName)
        {
            // Get values from XML node.
            string filePath = utilityObj.xmlUtil.GetTextValue(
               nodeName, Constants.FilePathNode);
            string kmerLength = utilityObj.xmlUtil.GetTextValue(
                nodeName, Constants.KmerLengthNode);
            string daglingThreshold = utilityObj.xmlUtil.GetTextValue(
                nodeName, Constants.DanglingLinkThresholdNode);
            string redundantThreshold = utilityObj.xmlUtil.GetTextValue(
                nodeName, Constants.RedundantThreshold);
            string libraray = utilityObj.xmlUtil.GetTextValue(nodeName,
                Constants.LibraryName);
            string stdDeviation = utilityObj.xmlUtil.GetTextValue(nodeName,
                Constants.StdDeviation);
            string mean = utilityObj.xmlUtil.GetTextValue(nodeName,
                Constants.Mean);
            string assembledSequences = utilityObj.xmlUtil.GetTextValue(nodeName,
                Constants.SequencePathNode);
            string assembledSeqCount = utilityObj.xmlUtil.GetTextValue(nodeName,
                Constants.AssembledSeqCountNode);
            string[] updatedAssembledSeqs = assembledSequences.Split(',');

            // Get the input reads and build kmers
            IEnumerable<ISequence> sequenceReads = null;
            using (FastAParser parser = new FastAParser(filePath))
            {
                sequenceReads = parser.Parse();

                // Create a ParallelDeNovoAssembler instance.
                ParallelDeNovoAssembler denovoObj = null;
                try
                {
                    denovoObj = new ParallelDeNovoAssembler();

                    denovoObj.KmerLength = Int32.Parse(kmerLength, (IFormatProvider)null);
                    denovoObj.DanglingLinksThreshold = Int32.Parse(daglingThreshold, (IFormatProvider)null);
                    denovoObj.RedundantPathLengthThreshold = Int32.Parse(redundantThreshold, (IFormatProvider)null);

                    CloneLibrary.Instance.AddLibrary(libraray, float.Parse(mean, (IFormatProvider)null),
                    float.Parse(stdDeviation, (IFormatProvider)null));

                    byte[] symbols = sequenceReads.ElementAt(0).Alphabet.GetSymbolValueMap();

                    IDeNovoAssembly assembly =
                        denovoObj.Assemble(sequenceReads.Select(a => new Sequence(Alphabets.DNA, a.Select(b => symbols[b]).ToArray()) { ID = a.ID }), true);

                    IList<ISequence> assembledSequenceList = assembly.AssembledSequences.ToList();

                    // Validate assembled sequences.
                    Assert.AreEqual(assembledSeqCount, assembledSequenceList.Count.ToString((IFormatProvider)null));

                    for (int i = 0; i < assembledSequenceList.Count; i++)
                    {
                        Assert.IsTrue(assembledSequences.Contains(
                       new string(assembledSequenceList[i].Select(a => (char)a).ToArray()))
                        || updatedAssembledSeqs.Contains(
                        new string(assembledSequenceList[i].GetReverseComplementedSequence().Select(a => (char)a).ToArray())));
                    }
                }
                finally
                {
                    if (denovoObj != null)
                        denovoObj.Dispose();
                }
            }

            ApplicationLog.WriteLine("Padena P1 : Assemble() validation for Padena step6:step7 completed successfully");
        }
        /// <summary>
        /// It assembles the sequences.
        /// </summary>
        public override void AssembleSequences()
        {
            TimeSpan algorithmSpan = new TimeSpan();
            Stopwatch runAlgorithm = new Stopwatch();
            FileInfo refFileinfo = new FileInfo(this.Filename);
            long refFileLength = refFileinfo.Length;

            Output.WriteLine(OutputLevel.Information, Resources.AssemblyScaffoldStarting);

            if (!string.IsNullOrEmpty(this.CloneLibraryName))
            {
                CloneLibrary.Instance.AddLibrary(this.CloneLibraryName, (float)this.MeanLengthOfInsert, (float)this.StandardDeviationOfInsert);
            }

            runAlgorithm.Restart();
            IEnumerable<ISequence> reads = ParseFile();
            runAlgorithm.Stop();
            algorithmSpan = algorithmSpan.Add(runAlgorithm.Elapsed);

            if (this.Verbose)
            {
                Output.WriteLine(OutputLevel.Verbose);
                Output.WriteLine(OutputLevel.Verbose, "Processed read file: {0}", Path.GetFullPath(this.Filename));
                Output.WriteLine(OutputLevel.Verbose, "   Read/Processing time: {0}", runAlgorithm.Elapsed);
                Output.WriteLine(OutputLevel.Verbose, "   File Size           : {0}", refFileLength);
                Output.WriteLine(OutputLevel.Verbose, "   k-mer Length        : {0}", this.KmerLength);
            }

            runAlgorithm.Restart();
            if (reads.Any(s => s.Alphabet.HasAmbiguity))
                throw new ArgumentException(Resources.AmbiguousReadsNotSupported);
            runAlgorithm.Stop();

            if (this.Verbose)
            {
                Output.WriteLine(OutputLevel.Verbose);
                Output.WriteLine(OutputLevel.Verbose, "Time taken for Validating reads: {0}", runAlgorithm.Elapsed);
            }

            ParallelDeNovoAssembler assembler = new ParallelDeNovoAssembler();
            assembler.StatusChanged += this.AssemblerStatusChanged;
            assembler.AllowErosion = AllowErosion;
            assembler.AllowKmerLengthEstimation = AllowKmerLengthEstimation;
            if (ContigCoverageThreshold != -1)
            {
                assembler.AllowLowCoverageContigRemoval = true;
                assembler.ContigCoverageThreshold = ContigCoverageThreshold;
            }
            assembler.DanglingLinksThreshold = DangleThreshold;
            assembler.ErosionThreshold = ErosionThreshold;
            if (!this.AllowKmerLengthEstimation)
            {
                assembler.KmerLength = this.KmerLength;
            }

            assembler.RedundantPathLengthThreshold = RedundantPathLengthThreshold;
            runAlgorithm.Restart();
            IDeNovoAssembly assembly = assembler.Assemble(reads, true);
            runAlgorithm.Stop();
            algorithmSpan = algorithmSpan.Add(runAlgorithm.Elapsed);
            if (this.Verbose)
            {
                Output.WriteLine(OutputLevel.Verbose);
                Output.WriteLine(OutputLevel.Verbose, "Compute time: {0}", runAlgorithm.Elapsed);
            }

            runAlgorithm.Restart();
            WriteContigs(assembly);
            runAlgorithm.Stop();
            algorithmSpan = algorithmSpan.Add(runAlgorithm.Elapsed);
            if (this.Verbose)
            {
                Output.WriteLine(OutputLevel.Verbose);
                Output.WriteLine(OutputLevel.Verbose, "Write contigs time: {0}", runAlgorithm.Elapsed);
                Output.WriteLine(OutputLevel.Verbose, "Total runtime: {0}", algorithmSpan);
            }
        }
Example #4
0
        /// <summary>
        /// Validate Parallel Denovo Assembly Assembled sequences.
        /// </summary>
        /// <param name="nodeName">XML node used to validate different test scenarios</param>
        /// <param name="isScaffold"></param>
        /// <param name="enableLowerContigRemoval"></param>
        /// <param name="allowErosion"></param>
        internal void ValidatePadenaAssembledSeqs(string nodeName,
            bool isScaffold, bool enableLowerContigRemoval, bool allowErosion)
        {
            // Get values from XML node.
            string filePath = utilityObj.xmlUtil.GetTextValue(nodeName, Constants.FilePathNode);
            string kmerLength = utilityObj.xmlUtil.GetTextValue(nodeName, Constants.KmerLengthNode);
            string daglingThreshold = utilityObj.xmlUtil.GetTextValue(nodeName, Constants.DanglingLinkThresholdNode);
            string redundantThreshold = utilityObj.xmlUtil.GetTextValue(nodeName, Constants.RedundantThreshold);
            string library = utilityObj.xmlUtil.GetTextValue(nodeName, Constants.LibraryName);
            string stdDeviation = utilityObj.xmlUtil.GetTextValue(nodeName, Constants.StdDeviation);
            string mean = utilityObj.xmlUtil.GetTextValue(nodeName, Constants.Mean);
            string erosionThreshold = utilityObj.xmlUtil.GetTextValue(nodeName, Constants.ErosionNode);
            string lowCCThreshold = utilityObj.xmlUtil.GetTextValue(nodeName, Constants.LowCoverageContigNode);
            string expectedSequences = utilityObj.xmlUtil.GetTextValue(nodeName, Constants.SequencePathNode);

            // Get the input reads and build kmers
            using (FastAParser parser = new FastAParser(filePath))
            {
                IEnumerable<ISequence> sequenceReads = parser.Parse();

                // Create a ParallelDeNovoAssembler instance.
                ParallelDeNovoAssembler assembler = null;
                try
                {
                    assembler = new ParallelDeNovoAssembler
                    {
                        KmerLength = Int32.Parse(kmerLength, null),
                        DanglingLinksThreshold = Int32.Parse(daglingThreshold, null),
                        RedundantPathLengthThreshold = Int32.Parse(redundantThreshold, null)
                    };

                    if (enableLowerContigRemoval)
                    {
                        assembler.AllowLowCoverageContigRemoval = enableLowerContigRemoval;
                        assembler.ContigCoverageThreshold = double.Parse(lowCCThreshold, null);
                    }

                    if (allowErosion)
                    {
                        assembler.AllowErosion = true;
                        assembler.ErosionThreshold = Int32.Parse(erosionThreshold, null);
                    }

                    CloneLibrary.Instance.AddLibrary(library, float.Parse(mean, null),
                        float.Parse(stdDeviation, null));

                    IDeNovoAssembly assembly = assembler.Assemble(sequenceReads.ToList(), isScaffold);
                    IList<ISequence> assembledSequenceList = assembly.AssembledSequences.ToList();

                    HashSet<string> expected = new HashSet<string>(expectedSequences.Split(',').Select(s => s.Trim()));
                    AlignmentHelpers.CompareSequenceLists(expected, assembledSequenceList);

                    ApplicationLog.WriteLine("Padena BVT : Assemble() validation for Padena step6:step7 completed successfully");
                }
                finally
                {
                    if (assembler != null)
                        assembler.Dispose();
                }
            }
        }
Example #5
0
        //TODO: This is really a PADENA Test, needs to be renamed.
        public void ValidateGetReverseComplement()
        {
            const int kmerLength = 6;
            const int dangleThreshold = 3;
            const int redundantThreshold = 7;

            using (ParallelDeNovoAssembler assembler = new ParallelDeNovoAssembler())
            {
                assembler.KmerLength = kmerLength;
                assembler.DanglingLinksThreshold = dangleThreshold;
                assembler.RedundantPathLengthThreshold = redundantThreshold;

                assembler.ScaffoldRedundancy = 0;
                assembler.Depth = 3;
                CloneLibrary.Instance.AddLibrary("abc", 5, 20);

                PadenaAssembly result = (PadenaAssembly)assembler.Assemble(GetReadsForScaffolds(), true);

               

                var expectedContigs = new List<string>
                {
                   "TTTTTT",
                   "TTAGCGCG",
                   "CGCGCCGCGC",
                   "CGCGCG",
                   "GCGCGC",
                   "TTTTTA",
                   "TTTTAGC",
                   "TTTTAA",
                   "TTTAAA",
                   "ATGCCTCCTATCTTAGC",
                };
                
                Assert.AreEqual(10, result.ContigSequences.Count());

                foreach (ISequence contig in result.ContigSequences)
                {
                    string contigSeq = contig.ConvertToString();
                    Assert.IsTrue(
                        expectedContigs.Contains(contigSeq) ||
                        expectedContigs.Contains(contigSeq.GetReverseComplement(new char[contigSeq.Length])), 
                        "Found unknown contig " + contigSeq);
                }

                Assert.AreEqual(8, result.Scaffolds.Count());
                var expectedScaffolds = new List<string>
                {
                    "ATGCCTCCTATCTTAGCGCGC",
                    "CGCGCG",
                    "CGCGCCGCGC",
                    "TTTTTA",
                    "TTTTTT",
                    "TTTTAGC",
                    "TTTTAA",
                    "TTTAAA",
                };

                foreach (ISequence scaffold in result.Scaffolds)
                {
                    string scaffoldSeq = scaffold.ConvertToString();
                    Assert.IsTrue(
                        expectedScaffolds.Contains(scaffoldSeq) ||
                        expectedScaffolds.Contains(scaffoldSeq.GetReverseComplement(new char[scaffoldSeq.Length])), 
                        "Found unknown scaffold " + scaffoldSeq);
                }
            }
        }
Example #6
0
        public void ValidateGetSequenceRange()
        {
            const int kmerLength = 6;
            const int dangleThreshold = 3;
            const int redundantThreshold = 7;

            using (ParallelDeNovoAssembler assembler = new ParallelDeNovoAssembler())
            {
                assembler.KmerLength = kmerLength;
                assembler.DanglingLinksThreshold = dangleThreshold;
                assembler.RedundantPathLengthThreshold = redundantThreshold;

                assembler.ScaffoldRedundancy = 0;
                assembler.Depth = 3;
                CloneLibrary.Instance.AddLibrary("abc", (float)5, (float)20);

                PadenaAssembly result = (PadenaAssembly)assembler.Assemble(GetReadsForScaffolds(), true);
                ISequence sequence = result.ContigSequences[0];
                ISequence seqRange = Helper.GetSequenceRange(sequence, 2, 3);
                Assert.AreEqual(3, seqRange.Count);

                string sequenceStr = new string(sequence.Select(a => (char)a).ToArray());
                string seqRangeStr = new string(seqRange.Select(a => (char)a).ToArray());
                Assert.IsTrue(sequenceStr.Contains(seqRangeStr));

            }
        }