/// <summary> /// Method to do a denovo assembly. /// This sample uses Padena Denovo assembler. /// </summary> /// <param name="sequences">List of sequences to assemble.</param> /// <returns>PadenaAssembly which contain the assembled result.</returns> public static PadenaAssembly DoDenovoAssembly(List<ISequence> sequences) { // Create a denovo assembler ParallelDeNovoAssembler assembler = new ParallelDeNovoAssembler(); // Length of kmer assembler.KmerLength = 6; // Threshold to be used for error correction step where dangling links are removed. // All dangling links that have lengths less than specified length will be removed. assembler.DanglingLinksThreshold = 3; // Threshold to be used for error correction step where redundant paths are removed. // Paths that have same start and end points (redundant paths) and whose lengths are less // than specified length will be removed. They will be replaced by a single 'best' path assembler.RedundantPathLengthThreshold = 7; // Enter the name of the library along with mean distance and standard deviation CloneLibrary.Instance.AddLibrary("abc", (float)4, (float)5); // Assemble return (PadenaAssembly)assembler.Assemble(sequences); }
/// <summary> /// Validate Parallel Denovo Assembly Assembled sequences. /// </summary> /// <param name="nodeName">XML node used to validate different test scenarios</param> internal void ValidatePadenaAssembledSeqs(string nodeName) { // Get values from XML node. string filePath = utilityObj.xmlUtil.GetTextValue( nodeName, Constants.FilePathNode); string kmerLength = utilityObj.xmlUtil.GetTextValue( nodeName, Constants.KmerLengthNode); string daglingThreshold = utilityObj.xmlUtil.GetTextValue( nodeName, Constants.DanglingLinkThresholdNode); string redundantThreshold = utilityObj.xmlUtil.GetTextValue( nodeName, Constants.RedundantThreshold); string libraray = utilityObj.xmlUtil.GetTextValue(nodeName, Constants.LibraryName); string stdDeviation = utilityObj.xmlUtil.GetTextValue(nodeName, Constants.StdDeviation); string mean = utilityObj.xmlUtil.GetTextValue(nodeName, Constants.Mean); string assembledSequences = utilityObj.xmlUtil.GetTextValue(nodeName, Constants.SequencePathNode); string assembledSeqCount = utilityObj.xmlUtil.GetTextValue(nodeName, Constants.AssembledSeqCountNode); string[] updatedAssembledSeqs = assembledSequences.Split(','); // Get the input reads and build kmers IEnumerable<ISequence> sequenceReads = null; using (FastAParser parser = new FastAParser(filePath)) { sequenceReads = parser.Parse(); // Create a ParallelDeNovoAssembler instance. ParallelDeNovoAssembler denovoObj = null; try { denovoObj = new ParallelDeNovoAssembler(); denovoObj.KmerLength = Int32.Parse(kmerLength, (IFormatProvider)null); denovoObj.DanglingLinksThreshold = Int32.Parse(daglingThreshold, (IFormatProvider)null); denovoObj.RedundantPathLengthThreshold = Int32.Parse(redundantThreshold, (IFormatProvider)null); CloneLibrary.Instance.AddLibrary(libraray, float.Parse(mean, (IFormatProvider)null), float.Parse(stdDeviation, (IFormatProvider)null)); byte[] symbols = sequenceReads.ElementAt(0).Alphabet.GetSymbolValueMap(); IDeNovoAssembly assembly = denovoObj.Assemble(sequenceReads.Select(a => new Sequence(Alphabets.DNA, a.Select(b => symbols[b]).ToArray()) { ID = a.ID }), true); IList<ISequence> assembledSequenceList = assembly.AssembledSequences.ToList(); // Validate assembled sequences. Assert.AreEqual(assembledSeqCount, assembledSequenceList.Count.ToString((IFormatProvider)null)); for (int i = 0; i < assembledSequenceList.Count; i++) { Assert.IsTrue(assembledSequences.Contains( new string(assembledSequenceList[i].Select(a => (char)a).ToArray())) || updatedAssembledSeqs.Contains( new string(assembledSequenceList[i].GetReverseComplementedSequence().Select(a => (char)a).ToArray()))); } } finally { if (denovoObj != null) denovoObj.Dispose(); } } ApplicationLog.WriteLine("Padena P1 : Assemble() validation for Padena step6:step7 completed successfully"); }
/// <summary> /// It assembles the sequences. /// </summary> public override void AssembleSequences() { TimeSpan algorithmSpan = new TimeSpan(); Stopwatch runAlgorithm = new Stopwatch(); FileInfo refFileinfo = new FileInfo(this.Filename); long refFileLength = refFileinfo.Length; Output.WriteLine(OutputLevel.Information, Resources.AssemblyScaffoldStarting); if (!string.IsNullOrEmpty(this.CloneLibraryName)) { CloneLibrary.Instance.AddLibrary(this.CloneLibraryName, (float)this.MeanLengthOfInsert, (float)this.StandardDeviationOfInsert); } runAlgorithm.Restart(); IEnumerable<ISequence> reads = ParseFile(); runAlgorithm.Stop(); algorithmSpan = algorithmSpan.Add(runAlgorithm.Elapsed); if (this.Verbose) { Output.WriteLine(OutputLevel.Verbose); Output.WriteLine(OutputLevel.Verbose, "Processed read file: {0}", Path.GetFullPath(this.Filename)); Output.WriteLine(OutputLevel.Verbose, " Read/Processing time: {0}", runAlgorithm.Elapsed); Output.WriteLine(OutputLevel.Verbose, " File Size : {0}", refFileLength); Output.WriteLine(OutputLevel.Verbose, " k-mer Length : {0}", this.KmerLength); } runAlgorithm.Restart(); if (reads.Any(s => s.Alphabet.HasAmbiguity)) throw new ArgumentException(Resources.AmbiguousReadsNotSupported); runAlgorithm.Stop(); if (this.Verbose) { Output.WriteLine(OutputLevel.Verbose); Output.WriteLine(OutputLevel.Verbose, "Time taken for Validating reads: {0}", runAlgorithm.Elapsed); } ParallelDeNovoAssembler assembler = new ParallelDeNovoAssembler(); assembler.StatusChanged += this.AssemblerStatusChanged; assembler.AllowErosion = AllowErosion; assembler.AllowKmerLengthEstimation = AllowKmerLengthEstimation; if (ContigCoverageThreshold != -1) { assembler.AllowLowCoverageContigRemoval = true; assembler.ContigCoverageThreshold = ContigCoverageThreshold; } assembler.DanglingLinksThreshold = DangleThreshold; assembler.ErosionThreshold = ErosionThreshold; if (!this.AllowKmerLengthEstimation) { assembler.KmerLength = this.KmerLength; } assembler.RedundantPathLengthThreshold = RedundantPathLengthThreshold; runAlgorithm.Restart(); IDeNovoAssembly assembly = assembler.Assemble(reads, true); runAlgorithm.Stop(); algorithmSpan = algorithmSpan.Add(runAlgorithm.Elapsed); if (this.Verbose) { Output.WriteLine(OutputLevel.Verbose); Output.WriteLine(OutputLevel.Verbose, "Compute time: {0}", runAlgorithm.Elapsed); } runAlgorithm.Restart(); WriteContigs(assembly); runAlgorithm.Stop(); algorithmSpan = algorithmSpan.Add(runAlgorithm.Elapsed); if (this.Verbose) { Output.WriteLine(OutputLevel.Verbose); Output.WriteLine(OutputLevel.Verbose, "Write contigs time: {0}", runAlgorithm.Elapsed); Output.WriteLine(OutputLevel.Verbose, "Total runtime: {0}", algorithmSpan); } }
/// <summary> /// Validate Parallel Denovo Assembly Assembled sequences. /// </summary> /// <param name="nodeName">XML node used to validate different test scenarios</param> /// <param name="isScaffold"></param> /// <param name="enableLowerContigRemoval"></param> /// <param name="allowErosion"></param> internal void ValidatePadenaAssembledSeqs(string nodeName, bool isScaffold, bool enableLowerContigRemoval, bool allowErosion) { // Get values from XML node. string filePath = utilityObj.xmlUtil.GetTextValue(nodeName, Constants.FilePathNode); string kmerLength = utilityObj.xmlUtil.GetTextValue(nodeName, Constants.KmerLengthNode); string daglingThreshold = utilityObj.xmlUtil.GetTextValue(nodeName, Constants.DanglingLinkThresholdNode); string redundantThreshold = utilityObj.xmlUtil.GetTextValue(nodeName, Constants.RedundantThreshold); string library = utilityObj.xmlUtil.GetTextValue(nodeName, Constants.LibraryName); string stdDeviation = utilityObj.xmlUtil.GetTextValue(nodeName, Constants.StdDeviation); string mean = utilityObj.xmlUtil.GetTextValue(nodeName, Constants.Mean); string erosionThreshold = utilityObj.xmlUtil.GetTextValue(nodeName, Constants.ErosionNode); string lowCCThreshold = utilityObj.xmlUtil.GetTextValue(nodeName, Constants.LowCoverageContigNode); string expectedSequences = utilityObj.xmlUtil.GetTextValue(nodeName, Constants.SequencePathNode); // Get the input reads and build kmers using (FastAParser parser = new FastAParser(filePath)) { IEnumerable<ISequence> sequenceReads = parser.Parse(); // Create a ParallelDeNovoAssembler instance. ParallelDeNovoAssembler assembler = null; try { assembler = new ParallelDeNovoAssembler { KmerLength = Int32.Parse(kmerLength, null), DanglingLinksThreshold = Int32.Parse(daglingThreshold, null), RedundantPathLengthThreshold = Int32.Parse(redundantThreshold, null) }; if (enableLowerContigRemoval) { assembler.AllowLowCoverageContigRemoval = enableLowerContigRemoval; assembler.ContigCoverageThreshold = double.Parse(lowCCThreshold, null); } if (allowErosion) { assembler.AllowErosion = true; assembler.ErosionThreshold = Int32.Parse(erosionThreshold, null); } CloneLibrary.Instance.AddLibrary(library, float.Parse(mean, null), float.Parse(stdDeviation, null)); IDeNovoAssembly assembly = assembler.Assemble(sequenceReads.ToList(), isScaffold); IList<ISequence> assembledSequenceList = assembly.AssembledSequences.ToList(); HashSet<string> expected = new HashSet<string>(expectedSequences.Split(',').Select(s => s.Trim())); AlignmentHelpers.CompareSequenceLists(expected, assembledSequenceList); ApplicationLog.WriteLine("Padena BVT : Assemble() validation for Padena step6:step7 completed successfully"); } finally { if (assembler != null) assembler.Dispose(); } } }
//TODO: This is really a PADENA Test, needs to be renamed. public void ValidateGetReverseComplement() { const int kmerLength = 6; const int dangleThreshold = 3; const int redundantThreshold = 7; using (ParallelDeNovoAssembler assembler = new ParallelDeNovoAssembler()) { assembler.KmerLength = kmerLength; assembler.DanglingLinksThreshold = dangleThreshold; assembler.RedundantPathLengthThreshold = redundantThreshold; assembler.ScaffoldRedundancy = 0; assembler.Depth = 3; CloneLibrary.Instance.AddLibrary("abc", 5, 20); PadenaAssembly result = (PadenaAssembly)assembler.Assemble(GetReadsForScaffolds(), true); var expectedContigs = new List<string> { "TTTTTT", "TTAGCGCG", "CGCGCCGCGC", "CGCGCG", "GCGCGC", "TTTTTA", "TTTTAGC", "TTTTAA", "TTTAAA", "ATGCCTCCTATCTTAGC", }; Assert.AreEqual(10, result.ContigSequences.Count()); foreach (ISequence contig in result.ContigSequences) { string contigSeq = contig.ConvertToString(); Assert.IsTrue( expectedContigs.Contains(contigSeq) || expectedContigs.Contains(contigSeq.GetReverseComplement(new char[contigSeq.Length])), "Found unknown contig " + contigSeq); } Assert.AreEqual(8, result.Scaffolds.Count()); var expectedScaffolds = new List<string> { "ATGCCTCCTATCTTAGCGCGC", "CGCGCG", "CGCGCCGCGC", "TTTTTA", "TTTTTT", "TTTTAGC", "TTTTAA", "TTTAAA", }; foreach (ISequence scaffold in result.Scaffolds) { string scaffoldSeq = scaffold.ConvertToString(); Assert.IsTrue( expectedScaffolds.Contains(scaffoldSeq) || expectedScaffolds.Contains(scaffoldSeq.GetReverseComplement(new char[scaffoldSeq.Length])), "Found unknown scaffold " + scaffoldSeq); } } }
public void ValidateGetSequenceRange() { const int kmerLength = 6; const int dangleThreshold = 3; const int redundantThreshold = 7; using (ParallelDeNovoAssembler assembler = new ParallelDeNovoAssembler()) { assembler.KmerLength = kmerLength; assembler.DanglingLinksThreshold = dangleThreshold; assembler.RedundantPathLengthThreshold = redundantThreshold; assembler.ScaffoldRedundancy = 0; assembler.Depth = 3; CloneLibrary.Instance.AddLibrary("abc", (float)5, (float)20); PadenaAssembly result = (PadenaAssembly)assembler.Assemble(GetReadsForScaffolds(), true); ISequence sequence = result.ContigSequences[0]; ISequence seqRange = Helper.GetSequenceRange(sequence, 2, 3); Assert.AreEqual(3, seqRange.Count); string sequenceStr = new string(sequence.Select(a => (char)a).ToArray()); string seqRangeStr = new string(seqRange.Select(a => (char)a).ToArray()); Assert.IsTrue(sequenceStr.Contains(seqRangeStr)); } }