public void ValidateGetSequenceRange() { const int kmerLength = 6; const int dangleThreshold = 3; const int redundantThreshold = 7; using (ParallelDeNovoAssembler assembler = new ParallelDeNovoAssembler()) { assembler.KmerLength = kmerLength; assembler.DanglingLinksThreshold = dangleThreshold; assembler.RedundantPathLengthThreshold = redundantThreshold; assembler.ScaffoldRedundancy = 0; assembler.Depth = 3; CloneLibrary.Instance.AddLibrary("abc", (float)5, (float)20); PadenaAssembly result = (PadenaAssembly)assembler.Assemble(GetReadsForScaffolds(), true); ISequence sequence = result.ContigSequences[0]; ISequence seqRange = Helper.GetSequenceRange(sequence, 2, 3); Assert.AreEqual(3, seqRange.Count); string sequenceStr = new string(sequence.Select(a => (char)a).ToArray()); string seqRangeStr = new string(seqRange.Select(a => (char)a).ToArray()); Assert.IsTrue(sequenceStr.Contains(seqRangeStr)); } }
public void AssemblerTestWithScaffoldBuilder() { const int kmerLength = 6; const int dangleThreshold = 3; const int redundantThreshold = 7; using (ParallelDeNovoAssembler assembler = new ParallelDeNovoAssembler()) { assembler.KmerLength = kmerLength; assembler.DanglingLinksThreshold = dangleThreshold; assembler.RedundantPathLengthThreshold = redundantThreshold; assembler.ScaffoldRedundancy = 0; assembler.Depth = 3; CloneLibrary.Instance.AddLibrary("abc", 5, 20); PadenaAssembly result = (PadenaAssembly)assembler.Assemble(TestInputs.GetReadsForScaffolds(), true); HashSet <string> expectedContigs = new HashSet <string> { "TTTTTT", "CGCGCG", "TTAGCGCG", "CGCGCCGCGC", "GCGCGC", "TTTTTA", "TTTTAA", "TTTAAA", "TTTTAGC", "ATGCCTCCTATCTTAGC" }; AlignmentHelpers.CompareSequenceLists(expectedContigs, result.ContigSequences); HashSet <string> expectedScaffolds = new HashSet <string> { "ATGCCTCCTATCTTAGCGCGC", "TTTAAA", "TTTTTT", "TTTTAGC", "TTTTAA", "CGCGCCGCGC", "TTTTTA", "CGCGCG" }; AlignmentHelpers.CompareSequenceLists(expectedScaffolds, result.Scaffolds); } }
/// <summary> /// Writes the contigs to the file. /// </summary> /// <param name="assembly">IDeNovoAssembly parameter is the result of running De Novo Assembly on a set of two or more sequences. </param> protected void writeContigs(PadenaAssembly assembly) { if (assembly.AssembledSequences.Count == 0) { Output.WriteLine(OutputLevel.Results, "\tNo sequences assembled."); return; } ensureContigNames(assembly.AssembledSequences); if (!string.IsNullOrEmpty(this.DiagnosticFilePrefix)) { using (FastAFormatter formatter = new FastAFormatter(ContigFileName)) { formatter.AutoFlush = true; foreach (ISequence seq in assembly.AssembledSequences) { formatter.Write(seq); } } Output.WriteLine(OutputLevel.Information, "\tWrote {0} sequences to {1}", assembly.AssembledSequences.Count, ContigFileName); } else { Output.WriteLine(OutputLevel.Information, "\tAssembled Sequence Results: {0} sequences", assembly.AssembledSequences.Count); using (FastAFormatter formatter = new FastAFormatter()) { formatter.Open(new StreamWriter(Console.OpenStandardOutput())); formatter.MaxSymbolsAllowedPerLine = decideOutputWidth(); formatter.AutoFlush = true; foreach (ISequence seq in assembly.AssembledSequences) { formatter.Write(seq); } } } }
public void TestPadenaAssemblyToString() { ISequence seq2 = new Sequence(Alphabets.DNA, "ACAAAAGCAAC"); ISequence seq1 = new Sequence(Alphabets.DNA, "ATGAAGGCAATACTAGTAGT"); IList <ISequence> contigList = new List <ISequence>(); contigList.Add(seq1); contigList.Add(seq2); PadenaAssembly denovoAssembly = new PadenaAssembly(); denovoAssembly.AddContigs(contigList); string actualString = denovoAssembly.ToString(); string expectedString = "ATGAAGGCAATACTAGTAGT\r\nACAAAAGCAAC\r\n"; Assert.AreEqual(actualString, expectedString); }
public void ValidatePadenaAssemblyToString() { ISequence seq2 = new Sequence(Alphabets.DNA, "ACAAAAGCAAC"); ISequence seq1 = new Sequence(Alphabets.DNA, "ATGAAGGCAATACTAGTAGT"); IList <ISequence> contigList = new List <ISequence>(); contigList.Add(seq1); contigList.Add(seq2); var denovoAssembly = new PadenaAssembly(); denovoAssembly.AddContigs(contigList); string actualString = denovoAssembly.ToString(); string expectedString = "ATGAAGGCAATACTAGTAGT\r\nACAAAAGCAAC\r\n"; Assert.AreEqual(actualString, expectedString); // read sequences from xml string sequence1 = this.utilityObj.xmlUtil.GetTextValue(Constants.AssemblyAlgorithmNodeName, Constants.SequenceNode1); string sequence2 = this.utilityObj.xmlUtil.GetTextValue(Constants.AssemblyAlgorithmNodeName, Constants.SequenceNode2); IAlphabet alphabet = Utility.GetAlphabet(this.utilityObj.xmlUtil.GetTextValue(Constants.AssemblyAlgorithmNodeName, Constants.AlphabetNameNode)); var seq3 = new Sequence(alphabet, sequence1); var seq4 = new Sequence(alphabet, sequence2); IList <ISequence> contigList1 = new List <ISequence>(); contigList1.Add(seq3); contigList1.Add(seq4); var denovoAssembly1 = new PadenaAssembly(); denovoAssembly1.AddContigs(contigList1); string actualString1 = denovoAssembly1.ToString(); string expectedString1 = "GCCAAAATTTAGGC\r\nTTATGGCGCCCACGGA\r\n"; Assert.AreEqual(expectedString1, actualString1); }
public void AssemblerTestWithScaffoldBuilder() { const int kmerLength = 6; const int dangleThreshold = 3; const int redundantThreshold = 7; using (ParallelDeNovoAssembler assembler = new ParallelDeNovoAssembler()) { assembler.KmerLength = kmerLength; assembler.DanglingLinksThreshold = dangleThreshold; assembler.RedundantPathLengthThreshold = redundantThreshold; assembler.ScaffoldRedundancy = 0; assembler.Depth = 3; CloneLibrary.Instance.AddLibrary("abc", (float)5, (float)20); PadenaAssembly result = (PadenaAssembly)assembler.Assemble(TestInputs.GetReadsForScaffolds(), true); Assert.AreEqual(10, result.ContigSequences.Count()); HashSet <string> expectedContigs = new HashSet <string> { "GCGCGC", "TTTTTT", "TTTTTA", "TTTTAA", "TTTAAA", "ATGCCTCCTATCTTAGC", "TTTTAGC", "TTAGCGCG", "CGCGCCGCGC", "CGCGCG" }; foreach (ISequence contig in result.ContigSequences) { string contigSeq = new string(contig.Select(a => (char)a).ToArray()); Assert.IsTrue( expectedContigs.Contains(contigSeq) || expectedContigs.Contains(contigSeq.GetReverseComplement(new char[contigSeq.Length]))); } Assert.AreEqual(8, result.Scaffolds.Count()); HashSet <string> expectedScaffolds = new HashSet <string> { "ATGCCTCCTATCTTAGCGCGC", "TTTTTT", "TTTTTA", "TTTTAA", "TTTAAA", "CGCGCCGCGC", "TTTTAGC", "CGCGCG" }; foreach (ISequence scaffold in result.Scaffolds) { string scaffoldSeq = new string(scaffold.Select(a => (char)a).ToArray()); Assert.IsTrue( expectedScaffolds.Contains(scaffoldSeq) || expectedScaffolds.Contains(scaffoldSeq.GetReverseComplement(new char[scaffoldSeq.Length]))); } } }
public void ValidatePadenaAssemblyToString() { ISequence seq2 = new Sequence(Alphabets.DNA, "ACAAAAGCAAC"); ISequence seq1 = new Sequence(Alphabets.DNA, "ATGAAGGCAATACTAGTAGT"); IList<ISequence> contigList = new List<ISequence>(); contigList.Add(seq1); contigList.Add(seq2); var denovoAssembly = new PadenaAssembly(); denovoAssembly.AddContigs(contigList); string actualString = denovoAssembly.ToString(); string expectedString = "ATGAAGGCAATACTAGTAGT\r\nACAAAAGCAAC\r\n"; Assert.AreEqual(actualString, expectedString); // read sequences from xml string sequence1 = this.utilityObj.xmlUtil.GetTextValue(Constants.AssemblyAlgorithmNodeName, Constants.SequenceNode1); string sequence2 = this.utilityObj.xmlUtil.GetTextValue(Constants.AssemblyAlgorithmNodeName, Constants.SequenceNode2); IAlphabet alphabet = Utility.GetAlphabet(this.utilityObj.xmlUtil.GetTextValue(Constants.AssemblyAlgorithmNodeName, Constants.AlphabetNameNode)); var seq3 = new Sequence(alphabet, sequence1); var seq4 = new Sequence(alphabet, sequence2); IList<ISequence> contigList1 = new List<ISequence>(); contigList1.Add(seq3); contigList1.Add(seq4); var denovoAssembly1 = new PadenaAssembly(); denovoAssembly1.AddContigs(contigList1); string actualString1 = denovoAssembly1.ToString(); string expectedString1 = "GCCAAAATTTAGGC\r\nTTATGGCGCCCACGGA\r\n"; Assert.AreEqual(expectedString1, actualString1); }
//TODO: This is really a PADENA Test, needs to be renamed. public void ValidateGetReverseComplement() { const int kmerLength = 6; const int dangleThreshold = 3; const int redundantThreshold = 7; using (ParallelDeNovoAssembler assembler = new ParallelDeNovoAssembler()) { assembler.KmerLength = kmerLength; assembler.DanglingLinksThreshold = dangleThreshold; assembler.RedundantPathLengthThreshold = redundantThreshold; assembler.ScaffoldRedundancy = 0; assembler.Depth = 3; CloneLibrary.Instance.AddLibrary("abc", 5, 20); PadenaAssembly result = (PadenaAssembly)assembler.Assemble(GetReadsForScaffolds(), true); var expectedContigs = new List <string> { "TTTTTT", "TTAGCGCG", "CGCGCCGCGC", "CGCGCG", "GCGCGC", "TTTTTA", "TTTTAGC", "TTTTAA", "TTTAAA", "ATGCCTCCTATCTTAGC", }; Assert.AreEqual(10, result.ContigSequences.Count()); foreach (ISequence contig in result.ContigSequences) { string contigSeq = contig.ConvertToString(); Assert.IsTrue( expectedContigs.Contains(contigSeq) || expectedContigs.Contains(contigSeq.GetReverseComplement(new char[contigSeq.Length])), "Found unknown contig " + contigSeq); } Assert.AreEqual(8, result.Scaffolds.Count()); var expectedScaffolds = new List <string> { "ATGCCTCCTATCTTAGCGCGC", "CGCGCG", "CGCGCCGCGC", "TTTTTA", "TTTTTT", "TTTTAGC", "TTTTAA", "TTTAAA", }; foreach (ISequence scaffold in result.Scaffolds) { string scaffoldSeq = scaffold.ConvertToString(); Assert.IsTrue( expectedScaffolds.Contains(scaffoldSeq) || expectedScaffolds.Contains(scaffoldSeq.GetReverseComplement(new char[scaffoldSeq.Length])), "Found unknown scaffold " + scaffoldSeq); } } }
public override PadenaAssembly Assemble(IEnumerable <ISequence> inputSequences) { if (inputSequences == null) { throw new ArgumentNullException("inputSequences"); } // Step 0: Load the reference genome as a fasta file. // Remove ambiguous reads and set up fields for assembler process this.Initialize(); // Step 1, 2: Create k-mers from reads and build de bruijn graph and paint them with the reference System.Diagnostics.Stopwatch sw = System.Diagnostics.Stopwatch.StartNew(); this.CreateGraphStarted(); this.CreateGraph(inputSequences); this.CreateGraphEnded(); sw.Stop(); this.NodeCountReport(); this.TaskTimeSpanReport(sw.Elapsed); int count = this.Graph.GetNodes().Where(x => x.IsInReference).Count(); ReferenceNodeCountAfterCreation = count; TotalSequencingBP = Graph.GetNodes().Sum(x => x.KmerCount * KmerLength); RaiseMessage("A total of: " + count.ToString() + " nodes remain from the reference"); RaiseMessage("A total of: " + this.Graph.NodeCount + " nodes are in the graph"); NodeCountAfterCreation = Graph.NodeCount; SkippedReadsAfterQCCount = Graph.SkippedSequencesCount; ReadCount = Graph.ProcessedSequencesCount; if (NodeCountAfterCreation < 100) { return(null); } //Step 2.1, Remove nodes are below coverage cutoff sw.Reset(); sw.Start(); // Estimate and set default value for erosion and coverage thresholds this.EstimateDefaultThresholds(); int sqrtCoverageCutOff = this.CalculateSqrtOfMedianCoverageCutoff(); var coverageCutOff = ForceSqrtThreshold ? sqrtCoverageCutOff : Math.Min(sqrtCoverageCutOff, AlternateMinimumNodeCount); if (MTAssembleArguments.MinNodeCountSet) { coverageCutOff = AlternateMinimumNodeCount; } //var coverageCutOff = sqrtCoverageCutOff; KmerCutOff = coverageCutOff; if (OutputIntermediateGraphSteps) { OutputNodeCountHistograms("PreFiltered", coverageCutOff); } long originalNodes = this.Graph.NodeCount; ThresholdCoverageNodePurger snr = new ThresholdCoverageNodePurger(coverageCutOff); snr.RemoveLowCoverageNodes(Graph); PercentNodesRemovedByLowCoverageOrThreshold = originalNodes / (double)this.Graph.NodeCount; sw.Stop(); TaskTimeSpanReport(sw.Elapsed); RaiseMessage("Finished removing nodes with less than " + snr.CoverageCutOff.ToString() + " counts"); NodeCountReport(); NodeCountAfterCoveragePurge = Graph.NodeCount; sw.Reset(); sw.Start(); RaiseMessage("Start removing unconnected nodes"); // Remove pathological nodes var badNodes = PathologicalSequencePurger.RemovePathologicalNodes(Graph); if (badNodes > 0) { RaiseMessage("WARNING!!!! Found and removed " + badNodes.ToString() + " pathological nodes. These were removed.", false); } //Step 2.2 Remove nodes that are not connected to the reference genome UnlinkedToReferencePurger remover = new UnlinkedToReferencePurger(); remover.RemoveUnconnectedNodes(Graph, referenceNodes); RaiseMessage("Finished removing unconnected nodes"); this.NodeCountReport(); NodeCountAfterUndangle = Graph.NodeCount; //outputVisualization ("PostUnconnectedFilter"); // Step 2.3: Remove dangling links from graph ///NIGEL: This also removes the low coverage nodes sw.Reset(); sw.Restart(); this.UndangleGraphStarted(); this.UnDangleGraph(); this.UndangleGraphEnded(); sw.Stop(); this.TaskTimeSpanReport(sw.Elapsed); this.NodeCountReport(); if (OutputIntermediateGraphSteps) { outputVisualization("PostUndangleFilter"); } // Step 4: Remove redundant SNP and indel paths from graph RaiseMessage(string.Format(CultureInfo.CurrentCulture, "Starting to remove redundant paths", DateTime.Now)); this.RemoveRedundancyStarted(); RaiseMessage(string.Format(CultureInfo.CurrentCulture, "Starting to remove SNPs", DateTime.Now)); this.RemoveRedundancy(); RaiseMessage(string.Format(CultureInfo.CurrentCulture, "Finished removing SNPs", DateTime.Now)); this.NodeCountReport(); //Now remove redundant indel paths as well //TODO: For historic reasons this is largely similar to the snp remover, which isn't so great... RaiseMessage(string.Format(CultureInfo.CurrentCulture, "Starting to call INDELs", DateTime.Now)); var indels = CallAndRemoveIndels(); RaiseMessage(string.Format(CultureInfo.CurrentCulture, "Finished calling and removing small INDELs paths", DateTime.Now)); this.NodeCountReport(); // Perform dangling link purger step once more. // This is done to remove any links created by redundant paths purger. this.UnDangleGraph(); RaiseMessage(string.Format(CultureInfo.CurrentCulture, "Finished removing all redundant paths", DateTime.Now)); this.NodeCountReport(); //STEP 4.2 Rerun the unlinked to reference purger after graph is cleaned ChangeNodeVisitFlag(false); remover = new UnlinkedToReferencePurger(); remover.RemoveUnconnectedNodes(Graph, referenceNodes); this.RemoveRedundancyEnded(); this.NodeCountReport(); NodeCountAfterRedundancyRemoval = Graph.NodeCount; if (OutputIntermediateGraphSteps) { outputVisualization("Post-redundant-path-removal"); } //Now attempt to assemble and find deletions var attemptedAssembly = new MitochondrialAssembly(Graph, DiagnosticFileOutputPrefix); FinalMetaNodeCount = attemptedAssembly.AllNodesInGraph.Count; SuccessfulAssembly = attemptedAssembly.SuccessfulAssembly; if (SuccessfulAssembly) { SuccessfulAssemblyLength = attemptedAssembly.AssemblyLength; MinSplitPercentage = attemptedAssembly.MinimumGreedySplit; if (OutputDiagnosticInformation) { var outReport = attemptedAssembly.OutputAssembly(DiagnosticFileOutputPrefix); if (outReport != null) { //TODO: This matching is really crappy, need to find a better way to propogate this on up. BestHaplotypeScore = outReport.BestHit.Rank; SecondBestHaplotypeScore = outReport.SecondBestHit.Rank; BestMatchingHaplotype = outReport.BestHit.node.haplogroup.id; SecondBestMatchingHaplotype = outReport.SecondBestHit.node.haplogroup.id; NumberOfEquallyGoodHaplotypes = outReport.NumberOfEquallyGoodBestHits; PolymorphismsMatchingHaplotype = outReport.BestHit.NumberOfMatchingPolymorphisms; PolymorphismsMissingFromHaplotype = outReport.BestHit.NumberOfPolymorphismsMissingFromHaplotype; PolymorphismsMissingFromGenotype = outReport.BestHit.NumberOfPolymorphismsMissingFromGenotype; } } else { RaiseMessage("Greedy assembly skipped as assembly failed."); } } else { RaiseMessage("Greedy assembly skipped as assembly failed."); } //Now find deletions this.OutputGraphicAndFindDeletion(attemptedAssembly); PercentageOfScannedReadsUsed = Graph.GetNodes().Sum(x => x.KmerCount * KmerLength) / (double)TotalSequencingBP; RaiseMessage("Used a total of " + PercentageOfScannedReadsUsed.ToString("p") + " of basepairs in reads for assembly"); // Step 5: Build Contigs - This is essentially independent of deletion finding this.BuildContigsStarted(); List <ISequence> contigSequences = this.BuildContigs().ToList(); contigSequences.ForEach(x => ReferenceGenome.AssignContigToMTDNA(x)); contigSequences.Sort((x, y) => - x.Count.CompareTo(y.Count)); this.BuildContigsEnded(); PadenaAssembly result = new PadenaAssembly(); result.AddContigs(contigSequences); long totalLength = contigSequences.Sum(x => x.Count); RaiseMessage("Assembled " + totalLength.ToString() + " bases of sequence in " + contigSequences.Count.ToString() + " contigs."); if (contigSequences.Count > 0) { N50 = CalculateN50(contigSequences, totalLength); RaiseMessage("N50: " + N50.ToString()); } count = this.Graph.GetNodes().Where(x => x.IsInReference).Count(); RaiseMessage("A total of: " + count.ToString() + " nodes remain from the reference"); RaiseMessage("A total of: " + this.Graph.NodeCount + " nodes are in the graph"); return(result); }