public void TestPadenaAssemblyToString() { ISequence seq2 = new Sequence(Alphabets.DNA, "ACAAAAGCAAC"); ISequence seq1 = new Sequence(Alphabets.DNA, "ATGAAGGCAATACTAGTAGT"); IList <ISequence> contigList = new List <ISequence>(); contigList.Add(seq1); contigList.Add(seq2); PadenaAssembly denovoAssembly = new PadenaAssembly(); denovoAssembly.AddContigs(contigList); string actualString = denovoAssembly.ToString(); string expectedString = "ATGAAGGCAATACTAGTAGT\r\nACAAAAGCAAC\r\n"; Assert.AreEqual(actualString, expectedString); }
public void ValidatePadenaAssemblyToString() { ISequence seq2 = new Sequence(Alphabets.DNA, "ACAAAAGCAAC"); ISequence seq1 = new Sequence(Alphabets.DNA, "ATGAAGGCAATACTAGTAGT"); IList <ISequence> contigList = new List <ISequence>(); contigList.Add(seq1); contigList.Add(seq2); var denovoAssembly = new PadenaAssembly(); denovoAssembly.AddContigs(contigList); string actualString = denovoAssembly.ToString(); string expectedString = "ATGAAGGCAATACTAGTAGT\r\nACAAAAGCAAC\r\n"; Assert.AreEqual(actualString, expectedString); // read sequences from xml string sequence1 = this.utilityObj.xmlUtil.GetTextValue(Constants.AssemblyAlgorithmNodeName, Constants.SequenceNode1); string sequence2 = this.utilityObj.xmlUtil.GetTextValue(Constants.AssemblyAlgorithmNodeName, Constants.SequenceNode2); IAlphabet alphabet = Utility.GetAlphabet(this.utilityObj.xmlUtil.GetTextValue(Constants.AssemblyAlgorithmNodeName, Constants.AlphabetNameNode)); var seq3 = new Sequence(alphabet, sequence1); var seq4 = new Sequence(alphabet, sequence2); IList <ISequence> contigList1 = new List <ISequence>(); contigList1.Add(seq3); contigList1.Add(seq4); var denovoAssembly1 = new PadenaAssembly(); denovoAssembly1.AddContigs(contigList1); string actualString1 = denovoAssembly1.ToString(); string expectedString1 = "GCCAAAATTTAGGC\r\nTTATGGCGCCCACGGA\r\n"; Assert.AreEqual(expectedString1, actualString1); }
public void ValidatePadenaAssemblyToString() { ISequence seq2 = new Sequence(Alphabets.DNA, "ACAAAAGCAAC"); ISequence seq1 = new Sequence(Alphabets.DNA, "ATGAAGGCAATACTAGTAGT"); IList<ISequence> contigList = new List<ISequence>(); contigList.Add(seq1); contigList.Add(seq2); var denovoAssembly = new PadenaAssembly(); denovoAssembly.AddContigs(contigList); string actualString = denovoAssembly.ToString(); string expectedString = "ATGAAGGCAATACTAGTAGT\r\nACAAAAGCAAC\r\n"; Assert.AreEqual(actualString, expectedString); // read sequences from xml string sequence1 = this.utilityObj.xmlUtil.GetTextValue(Constants.AssemblyAlgorithmNodeName, Constants.SequenceNode1); string sequence2 = this.utilityObj.xmlUtil.GetTextValue(Constants.AssemblyAlgorithmNodeName, Constants.SequenceNode2); IAlphabet alphabet = Utility.GetAlphabet(this.utilityObj.xmlUtil.GetTextValue(Constants.AssemblyAlgorithmNodeName, Constants.AlphabetNameNode)); var seq3 = new Sequence(alphabet, sequence1); var seq4 = new Sequence(alphabet, sequence2); IList<ISequence> contigList1 = new List<ISequence>(); contigList1.Add(seq3); contigList1.Add(seq4); var denovoAssembly1 = new PadenaAssembly(); denovoAssembly1.AddContigs(contigList1); string actualString1 = denovoAssembly1.ToString(); string expectedString1 = "GCCAAAATTTAGGC\r\nTTATGGCGCCCACGGA\r\n"; Assert.AreEqual(expectedString1, actualString1); }
public override PadenaAssembly Assemble(IEnumerable <ISequence> inputSequences) { if (inputSequences == null) { throw new ArgumentNullException("inputSequences"); } // Step 0: Load the reference genome as a fasta file. // Remove ambiguous reads and set up fields for assembler process this.Initialize(); // Step 1, 2: Create k-mers from reads and build de bruijn graph and paint them with the reference System.Diagnostics.Stopwatch sw = System.Diagnostics.Stopwatch.StartNew(); this.CreateGraphStarted(); this.CreateGraph(inputSequences); this.CreateGraphEnded(); sw.Stop(); this.NodeCountReport(); this.TaskTimeSpanReport(sw.Elapsed); int count = this.Graph.GetNodes().Where(x => x.IsInReference).Count(); ReferenceNodeCountAfterCreation = count; TotalSequencingBP = Graph.GetNodes().Sum(x => x.KmerCount * KmerLength); RaiseMessage("A total of: " + count.ToString() + " nodes remain from the reference"); RaiseMessage("A total of: " + this.Graph.NodeCount + " nodes are in the graph"); NodeCountAfterCreation = Graph.NodeCount; SkippedReadsAfterQCCount = Graph.SkippedSequencesCount; ReadCount = Graph.ProcessedSequencesCount; if (NodeCountAfterCreation < 100) { return(null); } //Step 2.1, Remove nodes are below coverage cutoff sw.Reset(); sw.Start(); // Estimate and set default value for erosion and coverage thresholds this.EstimateDefaultThresholds(); int sqrtCoverageCutOff = this.CalculateSqrtOfMedianCoverageCutoff(); var coverageCutOff = ForceSqrtThreshold ? sqrtCoverageCutOff : Math.Min(sqrtCoverageCutOff, AlternateMinimumNodeCount); if (MTAssembleArguments.MinNodeCountSet) { coverageCutOff = AlternateMinimumNodeCount; } //var coverageCutOff = sqrtCoverageCutOff; KmerCutOff = coverageCutOff; if (OutputIntermediateGraphSteps) { OutputNodeCountHistograms("PreFiltered", coverageCutOff); } long originalNodes = this.Graph.NodeCount; ThresholdCoverageNodePurger snr = new ThresholdCoverageNodePurger(coverageCutOff); snr.RemoveLowCoverageNodes(Graph); PercentNodesRemovedByLowCoverageOrThreshold = originalNodes / (double)this.Graph.NodeCount; sw.Stop(); TaskTimeSpanReport(sw.Elapsed); RaiseMessage("Finished removing nodes with less than " + snr.CoverageCutOff.ToString() + " counts"); NodeCountReport(); NodeCountAfterCoveragePurge = Graph.NodeCount; sw.Reset(); sw.Start(); RaiseMessage("Start removing unconnected nodes"); // Remove pathological nodes var badNodes = PathologicalSequencePurger.RemovePathologicalNodes(Graph); if (badNodes > 0) { RaiseMessage("WARNING!!!! Found and removed " + badNodes.ToString() + " pathological nodes. These were removed.", false); } //Step 2.2 Remove nodes that are not connected to the reference genome UnlinkedToReferencePurger remover = new UnlinkedToReferencePurger(); remover.RemoveUnconnectedNodes(Graph, referenceNodes); RaiseMessage("Finished removing unconnected nodes"); this.NodeCountReport(); NodeCountAfterUndangle = Graph.NodeCount; //outputVisualization ("PostUnconnectedFilter"); // Step 2.3: Remove dangling links from graph ///NIGEL: This also removes the low coverage nodes sw.Reset(); sw.Restart(); this.UndangleGraphStarted(); this.UnDangleGraph(); this.UndangleGraphEnded(); sw.Stop(); this.TaskTimeSpanReport(sw.Elapsed); this.NodeCountReport(); if (OutputIntermediateGraphSteps) { outputVisualization("PostUndangleFilter"); } // Step 4: Remove redundant SNP and indel paths from graph RaiseMessage(string.Format(CultureInfo.CurrentCulture, "Starting to remove redundant paths", DateTime.Now)); this.RemoveRedundancyStarted(); RaiseMessage(string.Format(CultureInfo.CurrentCulture, "Starting to remove SNPs", DateTime.Now)); this.RemoveRedundancy(); RaiseMessage(string.Format(CultureInfo.CurrentCulture, "Finished removing SNPs", DateTime.Now)); this.NodeCountReport(); //Now remove redundant indel paths as well //TODO: For historic reasons this is largely similar to the snp remover, which isn't so great... RaiseMessage(string.Format(CultureInfo.CurrentCulture, "Starting to call INDELs", DateTime.Now)); var indels = CallAndRemoveIndels(); RaiseMessage(string.Format(CultureInfo.CurrentCulture, "Finished calling and removing small INDELs paths", DateTime.Now)); this.NodeCountReport(); // Perform dangling link purger step once more. // This is done to remove any links created by redundant paths purger. this.UnDangleGraph(); RaiseMessage(string.Format(CultureInfo.CurrentCulture, "Finished removing all redundant paths", DateTime.Now)); this.NodeCountReport(); //STEP 4.2 Rerun the unlinked to reference purger after graph is cleaned ChangeNodeVisitFlag(false); remover = new UnlinkedToReferencePurger(); remover.RemoveUnconnectedNodes(Graph, referenceNodes); this.RemoveRedundancyEnded(); this.NodeCountReport(); NodeCountAfterRedundancyRemoval = Graph.NodeCount; if (OutputIntermediateGraphSteps) { outputVisualization("Post-redundant-path-removal"); } //Now attempt to assemble and find deletions var attemptedAssembly = new MitochondrialAssembly(Graph, DiagnosticFileOutputPrefix); FinalMetaNodeCount = attemptedAssembly.AllNodesInGraph.Count; SuccessfulAssembly = attemptedAssembly.SuccessfulAssembly; if (SuccessfulAssembly) { SuccessfulAssemblyLength = attemptedAssembly.AssemblyLength; MinSplitPercentage = attemptedAssembly.MinimumGreedySplit; if (OutputDiagnosticInformation) { var outReport = attemptedAssembly.OutputAssembly(DiagnosticFileOutputPrefix); if (outReport != null) { //TODO: This matching is really crappy, need to find a better way to propogate this on up. BestHaplotypeScore = outReport.BestHit.Rank; SecondBestHaplotypeScore = outReport.SecondBestHit.Rank; BestMatchingHaplotype = outReport.BestHit.node.haplogroup.id; SecondBestMatchingHaplotype = outReport.SecondBestHit.node.haplogroup.id; NumberOfEquallyGoodHaplotypes = outReport.NumberOfEquallyGoodBestHits; PolymorphismsMatchingHaplotype = outReport.BestHit.NumberOfMatchingPolymorphisms; PolymorphismsMissingFromHaplotype = outReport.BestHit.NumberOfPolymorphismsMissingFromHaplotype; PolymorphismsMissingFromGenotype = outReport.BestHit.NumberOfPolymorphismsMissingFromGenotype; } } else { RaiseMessage("Greedy assembly skipped as assembly failed."); } } else { RaiseMessage("Greedy assembly skipped as assembly failed."); } //Now find deletions this.OutputGraphicAndFindDeletion(attemptedAssembly); PercentageOfScannedReadsUsed = Graph.GetNodes().Sum(x => x.KmerCount * KmerLength) / (double)TotalSequencingBP; RaiseMessage("Used a total of " + PercentageOfScannedReadsUsed.ToString("p") + " of basepairs in reads for assembly"); // Step 5: Build Contigs - This is essentially independent of deletion finding this.BuildContigsStarted(); List <ISequence> contigSequences = this.BuildContigs().ToList(); contigSequences.ForEach(x => ReferenceGenome.AssignContigToMTDNA(x)); contigSequences.Sort((x, y) => - x.Count.CompareTo(y.Count)); this.BuildContigsEnded(); PadenaAssembly result = new PadenaAssembly(); result.AddContigs(contigSequences); long totalLength = contigSequences.Sum(x => x.Count); RaiseMessage("Assembled " + totalLength.ToString() + " bases of sequence in " + contigSequences.Count.ToString() + " contigs."); if (contigSequences.Count > 0) { N50 = CalculateN50(contigSequences, totalLength); RaiseMessage("N50: " + N50.ToString()); } count = this.Graph.GetNodes().Where(x => x.IsInReference).Count(); RaiseMessage("A total of: " + count.ToString() + " nodes remain from the reference"); RaiseMessage("A total of: " + this.Graph.NodeCount + " nodes are in the graph"); return(result); }