protected void OutputGraphicAndFindDeletion(MitochondrialAssembly attemptedAssembly) { if (attemptedAssembly.SuccessfulAssembly) { MitochondrialAssemblyPlotMaker plotMaker = new MitochondrialAssemblyPlotMaker(attemptedAssembly); #if !NO_R if (OutputIntermediateGraphSteps) { plotMaker.Render(rInt, DiagnosticFileOutputPrefix + "AssemblyView.pdf"); } #endif DecidedAssemblyTotalLength = plotMaker.Assembly.AssemblyLength; //Output all possible assemblies and deletions if possible RaiseMessage("Graph contains " + plotMaker.Assembly.AllNodesInGraph.Count.ToString() + " Contained Nodes"); if (plotMaker.Assembly.AllNodesInGraph.Count < 10) { DeletionSearchAttempted = true; LargeDeletionFinder ldf = new LargeDeletionFinder(); var deletions = ldf.FindAllDeletions(this.Graph, plotMaker.Assembly); // Check to see if any sections DeletionsFoundInAssemblyGraph = deletions.Where(z => z.HasDeletion).Count(); PossibleAssemblyCount = ldf.PossibleDeletionPaths.Count; RaiseMessage("Found a total of: " + deletions.Count + " possible mutations in " + ldf.PossibleDeletionPaths.Count.ToString() + " possible assembly paths"); //throw error as not finalized here ldf.OutputReport(this.DiagnosticFileOutputPrefix + "DeletionReport.csv"); } else { PossibleAssemblyCount = 999; DeletionSearchAttempted = false; } } }
public List <DeletionAnalysis> FindAllDeletions(DeBruijnGraph graph, MitochondrialAssembly assembly) { LargeDeletionFinder.graph = graph; KmerLength = graph.KmerLength; //set all edges in the graph to not be visited graph.GetNodes().AsParallel().ForAll(x => x.ResetVisitState()); foreach (DeBruijnNode node in graph.GetNodes()) { //starting from any unused edges in the network, make any/all paths one can //take try { PossibleDeletionPaths.AddRange(ExtendFromStartNode(node)); } catch (Exception thrown) { Console.WriteLine(thrown.Message); } } DeletionReports = PossibleDeletionPaths.Select(x => new DeletionAnalysis(x)).ToList(); return(DeletionReports); }
public override PadenaAssembly Assemble(IEnumerable <ISequence> inputSequences) { if (inputSequences == null) { throw new ArgumentNullException("inputSequences"); } // Step 0: Load the reference genome as a fasta file. // Remove ambiguous reads and set up fields for assembler process this.Initialize(); // Step 1, 2: Create k-mers from reads and build de bruijn graph and paint them with the reference System.Diagnostics.Stopwatch sw = System.Diagnostics.Stopwatch.StartNew(); this.CreateGraphStarted(); this.CreateGraph(inputSequences); this.CreateGraphEnded(); sw.Stop(); this.NodeCountReport(); this.TaskTimeSpanReport(sw.Elapsed); int count = this.Graph.GetNodes().Where(x => x.IsInReference).Count(); ReferenceNodeCountAfterCreation = count; TotalSequencingBP = Graph.GetNodes().Sum(x => x.KmerCount * KmerLength); RaiseMessage("A total of: " + count.ToString() + " nodes remain from the reference"); RaiseMessage("A total of: " + this.Graph.NodeCount + " nodes are in the graph"); NodeCountAfterCreation = Graph.NodeCount; SkippedReadsAfterQCCount = Graph.SkippedSequencesCount; ReadCount = Graph.ProcessedSequencesCount; if (NodeCountAfterCreation < 100) { return(null); } //Step 2.1, Remove nodes are below coverage cutoff sw.Reset(); sw.Start(); // Estimate and set default value for erosion and coverage thresholds this.EstimateDefaultThresholds(); int sqrtCoverageCutOff = this.CalculateSqrtOfMedianCoverageCutoff(); var coverageCutOff = ForceSqrtThreshold ? sqrtCoverageCutOff : Math.Min(sqrtCoverageCutOff, AlternateMinimumNodeCount); if (MTAssembleArguments.MinNodeCountSet) { coverageCutOff = AlternateMinimumNodeCount; } //var coverageCutOff = sqrtCoverageCutOff; KmerCutOff = coverageCutOff; if (OutputIntermediateGraphSteps) { OutputNodeCountHistograms("PreFiltered", coverageCutOff); } long originalNodes = this.Graph.NodeCount; ThresholdCoverageNodePurger snr = new ThresholdCoverageNodePurger(coverageCutOff); snr.RemoveLowCoverageNodes(Graph); PercentNodesRemovedByLowCoverageOrThreshold = originalNodes / (double)this.Graph.NodeCount; sw.Stop(); TaskTimeSpanReport(sw.Elapsed); RaiseMessage("Finished removing nodes with less than " + snr.CoverageCutOff.ToString() + " counts"); NodeCountReport(); NodeCountAfterCoveragePurge = Graph.NodeCount; sw.Reset(); sw.Start(); RaiseMessage("Start removing unconnected nodes"); // Remove pathological nodes var badNodes = PathologicalSequencePurger.RemovePathologicalNodes(Graph); if (badNodes > 0) { RaiseMessage("WARNING!!!! Found and removed " + badNodes.ToString() + " pathological nodes. These were removed.", false); } //Step 2.2 Remove nodes that are not connected to the reference genome UnlinkedToReferencePurger remover = new UnlinkedToReferencePurger(); remover.RemoveUnconnectedNodes(Graph, referenceNodes); RaiseMessage("Finished removing unconnected nodes"); this.NodeCountReport(); NodeCountAfterUndangle = Graph.NodeCount; //outputVisualization ("PostUnconnectedFilter"); // Step 2.3: Remove dangling links from graph ///NIGEL: This also removes the low coverage nodes sw.Reset(); sw.Restart(); this.UndangleGraphStarted(); this.UnDangleGraph(); this.UndangleGraphEnded(); sw.Stop(); this.TaskTimeSpanReport(sw.Elapsed); this.NodeCountReport(); if (OutputIntermediateGraphSteps) { outputVisualization("PostUndangleFilter"); } // Step 4: Remove redundant SNP and indel paths from graph RaiseMessage(string.Format(CultureInfo.CurrentCulture, "Starting to remove redundant paths", DateTime.Now)); this.RemoveRedundancyStarted(); RaiseMessage(string.Format(CultureInfo.CurrentCulture, "Starting to remove SNPs", DateTime.Now)); this.RemoveRedundancy(); RaiseMessage(string.Format(CultureInfo.CurrentCulture, "Finished removing SNPs", DateTime.Now)); this.NodeCountReport(); //Now remove redundant indel paths as well //TODO: For historic reasons this is largely similar to the snp remover, which isn't so great... RaiseMessage(string.Format(CultureInfo.CurrentCulture, "Starting to call INDELs", DateTime.Now)); var indels = CallAndRemoveIndels(); RaiseMessage(string.Format(CultureInfo.CurrentCulture, "Finished calling and removing small INDELs paths", DateTime.Now)); this.NodeCountReport(); // Perform dangling link purger step once more. // This is done to remove any links created by redundant paths purger. this.UnDangleGraph(); RaiseMessage(string.Format(CultureInfo.CurrentCulture, "Finished removing all redundant paths", DateTime.Now)); this.NodeCountReport(); //STEP 4.2 Rerun the unlinked to reference purger after graph is cleaned ChangeNodeVisitFlag(false); remover = new UnlinkedToReferencePurger(); remover.RemoveUnconnectedNodes(Graph, referenceNodes); this.RemoveRedundancyEnded(); this.NodeCountReport(); NodeCountAfterRedundancyRemoval = Graph.NodeCount; if (OutputIntermediateGraphSteps) { outputVisualization("Post-redundant-path-removal"); } //Now attempt to assemble and find deletions var attemptedAssembly = new MitochondrialAssembly(Graph, DiagnosticFileOutputPrefix); FinalMetaNodeCount = attemptedAssembly.AllNodesInGraph.Count; SuccessfulAssembly = attemptedAssembly.SuccessfulAssembly; if (SuccessfulAssembly) { SuccessfulAssemblyLength = attemptedAssembly.AssemblyLength; MinSplitPercentage = attemptedAssembly.MinimumGreedySplit; if (OutputDiagnosticInformation) { var outReport = attemptedAssembly.OutputAssembly(DiagnosticFileOutputPrefix); if (outReport != null) { //TODO: This matching is really crappy, need to find a better way to propogate this on up. BestHaplotypeScore = outReport.BestHit.Rank; SecondBestHaplotypeScore = outReport.SecondBestHit.Rank; BestMatchingHaplotype = outReport.BestHit.node.haplogroup.id; SecondBestMatchingHaplotype = outReport.SecondBestHit.node.haplogroup.id; NumberOfEquallyGoodHaplotypes = outReport.NumberOfEquallyGoodBestHits; PolymorphismsMatchingHaplotype = outReport.BestHit.NumberOfMatchingPolymorphisms; PolymorphismsMissingFromHaplotype = outReport.BestHit.NumberOfPolymorphismsMissingFromHaplotype; PolymorphismsMissingFromGenotype = outReport.BestHit.NumberOfPolymorphismsMissingFromGenotype; } } else { RaiseMessage("Greedy assembly skipped as assembly failed."); } } else { RaiseMessage("Greedy assembly skipped as assembly failed."); } //Now find deletions this.OutputGraphicAndFindDeletion(attemptedAssembly); PercentageOfScannedReadsUsed = Graph.GetNodes().Sum(x => x.KmerCount * KmerLength) / (double)TotalSequencingBP; RaiseMessage("Used a total of " + PercentageOfScannedReadsUsed.ToString("p") + " of basepairs in reads for assembly"); // Step 5: Build Contigs - This is essentially independent of deletion finding this.BuildContigsStarted(); List <ISequence> contigSequences = this.BuildContigs().ToList(); contigSequences.ForEach(x => ReferenceGenome.AssignContigToMTDNA(x)); contigSequences.Sort((x, y) => - x.Count.CompareTo(y.Count)); this.BuildContigsEnded(); PadenaAssembly result = new PadenaAssembly(); result.AddContigs(contigSequences); long totalLength = contigSequences.Sum(x => x.Count); RaiseMessage("Assembled " + totalLength.ToString() + " bases of sequence in " + contigSequences.Count.ToString() + " contigs."); if (contigSequences.Count > 0) { N50 = CalculateN50(contigSequences, totalLength); RaiseMessage("N50: " + N50.ToString()); } count = this.Graph.GetNodes().Where(x => x.IsInReference).Count(); RaiseMessage("A total of: " + count.ToString() + " nodes remain from the reference"); RaiseMessage("A total of: " + this.Graph.NodeCount + " nodes are in the graph"); return(result); }