예제 #1
0
        protected void OutputGraphicAndFindDeletion(MitochondrialAssembly attemptedAssembly)
        {
            if (attemptedAssembly.SuccessfulAssembly)
            {
                MitochondrialAssemblyPlotMaker plotMaker = new MitochondrialAssemblyPlotMaker(attemptedAssembly);
                                #if !NO_R
                if (OutputIntermediateGraphSteps)
                {
                    plotMaker.Render(rInt, DiagnosticFileOutputPrefix + "AssemblyView.pdf");
                }
                                #endif
                DecidedAssemblyTotalLength = plotMaker.Assembly.AssemblyLength;

                //Output all possible assemblies and deletions if possible
                RaiseMessage("Graph contains " + plotMaker.Assembly.AllNodesInGraph.Count.ToString() + " Contained Nodes");
                if (plotMaker.Assembly.AllNodesInGraph.Count < 10)
                {
                    DeletionSearchAttempted = true;
                    LargeDeletionFinder ldf = new LargeDeletionFinder();
                    var deletions           = ldf.FindAllDeletions(this.Graph, plotMaker.Assembly);
                    // Check to see if any sections
                    DeletionsFoundInAssemblyGraph = deletions.Where(z => z.HasDeletion).Count();
                    PossibleAssemblyCount         = ldf.PossibleDeletionPaths.Count;
                    RaiseMessage("Found a total of: " + deletions.Count + " possible mutations in " + ldf.PossibleDeletionPaths.Count.ToString() + " possible assembly paths");
                    //throw error as not finalized here
                    ldf.OutputReport(this.DiagnosticFileOutputPrefix + "DeletionReport.csv");
                }
                else
                {
                    PossibleAssemblyCount   = 999;
                    DeletionSearchAttempted = false;
                }
            }
        }
 public List <DeletionAnalysis> FindAllDeletions(DeBruijnGraph graph, MitochondrialAssembly assembly)
 {
     LargeDeletionFinder.graph = graph;
     KmerLength = graph.KmerLength;
     //set all edges in the graph to not be visited
     graph.GetNodes().AsParallel().ForAll(x => x.ResetVisitState());
     foreach (DeBruijnNode node in graph.GetNodes())
     {
         //starting from any unused edges in the network, make any/all paths one can
         //take
         try
         {
             PossibleDeletionPaths.AddRange(ExtendFromStartNode(node));
         }
         catch (Exception thrown) {
             Console.WriteLine(thrown.Message);
         }
     }
     DeletionReports = PossibleDeletionPaths.Select(x => new DeletionAnalysis(x)).ToList();
     return(DeletionReports);
 }
예제 #3
0
        public override PadenaAssembly Assemble(IEnumerable <ISequence> inputSequences)
        {
            if (inputSequences == null)
            {
                throw new ArgumentNullException("inputSequences");
            }
            // Step 0: Load the reference genome as a fasta file.
            // Remove ambiguous reads and set up fields for assembler process
            this.Initialize();

            // Step 1, 2: Create k-mers from reads and build de bruijn graph and paint them with the reference
            System.Diagnostics.Stopwatch sw = System.Diagnostics.Stopwatch.StartNew();
            this.CreateGraphStarted();
            this.CreateGraph(inputSequences);
            this.CreateGraphEnded();
            sw.Stop();
            this.NodeCountReport();
            this.TaskTimeSpanReport(sw.Elapsed);

            int count = this.Graph.GetNodes().Where(x => x.IsInReference).Count();

            ReferenceNodeCountAfterCreation = count;
            TotalSequencingBP = Graph.GetNodes().Sum(x => x.KmerCount * KmerLength);
            RaiseMessage("A total of: " + count.ToString() + " nodes remain from the reference");
            RaiseMessage("A total of: " + this.Graph.NodeCount + " nodes are in the graph");

            NodeCountAfterCreation   = Graph.NodeCount;
            SkippedReadsAfterQCCount = Graph.SkippedSequencesCount;
            ReadCount = Graph.ProcessedSequencesCount;

            if (NodeCountAfterCreation < 100)
            {
                return(null);
            }

            //Step 2.1, Remove nodes are below coverage cutoff
            sw.Reset();
            sw.Start();
            // Estimate and set default value for erosion and coverage thresholds
            this.EstimateDefaultThresholds();
            int sqrtCoverageCutOff = this.CalculateSqrtOfMedianCoverageCutoff();
            var coverageCutOff     = ForceSqrtThreshold ? sqrtCoverageCutOff : Math.Min(sqrtCoverageCutOff, AlternateMinimumNodeCount);

            if (MTAssembleArguments.MinNodeCountSet)
            {
                coverageCutOff = AlternateMinimumNodeCount;
            }
            //var coverageCutOff = sqrtCoverageCutOff;
            KmerCutOff = coverageCutOff;
            if (OutputIntermediateGraphSteps)
            {
                OutputNodeCountHistograms("PreFiltered", coverageCutOff);
            }
            long originalNodes = this.Graph.NodeCount;
            ThresholdCoverageNodePurger snr = new ThresholdCoverageNodePurger(coverageCutOff);

            snr.RemoveLowCoverageNodes(Graph);
            PercentNodesRemovedByLowCoverageOrThreshold = originalNodes / (double)this.Graph.NodeCount;
            sw.Stop();

            TaskTimeSpanReport(sw.Elapsed);
            RaiseMessage("Finished removing nodes with less than " + snr.CoverageCutOff.ToString() + " counts");
            NodeCountReport();
            NodeCountAfterCoveragePurge = Graph.NodeCount;
            sw.Reset();
            sw.Start();
            RaiseMessage("Start removing unconnected nodes");

            // Remove pathological nodes
            var badNodes = PathologicalSequencePurger.RemovePathologicalNodes(Graph);

            if (badNodes > 0)
            {
                RaiseMessage("WARNING!!!! Found and removed " + badNodes.ToString() + " pathological nodes.  These were removed.", false);
            }

            //Step 2.2 Remove nodes that are not connected to the reference genome
            UnlinkedToReferencePurger remover = new UnlinkedToReferencePurger();

            remover.RemoveUnconnectedNodes(Graph, referenceNodes);
            RaiseMessage("Finished removing unconnected nodes");
            this.NodeCountReport();
            NodeCountAfterUndangle = Graph.NodeCount;
            //outputVisualization ("PostUnconnectedFilter");

            // Step 2.3: Remove dangling links from graph
            ///NIGEL: This also removes the low coverage nodes
            sw.Reset();
            sw.Restart();
            this.UndangleGraphStarted();
            this.UnDangleGraph();
            this.UndangleGraphEnded();
            sw.Stop();

            this.TaskTimeSpanReport(sw.Elapsed);
            this.NodeCountReport();
            if (OutputIntermediateGraphSteps)
            {
                outputVisualization("PostUndangleFilter");
            }

            // Step 4: Remove redundant SNP and indel paths from graph
            RaiseMessage(string.Format(CultureInfo.CurrentCulture, "Starting to remove redundant paths", DateTime.Now));
            this.RemoveRedundancyStarted();
            RaiseMessage(string.Format(CultureInfo.CurrentCulture, "Starting to remove SNPs", DateTime.Now));
            this.RemoveRedundancy();
            RaiseMessage(string.Format(CultureInfo.CurrentCulture, "Finished  removing SNPs", DateTime.Now));
            this.NodeCountReport();
            //Now remove redundant indel paths as well
            //TODO: For historic reasons this is largely similar to the snp remover, which isn't so great...
            RaiseMessage(string.Format(CultureInfo.CurrentCulture, "Starting to call INDELs", DateTime.Now));
            var indels = CallAndRemoveIndels();

            RaiseMessage(string.Format(CultureInfo.CurrentCulture, "Finished calling and removing small INDELs paths", DateTime.Now));
            this.NodeCountReport();
            // Perform dangling link purger step once more.
            // This is done to remove any links created by redundant paths purger.
            this.UnDangleGraph();
            RaiseMessage(string.Format(CultureInfo.CurrentCulture, "Finished removing all redundant paths", DateTime.Now));
            this.NodeCountReport();

            //STEP 4.2 Rerun the unlinked to reference purger after graph is cleaned
            ChangeNodeVisitFlag(false);
            remover = new UnlinkedToReferencePurger();
            remover.RemoveUnconnectedNodes(Graph, referenceNodes);
            this.RemoveRedundancyEnded();
            this.NodeCountReport();

            NodeCountAfterRedundancyRemoval = Graph.NodeCount;
            if (OutputIntermediateGraphSteps)
            {
                outputVisualization("Post-redundant-path-removal");
            }
            //Now attempt to assemble and find deletions
            var attemptedAssembly = new MitochondrialAssembly(Graph, DiagnosticFileOutputPrefix);

            FinalMetaNodeCount = attemptedAssembly.AllNodesInGraph.Count;
            SuccessfulAssembly = attemptedAssembly.SuccessfulAssembly;
            if (SuccessfulAssembly)
            {
                SuccessfulAssemblyLength = attemptedAssembly.AssemblyLength;
                MinSplitPercentage       = attemptedAssembly.MinimumGreedySplit;
                if (OutputDiagnosticInformation)
                {
                    var outReport = attemptedAssembly.OutputAssembly(DiagnosticFileOutputPrefix);
                    if (outReport != null)
                    {
                        //TODO: This matching is really crappy, need to find a better way to propogate this on up.
                        BestHaplotypeScore                = outReport.BestHit.Rank;
                        SecondBestHaplotypeScore          = outReport.SecondBestHit.Rank;
                        BestMatchingHaplotype             = outReport.BestHit.node.haplogroup.id;
                        SecondBestMatchingHaplotype       = outReport.SecondBestHit.node.haplogroup.id;
                        NumberOfEquallyGoodHaplotypes     = outReport.NumberOfEquallyGoodBestHits;
                        PolymorphismsMatchingHaplotype    = outReport.BestHit.NumberOfMatchingPolymorphisms;
                        PolymorphismsMissingFromHaplotype = outReport.BestHit.NumberOfPolymorphismsMissingFromHaplotype;
                        PolymorphismsMissingFromGenotype  = outReport.BestHit.NumberOfPolymorphismsMissingFromGenotype;
                    }
                }
                else
                {
                    RaiseMessage("Greedy assembly skipped as assembly failed.");
                }
            }
            else
            {
                RaiseMessage("Greedy assembly skipped as assembly failed.");
            }

            //Now find deletions
            this.OutputGraphicAndFindDeletion(attemptedAssembly);
            PercentageOfScannedReadsUsed = Graph.GetNodes().Sum(x => x.KmerCount * KmerLength) / (double)TotalSequencingBP;
            RaiseMessage("Used a total of " + PercentageOfScannedReadsUsed.ToString("p") + " of basepairs in reads for assembly");

            // Step 5: Build Contigs - This is essentially independent of deletion finding
            this.BuildContigsStarted();
            List <ISequence> contigSequences = this.BuildContigs().ToList();

            contigSequences.ForEach(x => ReferenceGenome.AssignContigToMTDNA(x));
            contigSequences.Sort((x, y) => - x.Count.CompareTo(y.Count));
            this.BuildContigsEnded();

            PadenaAssembly result = new PadenaAssembly();

            result.AddContigs(contigSequences);
            long totalLength = contigSequences.Sum(x => x.Count);

            RaiseMessage("Assembled " + totalLength.ToString() + " bases of sequence in " + contigSequences.Count.ToString() + " contigs.");
            if (contigSequences.Count > 0)
            {
                N50 = CalculateN50(contigSequences, totalLength);
                RaiseMessage("N50: " + N50.ToString());
            }
            count = this.Graph.GetNodes().Where(x => x.IsInReference).Count();
            RaiseMessage("A total of: " + count.ToString() + " nodes remain from the reference");
            RaiseMessage("A total of: " + this.Graph.NodeCount + " nodes are in the graph");
            return(result);
        }