Beispiel #1
0
        /// <summary>
        /// Call SNPs from the sorted list of sequences using the pile-up method.
        /// </summary>
        /// <returns>The SN ps.</returns>
        /// <param name="sequences">Sequences.</param>
        public static SNPCallerReport CallSNPs(IEnumerable <CompactSAMSequence> sequences)
        {
            // Get a pile up and convert it to genotypes
            var pileups   = PileUpProducer.CreatePileupFromReads(sequences);
            var genotypes = ContinuousGenotypeCaller.CallContinuousGenotypes(pileups).ToList();

            // Filter down to a usable set
            var usable = genotypes.Where(x => x.ResultType == GenotypeCallResult.GenotypeCalled && x.OriginalPosition.HasValue).ToList();

            if (usable.Count == 0)
            {
                return(new SNPCallerReport(AlgorithmResult.Failed));
            }

            // Get median coverage at sites
            var data_counts = usable.Select(x => x.TotalObservedBases).ToList();

            data_counts.Sort();
            var median = data_counts[data_counts.Count / 2];

            //now create a cut-off for required coverage as the square root of the median.
            var cut_off = Math.Sqrt(median);

            //Get a list of genotypes, and if a simple SNP, make a polymorphism if it doesn't match
            //the reference
            var genotypedPositions    = new HashSet <int> ();
            List <Polymorphism> polys = new List <Polymorphism> ();

            foreach (var geno in usable)
            {
                if (geno.TotalObservedBases >= cut_off)
                {
                    genotypedPositions.Add(geno.OriginalPosition.Value);
                    var org_bp = ReferenceGenome.GetReferenceBaseAt_rCRSPosition(geno.OriginalPosition.Value);
                    var cur_bp = geno.GetMostFrequentGenotype();
                    if (org_bp != cur_bp[0])
                    {
                        var poly = new Polymorphism(geno.OriginalPosition.Value, MutationAssigner.getBase(cur_bp));
                        polys.Add(poly);
                    }
                }
            }
            //Now assign haplotype
            HaplotypeSearcher  hts = new HaplotypeSearcher();
            PolymorphismFilter pf  = new PolymorphismFilter(p => p.IsSNP && genotypedPositions.Contains(p.Position));
            var simpSample         = new SimpleSample("Pileup", polys, pf);
            var hap_report         = hts.GetHaplotypeReport(simpSample);

            return(new SNPCallerReport(genotypes, hap_report));
        }
Beispiel #2
0
 /// <summary>
 /// Orient this sequence to the reference, give its MSA with it.
 /// </summary>
 public void FinalizeAndOrientToReference()
 {
     if (!finalized)
     {
         _sequence = new Bio.Sequence(DnaAlphabet.Instance, contigSequence.ToArray());
         var delts = ReferenceGenome.GetDeltaAlignments(_sequence).SelectMany(x => x).ToList();
         //now to change orientation so that it matches well with the reference
         //will do a simple single flip
         if (delts.Count > 1)
         {
             // Find the lowest value on the reference, go from there
             var deltToUse = delts.MinBy(x => x.FirstSequenceStart);
             // Okay, given the alignment will either be at start or at end, we are going to attempt one flip here, might be off slightly
             int moveBack = deltToUse.FirstSequenceStart != 0L ? 0 - (int)deltToUse.FirstSequenceStart : 0;
             int start    = moveBack + (int)deltToUse.SecondSequenceStart;
             if (start > 0)
             {
                 var front  = _sequence.Skip(start);
                 var back   = _sequence.Take(start);
                 var newSeq = new List <byte>(front);
                 newSeq.AddRange(back);
                 var _n = new Sequence(DnaAlphabet.Instance, newSeq.ToArray());
                 if (_n.Count != _sequence.Count)
                 {
                     throw new Exception("Screw up when aligning the sequence");
                 }
                 if (deltToUse.IsReverseQueryDirection)
                 {
                     ReversePath();
                     _n = _n.GetReverseComplementedSequence() as Sequence;
                 }
                 _sequence = _n;
             }
         }
         // The underlying bytes should only be accessible from the sequence now
         contigSequence = null;
         finalized      = true;
     }
 }
        private void LookForDeletion()
        {
            bool movingUp;

            var v = Assembly.Where(x => x.IsInReference).ToList();

            var difs = Enumerable.Zip(v.Skip(1), v.Take(v.Count - 1),
                                      (x, y) => {
                if (x.ReferenceGenomePosition > y.ReferenceGenomePosition)
                {
                    return(1);
                }
                else
                {
                    return(0);
                }
            }).Sum();

            // If monotonically changing, should only not change once (when it goes around the circle).
            if (difs < 2 || difs > (v.Count - 3))
            {
                ReferenceValuesChangeMonotonically = true;
            }
            // Now which way is it increasing, up (big sum) or down (small sum)
            if (difs > (v.Count / 2))
            {
                movingUp = true;
            }
            else
            {
                movingUp = false;
            }

            if (!movingUp)
            {
                Assembly.ReversePath();
                movingUp = true;
            }
            // Only report for sensible assemblies
            if (ReferenceValuesChangeMonotonically)
            {
                Assembly.FinalizeAndOrientToReference();
                // Get Alignments
                var alns = HaploGrepSharp.ReferenceGenome.GetDeltaAlignments(Assembly.Sequence).SelectMany(x => x).ToList();
                if (alns.Count > 0)
                {
                    StartReference = (int)alns.First().FirstSequenceStart;
                    EndReference   = (int)alns.Last().FirstSequenceEnd;
                }
                SizeOfDeletionsSeen = String.Empty;
                if (alns.Count > 1)
                {
                    HasDeletion = true;
                    StringBuilder sb            = new StringBuilder();
                    List <int>    DeletionSizes = new List <int>();
                    for (int i = 0; i < (alns.Count - 1); i++)
                    {
                        var s = ReferenceGenome.ConvertTorCRSPosition((int)alns[i].FirstSequenceEnd);
                        var e = ReferenceGenome.ConvertTorCRSPosition((int)alns[i + 1].FirstSequenceStart);
                        sb.Append(s.ToString());
                        sb.Append("-");
                        sb.Append(e.ToString());
                        sb.Append(";");
                        DeletionSizes.Add(e - s + 1);
                    }
                    DeletedRegions      = sb.ToString();
                    SizeOfDeletionsSeen = String.Join(";", DeletionSizes.Select(x => x.ToString()));
                }

                // Now see if we can get the fractional evidence for this.
                // Note this code can get very nasty as it we have multiple
                // nodes with 2, we can look for
                var totBifurcations = 0;

                double avg  = 0.0;
                var    totL = Assembly.Where(x => x.LeftExtensionNodesCount == 2).ToList();
                var    torR = Assembly.Where(x => x.RightExtensionNodesCount == (byte)2).ToList();
                for (int i = 0; i < (Assembly.Count - 1); i++)
                {
                    var cnode  = Assembly[i];
                    var lefts  = cnode.GetLeftExtensionNodes().ToList();
                    var rights = cnode.GetRightExtensionNodes().ToList();
                    List <List <DeBruijnNode> > neighbors = new List <List <DeBruijnNode> >()
                    {
                        lefts, rights
                    };
                    foreach (var neighbor in neighbors)
                    {
                        if (neighbor.Count == 2)
                        {
                            var tot = neighbor.Sum(z => (double)z.KmerCount);
                            var cur = (double)neighbor.Where(z => z == Assembly[i - 1] || z == Assembly[i + 1]).First().KmerCount;
                            avg += cur / tot;
                            totBifurcations++;
                        }
                        else if (neighbor.Count > 2)
                        {
                            totBifurcations = 100; //arbitrarily set to too high a value
                            break;
                        }
                    }
                }
                if (totBifurcations == 2)
                {
                    SimpleBifurcation = true;
                    avg *= .5;// .5 * (a + b) = Average
                }
                AvgFractionBySplit = SimpleBifurcation ? avg : Double.NaN;
            }
        }
Beispiel #4
0
        public override PadenaAssembly Assemble(IEnumerable <ISequence> inputSequences)
        {
            if (inputSequences == null)
            {
                throw new ArgumentNullException("inputSequences");
            }
            // Step 0: Load the reference genome as a fasta file.
            // Remove ambiguous reads and set up fields for assembler process
            this.Initialize();

            // Step 1, 2: Create k-mers from reads and build de bruijn graph and paint them with the reference
            System.Diagnostics.Stopwatch sw = System.Diagnostics.Stopwatch.StartNew();
            this.CreateGraphStarted();
            this.CreateGraph(inputSequences);
            this.CreateGraphEnded();
            sw.Stop();
            this.NodeCountReport();
            this.TaskTimeSpanReport(sw.Elapsed);

            int count = this.Graph.GetNodes().Where(x => x.IsInReference).Count();

            ReferenceNodeCountAfterCreation = count;
            TotalSequencingBP = Graph.GetNodes().Sum(x => x.KmerCount * KmerLength);
            RaiseMessage("A total of: " + count.ToString() + " nodes remain from the reference");
            RaiseMessage("A total of: " + this.Graph.NodeCount + " nodes are in the graph");

            NodeCountAfterCreation   = Graph.NodeCount;
            SkippedReadsAfterQCCount = Graph.SkippedSequencesCount;
            ReadCount = Graph.ProcessedSequencesCount;

            if (NodeCountAfterCreation < 100)
            {
                return(null);
            }

            //Step 2.1, Remove nodes are below coverage cutoff
            sw.Reset();
            sw.Start();
            // Estimate and set default value for erosion and coverage thresholds
            this.EstimateDefaultThresholds();
            int sqrtCoverageCutOff = this.CalculateSqrtOfMedianCoverageCutoff();
            var coverageCutOff     = ForceSqrtThreshold ? sqrtCoverageCutOff : Math.Min(sqrtCoverageCutOff, AlternateMinimumNodeCount);

            if (MTAssembleArguments.MinNodeCountSet)
            {
                coverageCutOff = AlternateMinimumNodeCount;
            }
            //var coverageCutOff = sqrtCoverageCutOff;
            KmerCutOff = coverageCutOff;
            if (OutputIntermediateGraphSteps)
            {
                OutputNodeCountHistograms("PreFiltered", coverageCutOff);
            }
            long originalNodes = this.Graph.NodeCount;
            ThresholdCoverageNodePurger snr = new ThresholdCoverageNodePurger(coverageCutOff);

            snr.RemoveLowCoverageNodes(Graph);
            PercentNodesRemovedByLowCoverageOrThreshold = originalNodes / (double)this.Graph.NodeCount;
            sw.Stop();

            TaskTimeSpanReport(sw.Elapsed);
            RaiseMessage("Finished removing nodes with less than " + snr.CoverageCutOff.ToString() + " counts");
            NodeCountReport();
            NodeCountAfterCoveragePurge = Graph.NodeCount;
            sw.Reset();
            sw.Start();
            RaiseMessage("Start removing unconnected nodes");

            // Remove pathological nodes
            var badNodes = PathologicalSequencePurger.RemovePathologicalNodes(Graph);

            if (badNodes > 0)
            {
                RaiseMessage("WARNING!!!! Found and removed " + badNodes.ToString() + " pathological nodes.  These were removed.", false);
            }

            //Step 2.2 Remove nodes that are not connected to the reference genome
            UnlinkedToReferencePurger remover = new UnlinkedToReferencePurger();

            remover.RemoveUnconnectedNodes(Graph, referenceNodes);
            RaiseMessage("Finished removing unconnected nodes");
            this.NodeCountReport();
            NodeCountAfterUndangle = Graph.NodeCount;
            //outputVisualization ("PostUnconnectedFilter");

            // Step 2.3: Remove dangling links from graph
            ///NIGEL: This also removes the low coverage nodes
            sw.Reset();
            sw.Restart();
            this.UndangleGraphStarted();
            this.UnDangleGraph();
            this.UndangleGraphEnded();
            sw.Stop();

            this.TaskTimeSpanReport(sw.Elapsed);
            this.NodeCountReport();
            if (OutputIntermediateGraphSteps)
            {
                outputVisualization("PostUndangleFilter");
            }

            // Step 4: Remove redundant SNP and indel paths from graph
            RaiseMessage(string.Format(CultureInfo.CurrentCulture, "Starting to remove redundant paths", DateTime.Now));
            this.RemoveRedundancyStarted();
            RaiseMessage(string.Format(CultureInfo.CurrentCulture, "Starting to remove SNPs", DateTime.Now));
            this.RemoveRedundancy();
            RaiseMessage(string.Format(CultureInfo.CurrentCulture, "Finished  removing SNPs", DateTime.Now));
            this.NodeCountReport();
            //Now remove redundant indel paths as well
            //TODO: For historic reasons this is largely similar to the snp remover, which isn't so great...
            RaiseMessage(string.Format(CultureInfo.CurrentCulture, "Starting to call INDELs", DateTime.Now));
            var indels = CallAndRemoveIndels();

            RaiseMessage(string.Format(CultureInfo.CurrentCulture, "Finished calling and removing small INDELs paths", DateTime.Now));
            this.NodeCountReport();
            // Perform dangling link purger step once more.
            // This is done to remove any links created by redundant paths purger.
            this.UnDangleGraph();
            RaiseMessage(string.Format(CultureInfo.CurrentCulture, "Finished removing all redundant paths", DateTime.Now));
            this.NodeCountReport();

            //STEP 4.2 Rerun the unlinked to reference purger after graph is cleaned
            ChangeNodeVisitFlag(false);
            remover = new UnlinkedToReferencePurger();
            remover.RemoveUnconnectedNodes(Graph, referenceNodes);
            this.RemoveRedundancyEnded();
            this.NodeCountReport();

            NodeCountAfterRedundancyRemoval = Graph.NodeCount;
            if (OutputIntermediateGraphSteps)
            {
                outputVisualization("Post-redundant-path-removal");
            }
            //Now attempt to assemble and find deletions
            var attemptedAssembly = new MitochondrialAssembly(Graph, DiagnosticFileOutputPrefix);

            FinalMetaNodeCount = attemptedAssembly.AllNodesInGraph.Count;
            SuccessfulAssembly = attemptedAssembly.SuccessfulAssembly;
            if (SuccessfulAssembly)
            {
                SuccessfulAssemblyLength = attemptedAssembly.AssemblyLength;
                MinSplitPercentage       = attemptedAssembly.MinimumGreedySplit;
                if (OutputDiagnosticInformation)
                {
                    var outReport = attemptedAssembly.OutputAssembly(DiagnosticFileOutputPrefix);
                    if (outReport != null)
                    {
                        //TODO: This matching is really crappy, need to find a better way to propogate this on up.
                        BestHaplotypeScore                = outReport.BestHit.Rank;
                        SecondBestHaplotypeScore          = outReport.SecondBestHit.Rank;
                        BestMatchingHaplotype             = outReport.BestHit.node.haplogroup.id;
                        SecondBestMatchingHaplotype       = outReport.SecondBestHit.node.haplogroup.id;
                        NumberOfEquallyGoodHaplotypes     = outReport.NumberOfEquallyGoodBestHits;
                        PolymorphismsMatchingHaplotype    = outReport.BestHit.NumberOfMatchingPolymorphisms;
                        PolymorphismsMissingFromHaplotype = outReport.BestHit.NumberOfPolymorphismsMissingFromHaplotype;
                        PolymorphismsMissingFromGenotype  = outReport.BestHit.NumberOfPolymorphismsMissingFromGenotype;
                    }
                }
                else
                {
                    RaiseMessage("Greedy assembly skipped as assembly failed.");
                }
            }
            else
            {
                RaiseMessage("Greedy assembly skipped as assembly failed.");
            }

            //Now find deletions
            this.OutputGraphicAndFindDeletion(attemptedAssembly);
            PercentageOfScannedReadsUsed = Graph.GetNodes().Sum(x => x.KmerCount * KmerLength) / (double)TotalSequencingBP;
            RaiseMessage("Used a total of " + PercentageOfScannedReadsUsed.ToString("p") + " of basepairs in reads for assembly");

            // Step 5: Build Contigs - This is essentially independent of deletion finding
            this.BuildContigsStarted();
            List <ISequence> contigSequences = this.BuildContigs().ToList();

            contigSequences.ForEach(x => ReferenceGenome.AssignContigToMTDNA(x));
            contigSequences.Sort((x, y) => - x.Count.CompareTo(y.Count));
            this.BuildContigsEnded();

            PadenaAssembly result = new PadenaAssembly();

            result.AddContigs(contigSequences);
            long totalLength = contigSequences.Sum(x => x.Count);

            RaiseMessage("Assembled " + totalLength.ToString() + " bases of sequence in " + contigSequences.Count.ToString() + " contigs.");
            if (contigSequences.Count > 0)
            {
                N50 = CalculateN50(contigSequences, totalLength);
                RaiseMessage("N50: " + N50.ToString());
            }
            count = this.Graph.GetNodes().Where(x => x.IsInReference).Count();
            RaiseMessage("A total of: " + count.ToString() + " nodes remain from the reference");
            RaiseMessage("A total of: " + this.Graph.NodeCount + " nodes are in the graph");
            return(result);
        }