/// <summary> /// Call SNPs from the sorted list of sequences using the pile-up method. /// </summary> /// <returns>The SN ps.</returns> /// <param name="sequences">Sequences.</param> public static SNPCallerReport CallSNPs(IEnumerable <CompactSAMSequence> sequences) { // Get a pile up and convert it to genotypes var pileups = PileUpProducer.CreatePileupFromReads(sequences); var genotypes = ContinuousGenotypeCaller.CallContinuousGenotypes(pileups).ToList(); // Filter down to a usable set var usable = genotypes.Where(x => x.ResultType == GenotypeCallResult.GenotypeCalled && x.OriginalPosition.HasValue).ToList(); if (usable.Count == 0) { return(new SNPCallerReport(AlgorithmResult.Failed)); } // Get median coverage at sites var data_counts = usable.Select(x => x.TotalObservedBases).ToList(); data_counts.Sort(); var median = data_counts[data_counts.Count / 2]; //now create a cut-off for required coverage as the square root of the median. var cut_off = Math.Sqrt(median); //Get a list of genotypes, and if a simple SNP, make a polymorphism if it doesn't match //the reference var genotypedPositions = new HashSet <int> (); List <Polymorphism> polys = new List <Polymorphism> (); foreach (var geno in usable) { if (geno.TotalObservedBases >= cut_off) { genotypedPositions.Add(geno.OriginalPosition.Value); var org_bp = ReferenceGenome.GetReferenceBaseAt_rCRSPosition(geno.OriginalPosition.Value); var cur_bp = geno.GetMostFrequentGenotype(); if (org_bp != cur_bp[0]) { var poly = new Polymorphism(geno.OriginalPosition.Value, MutationAssigner.getBase(cur_bp)); polys.Add(poly); } } } //Now assign haplotype HaplotypeSearcher hts = new HaplotypeSearcher(); PolymorphismFilter pf = new PolymorphismFilter(p => p.IsSNP && genotypedPositions.Contains(p.Position)); var simpSample = new SimpleSample("Pileup", polys, pf); var hap_report = hts.GetHaplotypeReport(simpSample); return(new SNPCallerReport(genotypes, hap_report)); }
/// <summary> /// Orient this sequence to the reference, give its MSA with it. /// </summary> public void FinalizeAndOrientToReference() { if (!finalized) { _sequence = new Bio.Sequence(DnaAlphabet.Instance, contigSequence.ToArray()); var delts = ReferenceGenome.GetDeltaAlignments(_sequence).SelectMany(x => x).ToList(); //now to change orientation so that it matches well with the reference //will do a simple single flip if (delts.Count > 1) { // Find the lowest value on the reference, go from there var deltToUse = delts.MinBy(x => x.FirstSequenceStart); // Okay, given the alignment will either be at start or at end, we are going to attempt one flip here, might be off slightly int moveBack = deltToUse.FirstSequenceStart != 0L ? 0 - (int)deltToUse.FirstSequenceStart : 0; int start = moveBack + (int)deltToUse.SecondSequenceStart; if (start > 0) { var front = _sequence.Skip(start); var back = _sequence.Take(start); var newSeq = new List <byte>(front); newSeq.AddRange(back); var _n = new Sequence(DnaAlphabet.Instance, newSeq.ToArray()); if (_n.Count != _sequence.Count) { throw new Exception("Screw up when aligning the sequence"); } if (deltToUse.IsReverseQueryDirection) { ReversePath(); _n = _n.GetReverseComplementedSequence() as Sequence; } _sequence = _n; } } // The underlying bytes should only be accessible from the sequence now contigSequence = null; finalized = true; } }
private void LookForDeletion() { bool movingUp; var v = Assembly.Where(x => x.IsInReference).ToList(); var difs = Enumerable.Zip(v.Skip(1), v.Take(v.Count - 1), (x, y) => { if (x.ReferenceGenomePosition > y.ReferenceGenomePosition) { return(1); } else { return(0); } }).Sum(); // If monotonically changing, should only not change once (when it goes around the circle). if (difs < 2 || difs > (v.Count - 3)) { ReferenceValuesChangeMonotonically = true; } // Now which way is it increasing, up (big sum) or down (small sum) if (difs > (v.Count / 2)) { movingUp = true; } else { movingUp = false; } if (!movingUp) { Assembly.ReversePath(); movingUp = true; } // Only report for sensible assemblies if (ReferenceValuesChangeMonotonically) { Assembly.FinalizeAndOrientToReference(); // Get Alignments var alns = HaploGrepSharp.ReferenceGenome.GetDeltaAlignments(Assembly.Sequence).SelectMany(x => x).ToList(); if (alns.Count > 0) { StartReference = (int)alns.First().FirstSequenceStart; EndReference = (int)alns.Last().FirstSequenceEnd; } SizeOfDeletionsSeen = String.Empty; if (alns.Count > 1) { HasDeletion = true; StringBuilder sb = new StringBuilder(); List <int> DeletionSizes = new List <int>(); for (int i = 0; i < (alns.Count - 1); i++) { var s = ReferenceGenome.ConvertTorCRSPosition((int)alns[i].FirstSequenceEnd); var e = ReferenceGenome.ConvertTorCRSPosition((int)alns[i + 1].FirstSequenceStart); sb.Append(s.ToString()); sb.Append("-"); sb.Append(e.ToString()); sb.Append(";"); DeletionSizes.Add(e - s + 1); } DeletedRegions = sb.ToString(); SizeOfDeletionsSeen = String.Join(";", DeletionSizes.Select(x => x.ToString())); } // Now see if we can get the fractional evidence for this. // Note this code can get very nasty as it we have multiple // nodes with 2, we can look for var totBifurcations = 0; double avg = 0.0; var totL = Assembly.Where(x => x.LeftExtensionNodesCount == 2).ToList(); var torR = Assembly.Where(x => x.RightExtensionNodesCount == (byte)2).ToList(); for (int i = 0; i < (Assembly.Count - 1); i++) { var cnode = Assembly[i]; var lefts = cnode.GetLeftExtensionNodes().ToList(); var rights = cnode.GetRightExtensionNodes().ToList(); List <List <DeBruijnNode> > neighbors = new List <List <DeBruijnNode> >() { lefts, rights }; foreach (var neighbor in neighbors) { if (neighbor.Count == 2) { var tot = neighbor.Sum(z => (double)z.KmerCount); var cur = (double)neighbor.Where(z => z == Assembly[i - 1] || z == Assembly[i + 1]).First().KmerCount; avg += cur / tot; totBifurcations++; } else if (neighbor.Count > 2) { totBifurcations = 100; //arbitrarily set to too high a value break; } } } if (totBifurcations == 2) { SimpleBifurcation = true; avg *= .5;// .5 * (a + b) = Average } AvgFractionBySplit = SimpleBifurcation ? avg : Double.NaN; } }
public override PadenaAssembly Assemble(IEnumerable <ISequence> inputSequences) { if (inputSequences == null) { throw new ArgumentNullException("inputSequences"); } // Step 0: Load the reference genome as a fasta file. // Remove ambiguous reads and set up fields for assembler process this.Initialize(); // Step 1, 2: Create k-mers from reads and build de bruijn graph and paint them with the reference System.Diagnostics.Stopwatch sw = System.Diagnostics.Stopwatch.StartNew(); this.CreateGraphStarted(); this.CreateGraph(inputSequences); this.CreateGraphEnded(); sw.Stop(); this.NodeCountReport(); this.TaskTimeSpanReport(sw.Elapsed); int count = this.Graph.GetNodes().Where(x => x.IsInReference).Count(); ReferenceNodeCountAfterCreation = count; TotalSequencingBP = Graph.GetNodes().Sum(x => x.KmerCount * KmerLength); RaiseMessage("A total of: " + count.ToString() + " nodes remain from the reference"); RaiseMessage("A total of: " + this.Graph.NodeCount + " nodes are in the graph"); NodeCountAfterCreation = Graph.NodeCount; SkippedReadsAfterQCCount = Graph.SkippedSequencesCount; ReadCount = Graph.ProcessedSequencesCount; if (NodeCountAfterCreation < 100) { return(null); } //Step 2.1, Remove nodes are below coverage cutoff sw.Reset(); sw.Start(); // Estimate and set default value for erosion and coverage thresholds this.EstimateDefaultThresholds(); int sqrtCoverageCutOff = this.CalculateSqrtOfMedianCoverageCutoff(); var coverageCutOff = ForceSqrtThreshold ? sqrtCoverageCutOff : Math.Min(sqrtCoverageCutOff, AlternateMinimumNodeCount); if (MTAssembleArguments.MinNodeCountSet) { coverageCutOff = AlternateMinimumNodeCount; } //var coverageCutOff = sqrtCoverageCutOff; KmerCutOff = coverageCutOff; if (OutputIntermediateGraphSteps) { OutputNodeCountHistograms("PreFiltered", coverageCutOff); } long originalNodes = this.Graph.NodeCount; ThresholdCoverageNodePurger snr = new ThresholdCoverageNodePurger(coverageCutOff); snr.RemoveLowCoverageNodes(Graph); PercentNodesRemovedByLowCoverageOrThreshold = originalNodes / (double)this.Graph.NodeCount; sw.Stop(); TaskTimeSpanReport(sw.Elapsed); RaiseMessage("Finished removing nodes with less than " + snr.CoverageCutOff.ToString() + " counts"); NodeCountReport(); NodeCountAfterCoveragePurge = Graph.NodeCount; sw.Reset(); sw.Start(); RaiseMessage("Start removing unconnected nodes"); // Remove pathological nodes var badNodes = PathologicalSequencePurger.RemovePathologicalNodes(Graph); if (badNodes > 0) { RaiseMessage("WARNING!!!! Found and removed " + badNodes.ToString() + " pathological nodes. These were removed.", false); } //Step 2.2 Remove nodes that are not connected to the reference genome UnlinkedToReferencePurger remover = new UnlinkedToReferencePurger(); remover.RemoveUnconnectedNodes(Graph, referenceNodes); RaiseMessage("Finished removing unconnected nodes"); this.NodeCountReport(); NodeCountAfterUndangle = Graph.NodeCount; //outputVisualization ("PostUnconnectedFilter"); // Step 2.3: Remove dangling links from graph ///NIGEL: This also removes the low coverage nodes sw.Reset(); sw.Restart(); this.UndangleGraphStarted(); this.UnDangleGraph(); this.UndangleGraphEnded(); sw.Stop(); this.TaskTimeSpanReport(sw.Elapsed); this.NodeCountReport(); if (OutputIntermediateGraphSteps) { outputVisualization("PostUndangleFilter"); } // Step 4: Remove redundant SNP and indel paths from graph RaiseMessage(string.Format(CultureInfo.CurrentCulture, "Starting to remove redundant paths", DateTime.Now)); this.RemoveRedundancyStarted(); RaiseMessage(string.Format(CultureInfo.CurrentCulture, "Starting to remove SNPs", DateTime.Now)); this.RemoveRedundancy(); RaiseMessage(string.Format(CultureInfo.CurrentCulture, "Finished removing SNPs", DateTime.Now)); this.NodeCountReport(); //Now remove redundant indel paths as well //TODO: For historic reasons this is largely similar to the snp remover, which isn't so great... RaiseMessage(string.Format(CultureInfo.CurrentCulture, "Starting to call INDELs", DateTime.Now)); var indels = CallAndRemoveIndels(); RaiseMessage(string.Format(CultureInfo.CurrentCulture, "Finished calling and removing small INDELs paths", DateTime.Now)); this.NodeCountReport(); // Perform dangling link purger step once more. // This is done to remove any links created by redundant paths purger. this.UnDangleGraph(); RaiseMessage(string.Format(CultureInfo.CurrentCulture, "Finished removing all redundant paths", DateTime.Now)); this.NodeCountReport(); //STEP 4.2 Rerun the unlinked to reference purger after graph is cleaned ChangeNodeVisitFlag(false); remover = new UnlinkedToReferencePurger(); remover.RemoveUnconnectedNodes(Graph, referenceNodes); this.RemoveRedundancyEnded(); this.NodeCountReport(); NodeCountAfterRedundancyRemoval = Graph.NodeCount; if (OutputIntermediateGraphSteps) { outputVisualization("Post-redundant-path-removal"); } //Now attempt to assemble and find deletions var attemptedAssembly = new MitochondrialAssembly(Graph, DiagnosticFileOutputPrefix); FinalMetaNodeCount = attemptedAssembly.AllNodesInGraph.Count; SuccessfulAssembly = attemptedAssembly.SuccessfulAssembly; if (SuccessfulAssembly) { SuccessfulAssemblyLength = attemptedAssembly.AssemblyLength; MinSplitPercentage = attemptedAssembly.MinimumGreedySplit; if (OutputDiagnosticInformation) { var outReport = attemptedAssembly.OutputAssembly(DiagnosticFileOutputPrefix); if (outReport != null) { //TODO: This matching is really crappy, need to find a better way to propogate this on up. BestHaplotypeScore = outReport.BestHit.Rank; SecondBestHaplotypeScore = outReport.SecondBestHit.Rank; BestMatchingHaplotype = outReport.BestHit.node.haplogroup.id; SecondBestMatchingHaplotype = outReport.SecondBestHit.node.haplogroup.id; NumberOfEquallyGoodHaplotypes = outReport.NumberOfEquallyGoodBestHits; PolymorphismsMatchingHaplotype = outReport.BestHit.NumberOfMatchingPolymorphisms; PolymorphismsMissingFromHaplotype = outReport.BestHit.NumberOfPolymorphismsMissingFromHaplotype; PolymorphismsMissingFromGenotype = outReport.BestHit.NumberOfPolymorphismsMissingFromGenotype; } } else { RaiseMessage("Greedy assembly skipped as assembly failed."); } } else { RaiseMessage("Greedy assembly skipped as assembly failed."); } //Now find deletions this.OutputGraphicAndFindDeletion(attemptedAssembly); PercentageOfScannedReadsUsed = Graph.GetNodes().Sum(x => x.KmerCount * KmerLength) / (double)TotalSequencingBP; RaiseMessage("Used a total of " + PercentageOfScannedReadsUsed.ToString("p") + " of basepairs in reads for assembly"); // Step 5: Build Contigs - This is essentially independent of deletion finding this.BuildContigsStarted(); List <ISequence> contigSequences = this.BuildContigs().ToList(); contigSequences.ForEach(x => ReferenceGenome.AssignContigToMTDNA(x)); contigSequences.Sort((x, y) => - x.Count.CompareTo(y.Count)); this.BuildContigsEnded(); PadenaAssembly result = new PadenaAssembly(); result.AddContigs(contigSequences); long totalLength = contigSequences.Sum(x => x.Count); RaiseMessage("Assembled " + totalLength.ToString() + " bases of sequence in " + contigSequences.Count.ToString() + " contigs."); if (contigSequences.Count > 0) { N50 = CalculateN50(contigSequences, totalLength); RaiseMessage("N50: " + N50.ToString()); } count = this.Graph.GetNodes().Where(x => x.IsInReference).Count(); RaiseMessage("A total of: " + count.ToString() + " nodes remain from the reference"); RaiseMessage("A total of: " + this.Graph.NodeCount + " nodes are in the graph"); return(result); }