private void VerifyStatusForPositionsInBin(int minInBin, int maxInBin, UsableBins usable, bool expected) { for (int i = minInBin; i <= maxInBin; i++) { VerifyUsableStatus(usable, i, expected); } }
public void FinalizeConclusions() { var binConclusions = new Mock <IBinConclusions>(); binConclusions.SetupGet(x => x.NumBins).Returns(1000); binConclusions.Setup(x => x.GetBinId(It.IsAny <int>())).Returns <int>(i => i / 10); MockBinResult(binConclusions, 1, true, true, false); MockBinResult(binConclusions, 10, true, true, false); MockBinResult(binConclusions, 15, true, false, false); MockBinResult(binConclusions, 25, true, true, false); MockBinResult(binConclusions, 26, true, true, true); var usable = new UsableBins(binConclusions.Object); usable.FinalizeConclusions(2); VerifyStatusForPositionsInBin(0, 9, usable, true); // Bin 0 - propagate from 1 VerifyStatusForPositionsInBin(10, 19, usable, true); // Bin 1 - explicitly set VerifyStatusForPositionsInBin(20, 29, usable, true); // Bin 2 - propagate from 1 VerifyStatusForPositionsInBin(30, 39, usable, false); // Bin 3 - outside range of bin 1 propagation VerifyStatusForPositionsInBin(40, 89, usable, false); // Bin 4-8 - false VerifyStatusForPositionsInBin(90, 119, usable, true); // Bin 9, 10, 11 - propagate from 10 VerifyStatusForPositionsInBin(120, 239, usable, false); // Bin 12 - 23 - false VerifyStatusForPositionsInBin(240, 249, usable, true); // Bin 24 - propagate from 25 VerifyStatusForPositionsInBin(250, 259, usable, true); // Bin 25 - explicitly set VerifyStatusForPositionsInBin(260, 269, usable, false); // Bin 26 - would have propagated from 25, but has likely true snp VerifyStatusForPositionsInBin(270, 10000, usable, false); // Everything else - false - not explicitly set }
public AggregateRegionResults GetAggregateRegionResults(ConcurrentDictionary <string, IndelEvidence> indelLookup, int startPosition, int endPosition, bool isFinalTask, RegionDataForAggregation regionData) { if (_geminiOptions.LightDebug) { Logger.WriteToLog( $"Started processing for region {_chrom}:{startPosition}-{endPosition}."); } var adjustedStartPosition = regionData.EffectiveMinPosition; var edgeThresholdOrig = Math.Max(1, regionData.EffectiveMaxPosition - 5000); var finalIndelLookup = GetAndSyncFinalIndelLookup(indelLookup, _masterIndelLookup); var edgeState = regionData.EdgeState; var nextEdgeMinPosition = int.MaxValue; var finalizedIndels = FinalizeIndels(finalIndelLookup, _chrReference, regionData.EffectiveMaxPosition); var finalizedIndelsForChrom = GetFinalizedIndelsForChrom(_chrom, finalizedIndels, edgeState); IChromosomeIndelSource indelSource = null; var messySiteWidth = _geminiOptions.MessySiteWidth; const int binsToExtendTo = 2; // Treated as <, so 2 means we get to extend status to one on either side var binEvidence = regionData.BinEvidence; var binConclusions = new BinConclusions(binEvidence, _geminiOptions.CollectDepth, trackDirectionalMess: _geminiOptions.SilenceDirectionalMessReads, trackMapqMess: _geminiOptions.SilenceMessyMapMessReads); var numBins = binEvidence.NumBins; bool shouldRealignAtAll = finalizedIndelsForChrom.Any(); var imperfectFreqThreshold = _geminiOptions.ImperfectFreqThreshold; var indelRegionfreqThreshold = _geminiOptions.IndelRegionFreqThreshold; var messySiteThreshold = _geminiOptions.MessySiteThreshold; var numRetrievedFromLastBlock = 0; var numPairsSentToNextBlock = 0; var pairResultsForNextBlock = new Dictionary <PairClassification, List <PairResult> >(); var pairResultLookup = new Dictionary <PairClassification, List <PairResult> >(); foreach (var key in regionData.PairResultLookup.Keys) { if (!pairResultLookup.ContainsKey(key)) { pairResultLookup.Add(key, new List <PairResult>()); } pairResultLookup[key].AddRange(regionData.PairResultLookup[key]); } foreach (var category in pairResultLookup) { var isMessy = TypeClassifier.MessyTypes.Contains(category.Key); var isIndel = TypeClassifier._indelTypes.Contains(category.Key); var isSingleMismatch = _geminiOptions.AvoidLikelySnvs && (category.Key == PairClassification.SingleMismatchStitched || category.Key == PairClassification.UnstitchSingleMismatch); var isForwardOnlyMessy = IsForwardMessy(category.Key); var isReverseOnlyMessy = IsReverseMessy(category.Key); var isMapMessy = IsSuspiciousMapping(category.Key); foreach (var pairResult in category.Value) { // If on the edge, kick it over to the edge lookup. if (!isFinalTask && pairResult.ReadPair.MaxPosition > edgeThresholdOrig) { numPairsSentToNextBlock++; if (!pairResultsForNextBlock.ContainsKey(category.Key)) { pairResultsForNextBlock.Add(category.Key, new List <PairResult>()); } pairResultsForNextBlock[category.Key].Add(pairResult); nextEdgeMinPosition = Math.Min(nextEdgeMinPosition, pairResult.ReadPair.MinPosition); } // Still collect evidence even if it's edge, because that could impact this block as well as next block. binEvidence.AddMessEvidence(isMessy, pairResult, isIndel, isSingleMismatch, isForwardOnlyMessy, isReverseOnlyMessy, isMapMessy); } } numRetrievedFromLastBlock = AddAlignmentsFromEdgeState(edgeState, pairResultLookup, numRetrievedFromLastBlock); var finalizedBins = new UsableBins(binConclusions); if (shouldRealignAtAll) { binConclusions.AddIndelEvidence(finalizedIndelsForChrom, binsToExtendTo); binConclusions.ProcessRegions(messySiteThreshold, imperfectFreqThreshold, _geminiOptions.RegionDepthThreshold, indelRegionfreqThreshold, binsToExtendTo, _geminiOptions.DirectionalMessThreshold); finalizedBins.FinalizeConclusions(binsToExtendTo); } using (var snippetSource = _dataSourceFactory.CreateGenomeSnippetSource(_chrom, _chrReference)) { indelSource = _dataSourceFactory.GetChromosomeIndelSource(finalizedIndelsForChrom, snippetSource); } foreach (var kvp in pairResultsForNextBlock) { foreach (var pairResult in kvp.Value) { pairResultLookup[kvp.Key].Remove(pairResult); } } var allAlignments = new List <BamAlignment>(); var outcomesLookup = new Dictionary <HashableIndel, int[]>(); var numSkippedDueToSites = 0; var numKept = 0; var numRealigned = 0; var numSilenced = 0; var snowballCategories = _realignmentOptions.CategoriesForSnowballing; var doSnowball = snowballCategories.Any(); foreach (var category in snowballCategories) { if (pairResultLookup.ContainsKey(category)) { pairResultLookup.Remove(category, out var categoryReads); allAlignments.AddRange(ProcessCategory(_categoriesForRealignment, indelSource, shouldRealignAtAll, outcomesLookup, ref numSkippedDueToSites, ref numKept, ref numRealigned, ref numSilenced, categoryReads, category, binEvidence, _progressTracker, binConclusions, finalizedBins, startPosition, endPosition)); } } List <HashableIndel> superFinalizedIndels; if (doSnowball) { superFinalizedIndels = GetSuperFinalizedIndelsAfterSnowball(finalizedIndelsForChrom, outcomesLookup); if (_geminiOptions.Debug) { Logger.WriteToLog( $"After snowballing for region {_chrom}:{startPosition}-{endPosition}, filtered down to {superFinalizedIndels.Count} indels from {finalizedIndelsForChrom.Count} ({finalIndelLookup.Count} preliminary indels)."); } using (var snippetSource = _dataSourceFactory.CreateGenomeSnippetSource(_chrom, _chrReference)) { indelSource = _dataSourceFactory.GetChromosomeIndelSource(superFinalizedIndels, snippetSource); } if (_geminiOptions.RecalculateUsableSitesAfterSnowball) { binConclusions.ResetIndelRegions(); foreach (var indel in superFinalizedIndels) { var bin = (indel.ReferencePosition - adjustedStartPosition) / messySiteWidth; binConclusions.SetIndelRegionTrue(bin); for (int j = 0; j < binsToExtendTo; j++) { var binIndex = bin - j; if (binIndex >= 0) { binConclusions.SetIndelRegionTrue(binIndex); } else { break; } } for (int j = 0; j < binsToExtendTo; j++) { var binIndex = bin + j; if (!binConclusions.SetIndelRegionTrue(binIndex)) { break; } } } finalizedBins.FinalizeConclusions(binsToExtendTo); } } else { superFinalizedIndels = finalizedIndelsForChrom; } // TODO pull out the allocs below, or ideally actually remove them from realign pair handler or use something different altogether foreach (var category in pairResultLookup) { if (snowballCategories.Contains(category.Key)) { continue; } allAlignments.AddRange(ProcessCategory(_categoriesForRealignment, indelSource, shouldRealignAtAll, outcomesLookup, ref numSkippedDueToSites, ref numKept, ref numRealigned, ref numSilenced, category.Value, category.Key, binEvidence, _progressTracker, binConclusions, finalizedBins, startPosition, endPosition)); } var edgeHits = new Dictionary <int, int>(); var edgeSingleMismatchHits = new Dictionary <int, int>(); var edgeIndelHits = new Dictionary <int, int>(); var edgeMessyHits = new Dictionary <int, int>(); PopulateEdgeHitsAndLogBins(numBins, adjustedStartPosition, messySiteWidth, nextEdgeMinPosition, binEvidence, edgeHits, edgeSingleMismatchHits, edgeIndelHits, edgeMessyHits, startPosition, binConclusions, finalizedBins); UpdateMasterOutcomes(_masterOutcomesLookup, outcomesLookup); foreach (var hashableIndel in superFinalizedIndels) { _masterFinalIndels.AddOrUpdate(hashableIndel, 1, (h, n) => { return(n + 1); }); } _progressTracker.AddOrUpdate("Flushed", allAlignments.Count(), (x, currentCount) => { return(currentCount + allAlignments.Count()); }); _progressTracker.AddOrUpdate("Sent To Next Block", numPairsSentToNextBlock, (x, currentCount) => { return(currentCount + numPairsSentToNextBlock); }); _progressTracker.AddOrUpdate("Retrieved from Past Block", numRetrievedFromLastBlock, (x, currentCount) => { return(currentCount + numRetrievedFromLastBlock); }); _progressTracker.AddOrUpdate("Realigned", numRealigned, (x, currentCount) => { return(currentCount + numRealigned); }); _progressTracker.AddOrUpdate("Attempts", numKept, (x, currentCount) => { return(currentCount + numKept); }); _progressTracker.AddOrUpdate("Skipped", numSkippedDueToSites, (x, currentCount) => { return(currentCount + numSkippedDueToSites); }); _progressTracker.AddOrUpdate("Silenced", numSilenced, (x, currentCount) => { return(currentCount + numSilenced); }); pairResultLookup.Clear(); Logger.WriteToLog( $"Finished processing for region {_chrom}:{startPosition}-{endPosition}. {allAlignments.Count()} alignments flushed, " + $"{numPairsSentToNextBlock} sent to next block, {numRetrievedFromLastBlock} retrieved from {regionData.EdgeState?.Name}. " + $"Realigned {numRealigned}/{numKept} attempts ({numSkippedDueToSites} pairs skipped realignment), silenced {numSilenced} messy mates."); return(new AggregateRegionResults() { EdgeState = isFinalTask ? new EdgeState() { Name = "Final" } : new EdgeState() { EdgeAlignments = pairResultsForNextBlock, EdgeIndels = finalizedIndelsForChrom.Where(y => y.ReferencePosition > nextEdgeMinPosition) .ToList(), EffectiveMinPosition = nextEdgeMinPosition, Name = $"{startPosition}-{endPosition}", BinEvidence = binEvidence }, AlignmentsReadyToBeFlushed = allAlignments }); }
private void PopulateEdgeHitsAndLogBins(int numBins, int adjustedStartPosition, int messySiteWidth, int edgeThreshold, IBinEvidence binEvidence, Dictionary <int, int> edgeHits, Dictionary <int, int> edgeSingleMismatchHits, Dictionary <int, int> edgeIndelHits, Dictionary <int, int> edgeMessyHits, int startPosition, BinConclusions binConclusions, UsableBins usableBins) { for (int binId = 0; binId < numBins; binId++) { var inEdge = false; var binStart = adjustedStartPosition + (binId * messySiteWidth); if (_geminiOptions.LogRegionsAndRealignments) { if (binEvidence.GetAllHits(binId) > 10 && !inEdge) { var binCounts = $"{binId},{inEdge},{binStart},{binStart + messySiteWidth},{binEvidence.GetAllHits(binId)},{usableBins.IsPositionUsable(binStart)},{binEvidence.GetSingleMismatchHit(binId)}," + $"{binConclusions.GetProbableTrueSnvRegion(binId)},{binEvidence.GetIndelHit(binId)},{binConclusions.GetIndelRegionHit(binId)}," + $"{binEvidence.GetMessyHit(binId)},{binConclusions.GetIsMessyEnough(binId)},{binEvidence.GetForwardMessyRegionHit(binId)},{binConclusions.GetFwdMessyStatus(binId)},{binEvidence.GetReverseMessyRegionHit(binId)},{binConclusions.GetRevMessyStatus(binId)},{binEvidence.GetMapqMessyHit(binId)},{binConclusions.GetMapqMessyStatus(binId)}"; // TODO consider writing this to a proper output file if (_geminiOptions.LogRegionsAndRealignments) { Logger.WriteToLog("BINCOUNTS\t" + binCounts); } } } } }
private List <BamAlignment> ProcessCategory( List <PairClassification> categoriesForRealignment, IChromosomeIndelSource indelSource, bool shouldRealignAtAll, Dictionary <HashableIndel, int[]> outcomesLookup, ref int numSkippedDueToSites, ref int numKept, ref int numRealigned, ref int numSilenced, List <PairResult> pairResults, PairClassification classification, IBinEvidence binEvidence, ConcurrentDictionary <string, int> progressTracker, BinConclusions binConclusions, UsableBins usableBins, int startPosition, int endPosition) { var allAlignments = new List <BamAlignment>(); var isHighLikelihoodForRealign = false; if (_geminiOptions.ForceHighLikelihoodRealigners) { var highLikelihoodCategories = new List <PairClassification>() { PairClassification.Disagree, PairClassification.MessyStitched, PairClassification.MessySplit, PairClassification.UnstitchMessy, PairClassification.UnstitchIndel }; isHighLikelihoodForRealign = highLikelihoodCategories.Contains(classification); } int alignmentsCount = 0; var doRealign = false; ReadPairRealignerAndCombiner realignHandler = null; var alreadyStitched = ClassificationIsStitched(classification); var doStitch = !_geminiOptions.SkipStitching && TypeClassifier.ClassificationIsStitchable(classification); var categoryIsRealignable = categoriesForRealignment.Contains(classification); if (categoryIsRealignable || doStitch) { doRealign = true; realignHandler = _bamRealignmentFactory.GetRealignPairHandler(doStitch, alreadyStitched, _realignmentOptions.PairAwareEverything || ClassificationIsPairAwareRealignable(classification), _refIdMapping, new ReadStatusCounter(), false, indelSource, _chrom, new Dictionary <string, IndelEvidence>(), ClassificationHasIndels(classification), outcomesLookup , SkipRestitchIfUnchanged(classification)); } using (var snippetSource = _dataSourceFactory.CreateGenomeSnippetSource(_chrom, _chrReference)) using (var singleSnippetSource = new ReusableSnippetSource(snippetSource)) { var nmCalculator = new NmCalculator(singleSnippetSource); var classificationString = classification.ToString(); foreach (var pairResult in pairResults) { int toSilence = 0; IEnumerable <BamAlignment> alignments; if (!doRealign) { alignments = pairResult.Alignments; } else { bool doRealignPair = shouldRealignAtAll && (isHighLikelihoodForRealign || (categoryIsRealignable && (usableBins.IsPositionUsable(pairResult.ReadPair.MinPosition) || usableBins.IsPositionUsable(pairResult.ReadPair.MaxPosition)))); if (!doRealignPair) { numSkippedDueToSites++; } else { numKept++; } toSilence = ReadsToSilence(classification, binConclusions, pairResult); if (toSilence > 0) { numSilenced++; } alignments = realignHandler.ExtractReads(pairResult, nmCalculator, doRealignPair, toSilence); if (pairResult.ReadPair.Realigned || pairResult.ReadPair.RealignedR1 || pairResult.ReadPair.RealignedR2) { numRealigned++; } } var silencedR1 = (toSilence == 1 || toSilence == 3) && !pairResult.ReadPair.RealignedR1; var silencedR2 = (toSilence == 2 || toSilence == 3) && !pairResult.ReadPair.RealignedR2; var readTreatment = ReadTreatment(silencedR1, silencedR2, pairResult); progressTracker.AddOrUpdate(classificationString + ":" + readTreatment, 1, (x, currentCount) => { return(currentCount + 1); }); var alignmentsList = alignments.ToList(); foreach (var bamAlignment in alignmentsList) { if (_geminiOptions.LightDebug) { AddMdTagCountsTags(bamAlignment, pairResult); } bamAlignment.ReplaceOrAddStringTag("XT", readTreatment); bamAlignment.ReplaceOrAddStringTag("XP", classificationString); } alignmentsCount += alignmentsList.Count(); allAlignments.AddRange(alignmentsList); } } if (realignHandler != null) { realignHandler.Finish(); } pairResults.Clear(); return(allAlignments); }
private void VerifyUsableStatus(UsableBins usable, int position, bool expected) { Assert.Equal(expected, usable.IsPositionUsable(position)); }