public RealignmentEvaluator(IChromosomeIndelSource indelSource, IStatusHandler statusCounter, IReadRealigner readRealinger, IRealignmentJudger judger, string chromosome, bool trackActualMismatches, bool checkSoftclipsForMismatches, bool allowRescoringOrig0, bool softclipUnknownIndels, IRegionFilterer regionFilterer, bool lightDebug) { _indelSource = indelSource; _statusCounter = statusCounter; _readRealigner = readRealinger; _judger = judger; _chromosome = chromosome; _trackActualMismatches = trackActualMismatches; _checkSoftclipsForMismatches = checkSoftclipsForMismatches; _allowRescoringOrig0 = allowRescoringOrig0; _softclipUnknownIndels = softclipUnknownIndels; _regionFilterer = regionFilterer; _lightDebug = lightDebug; }
public ReadPairRealignerAndCombiner GetRealignPairHandler(bool tryRestitch, bool alreadyStitched, bool pairAwareRealign, Dictionary <int, string> refIdMapping, ReadStatusCounter statusCounter, bool isSnowball, IChromosomeIndelSource indelSource, string chromosome, Dictionary <string, IndelEvidence> masterLookup, bool hasIndels, Dictionary <HashableIndel, int[]> outcomesLookup, bool skipRestitchIfNothingChanged) { var stitcher = GetStitcher(); var stitchedPairHandler = new PairHandler(refIdMapping, stitcher, tryStitch: tryRestitch); var judger = new RealignmentJudger(GetAlignmentComparer()); var readRealigner = new GeminiReadRealigner(GetAlignmentComparer(), remaskSoftclips: _geminiOptions.RemaskMessySoftclips, keepProbeSoftclips: _geminiOptions.KeepProbeSoftclip, keepBothSideSoftclips: _geminiOptions.KeepBothSideSoftclips || (_geminiOptions.KeepProbeSoftclip && alreadyStitched), trackActualMismatches: _realignmentAssessmentOptions.TrackActualMismatches, checkSoftclipsForMismatches: _realignmentAssessmentOptions.CheckSoftclipsForMismatches, debug: _geminiOptions.Debug, maskNsOnly: !(_geminiOptions.RemaskMessySoftclips || _geminiOptions.KeepProbeSoftclip || _geminiOptions.KeepBothSideSoftclips), maskPartialInsertion: _realignmentOptions.MaskPartialInsertion, minimumUnanchoredInsertionLength: _realignmentOptions.MinimumUnanchoredInsertionLength, minInsertionSizeToAllowMismatchingBases: 4, maxProportionInsertSequenceMismatch: 0.2); // TODO fix // TODO figure out what I was saying to fix here... IStatusHandler statusHandler = new DebugSummaryStatusHandler(statusCounter); if (_geminiOptions.Debug) { statusHandler = new DebugStatusHandler(statusCounter); } // Only softclip unknowns if it is not stitched to begin with (we believe in these more, plus it makes our lives simpler for dealing with stitched directions) var softclipUnknownIndels = _geminiOptions.SoftclipUnknownIndels && !alreadyStitched; //var regionFilterer = new RegionFilterer(chromosome, indelSource.Indels); var regionFilterer = new DummyRegionFilterer(); var collector = GetCollector(isSnowball); var realignmentEvaluator = new RealignmentEvaluator(indelSource.DeepCopy(), statusHandler, readRealigner, judger, chromosome, _realignmentAssessmentOptions.TrackActualMismatches, _realignmentAssessmentOptions.CheckSoftclipsForMismatches, _geminiOptions.AllowRescoringOrigZero, softclipUnknownIndels, regionFilterer, _geminiOptions.LightDebug); return(new ReadPairRealignerAndCombiner( collector, GetRestitcher(stitchedPairHandler, statusHandler), realignmentEvaluator, GetIndelFinder(pairAwareRealign, chromosome, indelSource), chromosome, alreadyStitched, pairAwareRealign, masterLookup: masterLookup, hasExistingIndels: hasIndels, masterOutcomesLookup: outcomesLookup, skipRestitchIfNothingChanged: skipRestitchIfNothingChanged, allowedToStitch: !_geminiOptions.SkipStitching)); }
public AggregateRegionResults GetAggregateRegionResults(ConcurrentDictionary <string, IndelEvidence> indelLookup, int startPosition, int endPosition, bool isFinalTask, RegionDataForAggregation regionData) { if (_geminiOptions.LightDebug) { Logger.WriteToLog( $"Started processing for region {_chrom}:{startPosition}-{endPosition}."); } var adjustedStartPosition = regionData.EffectiveMinPosition; var edgeThresholdOrig = Math.Max(1, regionData.EffectiveMaxPosition - 5000); var finalIndelLookup = GetAndSyncFinalIndelLookup(indelLookup, _masterIndelLookup); var edgeState = regionData.EdgeState; var nextEdgeMinPosition = int.MaxValue; var finalizedIndels = FinalizeIndels(finalIndelLookup, _chrReference, regionData.EffectiveMaxPosition); var finalizedIndelsForChrom = GetFinalizedIndelsForChrom(_chrom, finalizedIndels, edgeState); IChromosomeIndelSource indelSource = null; var messySiteWidth = _geminiOptions.MessySiteWidth; const int binsToExtendTo = 2; // Treated as <, so 2 means we get to extend status to one on either side var binEvidence = regionData.BinEvidence; var binConclusions = new BinConclusions(binEvidence, _geminiOptions.CollectDepth, trackDirectionalMess: _geminiOptions.SilenceDirectionalMessReads, trackMapqMess: _geminiOptions.SilenceMessyMapMessReads); var numBins = binEvidence.NumBins; bool shouldRealignAtAll = finalizedIndelsForChrom.Any(); var imperfectFreqThreshold = _geminiOptions.ImperfectFreqThreshold; var indelRegionfreqThreshold = _geminiOptions.IndelRegionFreqThreshold; var messySiteThreshold = _geminiOptions.MessySiteThreshold; var numRetrievedFromLastBlock = 0; var numPairsSentToNextBlock = 0; var pairResultsForNextBlock = new Dictionary <PairClassification, List <PairResult> >(); var pairResultLookup = new Dictionary <PairClassification, List <PairResult> >(); foreach (var key in regionData.PairResultLookup.Keys) { if (!pairResultLookup.ContainsKey(key)) { pairResultLookup.Add(key, new List <PairResult>()); } pairResultLookup[key].AddRange(regionData.PairResultLookup[key]); } foreach (var category in pairResultLookup) { var isMessy = TypeClassifier.MessyTypes.Contains(category.Key); var isIndel = TypeClassifier._indelTypes.Contains(category.Key); var isSingleMismatch = _geminiOptions.AvoidLikelySnvs && (category.Key == PairClassification.SingleMismatchStitched || category.Key == PairClassification.UnstitchSingleMismatch); var isForwardOnlyMessy = IsForwardMessy(category.Key); var isReverseOnlyMessy = IsReverseMessy(category.Key); var isMapMessy = IsSuspiciousMapping(category.Key); foreach (var pairResult in category.Value) { // If on the edge, kick it over to the edge lookup. if (!isFinalTask && pairResult.ReadPair.MaxPosition > edgeThresholdOrig) { numPairsSentToNextBlock++; if (!pairResultsForNextBlock.ContainsKey(category.Key)) { pairResultsForNextBlock.Add(category.Key, new List <PairResult>()); } pairResultsForNextBlock[category.Key].Add(pairResult); nextEdgeMinPosition = Math.Min(nextEdgeMinPosition, pairResult.ReadPair.MinPosition); } // Still collect evidence even if it's edge, because that could impact this block as well as next block. binEvidence.AddMessEvidence(isMessy, pairResult, isIndel, isSingleMismatch, isForwardOnlyMessy, isReverseOnlyMessy, isMapMessy); } } numRetrievedFromLastBlock = AddAlignmentsFromEdgeState(edgeState, pairResultLookup, numRetrievedFromLastBlock); var finalizedBins = new UsableBins(binConclusions); if (shouldRealignAtAll) { binConclusions.AddIndelEvidence(finalizedIndelsForChrom, binsToExtendTo); binConclusions.ProcessRegions(messySiteThreshold, imperfectFreqThreshold, _geminiOptions.RegionDepthThreshold, indelRegionfreqThreshold, binsToExtendTo, _geminiOptions.DirectionalMessThreshold); finalizedBins.FinalizeConclusions(binsToExtendTo); } using (var snippetSource = _dataSourceFactory.CreateGenomeSnippetSource(_chrom, _chrReference)) { indelSource = _dataSourceFactory.GetChromosomeIndelSource(finalizedIndelsForChrom, snippetSource); } foreach (var kvp in pairResultsForNextBlock) { foreach (var pairResult in kvp.Value) { pairResultLookup[kvp.Key].Remove(pairResult); } } var allAlignments = new List <BamAlignment>(); var outcomesLookup = new Dictionary <HashableIndel, int[]>(); var numSkippedDueToSites = 0; var numKept = 0; var numRealigned = 0; var numSilenced = 0; var snowballCategories = _realignmentOptions.CategoriesForSnowballing; var doSnowball = snowballCategories.Any(); foreach (var category in snowballCategories) { if (pairResultLookup.ContainsKey(category)) { pairResultLookup.Remove(category, out var categoryReads); allAlignments.AddRange(ProcessCategory(_categoriesForRealignment, indelSource, shouldRealignAtAll, outcomesLookup, ref numSkippedDueToSites, ref numKept, ref numRealigned, ref numSilenced, categoryReads, category, binEvidence, _progressTracker, binConclusions, finalizedBins, startPosition, endPosition)); } } List <HashableIndel> superFinalizedIndels; if (doSnowball) { superFinalizedIndels = GetSuperFinalizedIndelsAfterSnowball(finalizedIndelsForChrom, outcomesLookup); if (_geminiOptions.Debug) { Logger.WriteToLog( $"After snowballing for region {_chrom}:{startPosition}-{endPosition}, filtered down to {superFinalizedIndels.Count} indels from {finalizedIndelsForChrom.Count} ({finalIndelLookup.Count} preliminary indels)."); } using (var snippetSource = _dataSourceFactory.CreateGenomeSnippetSource(_chrom, _chrReference)) { indelSource = _dataSourceFactory.GetChromosomeIndelSource(superFinalizedIndels, snippetSource); } if (_geminiOptions.RecalculateUsableSitesAfterSnowball) { binConclusions.ResetIndelRegions(); foreach (var indel in superFinalizedIndels) { var bin = (indel.ReferencePosition - adjustedStartPosition) / messySiteWidth; binConclusions.SetIndelRegionTrue(bin); for (int j = 0; j < binsToExtendTo; j++) { var binIndex = bin - j; if (binIndex >= 0) { binConclusions.SetIndelRegionTrue(binIndex); } else { break; } } for (int j = 0; j < binsToExtendTo; j++) { var binIndex = bin + j; if (!binConclusions.SetIndelRegionTrue(binIndex)) { break; } } } finalizedBins.FinalizeConclusions(binsToExtendTo); } } else { superFinalizedIndels = finalizedIndelsForChrom; } // TODO pull out the allocs below, or ideally actually remove them from realign pair handler or use something different altogether foreach (var category in pairResultLookup) { if (snowballCategories.Contains(category.Key)) { continue; } allAlignments.AddRange(ProcessCategory(_categoriesForRealignment, indelSource, shouldRealignAtAll, outcomesLookup, ref numSkippedDueToSites, ref numKept, ref numRealigned, ref numSilenced, category.Value, category.Key, binEvidence, _progressTracker, binConclusions, finalizedBins, startPosition, endPosition)); } var edgeHits = new Dictionary <int, int>(); var edgeSingleMismatchHits = new Dictionary <int, int>(); var edgeIndelHits = new Dictionary <int, int>(); var edgeMessyHits = new Dictionary <int, int>(); PopulateEdgeHitsAndLogBins(numBins, adjustedStartPosition, messySiteWidth, nextEdgeMinPosition, binEvidence, edgeHits, edgeSingleMismatchHits, edgeIndelHits, edgeMessyHits, startPosition, binConclusions, finalizedBins); UpdateMasterOutcomes(_masterOutcomesLookup, outcomesLookup); foreach (var hashableIndel in superFinalizedIndels) { _masterFinalIndels.AddOrUpdate(hashableIndel, 1, (h, n) => { return(n + 1); }); } _progressTracker.AddOrUpdate("Flushed", allAlignments.Count(), (x, currentCount) => { return(currentCount + allAlignments.Count()); }); _progressTracker.AddOrUpdate("Sent To Next Block", numPairsSentToNextBlock, (x, currentCount) => { return(currentCount + numPairsSentToNextBlock); }); _progressTracker.AddOrUpdate("Retrieved from Past Block", numRetrievedFromLastBlock, (x, currentCount) => { return(currentCount + numRetrievedFromLastBlock); }); _progressTracker.AddOrUpdate("Realigned", numRealigned, (x, currentCount) => { return(currentCount + numRealigned); }); _progressTracker.AddOrUpdate("Attempts", numKept, (x, currentCount) => { return(currentCount + numKept); }); _progressTracker.AddOrUpdate("Skipped", numSkippedDueToSites, (x, currentCount) => { return(currentCount + numSkippedDueToSites); }); _progressTracker.AddOrUpdate("Silenced", numSilenced, (x, currentCount) => { return(currentCount + numSilenced); }); pairResultLookup.Clear(); Logger.WriteToLog( $"Finished processing for region {_chrom}:{startPosition}-{endPosition}. {allAlignments.Count()} alignments flushed, " + $"{numPairsSentToNextBlock} sent to next block, {numRetrievedFromLastBlock} retrieved from {regionData.EdgeState?.Name}. " + $"Realigned {numRealigned}/{numKept} attempts ({numSkippedDueToSites} pairs skipped realignment), silenced {numSilenced} messy mates."); return(new AggregateRegionResults() { EdgeState = isFinalTask ? new EdgeState() { Name = "Final" } : new EdgeState() { EdgeAlignments = pairResultsForNextBlock, EdgeIndels = finalizedIndelsForChrom.Where(y => y.ReferencePosition > nextEdgeMinPosition) .ToList(), EffectiveMinPosition = nextEdgeMinPosition, Name = $"{startPosition}-{endPosition}", BinEvidence = binEvidence }, AlignmentsReadyToBeFlushed = allAlignments }); }
private List <BamAlignment> ProcessCategory( List <PairClassification> categoriesForRealignment, IChromosomeIndelSource indelSource, bool shouldRealignAtAll, Dictionary <HashableIndel, int[]> outcomesLookup, ref int numSkippedDueToSites, ref int numKept, ref int numRealigned, ref int numSilenced, List <PairResult> pairResults, PairClassification classification, IBinEvidence binEvidence, ConcurrentDictionary <string, int> progressTracker, BinConclusions binConclusions, UsableBins usableBins, int startPosition, int endPosition) { var allAlignments = new List <BamAlignment>(); var isHighLikelihoodForRealign = false; if (_geminiOptions.ForceHighLikelihoodRealigners) { var highLikelihoodCategories = new List <PairClassification>() { PairClassification.Disagree, PairClassification.MessyStitched, PairClassification.MessySplit, PairClassification.UnstitchMessy, PairClassification.UnstitchIndel }; isHighLikelihoodForRealign = highLikelihoodCategories.Contains(classification); } int alignmentsCount = 0; var doRealign = false; ReadPairRealignerAndCombiner realignHandler = null; var alreadyStitched = ClassificationIsStitched(classification); var doStitch = !_geminiOptions.SkipStitching && TypeClassifier.ClassificationIsStitchable(classification); var categoryIsRealignable = categoriesForRealignment.Contains(classification); if (categoryIsRealignable || doStitch) { doRealign = true; realignHandler = _bamRealignmentFactory.GetRealignPairHandler(doStitch, alreadyStitched, _realignmentOptions.PairAwareEverything || ClassificationIsPairAwareRealignable(classification), _refIdMapping, new ReadStatusCounter(), false, indelSource, _chrom, new Dictionary <string, IndelEvidence>(), ClassificationHasIndels(classification), outcomesLookup , SkipRestitchIfUnchanged(classification)); } using (var snippetSource = _dataSourceFactory.CreateGenomeSnippetSource(_chrom, _chrReference)) using (var singleSnippetSource = new ReusableSnippetSource(snippetSource)) { var nmCalculator = new NmCalculator(singleSnippetSource); var classificationString = classification.ToString(); foreach (var pairResult in pairResults) { int toSilence = 0; IEnumerable <BamAlignment> alignments; if (!doRealign) { alignments = pairResult.Alignments; } else { bool doRealignPair = shouldRealignAtAll && (isHighLikelihoodForRealign || (categoryIsRealignable && (usableBins.IsPositionUsable(pairResult.ReadPair.MinPosition) || usableBins.IsPositionUsable(pairResult.ReadPair.MaxPosition)))); if (!doRealignPair) { numSkippedDueToSites++; } else { numKept++; } toSilence = ReadsToSilence(classification, binConclusions, pairResult); if (toSilence > 0) { numSilenced++; } alignments = realignHandler.ExtractReads(pairResult, nmCalculator, doRealignPair, toSilence); if (pairResult.ReadPair.Realigned || pairResult.ReadPair.RealignedR1 || pairResult.ReadPair.RealignedR2) { numRealigned++; } } var silencedR1 = (toSilence == 1 || toSilence == 3) && !pairResult.ReadPair.RealignedR1; var silencedR2 = (toSilence == 2 || toSilence == 3) && !pairResult.ReadPair.RealignedR2; var readTreatment = ReadTreatment(silencedR1, silencedR2, pairResult); progressTracker.AddOrUpdate(classificationString + ":" + readTreatment, 1, (x, currentCount) => { return(currentCount + 1); }); var alignmentsList = alignments.ToList(); foreach (var bamAlignment in alignmentsList) { if (_geminiOptions.LightDebug) { AddMdTagCountsTags(bamAlignment, pairResult); } bamAlignment.ReplaceOrAddStringTag("XT", readTreatment); bamAlignment.ReplaceOrAddStringTag("XP", classificationString); } alignmentsCount += alignmentsList.Count(); allAlignments.AddRange(alignmentsList); } } if (realignHandler != null) { realignHandler.Finish(); } pairResults.Clear(); return(allAlignments); }
private IPairSpecificIndelFinder GetIndelFinder(bool pairAwareRealign, string chromosome, IChromosomeIndelSource indelSource) { if (pairAwareRealign) { return(new PairSpecificIndelFinder()); } else { return(new NonPairSpecificIndelFinder()); } }