Exemple #1
0
 public RealignmentEvaluator(IChromosomeIndelSource indelSource, IStatusHandler statusCounter,
                             IReadRealigner readRealinger, IRealignmentJudger judger, string chromosome, bool trackActualMismatches,
                             bool checkSoftclipsForMismatches, bool allowRescoringOrig0, bool softclipUnknownIndels, IRegionFilterer regionFilterer,
                             bool lightDebug)
 {
     _indelSource                 = indelSource;
     _statusCounter               = statusCounter;
     _readRealigner               = readRealinger;
     _judger                      = judger;
     _chromosome                  = chromosome;
     _trackActualMismatches       = trackActualMismatches;
     _checkSoftclipsForMismatches = checkSoftclipsForMismatches;
     _allowRescoringOrig0         = allowRescoringOrig0;
     _softclipUnknownIndels       = softclipUnknownIndels;
     _regionFilterer              = regionFilterer;
     _lightDebug                  = lightDebug;
 }
        public ReadPairRealignerAndCombiner GetRealignPairHandler(bool tryRestitch, bool alreadyStitched,
                                                                  bool pairAwareRealign,
                                                                  Dictionary <int, string> refIdMapping, ReadStatusCounter statusCounter, bool isSnowball,
                                                                  IChromosomeIndelSource indelSource, string chromosome, Dictionary <string, IndelEvidence> masterLookup,
                                                                  bool hasIndels, Dictionary <HashableIndel, int[]> outcomesLookup, bool skipRestitchIfNothingChanged)
        {
            var stitcher = GetStitcher();

            var stitchedPairHandler = new PairHandler(refIdMapping, stitcher, tryStitch: tryRestitch);

            var judger = new RealignmentJudger(GetAlignmentComparer());

            var readRealigner = new GeminiReadRealigner(GetAlignmentComparer(), remaskSoftclips: _geminiOptions.RemaskMessySoftclips,
                                                        keepProbeSoftclips: _geminiOptions.KeepProbeSoftclip, keepBothSideSoftclips: _geminiOptions.KeepBothSideSoftclips || (_geminiOptions.KeepProbeSoftclip && alreadyStitched),
                                                        trackActualMismatches: _realignmentAssessmentOptions.TrackActualMismatches, checkSoftclipsForMismatches: _realignmentAssessmentOptions.CheckSoftclipsForMismatches,
                                                        debug: _geminiOptions.Debug, maskNsOnly: !(_geminiOptions.RemaskMessySoftclips || _geminiOptions.KeepProbeSoftclip || _geminiOptions.KeepBothSideSoftclips), maskPartialInsertion: _realignmentOptions.MaskPartialInsertion,
                                                        minimumUnanchoredInsertionLength: _realignmentOptions.MinimumUnanchoredInsertionLength,
                                                        minInsertionSizeToAllowMismatchingBases: 4, maxProportionInsertSequenceMismatch: 0.2); // TODO fix // TODO figure out what I was saying to fix here...

            IStatusHandler statusHandler = new DebugSummaryStatusHandler(statusCounter);

            if (_geminiOptions.Debug)
            {
                statusHandler = new DebugStatusHandler(statusCounter);
            }

            // Only softclip unknowns if it is not stitched to begin with (we believe in these more, plus it makes our lives simpler for dealing with stitched directions)
            var softclipUnknownIndels = _geminiOptions.SoftclipUnknownIndels && !alreadyStitched;

            //var regionFilterer = new RegionFilterer(chromosome, indelSource.Indels);
            var regionFilterer       = new DummyRegionFilterer();
            var collector            = GetCollector(isSnowball);
            var realignmentEvaluator = new RealignmentEvaluator(indelSource.DeepCopy(), statusHandler, readRealigner, judger, chromosome,
                                                                _realignmentAssessmentOptions.TrackActualMismatches, _realignmentAssessmentOptions.CheckSoftclipsForMismatches, _geminiOptions.AllowRescoringOrigZero, softclipUnknownIndels,
                                                                regionFilterer, _geminiOptions.LightDebug);

            return(new ReadPairRealignerAndCombiner(
                       collector,
                       GetRestitcher(stitchedPairHandler, statusHandler),
                       realignmentEvaluator,
                       GetIndelFinder(pairAwareRealign, chromosome, indelSource), chromosome, alreadyStitched, pairAwareRealign,
                       masterLookup: masterLookup, hasExistingIndels: hasIndels,
                       masterOutcomesLookup: outcomesLookup, skipRestitchIfNothingChanged: skipRestitchIfNothingChanged, allowedToStitch: !_geminiOptions.SkipStitching));
        }
        public AggregateRegionResults GetAggregateRegionResults(ConcurrentDictionary <string, IndelEvidence> indelLookup,
                                                                int startPosition,
                                                                int endPosition, bool isFinalTask, RegionDataForAggregation regionData)
        {
            if (_geminiOptions.LightDebug)
            {
                Logger.WriteToLog(
                    $"Started processing for region {_chrom}:{startPosition}-{endPosition}.");
            }

            var adjustedStartPosition = regionData.EffectiveMinPosition;
            var edgeThresholdOrig     = Math.Max(1, regionData.EffectiveMaxPosition - 5000);
            var finalIndelLookup      = GetAndSyncFinalIndelLookup(indelLookup, _masterIndelLookup);
            var edgeState             = regionData.EdgeState;
            var nextEdgeMinPosition   = int.MaxValue;

            var finalizedIndels         = FinalizeIndels(finalIndelLookup, _chrReference, regionData.EffectiveMaxPosition);
            var finalizedIndelsForChrom = GetFinalizedIndelsForChrom(_chrom, finalizedIndels, edgeState);

            IChromosomeIndelSource indelSource = null;

            var       messySiteWidth = _geminiOptions.MessySiteWidth;
            const int binsToExtendTo = 2; // Treated as <, so 2 means we get to extend status to one on either side

            var binEvidence    = regionData.BinEvidence;
            var binConclusions = new BinConclusions(binEvidence, _geminiOptions.CollectDepth, trackDirectionalMess: _geminiOptions.SilenceDirectionalMessReads, trackMapqMess: _geminiOptions.SilenceMessyMapMessReads);
            var numBins        = binEvidence.NumBins;

            bool shouldRealignAtAll = finalizedIndelsForChrom.Any();

            var imperfectFreqThreshold   = _geminiOptions.ImperfectFreqThreshold;
            var indelRegionfreqThreshold = _geminiOptions.IndelRegionFreqThreshold;
            var messySiteThreshold       = _geminiOptions.MessySiteThreshold;

            var numRetrievedFromLastBlock = 0;
            var numPairsSentToNextBlock   = 0;
            var pairResultsForNextBlock   = new Dictionary <PairClassification, List <PairResult> >();

            var pairResultLookup = new Dictionary <PairClassification, List <PairResult> >();

            foreach (var key in regionData.PairResultLookup.Keys)
            {
                if (!pairResultLookup.ContainsKey(key))
                {
                    pairResultLookup.Add(key, new List <PairResult>());
                }

                pairResultLookup[key].AddRange(regionData.PairResultLookup[key]);
            }

            foreach (var category in pairResultLookup)
            {
                var isMessy          = TypeClassifier.MessyTypes.Contains(category.Key);
                var isIndel          = TypeClassifier._indelTypes.Contains(category.Key);
                var isSingleMismatch = _geminiOptions.AvoidLikelySnvs &&
                                       (category.Key == PairClassification.SingleMismatchStitched ||
                                        category.Key == PairClassification.UnstitchSingleMismatch);
                var isForwardOnlyMessy = IsForwardMessy(category.Key);
                var isReverseOnlyMessy = IsReverseMessy(category.Key);
                var isMapMessy         = IsSuspiciousMapping(category.Key);
                foreach (var pairResult in category.Value)
                {
                    // If on the edge, kick it over to the edge lookup.
                    if (!isFinalTask && pairResult.ReadPair.MaxPosition > edgeThresholdOrig)
                    {
                        numPairsSentToNextBlock++;
                        if (!pairResultsForNextBlock.ContainsKey(category.Key))
                        {
                            pairResultsForNextBlock.Add(category.Key, new List <PairResult>());
                        }

                        pairResultsForNextBlock[category.Key].Add(pairResult);

                        nextEdgeMinPosition = Math.Min(nextEdgeMinPosition, pairResult.ReadPair.MinPosition);
                    }
                    // Still collect evidence even if it's edge, because that could impact this block as well as next block.

                    binEvidence.AddMessEvidence(isMessy, pairResult, isIndel, isSingleMismatch, isForwardOnlyMessy,
                                                isReverseOnlyMessy, isMapMessy);
                }
            }

            numRetrievedFromLastBlock = AddAlignmentsFromEdgeState(edgeState, pairResultLookup, numRetrievedFromLastBlock);

            var finalizedBins = new UsableBins(binConclusions);

            if (shouldRealignAtAll)
            {
                binConclusions.AddIndelEvidence(finalizedIndelsForChrom, binsToExtendTo);
                binConclusions.ProcessRegions(messySiteThreshold, imperfectFreqThreshold,
                                              _geminiOptions.RegionDepthThreshold, indelRegionfreqThreshold, binsToExtendTo, _geminiOptions.DirectionalMessThreshold);
                finalizedBins.FinalizeConclusions(binsToExtendTo);
            }

            using (var snippetSource = _dataSourceFactory.CreateGenomeSnippetSource(_chrom, _chrReference))
            {
                indelSource =
                    _dataSourceFactory.GetChromosomeIndelSource(finalizedIndelsForChrom, snippetSource);
            }


            foreach (var kvp in pairResultsForNextBlock)
            {
                foreach (var pairResult in kvp.Value)
                {
                    pairResultLookup[kvp.Key].Remove(pairResult);
                }
            }

            var allAlignments  = new List <BamAlignment>();
            var outcomesLookup = new Dictionary <HashableIndel, int[]>();

            var numSkippedDueToSites = 0;
            var numKept      = 0;
            var numRealigned = 0;
            var numSilenced  = 0;

            var snowballCategories = _realignmentOptions.CategoriesForSnowballing;
            var doSnowball         = snowballCategories.Any();


            foreach (var category in snowballCategories)
            {
                if (pairResultLookup.ContainsKey(category))
                {
                    pairResultLookup.Remove(category, out var categoryReads);
                    allAlignments.AddRange(ProcessCategory(_categoriesForRealignment,
                                                           indelSource, shouldRealignAtAll,
                                                           outcomesLookup, ref numSkippedDueToSites, ref numKept, ref numRealigned, ref numSilenced,
                                                           categoryReads, category, binEvidence, _progressTracker, binConclusions, finalizedBins, startPosition, endPosition));
                }
            }

            List <HashableIndel> superFinalizedIndels;

            if (doSnowball)
            {
                superFinalizedIndels = GetSuperFinalizedIndelsAfterSnowball(finalizedIndelsForChrom, outcomesLookup);

                if (_geminiOptions.Debug)
                {
                    Logger.WriteToLog(
                        $"After snowballing for region {_chrom}:{startPosition}-{endPosition}, filtered down to {superFinalizedIndels.Count} indels from {finalizedIndelsForChrom.Count} ({finalIndelLookup.Count} preliminary indels).");
                }

                using (var snippetSource = _dataSourceFactory.CreateGenomeSnippetSource(_chrom, _chrReference))
                {
                    indelSource =
                        _dataSourceFactory.GetChromosomeIndelSource(superFinalizedIndels, snippetSource);
                }

                if (_geminiOptions.RecalculateUsableSitesAfterSnowball)
                {
                    binConclusions.ResetIndelRegions();

                    foreach (var indel in superFinalizedIndels)
                    {
                        var bin = (indel.ReferencePosition - adjustedStartPosition) / messySiteWidth;
                        binConclusions.SetIndelRegionTrue(bin);

                        for (int j = 0; j < binsToExtendTo; j++)
                        {
                            var binIndex = bin - j;
                            if (binIndex >= 0)
                            {
                                binConclusions.SetIndelRegionTrue(binIndex);
                            }
                            else
                            {
                                break;
                            }
                        }

                        for (int j = 0; j < binsToExtendTo; j++)
                        {
                            var binIndex = bin + j;
                            if (!binConclusions.SetIndelRegionTrue(binIndex))
                            {
                                break;
                            }
                        }
                    }

                    finalizedBins.FinalizeConclusions(binsToExtendTo);
                }
            }
            else
            {
                superFinalizedIndels = finalizedIndelsForChrom;
            }

            // TODO pull out the allocs below, or ideally actually remove them from realign pair handler or use something different altogether
            foreach (var category in pairResultLookup)
            {
                if (snowballCategories.Contains(category.Key))
                {
                    continue;
                }

                allAlignments.AddRange(ProcessCategory(_categoriesForRealignment, indelSource,
                                                       shouldRealignAtAll,
                                                       outcomesLookup, ref numSkippedDueToSites, ref numKept, ref numRealigned, ref numSilenced, category.Value,
                                                       category.Key, binEvidence, _progressTracker, binConclusions, finalizedBins, startPosition, endPosition));
            }

            var edgeHits = new Dictionary <int, int>();
            var edgeSingleMismatchHits = new Dictionary <int, int>();
            var edgeIndelHits          = new Dictionary <int, int>();
            var edgeMessyHits          = new Dictionary <int, int>();

            PopulateEdgeHitsAndLogBins(numBins, adjustedStartPosition, messySiteWidth, nextEdgeMinPosition, binEvidence,
                                       edgeHits, edgeSingleMismatchHits, edgeIndelHits, edgeMessyHits, startPosition, binConclusions, finalizedBins);

            UpdateMasterOutcomes(_masterOutcomesLookup, outcomesLookup);

            foreach (var hashableIndel in superFinalizedIndels)
            {
                _masterFinalIndels.AddOrUpdate(hashableIndel, 1, (h, n) => { return(n + 1); });
            }

            _progressTracker.AddOrUpdate("Flushed", allAlignments.Count(),
                                         (x, currentCount) => { return(currentCount + allAlignments.Count()); });
            _progressTracker.AddOrUpdate("Sent To Next Block", numPairsSentToNextBlock,
                                         (x, currentCount) => { return(currentCount + numPairsSentToNextBlock); });
            _progressTracker.AddOrUpdate("Retrieved from Past Block", numRetrievedFromLastBlock,
                                         (x, currentCount) => { return(currentCount + numRetrievedFromLastBlock); });
            _progressTracker.AddOrUpdate("Realigned", numRealigned,
                                         (x, currentCount) => { return(currentCount + numRealigned); });
            _progressTracker.AddOrUpdate("Attempts", numKept,
                                         (x, currentCount) => { return(currentCount + numKept); });
            _progressTracker.AddOrUpdate("Skipped", numSkippedDueToSites,
                                         (x, currentCount) => { return(currentCount + numSkippedDueToSites); });
            _progressTracker.AddOrUpdate("Silenced", numSilenced,
                                         (x, currentCount) => { return(currentCount + numSilenced); });

            pairResultLookup.Clear();
            Logger.WriteToLog(
                $"Finished processing for region {_chrom}:{startPosition}-{endPosition}. {allAlignments.Count()} alignments flushed, " +
                $"{numPairsSentToNextBlock} sent to next block, {numRetrievedFromLastBlock} retrieved from {regionData.EdgeState?.Name}. " +
                $"Realigned {numRealigned}/{numKept} attempts ({numSkippedDueToSites} pairs skipped realignment), silenced {numSilenced} messy mates.");


            return(new AggregateRegionResults()
            {
                EdgeState = isFinalTask
                    ? new EdgeState()
                {
                    Name = "Final"
                }
                    : new EdgeState()
                {
                    EdgeAlignments = pairResultsForNextBlock,
                    EdgeIndels = finalizedIndelsForChrom.Where(y => y.ReferencePosition > nextEdgeMinPosition)
                                 .ToList(),
                    EffectiveMinPosition = nextEdgeMinPosition,
                    Name = $"{startPosition}-{endPosition}",
                    BinEvidence = binEvidence
                },
                AlignmentsReadyToBeFlushed = allAlignments
            });
        }
        private List <BamAlignment> ProcessCategory(
            List <PairClassification> categoriesForRealignment, IChromosomeIndelSource indelSource,
            bool shouldRealignAtAll, Dictionary <HashableIndel, int[]> outcomesLookup, ref int numSkippedDueToSites,
            ref int numKept, ref int numRealigned, ref int numSilenced,
            List <PairResult> pairResults, PairClassification classification, IBinEvidence binEvidence,
            ConcurrentDictionary <string, int> progressTracker, BinConclusions binConclusions, UsableBins usableBins, int startPosition, int endPosition)
        {
            var allAlignments = new List <BamAlignment>();
            var isHighLikelihoodForRealign = false;

            if (_geminiOptions.ForceHighLikelihoodRealigners)
            {
                var highLikelihoodCategories = new List <PairClassification>()
                {
                    PairClassification.Disagree,
                    PairClassification.MessyStitched,
                    PairClassification.MessySplit,
                    PairClassification.UnstitchMessy,
                    PairClassification.UnstitchIndel
                };
                isHighLikelihoodForRealign = highLikelihoodCategories.Contains(classification);
            }

            int alignmentsCount = 0;

            var doRealign = false;
            ReadPairRealignerAndCombiner realignHandler = null;
            var alreadyStitched       = ClassificationIsStitched(classification);
            var doStitch              = !_geminiOptions.SkipStitching && TypeClassifier.ClassificationIsStitchable(classification);
            var categoryIsRealignable = categoriesForRealignment.Contains(classification);

            if (categoryIsRealignable || doStitch)
            {
                doRealign = true;

                realignHandler = _bamRealignmentFactory.GetRealignPairHandler(doStitch,
                                                                              alreadyStitched,
                                                                              _realignmentOptions.PairAwareEverything ||
                                                                              ClassificationIsPairAwareRealignable(classification),
                                                                              _refIdMapping,
                                                                              new ReadStatusCounter(), false, indelSource, _chrom, new Dictionary <string, IndelEvidence>(),
                                                                              ClassificationHasIndels(classification), outcomesLookup
                                                                              , SkipRestitchIfUnchanged(classification));
            }

            using (var snippetSource = _dataSourceFactory.CreateGenomeSnippetSource(_chrom, _chrReference))
                using (var singleSnippetSource = new ReusableSnippetSource(snippetSource))
                {
                    var nmCalculator = new NmCalculator(singleSnippetSource);

                    var classificationString = classification.ToString();
                    foreach (var pairResult in pairResults)
                    {
                        int toSilence = 0;

                        IEnumerable <BamAlignment> alignments;
                        if (!doRealign)
                        {
                            alignments = pairResult.Alignments;
                        }
                        else
                        {
                            bool doRealignPair =
                                shouldRealignAtAll && (isHighLikelihoodForRealign ||
                                                       (categoryIsRealignable &&
                                                        (usableBins.IsPositionUsable(pairResult.ReadPair.MinPosition) ||
                                                         usableBins.IsPositionUsable(pairResult.ReadPair.MaxPosition))));


                            if (!doRealignPair)
                            {
                                numSkippedDueToSites++;
                            }
                            else
                            {
                                numKept++;
                            }

                            toSilence = ReadsToSilence(classification, binConclusions, pairResult);
                            if (toSilence > 0)
                            {
                                numSilenced++;
                            }

                            alignments = realignHandler.ExtractReads(pairResult, nmCalculator, doRealignPair, toSilence);

                            if (pairResult.ReadPair.Realigned || pairResult.ReadPair.RealignedR1 ||
                                pairResult.ReadPair.RealignedR2)
                            {
                                numRealigned++;
                            }
                        }

                        var silencedR1    = (toSilence == 1 || toSilence == 3) && !pairResult.ReadPair.RealignedR1;
                        var silencedR2    = (toSilence == 2 || toSilence == 3) && !pairResult.ReadPair.RealignedR2;
                        var readTreatment = ReadTreatment(silencedR1, silencedR2, pairResult);

                        progressTracker.AddOrUpdate(classificationString + ":" + readTreatment, 1,
                                                    (x, currentCount) => { return(currentCount + 1); });

                        var alignmentsList = alignments.ToList();
                        foreach (var bamAlignment in alignmentsList)
                        {
                            if (_geminiOptions.LightDebug)
                            {
                                AddMdTagCountsTags(bamAlignment, pairResult);
                            }

                            bamAlignment.ReplaceOrAddStringTag("XT", readTreatment);
                            bamAlignment.ReplaceOrAddStringTag("XP", classificationString);
                        }

                        alignmentsCount += alignmentsList.Count();
                        allAlignments.AddRange(alignmentsList);
                    }
                }

            if (realignHandler != null)
            {
                realignHandler.Finish();
            }

            pairResults.Clear();
            return(allAlignments);
        }
 private IPairSpecificIndelFinder GetIndelFinder(bool pairAwareRealign, string chromosome, IChromosomeIndelSource indelSource)
 {
     if (pairAwareRealign)
     {
         return(new PairSpecificIndelFinder());
     }
     else
     {
         return(new NonPairSpecificIndelFinder());
     }
 }