private void VerifyStatusForPositionsInBin(int minInBin, int maxInBin, UsableBins usable, bool expected)
 {
     for (int i = minInBin; i <= maxInBin; i++)
     {
         VerifyUsableStatus(usable, i, expected);
     }
 }
        public void FinalizeConclusions()
        {
            var binConclusions = new Mock <IBinConclusions>();

            binConclusions.SetupGet(x => x.NumBins).Returns(1000);
            binConclusions.Setup(x => x.GetBinId(It.IsAny <int>())).Returns <int>(i => i / 10);

            MockBinResult(binConclusions, 1, true, true, false);
            MockBinResult(binConclusions, 10, true, true, false);
            MockBinResult(binConclusions, 15, true, false, false);
            MockBinResult(binConclusions, 25, true, true, false);
            MockBinResult(binConclusions, 26, true, true, true);

            var usable = new UsableBins(binConclusions.Object);

            usable.FinalizeConclusions(2);

            VerifyStatusForPositionsInBin(0, 9, usable, true);        // Bin 0 - propagate from 1
            VerifyStatusForPositionsInBin(10, 19, usable, true);      // Bin 1 - explicitly set
            VerifyStatusForPositionsInBin(20, 29, usable, true);      // Bin 2 - propagate from 1
            VerifyStatusForPositionsInBin(30, 39, usable, false);     // Bin 3 - outside range of bin 1 propagation
            VerifyStatusForPositionsInBin(40, 89, usable, false);     // Bin 4-8 - false
            VerifyStatusForPositionsInBin(90, 119, usable, true);     // Bin 9, 10, 11 - propagate from 10
            VerifyStatusForPositionsInBin(120, 239, usable, false);   // Bin 12 - 23 - false
            VerifyStatusForPositionsInBin(240, 249, usable, true);    // Bin 24 - propagate from 25
            VerifyStatusForPositionsInBin(250, 259, usable, true);    // Bin 25 - explicitly set
            VerifyStatusForPositionsInBin(260, 269, usable, false);   // Bin 26 - would have propagated from 25, but has likely true snp
            VerifyStatusForPositionsInBin(270, 10000, usable, false); // Everything else - false - not explicitly set
        }
        public AggregateRegionResults GetAggregateRegionResults(ConcurrentDictionary <string, IndelEvidence> indelLookup,
                                                                int startPosition,
                                                                int endPosition, bool isFinalTask, RegionDataForAggregation regionData)
        {
            if (_geminiOptions.LightDebug)
            {
                Logger.WriteToLog(
                    $"Started processing for region {_chrom}:{startPosition}-{endPosition}.");
            }

            var adjustedStartPosition = regionData.EffectiveMinPosition;
            var edgeThresholdOrig     = Math.Max(1, regionData.EffectiveMaxPosition - 5000);
            var finalIndelLookup      = GetAndSyncFinalIndelLookup(indelLookup, _masterIndelLookup);
            var edgeState             = regionData.EdgeState;
            var nextEdgeMinPosition   = int.MaxValue;

            var finalizedIndels         = FinalizeIndels(finalIndelLookup, _chrReference, regionData.EffectiveMaxPosition);
            var finalizedIndelsForChrom = GetFinalizedIndelsForChrom(_chrom, finalizedIndels, edgeState);

            IChromosomeIndelSource indelSource = null;

            var       messySiteWidth = _geminiOptions.MessySiteWidth;
            const int binsToExtendTo = 2; // Treated as <, so 2 means we get to extend status to one on either side

            var binEvidence    = regionData.BinEvidence;
            var binConclusions = new BinConclusions(binEvidence, _geminiOptions.CollectDepth, trackDirectionalMess: _geminiOptions.SilenceDirectionalMessReads, trackMapqMess: _geminiOptions.SilenceMessyMapMessReads);
            var numBins        = binEvidence.NumBins;

            bool shouldRealignAtAll = finalizedIndelsForChrom.Any();

            var imperfectFreqThreshold   = _geminiOptions.ImperfectFreqThreshold;
            var indelRegionfreqThreshold = _geminiOptions.IndelRegionFreqThreshold;
            var messySiteThreshold       = _geminiOptions.MessySiteThreshold;

            var numRetrievedFromLastBlock = 0;
            var numPairsSentToNextBlock   = 0;
            var pairResultsForNextBlock   = new Dictionary <PairClassification, List <PairResult> >();

            var pairResultLookup = new Dictionary <PairClassification, List <PairResult> >();

            foreach (var key in regionData.PairResultLookup.Keys)
            {
                if (!pairResultLookup.ContainsKey(key))
                {
                    pairResultLookup.Add(key, new List <PairResult>());
                }

                pairResultLookup[key].AddRange(regionData.PairResultLookup[key]);
            }

            foreach (var category in pairResultLookup)
            {
                var isMessy          = TypeClassifier.MessyTypes.Contains(category.Key);
                var isIndel          = TypeClassifier._indelTypes.Contains(category.Key);
                var isSingleMismatch = _geminiOptions.AvoidLikelySnvs &&
                                       (category.Key == PairClassification.SingleMismatchStitched ||
                                        category.Key == PairClassification.UnstitchSingleMismatch);
                var isForwardOnlyMessy = IsForwardMessy(category.Key);
                var isReverseOnlyMessy = IsReverseMessy(category.Key);
                var isMapMessy         = IsSuspiciousMapping(category.Key);
                foreach (var pairResult in category.Value)
                {
                    // If on the edge, kick it over to the edge lookup.
                    if (!isFinalTask && pairResult.ReadPair.MaxPosition > edgeThresholdOrig)
                    {
                        numPairsSentToNextBlock++;
                        if (!pairResultsForNextBlock.ContainsKey(category.Key))
                        {
                            pairResultsForNextBlock.Add(category.Key, new List <PairResult>());
                        }

                        pairResultsForNextBlock[category.Key].Add(pairResult);

                        nextEdgeMinPosition = Math.Min(nextEdgeMinPosition, pairResult.ReadPair.MinPosition);
                    }
                    // Still collect evidence even if it's edge, because that could impact this block as well as next block.

                    binEvidence.AddMessEvidence(isMessy, pairResult, isIndel, isSingleMismatch, isForwardOnlyMessy,
                                                isReverseOnlyMessy, isMapMessy);
                }
            }

            numRetrievedFromLastBlock = AddAlignmentsFromEdgeState(edgeState, pairResultLookup, numRetrievedFromLastBlock);

            var finalizedBins = new UsableBins(binConclusions);

            if (shouldRealignAtAll)
            {
                binConclusions.AddIndelEvidence(finalizedIndelsForChrom, binsToExtendTo);
                binConclusions.ProcessRegions(messySiteThreshold, imperfectFreqThreshold,
                                              _geminiOptions.RegionDepthThreshold, indelRegionfreqThreshold, binsToExtendTo, _geminiOptions.DirectionalMessThreshold);
                finalizedBins.FinalizeConclusions(binsToExtendTo);
            }

            using (var snippetSource = _dataSourceFactory.CreateGenomeSnippetSource(_chrom, _chrReference))
            {
                indelSource =
                    _dataSourceFactory.GetChromosomeIndelSource(finalizedIndelsForChrom, snippetSource);
            }


            foreach (var kvp in pairResultsForNextBlock)
            {
                foreach (var pairResult in kvp.Value)
                {
                    pairResultLookup[kvp.Key].Remove(pairResult);
                }
            }

            var allAlignments  = new List <BamAlignment>();
            var outcomesLookup = new Dictionary <HashableIndel, int[]>();

            var numSkippedDueToSites = 0;
            var numKept      = 0;
            var numRealigned = 0;
            var numSilenced  = 0;

            var snowballCategories = _realignmentOptions.CategoriesForSnowballing;
            var doSnowball         = snowballCategories.Any();


            foreach (var category in snowballCategories)
            {
                if (pairResultLookup.ContainsKey(category))
                {
                    pairResultLookup.Remove(category, out var categoryReads);
                    allAlignments.AddRange(ProcessCategory(_categoriesForRealignment,
                                                           indelSource, shouldRealignAtAll,
                                                           outcomesLookup, ref numSkippedDueToSites, ref numKept, ref numRealigned, ref numSilenced,
                                                           categoryReads, category, binEvidence, _progressTracker, binConclusions, finalizedBins, startPosition, endPosition));
                }
            }

            List <HashableIndel> superFinalizedIndels;

            if (doSnowball)
            {
                superFinalizedIndels = GetSuperFinalizedIndelsAfterSnowball(finalizedIndelsForChrom, outcomesLookup);

                if (_geminiOptions.Debug)
                {
                    Logger.WriteToLog(
                        $"After snowballing for region {_chrom}:{startPosition}-{endPosition}, filtered down to {superFinalizedIndels.Count} indels from {finalizedIndelsForChrom.Count} ({finalIndelLookup.Count} preliminary indels).");
                }

                using (var snippetSource = _dataSourceFactory.CreateGenomeSnippetSource(_chrom, _chrReference))
                {
                    indelSource =
                        _dataSourceFactory.GetChromosomeIndelSource(superFinalizedIndels, snippetSource);
                }

                if (_geminiOptions.RecalculateUsableSitesAfterSnowball)
                {
                    binConclusions.ResetIndelRegions();

                    foreach (var indel in superFinalizedIndels)
                    {
                        var bin = (indel.ReferencePosition - adjustedStartPosition) / messySiteWidth;
                        binConclusions.SetIndelRegionTrue(bin);

                        for (int j = 0; j < binsToExtendTo; j++)
                        {
                            var binIndex = bin - j;
                            if (binIndex >= 0)
                            {
                                binConclusions.SetIndelRegionTrue(binIndex);
                            }
                            else
                            {
                                break;
                            }
                        }

                        for (int j = 0; j < binsToExtendTo; j++)
                        {
                            var binIndex = bin + j;
                            if (!binConclusions.SetIndelRegionTrue(binIndex))
                            {
                                break;
                            }
                        }
                    }

                    finalizedBins.FinalizeConclusions(binsToExtendTo);
                }
            }
            else
            {
                superFinalizedIndels = finalizedIndelsForChrom;
            }

            // TODO pull out the allocs below, or ideally actually remove them from realign pair handler or use something different altogether
            foreach (var category in pairResultLookup)
            {
                if (snowballCategories.Contains(category.Key))
                {
                    continue;
                }

                allAlignments.AddRange(ProcessCategory(_categoriesForRealignment, indelSource,
                                                       shouldRealignAtAll,
                                                       outcomesLookup, ref numSkippedDueToSites, ref numKept, ref numRealigned, ref numSilenced, category.Value,
                                                       category.Key, binEvidence, _progressTracker, binConclusions, finalizedBins, startPosition, endPosition));
            }

            var edgeHits = new Dictionary <int, int>();
            var edgeSingleMismatchHits = new Dictionary <int, int>();
            var edgeIndelHits          = new Dictionary <int, int>();
            var edgeMessyHits          = new Dictionary <int, int>();

            PopulateEdgeHitsAndLogBins(numBins, adjustedStartPosition, messySiteWidth, nextEdgeMinPosition, binEvidence,
                                       edgeHits, edgeSingleMismatchHits, edgeIndelHits, edgeMessyHits, startPosition, binConclusions, finalizedBins);

            UpdateMasterOutcomes(_masterOutcomesLookup, outcomesLookup);

            foreach (var hashableIndel in superFinalizedIndels)
            {
                _masterFinalIndels.AddOrUpdate(hashableIndel, 1, (h, n) => { return(n + 1); });
            }

            _progressTracker.AddOrUpdate("Flushed", allAlignments.Count(),
                                         (x, currentCount) => { return(currentCount + allAlignments.Count()); });
            _progressTracker.AddOrUpdate("Sent To Next Block", numPairsSentToNextBlock,
                                         (x, currentCount) => { return(currentCount + numPairsSentToNextBlock); });
            _progressTracker.AddOrUpdate("Retrieved from Past Block", numRetrievedFromLastBlock,
                                         (x, currentCount) => { return(currentCount + numRetrievedFromLastBlock); });
            _progressTracker.AddOrUpdate("Realigned", numRealigned,
                                         (x, currentCount) => { return(currentCount + numRealigned); });
            _progressTracker.AddOrUpdate("Attempts", numKept,
                                         (x, currentCount) => { return(currentCount + numKept); });
            _progressTracker.AddOrUpdate("Skipped", numSkippedDueToSites,
                                         (x, currentCount) => { return(currentCount + numSkippedDueToSites); });
            _progressTracker.AddOrUpdate("Silenced", numSilenced,
                                         (x, currentCount) => { return(currentCount + numSilenced); });

            pairResultLookup.Clear();
            Logger.WriteToLog(
                $"Finished processing for region {_chrom}:{startPosition}-{endPosition}. {allAlignments.Count()} alignments flushed, " +
                $"{numPairsSentToNextBlock} sent to next block, {numRetrievedFromLastBlock} retrieved from {regionData.EdgeState?.Name}. " +
                $"Realigned {numRealigned}/{numKept} attempts ({numSkippedDueToSites} pairs skipped realignment), silenced {numSilenced} messy mates.");


            return(new AggregateRegionResults()
            {
                EdgeState = isFinalTask
                    ? new EdgeState()
                {
                    Name = "Final"
                }
                    : new EdgeState()
                {
                    EdgeAlignments = pairResultsForNextBlock,
                    EdgeIndels = finalizedIndelsForChrom.Where(y => y.ReferencePosition > nextEdgeMinPosition)
                                 .ToList(),
                    EffectiveMinPosition = nextEdgeMinPosition,
                    Name = $"{startPosition}-{endPosition}",
                    BinEvidence = binEvidence
                },
                AlignmentsReadyToBeFlushed = allAlignments
            });
        }
        private void PopulateEdgeHitsAndLogBins(int numBins, int adjustedStartPosition, int messySiteWidth, int edgeThreshold,
                                                IBinEvidence binEvidence, Dictionary <int, int> edgeHits, Dictionary <int, int> edgeSingleMismatchHits, Dictionary <int, int> edgeIndelHits,
                                                Dictionary <int, int> edgeMessyHits, int startPosition, BinConclusions binConclusions, UsableBins usableBins)
        {
            for (int binId = 0; binId < numBins; binId++)
            {
                var inEdge   = false;
                var binStart = adjustedStartPosition + (binId * messySiteWidth);

                if (_geminiOptions.LogRegionsAndRealignments)
                {
                    if (binEvidence.GetAllHits(binId) > 10 && !inEdge)
                    {
                        var binCounts =
                            $"{binId},{inEdge},{binStart},{binStart + messySiteWidth},{binEvidence.GetAllHits(binId)},{usableBins.IsPositionUsable(binStart)},{binEvidence.GetSingleMismatchHit(binId)}," +
                            $"{binConclusions.GetProbableTrueSnvRegion(binId)},{binEvidence.GetIndelHit(binId)},{binConclusions.GetIndelRegionHit(binId)}," +
                            $"{binEvidence.GetMessyHit(binId)},{binConclusions.GetIsMessyEnough(binId)},{binEvidence.GetForwardMessyRegionHit(binId)},{binConclusions.GetFwdMessyStatus(binId)},{binEvidence.GetReverseMessyRegionHit(binId)},{binConclusions.GetRevMessyStatus(binId)},{binEvidence.GetMapqMessyHit(binId)},{binConclusions.GetMapqMessyStatus(binId)}";

                        // TODO consider writing this to a proper output file
                        if (_geminiOptions.LogRegionsAndRealignments)
                        {
                            Logger.WriteToLog("BINCOUNTS\t" + binCounts);
                        }
                    }
                }
            }
        }
        private List <BamAlignment> ProcessCategory(
            List <PairClassification> categoriesForRealignment, IChromosomeIndelSource indelSource,
            bool shouldRealignAtAll, Dictionary <HashableIndel, int[]> outcomesLookup, ref int numSkippedDueToSites,
            ref int numKept, ref int numRealigned, ref int numSilenced,
            List <PairResult> pairResults, PairClassification classification, IBinEvidence binEvidence,
            ConcurrentDictionary <string, int> progressTracker, BinConclusions binConclusions, UsableBins usableBins, int startPosition, int endPosition)
        {
            var allAlignments = new List <BamAlignment>();
            var isHighLikelihoodForRealign = false;

            if (_geminiOptions.ForceHighLikelihoodRealigners)
            {
                var highLikelihoodCategories = new List <PairClassification>()
                {
                    PairClassification.Disagree,
                    PairClassification.MessyStitched,
                    PairClassification.MessySplit,
                    PairClassification.UnstitchMessy,
                    PairClassification.UnstitchIndel
                };
                isHighLikelihoodForRealign = highLikelihoodCategories.Contains(classification);
            }

            int alignmentsCount = 0;

            var doRealign = false;
            ReadPairRealignerAndCombiner realignHandler = null;
            var alreadyStitched       = ClassificationIsStitched(classification);
            var doStitch              = !_geminiOptions.SkipStitching && TypeClassifier.ClassificationIsStitchable(classification);
            var categoryIsRealignable = categoriesForRealignment.Contains(classification);

            if (categoryIsRealignable || doStitch)
            {
                doRealign = true;

                realignHandler = _bamRealignmentFactory.GetRealignPairHandler(doStitch,
                                                                              alreadyStitched,
                                                                              _realignmentOptions.PairAwareEverything ||
                                                                              ClassificationIsPairAwareRealignable(classification),
                                                                              _refIdMapping,
                                                                              new ReadStatusCounter(), false, indelSource, _chrom, new Dictionary <string, IndelEvidence>(),
                                                                              ClassificationHasIndels(classification), outcomesLookup
                                                                              , SkipRestitchIfUnchanged(classification));
            }

            using (var snippetSource = _dataSourceFactory.CreateGenomeSnippetSource(_chrom, _chrReference))
                using (var singleSnippetSource = new ReusableSnippetSource(snippetSource))
                {
                    var nmCalculator = new NmCalculator(singleSnippetSource);

                    var classificationString = classification.ToString();
                    foreach (var pairResult in pairResults)
                    {
                        int toSilence = 0;

                        IEnumerable <BamAlignment> alignments;
                        if (!doRealign)
                        {
                            alignments = pairResult.Alignments;
                        }
                        else
                        {
                            bool doRealignPair =
                                shouldRealignAtAll && (isHighLikelihoodForRealign ||
                                                       (categoryIsRealignable &&
                                                        (usableBins.IsPositionUsable(pairResult.ReadPair.MinPosition) ||
                                                         usableBins.IsPositionUsable(pairResult.ReadPair.MaxPosition))));


                            if (!doRealignPair)
                            {
                                numSkippedDueToSites++;
                            }
                            else
                            {
                                numKept++;
                            }

                            toSilence = ReadsToSilence(classification, binConclusions, pairResult);
                            if (toSilence > 0)
                            {
                                numSilenced++;
                            }

                            alignments = realignHandler.ExtractReads(pairResult, nmCalculator, doRealignPair, toSilence);

                            if (pairResult.ReadPair.Realigned || pairResult.ReadPair.RealignedR1 ||
                                pairResult.ReadPair.RealignedR2)
                            {
                                numRealigned++;
                            }
                        }

                        var silencedR1    = (toSilence == 1 || toSilence == 3) && !pairResult.ReadPair.RealignedR1;
                        var silencedR2    = (toSilence == 2 || toSilence == 3) && !pairResult.ReadPair.RealignedR2;
                        var readTreatment = ReadTreatment(silencedR1, silencedR2, pairResult);

                        progressTracker.AddOrUpdate(classificationString + ":" + readTreatment, 1,
                                                    (x, currentCount) => { return(currentCount + 1); });

                        var alignmentsList = alignments.ToList();
                        foreach (var bamAlignment in alignmentsList)
                        {
                            if (_geminiOptions.LightDebug)
                            {
                                AddMdTagCountsTags(bamAlignment, pairResult);
                            }

                            bamAlignment.ReplaceOrAddStringTag("XT", readTreatment);
                            bamAlignment.ReplaceOrAddStringTag("XP", classificationString);
                        }

                        alignmentsCount += alignmentsList.Count();
                        allAlignments.AddRange(alignmentsList);
                    }
                }

            if (realignHandler != null)
            {
                realignHandler.Finish();
            }

            pairResults.Clear();
            return(allAlignments);
        }
 private void VerifyUsableStatus(UsableBins usable, int position, bool expected)
 {
     Assert.Equal(expected, usable.IsPositionUsable(position));
 }