public Task[] GetAndLinkAllClassificationBlocksWithEcFinalization( ISourceBlock <PairResult> pairClassifierBlock, int startPosition, int endPosition, ConcurrentDictionary <int, EdgeState> edgeStates, ConcurrentDictionary <int, Task> edgeToWaitOn, int prevBlockStart, bool isFinalTask = false) { if (_lightDebug) { Logger.WriteToLog( $"Creating tasks for region {_chrom}:{startPosition}-{endPosition}."); } var allToWaitFor = new ConcurrentDictionary <Task, int>(); var messySiteWidth = _geminiOptions.MessySiteWidth; var effectiveMax = 0; var adjustedStartPosition = startPosition; adjustedStartPosition = RoundedStartPosition(adjustedStartPosition, messySiteWidth); var pairResultLookup = new ConcurrentDictionary <PairClassification, List <PairResult> >(); var indelLookup = new ConcurrentDictionary <string, IndelEvidence>(); var regionLength = endPosition - adjustedStartPosition; var numBins = (regionLength / messySiteWidth) + 1000; var totalBinCounts = InitializeTotalBinCounts(numBins); var singleMismatchBinCounts = InitializeSingleMismatchBinCounts(numBins); var actBlockFactory = _actionBlockFactoryProvider.GetFactory(startPosition, endPosition, adjustedStartPosition, totalBinCounts, singleMismatchBinCounts, numBins, allToWaitFor); foreach (var classification in classifications) { var toWaitFor = GetAndLinkPerClassificationBlocksWithEcFinalization(pairClassifierBlock, classification, indelLookup); var doStitch = !_geminiOptions.SkipStitching && TypeClassifier.ClassificationIsStitchable(classification); var categoryIsRealignable = _categoriesForRealignment.Contains(classification); // Even if we're not going to realign these reads, they may still be useful for bin evidence, so don't give them the immediate flush var shouldCollectBinEvidence = TypeClassifier.MessyTypes.Contains(classification) || TypeClassifier._indelTypes.Contains(classification); var isSingleMismatch = _geminiOptions.AvoidLikelySnvs && (classification == PairClassification.SingleMismatchStitched || classification == PairClassification.UnstitchSingleMismatch); if (!(categoryIsRealignable || doStitch || shouldCollectBinEvidence)) { var actBlock = actBlockFactory.GetEarlyFlushBlock(classification, isSingleMismatch); foreach (var transformBlock in toWaitFor) { transformBlock.LinkTo(actBlock, new DataflowLinkOptions() { PropagateCompletion = true }); } var toRemove = allToWaitFor.Keys.Where(x => x.IsCompleted); foreach (var task in toRemove) { allToWaitFor.TryRemove(task, out _); } if (!allToWaitFor.TryAdd(actBlock.Completion, 1)) { throw new Exception("Failed to add task."); } } else { var actBlock = actBlockFactory.GetActionablePairsBlock(classification, pairResultLookup); foreach (var transformBlock in toWaitFor) { transformBlock.LinkTo(actBlock, new DataflowLinkOptions() { PropagateCompletion = true }); } if (!allToWaitFor.TryAdd(actBlock.Completion, 1)) { throw new Exception("Failed to add task."); } } } var finalTask = AggregateTask(indelLookup, startPosition, endPosition, isFinalTask, _progressTracker); var intermediateWriterTask = new TransformBlock <AggregateRegionResults, List <BamAlignment> >(results => { edgeStates.AddOrUpdate(startPosition, results.EdgeState, (s, e) => { Logger.WriteWarningToLog($"Edge state already exists: {s}."); return(results.EdgeState); }); return(results.AlignmentsReadyToBeFlushed); }, new ExecutionDataflowBlockOptions() { EnsureOrdered = false }); var finalWriteTask = actBlockFactory.GetWriterBlock(); finalTask.LinkTo(intermediateWriterTask, new DataflowLinkOptions() { PropagateCompletion = true }); intermediateWriterTask.LinkTo(finalWriteTask, new DataflowLinkOptions() { PropagateCompletion = true }); if (edgeToWaitOn.ContainsKey(prevBlockStart)) { if (!allToWaitFor.TryAdd(edgeToWaitOn[prevBlockStart], 1)) { throw new Exception("Failed to add task for previous edge."); } } else { Logger.WriteToLog($"At {startPosition}, prev block is {prevBlockStart}, nothing to wait on."); } if (!isFinalTask) { edgeToWaitOn.AddOrUpdate(startPosition, intermediateWriterTask.Completion, (s, e) => { Logger.WriteWarningToLog($"Edge state task already exists: {s}."); return(intermediateWriterTask.Completion); }); } var allTasks = allToWaitFor.Keys.ToList(); var t = Task.WhenAll(allTasks) .ContinueWith(_ => { if (_lightDebug) { Logger.WriteToLog($"Preparing for aggregation for region {_chrom}:{startPosition}-{endPosition}."); } if (allTasks.Any(x => x.Status != TaskStatus.RanToCompletion)) { Logger.WriteToLog("ERROR: Task did not complete."); foreach (var task in allTasks) { Logger.WriteToLog($"{task.Id}\t{task.Status}\t{task.Exception}"); if (task.Status == TaskStatus.Faulted) { // Pass the exception along to the final task so it can be forced to error out. finalTask = ForceFailFinalTask(intermediateWriterTask, task.Exception); } } } var numStillToProcess = 0; foreach (var item in pairResultLookup) { effectiveMax = Math.Max(effectiveMax, item.Value.Max(x => x.ReadPair.MaxPosition)); numStillToProcess += item.Value.Count; } if (_lightDebug) { Logger.WriteToLog($"Preparing edge state info for region {_chrom}:{startPosition}-{endPosition}."); } EdgeState edgeState = null; var extraBins = 0; if (edgeStates.ContainsKey(prevBlockStart)) { edgeStates.Remove(prevBlockStart, out edgeState); if (edgeState.EdgeIndels.Any() || edgeState.EdgeAlignments.Any()) { var newAdjustedStartPosition = RoundedStartPosition(Math.Min(adjustedStartPosition, edgeState.EffectiveMinPosition), messySiteWidth); extraBins = (adjustedStartPosition - newAdjustedStartPosition) / messySiteWidth; adjustedStartPosition = newAdjustedStartPosition; } } allToWaitFor.Clear(); allTasks.Clear(); if (_lightDebug) { var totalReadsInRegion = _categoryLookup.Values.Sum(); Console.WriteLine($"STILL TO PROCESS IN REGION ({startPosition}-{endPosition} (eff:{effectiveMax})): {numStillToProcess}"); Console.WriteLine( $"READS IN REGION ({startPosition}-{endPosition} (eff:{effectiveMax})): {totalReadsInRegion}"); foreach (var kvp in _categoryLookup) { Console.WriteLine( $"CATEGORYCOUNT ({startPosition}-{endPosition} (eff:{effectiveMax})): {kvp.Key}: {kvp.Value} ({Math.Round(kvp.Value * 100 / (float)totalReadsInRegion)}%)"); } } var totalNumBins = numBins + extraBins; var allHits = new uint[totalNumBins]; var singleMismatchHits = new uint[totalNumBins]; for (var i = 0; i < totalNumBins; i++) { var newBin = i + extraBins; if (newBin >= totalNumBins) { break; } if (totalBinCounts[i] > 0) { allHits[newBin] = totalBinCounts[i]; singleMismatchHits[newBin] = singleMismatchBinCounts[i]; } } if (_lightDebug) { Logger.WriteToLog( $"Creating bin evidence for region {_chrom}:{startPosition}-{endPosition}."); } var binEvidence = _binEvidenceFactory.GetBinEvidence(totalNumBins, adjustedStartPosition); binEvidence.SetSingleMismatchHits(singleMismatchHits); binEvidence.AddAllHits(allHits); if (_lightDebug) { Logger.WriteToLog($"Adding edge hits for region {_chrom}:{startPosition}-{endPosition}."); } if (edgeState != null) { var edgeBinInNew = binEvidence.GetBinId(edgeState.EffectiveMinPosition); var edgeBinInOld = edgeState.BinEvidence.GetBinId(edgeState.EffectiveMinPosition); AddEdgeHits(edgeState, binEvidence, edgeBinInOld, edgeBinInOld - edgeBinInNew); } if (_lightDebug) { Logger.WriteToLog($"Done adding edge hits for region {_chrom}:{startPosition}-{endPosition}."); } var finalState = new RegionDataForAggregation() { BinEvidence = binEvidence, PairResultLookup = pairResultLookup, EdgeState = edgeState, EffectiveMaxPosition = effectiveMax, EffectiveMinPosition = adjustedStartPosition }; finalTask.Post(finalState); finalTask.Complete(); }); return(new[] { t, finalWriteTask.Completion }); }
public AggregateRegionResults GetAggregateRegionResults(ConcurrentDictionary <string, IndelEvidence> indelLookup, int startPosition, int endPosition, bool isFinalTask, RegionDataForAggregation regionData) { if (_geminiOptions.LightDebug) { Logger.WriteToLog( $"Started processing for region {_chrom}:{startPosition}-{endPosition}."); } var adjustedStartPosition = regionData.EffectiveMinPosition; var edgeThresholdOrig = Math.Max(1, regionData.EffectiveMaxPosition - 5000); var finalIndelLookup = GetAndSyncFinalIndelLookup(indelLookup, _masterIndelLookup); var edgeState = regionData.EdgeState; var nextEdgeMinPosition = int.MaxValue; var finalizedIndels = FinalizeIndels(finalIndelLookup, _chrReference, regionData.EffectiveMaxPosition); var finalizedIndelsForChrom = GetFinalizedIndelsForChrom(_chrom, finalizedIndels, edgeState); IChromosomeIndelSource indelSource = null; var messySiteWidth = _geminiOptions.MessySiteWidth; const int binsToExtendTo = 2; // Treated as <, so 2 means we get to extend status to one on either side var binEvidence = regionData.BinEvidence; var binConclusions = new BinConclusions(binEvidence, _geminiOptions.CollectDepth, trackDirectionalMess: _geminiOptions.SilenceDirectionalMessReads, trackMapqMess: _geminiOptions.SilenceMessyMapMessReads); var numBins = binEvidence.NumBins; bool shouldRealignAtAll = finalizedIndelsForChrom.Any(); var imperfectFreqThreshold = _geminiOptions.ImperfectFreqThreshold; var indelRegionfreqThreshold = _geminiOptions.IndelRegionFreqThreshold; var messySiteThreshold = _geminiOptions.MessySiteThreshold; var numRetrievedFromLastBlock = 0; var numPairsSentToNextBlock = 0; var pairResultsForNextBlock = new Dictionary <PairClassification, List <PairResult> >(); var pairResultLookup = new Dictionary <PairClassification, List <PairResult> >(); foreach (var key in regionData.PairResultLookup.Keys) { if (!pairResultLookup.ContainsKey(key)) { pairResultLookup.Add(key, new List <PairResult>()); } pairResultLookup[key].AddRange(regionData.PairResultLookup[key]); } foreach (var category in pairResultLookup) { var isMessy = TypeClassifier.MessyTypes.Contains(category.Key); var isIndel = TypeClassifier._indelTypes.Contains(category.Key); var isSingleMismatch = _geminiOptions.AvoidLikelySnvs && (category.Key == PairClassification.SingleMismatchStitched || category.Key == PairClassification.UnstitchSingleMismatch); var isForwardOnlyMessy = IsForwardMessy(category.Key); var isReverseOnlyMessy = IsReverseMessy(category.Key); var isMapMessy = IsSuspiciousMapping(category.Key); foreach (var pairResult in category.Value) { // If on the edge, kick it over to the edge lookup. if (!isFinalTask && pairResult.ReadPair.MaxPosition > edgeThresholdOrig) { numPairsSentToNextBlock++; if (!pairResultsForNextBlock.ContainsKey(category.Key)) { pairResultsForNextBlock.Add(category.Key, new List <PairResult>()); } pairResultsForNextBlock[category.Key].Add(pairResult); nextEdgeMinPosition = Math.Min(nextEdgeMinPosition, pairResult.ReadPair.MinPosition); } // Still collect evidence even if it's edge, because that could impact this block as well as next block. binEvidence.AddMessEvidence(isMessy, pairResult, isIndel, isSingleMismatch, isForwardOnlyMessy, isReverseOnlyMessy, isMapMessy); } } numRetrievedFromLastBlock = AddAlignmentsFromEdgeState(edgeState, pairResultLookup, numRetrievedFromLastBlock); var finalizedBins = new UsableBins(binConclusions); if (shouldRealignAtAll) { binConclusions.AddIndelEvidence(finalizedIndelsForChrom, binsToExtendTo); binConclusions.ProcessRegions(messySiteThreshold, imperfectFreqThreshold, _geminiOptions.RegionDepthThreshold, indelRegionfreqThreshold, binsToExtendTo, _geminiOptions.DirectionalMessThreshold); finalizedBins.FinalizeConclusions(binsToExtendTo); } using (var snippetSource = _dataSourceFactory.CreateGenomeSnippetSource(_chrom, _chrReference)) { indelSource = _dataSourceFactory.GetChromosomeIndelSource(finalizedIndelsForChrom, snippetSource); } foreach (var kvp in pairResultsForNextBlock) { foreach (var pairResult in kvp.Value) { pairResultLookup[kvp.Key].Remove(pairResult); } } var allAlignments = new List <BamAlignment>(); var outcomesLookup = new Dictionary <HashableIndel, int[]>(); var numSkippedDueToSites = 0; var numKept = 0; var numRealigned = 0; var numSilenced = 0; var snowballCategories = _realignmentOptions.CategoriesForSnowballing; var doSnowball = snowballCategories.Any(); foreach (var category in snowballCategories) { if (pairResultLookup.ContainsKey(category)) { pairResultLookup.Remove(category, out var categoryReads); allAlignments.AddRange(ProcessCategory(_categoriesForRealignment, indelSource, shouldRealignAtAll, outcomesLookup, ref numSkippedDueToSites, ref numKept, ref numRealigned, ref numSilenced, categoryReads, category, binEvidence, _progressTracker, binConclusions, finalizedBins, startPosition, endPosition)); } } List <HashableIndel> superFinalizedIndels; if (doSnowball) { superFinalizedIndels = GetSuperFinalizedIndelsAfterSnowball(finalizedIndelsForChrom, outcomesLookup); if (_geminiOptions.Debug) { Logger.WriteToLog( $"After snowballing for region {_chrom}:{startPosition}-{endPosition}, filtered down to {superFinalizedIndels.Count} indels from {finalizedIndelsForChrom.Count} ({finalIndelLookup.Count} preliminary indels)."); } using (var snippetSource = _dataSourceFactory.CreateGenomeSnippetSource(_chrom, _chrReference)) { indelSource = _dataSourceFactory.GetChromosomeIndelSource(superFinalizedIndels, snippetSource); } if (_geminiOptions.RecalculateUsableSitesAfterSnowball) { binConclusions.ResetIndelRegions(); foreach (var indel in superFinalizedIndels) { var bin = (indel.ReferencePosition - adjustedStartPosition) / messySiteWidth; binConclusions.SetIndelRegionTrue(bin); for (int j = 0; j < binsToExtendTo; j++) { var binIndex = bin - j; if (binIndex >= 0) { binConclusions.SetIndelRegionTrue(binIndex); } else { break; } } for (int j = 0; j < binsToExtendTo; j++) { var binIndex = bin + j; if (!binConclusions.SetIndelRegionTrue(binIndex)) { break; } } } finalizedBins.FinalizeConclusions(binsToExtendTo); } } else { superFinalizedIndels = finalizedIndelsForChrom; } // TODO pull out the allocs below, or ideally actually remove them from realign pair handler or use something different altogether foreach (var category in pairResultLookup) { if (snowballCategories.Contains(category.Key)) { continue; } allAlignments.AddRange(ProcessCategory(_categoriesForRealignment, indelSource, shouldRealignAtAll, outcomesLookup, ref numSkippedDueToSites, ref numKept, ref numRealigned, ref numSilenced, category.Value, category.Key, binEvidence, _progressTracker, binConclusions, finalizedBins, startPosition, endPosition)); } var edgeHits = new Dictionary <int, int>(); var edgeSingleMismatchHits = new Dictionary <int, int>(); var edgeIndelHits = new Dictionary <int, int>(); var edgeMessyHits = new Dictionary <int, int>(); PopulateEdgeHitsAndLogBins(numBins, adjustedStartPosition, messySiteWidth, nextEdgeMinPosition, binEvidence, edgeHits, edgeSingleMismatchHits, edgeIndelHits, edgeMessyHits, startPosition, binConclusions, finalizedBins); UpdateMasterOutcomes(_masterOutcomesLookup, outcomesLookup); foreach (var hashableIndel in superFinalizedIndels) { _masterFinalIndels.AddOrUpdate(hashableIndel, 1, (h, n) => { return(n + 1); }); } _progressTracker.AddOrUpdate("Flushed", allAlignments.Count(), (x, currentCount) => { return(currentCount + allAlignments.Count()); }); _progressTracker.AddOrUpdate("Sent To Next Block", numPairsSentToNextBlock, (x, currentCount) => { return(currentCount + numPairsSentToNextBlock); }); _progressTracker.AddOrUpdate("Retrieved from Past Block", numRetrievedFromLastBlock, (x, currentCount) => { return(currentCount + numRetrievedFromLastBlock); }); _progressTracker.AddOrUpdate("Realigned", numRealigned, (x, currentCount) => { return(currentCount + numRealigned); }); _progressTracker.AddOrUpdate("Attempts", numKept, (x, currentCount) => { return(currentCount + numKept); }); _progressTracker.AddOrUpdate("Skipped", numSkippedDueToSites, (x, currentCount) => { return(currentCount + numSkippedDueToSites); }); _progressTracker.AddOrUpdate("Silenced", numSilenced, (x, currentCount) => { return(currentCount + numSilenced); }); pairResultLookup.Clear(); Logger.WriteToLog( $"Finished processing for region {_chrom}:{startPosition}-{endPosition}. {allAlignments.Count()} alignments flushed, " + $"{numPairsSentToNextBlock} sent to next block, {numRetrievedFromLastBlock} retrieved from {regionData.EdgeState?.Name}. " + $"Realigned {numRealigned}/{numKept} attempts ({numSkippedDueToSites} pairs skipped realignment), silenced {numSilenced} messy mates."); return(new AggregateRegionResults() { EdgeState = isFinalTask ? new EdgeState() { Name = "Final" } : new EdgeState() { EdgeAlignments = pairResultsForNextBlock, EdgeIndels = finalizedIndelsForChrom.Where(y => y.ReferencePosition > nextEdgeMinPosition) .ToList(), EffectiveMinPosition = nextEdgeMinPosition, Name = $"{startPosition}-{endPosition}", BinEvidence = binEvidence }, AlignmentsReadyToBeFlushed = allAlignments }); }