public void Execute() { var refIdMapping = _dataSourceFactory.GetRefIdMapping(_geminiSampleOptions.InputBam); var blockFactorySource = new BlockFactorySource(_stitcherOptions, _geminiOptions, refIdMapping, _bamRealignmentFactory, _dataSourceFactory, _geminiSampleOptions, _realignmentOptions, _geminiFactory); var evalresults = new DataflowReadEvaluator(_geminiOptions, _dataSourceFactory, _geminiSampleOptions, _dataOutputFactory, blockFactorySource).ProcessBam(); WriteIndelsCsv(_geminiSampleOptions.OutputFolder, _geminiOptions.IndelsCsvName, evalresults.IndelEvidence); MergeAndFinalizeBam(evalresults.CategorizedBams, _samtoolsWrapper, _geminiSampleOptions.OutputFolder, _geminiOptions.KeepUnmergedBams, indexFinalBam: _geminiSampleOptions.RefId == null || _geminiOptions.IndexPerChrom, doSort: _geminiSampleOptions.RefId == null || _geminiOptions.SortPerChrom); }
public EvidenceAndClassificationResults ProcessBam() { var refIdMapping = _dataSourceFactory.GetRefIdMapping(_geminiSampleOptions.InputBam); var chrom = _geminiSampleOptions.RefId == null ? "Unk" : refIdMapping[_geminiSampleOptions.RefId.Value]; var indelLookup = new ConcurrentDictionary <string, IndelEvidence>(); var classifications1 = new ConcurrentDictionary <PairClassification, int>(); foreach (var value in Enum.GetValues(typeof(PairClassification))) { classifications1.AddOrUpdate((PairClassification)value, 0, (x, y) => 0); } var progressTracker = new ConcurrentDictionary <string, int>(); var masterOutcomesLookup = new ConcurrentDictionary <HashableIndel, int[]>(); var masterFinalIndels = new ConcurrentDictionary <HashableIndel, int>(); var categorizedAlignments = new Dictionary <string, Dictionary <PairClassification, List <string> > >(); var borderlinePairs = new ConcurrentDictionary <string, ReadPair>(); var categoryLookup = new ConcurrentDictionary <PairClassification, int>(); using (var bamReader = _dataSourceFactory.CreateBamReader(_geminiSampleOptions.InputBam)) { using (var readPairSource = _dataSourceFactory.CreateReadPairSource(bamReader, new ReadStatusCounter())) { var pairBatchBlockFactory = _blockFactorySource.GetBatchBlockFactory(); var classifierBlockFactory = _blockFactorySource.GetClassifierBlockFactory(); ReadPair readPairEntry; var edgeStates = new ConcurrentDictionary <int, EdgeState>(); var edgeStatesTasks = new ConcurrentDictionary <int, Task>(); var writerSource = _dataOutputFactory.GetWriterSource(_geminiSampleOptions.InputBam, _geminiSampleOptions.OutputBam); var tasksToWaitOn = new List <Task>(); var blockSize = _geminiOptions.RegionSize; var currentBlockEnd = blockSize; var currentBlockStart = 0; var prevBlockStart = -1; var chrReference = _dataSourceFactory.GetChrReference(chrom); var classificationBlockProvider = _blockFactorySource.GetBlockProvider(refIdMapping, chrom, writerSource, progressTracker, categoryLookup, indelLookup, masterOutcomesLookup, masterFinalIndels, chrReference); var lineBuffer = pairBatchBlockFactory.GetBlock(); var pairClassifierBlock = classifierBlockFactory.GetClassifierBlock(); lineBuffer.LinkTo(pairClassifierBlock, new DataflowLinkOptions { PropagateCompletion = true }); var currentTask = classificationBlockProvider.GetAndLinkAllClassificationBlocksWithEcFinalization( pairClassifierBlock, currentBlockStart, currentBlockEnd, edgeStates, edgeStatesTasks, prevBlockStart); tasksToWaitOn.AddRange(currentTask); var numReadsRead = 0; var readsSinceLastTime = 0; var numReadsFlushedAsSingles = 0; while ((readPairEntry = readPairSource.GetNextEntryUntilNull()) != null) { numReadsRead++; readsSinceLastTime++; var pastCurrentBlock = readPairEntry.MinPosition > currentBlockEnd; if (pastCurrentBlock) { var readPosition = readPairEntry.MinPosition; var waiting = readPairSource.GetWaitingEntries(currentBlockEnd); foreach (var rp in waiting) { numReadsFlushedAsSingles = NumReadsFlushedAsSingles(rp, borderlinePairs, numReadsFlushedAsSingles, lineBuffer); numReadsRead++; readsSinceLastTime++; } lineBuffer.TriggerBatch(); lineBuffer.Complete(); if (_geminiOptions.LightDebug) { Logger.WriteToLog( $"Processing block {currentBlockStart}-{currentBlockEnd}. Next block will start at {readPosition}. Currently processing " + tasksToWaitOn.Count + " tasks " + $"({tasksToWaitOn.Count(x => x.IsCompleted)} completed)"); } while (true) { tasksToWaitOn = CheckAndClearTasks(tasksToWaitOn, null); if (tasksToWaitOn.Count <= _geminiOptions.NumConcurrentRegions * 3) { break; } } prevBlockStart = currentBlockStart; currentBlockStart = currentBlockEnd + 1; currentBlockEnd = readPosition + blockSize; var lineBufferNew = pairBatchBlockFactory.GetBlock(); var newPairClassifierBlock = classifierBlockFactory.GetClassifierBlock(); lineBufferNew.LinkTo(newPairClassifierBlock, new DataflowLinkOptions { PropagateCompletion = true }); var newTasks = classificationBlockProvider.GetAndLinkAllClassificationBlocksWithEcFinalization( newPairClassifierBlock, currentBlockStart, currentBlockEnd, edgeStates, edgeStatesTasks, prevBlockStart); tasksToWaitOn.AddRange(newTasks); readsSinceLastTime = 0; lineBuffer = lineBufferNew; } lineBuffer.Post(readPairEntry); } tasksToWaitOn = ClearCompletedTasks(tasksToWaitOn); var finalEntries = readPairSource.GetWaitingEntries().ToList(); Console.WriteLine($"Got {finalEntries.Count()} final entries."); foreach (var rp in finalEntries) { numReadsFlushedAsSingles = NumReadsFlushedAsSingles(rp, borderlinePairs, numReadsFlushedAsSingles, lineBuffer); numReadsRead++; readsSinceLastTime++; //lineBuffer.Post(rp); } Logger.WriteToLog($"Borderline pairs left: {borderlinePairs.Count}"); var writerHandle = writerSource.BamWriterHandle(chrom, PairClassification.Unknown, 0); var missingMatesWritten = 0; foreach (var borderlinePair in borderlinePairs.Values) { // These pairs (claim to have a mate nearby but the mate is not found in the bam) shouldn't exist in a real, well-formed bam. // They do come up if we're dealing with subsetted bams (e.g. for testing purposes). // Instead of passing these reads through the normal processing, pass them straight to the bam. We don't really know what else to do with them. Logger.WriteWarningToLog($"WARNING: Unable to properly process pair: {borderlinePair.Name} - never found mate. Writing to bam as-is."); foreach (var alignment in borderlinePair.GetAlignments()) { writerHandle.WriteAlignment(alignment); missingMatesWritten++; } } writerSource.DoneWithWriter(chrom, PairClassification.Unknown, 0, missingMatesWritten, writerHandle); Logger.WriteToLog($"Wrote {missingMatesWritten} reads with missing mates to bam."); lineBuffer.TriggerBatch(); lineBuffer.Complete(); prevBlockStart = currentBlockStart; currentBlockStart = currentBlockEnd + 1; currentBlockEnd = currentBlockEnd + blockSize; var lineBufferFinal = pairBatchBlockFactory.GetBlock(); var finalPairClassifierBlock = classifierBlockFactory.GetClassifierBlock(); lineBufferFinal.LinkTo(finalPairClassifierBlock, new DataflowLinkOptions { PropagateCompletion = true }); var finalTasks = classificationBlockProvider.GetAndLinkAllClassificationBlocksWithEcFinalization( finalPairClassifierBlock, currentBlockStart, currentBlockEnd, edgeStates, edgeStatesTasks, prevBlockStart, true); tasksToWaitOn.AddRange(finalTasks); lineBuffer = lineBufferFinal; Logger.WriteToLog($"Triggering last buffer batch. Read {numReadsRead} read pairs. Flushed {numReadsFlushedAsSingles} singles."); lineBuffer.TriggerBatch(); Logger.WriteToLog("Completing buffer"); lineBuffer.Complete(); tasksToWaitOn = tasksToWaitOn.Where(x => !x.IsCompleted).ToList(); Logger.WriteToLog($"Now waiting on {tasksToWaitOn.Count} tasks."); try { Task.WaitAll(tasksToWaitOn.ToArray()); tasksToWaitOn = tasksToWaitOn.Where(x => !x.IsCompleted).ToList(); if (tasksToWaitOn.Any()) { throw new Exception($"Some tasks did not complete"); } } catch (AggregateException e) { Logger.WriteExceptionToLog(e); Logger.WriteToLog("Status of tasks:\n"); foreach (var t in tasksToWaitOn) { Logger.WriteToLog(t.Status.ToString()); } throw; } Logger.WriteToLog("Done waiting on tasks."); writerSource.Finish(); categorizedAlignments[chrom] = new Dictionary <PairClassification, List <string> >(); categorizedAlignments[chrom][PairClassification.Unknown] = writerSource.GetBamFiles(); } } // Force GC because we're about to hand off to samtools, which doesn't fall under our purview GC.Collect(); var indelEvidence = new Dictionary <string, IndelEvidence>(); foreach (var kvp in indelLookup) { indelEvidence.Add(kvp.Key, kvp.Value); } Logger.WriteToLog( $"Found {indelLookup.Keys.Count} total indels, and {masterFinalIndels.Count} eligible for realignment."); var outcomesWriter = new OutcomesWriter(_geminiSampleOptions.OutputFolder, _dataOutputFactory); outcomesWriter.CategorizeProgressTrackerAndWriteCategoryOutcomesFile(progressTracker); foreach (var item in categoryLookup.Keys.OrderBy(x => x.ToString())) { Logger.WriteToLog($"CATEGORY {item}: {categoryLookup[item]}"); } outcomesWriter.WriteIndelOutcomesFile(masterOutcomesLookup); outcomesWriter.WriteIndelsFile(masterFinalIndels); return(new EvidenceAndClassificationResults() { IndelEvidence = indelEvidence, CategorizedBams = categorizedAlignments }); }