public BamRewriter(IBamReader bamReader, IBamWriter bamWriter, IAlignmentPairFilter filter, IReadPairHandler pairHandler, long?bufferSize = 100000, bool getUnpaired = false, string chrFilter = null) { _bamReader = bamReader; _bamWriter = bamWriter; _filter = filter; _pairHandler = pairHandler; _bufferSize = bufferSize; _getUnpaired = getUnpaired; _chrFilter = chrFilter; _alignmentBuffer = new List <BamAlignment>(); OnLog = message => Console.WriteLine(message); }
private IEnumerable <BamAlignment> TryStitch(ReadPair readPair, IReadPairHandler pairHandler, out PairClassification classification) { // TODO if we end up allowing NM calculation in here, this will become true. const bool allowStitchingOnImperfectReads = false; IEnumerable <BamAlignment> bamAlignmentList = pairHandler.ExtractReads(readPair); var bamAlignmentList2 = bamAlignmentList.ToList(); if (bamAlignmentList2.Count == 1) { readPair.Stitched = true; classification = PairClassification.PerfectStitched; if (allowStitchingOnImperfectReads) { var stitchedResult = bamAlignmentList2[0]; int?nm = 0; //TODO handle this if it is a hit on performance. Making it simple for now because the previous logic where we were lazy evaluating was a bit skewed var containsImperfections = ReadContainsImperfections(stitchedResult, _trustSoftclips); //nm = stitchedResult.GetIntTag("NM"); // TODO reinstate this if stitched read has proper NM var numMismatchesInR1 = readPair.Read1.GetIntTag("NM"); var numMismatchesInR2 = readPair.Read2.GetIntTag("NM"); if (containsImperfections || (nm > 0 || numMismatchesInR1 > 0 || numMismatchesInR2 > 0)) { classification = PairClassification.ImperfectStitched; if (numMismatchesInR1 <= NumMismatchesToBeConsideredLikelySnvInStitched && numMismatchesInR2 <= NumMismatchesToBeConsideredLikelySnvInStitched && !containsImperfections) { classification = PairClassification.SingleMismatchStitched; } else if (nm >= _numMismatchesToBeConsideredMessy || numMismatchesInR1 >= _numMismatchesToBeConsideredMessy || numMismatchesInR2 >= _numMismatchesToBeConsideredMessy) { classification = PairClassification.MessyStitched; } } } foreach (var alignment in bamAlignmentList) { foreach (var tag in _tagsToKeepFromR1) { var r1Tag = readPair.Read1.GetStringTag(tag); if (r1Tag != null) { alignment.ReplaceOrAddStringTag(tag, r1Tag); } } } } else { classification = PairClassification.FailStitch; } return(bamAlignmentList); }
public PairResult GetBamAlignmentsAndClassification(ReadPair readPair, IReadPairHandler pairHandler) { if (readPair.PairStatus == PairStatus.Duplicate) { // TODO hasIndels and numMismatches and split don't have meaning in an unusable dup, but it's a bit misleading to set them.. // Also, it's kind of silly to extract those alignments if we're going to set it to unusable anyway. var alignments = readPair.GetAlignments().ToList(); return(new PairResult(alignments: alignments, readPair: readPair, classification: PairClassification.Duplicate, hasIndels: false, isSplit: false, numMismatchesInSingleton: 0, softclipLengthForIndelRead: 0) { IsReputableIndelContaining = false }); } var classification = PairClassification.Unknown; IEnumerable <BamAlignment> bamAlignmentList = null; int?numMismatchesInR1 = null; int?numMismatchesInR2 = null; var r1HasIndels = OverlappingIndelHelpers.ReadContainsIndels(readPair.Read1); var r2HasIndels = OverlappingIndelHelpers.ReadContainsIndels(readPair.Read2); var hasIndels = r1HasIndels || r2HasIndels; if (IsCompletedPairedPair(readPair)) { if (BothReadsHighQuality(readPair)) { numMismatchesInR1 = readPair.Read1.GetIntTag("NM"); numMismatchesInR2 = readPair.Read2.GetIntTag("NM"); var tryStitch = true; if (hasIndels) { if (numMismatchesInR1 == null && numMismatchesInR2 == null) { Logger.WriteWarningToLog( $"Found indel-containing read without NM: '{readPair.Name}', likely indicating that NM is not set on any read. Consider preprocessing the BAM to calculate NM tags for best results."); } return(HandleIndelPairIfStitchUnallowed(readPair, numMismatchesInR1 ?? 0, numMismatchesInR2 ?? 0, r1HasIndels, r2HasIndels)); } else { // TODO if not realigning anything (or not realigning imperfects), go ahead and stitch immediately // TODO why are we using this bool multiple times // ^ Because there's no point checking for imperfectinos if we don't care about softclips -- we already know there are no indels because this is in the else of if(hasIndels) if (!_trustSoftclips && (ReadContainsImperfections(readPair.Read1, _trustSoftclips) || ReadContainsImperfections(readPair.Read2, _trustSoftclips))) { tryStitch = false; classification = ClassifySoftclipContainingPairGivenSoftclipDistrust(readPair, numMismatchesInR1, numMismatchesInR2); bamAlignmentList = readPair.GetAlignments(); } else { if (numMismatchesInR1 == null) { numMismatchesInR1 = readPair.Read1.GetIntTag("NM"); } if (numMismatchesInR2 == null) { numMismatchesInR2 = readPair.Read2.GetIntTag("NM"); } if (numMismatchesInR1 >= _numMismatchesToBeConsideredMessy || numMismatchesInR2 >= _numMismatchesToBeConsideredMessy) { classification = PairClassification.UnstitchMessy; tryStitch = false; if (numMismatchesInR1 <= 1 || numMismatchesInR2 <= 1) { // One of the reads is clean tryStitch = false; if (numMismatchesInR1 <= 1) { // R1 is the clean one. if (readPair.Read2.IsReverseStrand()) { classification = PairClassification.UnstitchReverseMessy; } else { classification = PairClassification.UnstitchForwardMessy; } } else { if (readPair.Read1.IsReverseStrand()) { classification = PairClassification.UnstitchReverseMessy; } else { classification = PairClassification.UnstitchForwardMessy; } } } bamAlignmentList = readPair.GetAlignments(); } else if (numMismatchesInR1 + numMismatchesInR2 == 0) { classification = PairClassification.UnstitchPerfect; bamAlignmentList = readPair.GetAlignments(); } else if (numMismatchesInR1 <= 1 && numMismatchesInR2 <= 1) { classification = PairClassification.UnstitchSingleMismatch; bamAlignmentList = readPair.GetAlignments(); } else { classification = PairClassification.UnstitchImperfect; bamAlignmentList = readPair.GetAlignments(); } } classification = AdjustClassificationForMultimapper(readPair, classification); } if (classification == PairClassification.UnstitchMessySuspiciousRead) { tryStitch = false; } if (_skipStitch) { tryStitch = false; } if (classification != PairClassification.UnstitchPerfect) { //For now we can't stitch anything else because we can't properly calculate NM!! tryStitch = false; } if (!tryStitch) { if (bamAlignmentList == null) { bamAlignmentList = readPair.GetAlignments().ToList(); classification = PairClassification.Unstitchable; } } else { bamAlignmentList = TryStitch(readPair, pairHandler, out classification); } } else if (OneReadIsHighQuality(readPair)) { classification = PairClassification.Split; if (hasIndels) { numMismatchesInR1 = numMismatchesInR1 ?? readPair.Read1?.GetIntTag("NM") ?? 0; numMismatchesInR2 = numMismatchesInR2 ?? readPair.Read2?.GetIntTag("NM") ?? 0; return(HandlePairContainingIndels(readPair, r1HasIndels, r2HasIndels, numMismatchesInR1.Value, numMismatchesInR2.Value, true, PairClassification.Split, true)); } } else { classification = PairClassification.Unusable; bamAlignmentList = readPair.GetAlignments().ToList(); } } else { numMismatchesInR1 = numMismatchesInR1 ?? readPair.Read1?.GetIntTag("NM") ?? 0; numMismatchesInR2 = numMismatchesInR2 ?? readPair.Read2?.GetIntTag("NM") ?? 0; return(ClassifyIncompletePair(readPair, r1HasIndels, r2HasIndels, numMismatchesInR1.Value, numMismatchesInR2.Value)); } // TODO - not sure why I originally had this double-check on whether pairs were split? shouldn't this already be evident from the pair status? //var isSplit = bamAlignmentList?.Count() > 0 && bamAlignmentList?.Select(x => x.RefID).Distinct().Count() > 1; var isSplit = false; if (isSplit || classification == PairClassification.Split || readPair.PairStatus == PairStatus.SplitChromosomes || readPair.PairStatus == PairStatus.MateNotFound || readPair.PairStatus == PairStatus.MateUnmapped) { return(HandleSplitNonIndelPair(readPair, bamAlignmentList, hasIndels, isSplit)); } var pr = new PairResult(bamAlignmentList.ToList(), readPair, classification, hasIndels, isSplit); if (classification == PairClassification.UnstitchMessy || classification == PairClassification.UnstitchMessySuspiciousRead) { if (_checkMd && HasSuspiciousMd(readPair, numMismatchesInR1, numMismatchesInR2, pr)) { classification = PairClassification.UnstitchMessySuspiciousMd; pr.Classification = classification; } } return(pr); }
public PostRealignmentStitcher(IReadPairHandler stitchedPairHandler, IStatusHandler statusHandler, List <string> tagsToKeepFromR1 = null) { _stitchedPairHandler = stitchedPairHandler; _statusHandler = statusHandler; _tagsToKeepFromR1 = tagsToKeepFromR1 ?? new List <string>(); }