public int GetNm(BamAlignment alignment) { var positionMap = new PositionMap(alignment.Bases.Length); Read.UpdatePositionMap(alignment.Position + 1, alignment.CigarData, positionMap); var snippet = _genomeSnippetSource.GetGenomeSnippet(alignment.Position); var numMismatches = Helper.GetNumMismatches(alignment.Bases, positionMap, snippet.Sequence, snippet.StartPosition); if (numMismatches == null) { throw new Exception("Num mismatches is null"); } var numIndelBases = alignment.CigarData.NumIndelBases(); return(numMismatches.Value + numIndelBases); }
private RealignmentResult RealignForAnchor(HashableIndel[] indels, Dictionary <HashableIndel, GenomeSnippet> indelContexts, Read read, bool anchorOnLeft, ReadToRealignDetails details, bool pairSpecific, int[] indexes) { try { var freshCigarWithoutTerminalNs = new CigarAlignment(details.FreshCigarWithoutTerminalNs); var freshPositionMap = new PositionMap(details.PositionMapLength); for (int i = 0; i < details.PositionMapLength; i++) { freshPositionMap.UpdatePositionAtIndex(i, details.PositionMapWithoutTerminalNs.GetPositionAtIndex(i)); } var result = new RealignmentResult(); // layer on indels one by one, indels already sorted by ascending position if (LayerOnIndels(indels, indelContexts, anchorOnLeft, details.SequenceWithoutTerminalNs, freshPositionMap, ref result, pairSpecific)) { return(null); } var context = indelContexts[indels[0]]; // Softclip partial insertions at read ends if (_maskPartialInsertion || _minimumUnanchoredInsertionLength > 0) { MaskPartialInsertion(indels, read, context.Sequence, result, context.StartPosition); } _softclipReapplier.ReapplySoftclips(read, details.NPrefixLength, details.NSuffixLength, freshPositionMap, result, context, details.PrefixSoftclip, details.SuffixSoftclip, freshCigarWithoutTerminalNs); result.AcceptedIndels = new List <int>(); result.AcceptedHashableIndels = new List <HashableIndel>(); for (int i = 0; i < result.AcceptedIndelsInSubList.Count; i++) { // TODO do we need to be more nuanced about this and only do it in duplication areas? var currentSubIndex = result.AcceptedIndelsInSubList[i]; result.AcceptedIndels.Add(indexes[currentSubIndex]); var currentIndel = indels[currentSubIndex]; result.AcceptedHashableIndels.Add(currentIndel); if (currentIndel.Type == AlleleCategory.Deletion) { var addedAt = result.IndelsAddedAt[i]; var anchorStart = addedAt + 1; var lastOp = result.Cigar[result.Cigar.Count - 1]; var rightSoftclipLength = lastOp.Type == 'S' ? (int)lastOp.Length : 0; var rightAnchorLength = read.Sequence.Length - anchorStart - rightSoftclipLength; if (rightAnchorLength < currentIndel.Length && anchorStart < read.Sequence.Length) { if (read.Sequence.Substring(anchorStart, rightAnchorLength) == currentIndel.ReferenceAllele.Substring(1, rightAnchorLength)) { return(null); } } } } if (result.SumOfMismatchingQualities == null) { result.SumOfMismatchingQualities = Helper.GetSumOfMismatchQualities(read.Qualities, read.Sequence, freshPositionMap, context.Sequence, context.StartPosition); } result.Indels = string.Join("|", indels.Select(x => StringifyIndel(x))); return(result); } catch (Exception e) { if (_debug) { Logger.WriteExceptionToLog(new Exception($"Realign for anchor failed: read '{read.Name}' with indels {(string.Join("|", indels.Select(x => StringifyIndel(x))))}, anchoring on {(anchorOnLeft ? "left" : "right")}.", e)); } return(null); } }
public void ReapplySoftclips(Read read, int nPrefixLength, int nSuffixLength, PositionMap positionMapWithoutTerminalNs, RealignmentResult result, GenomeSnippet context, uint prefixSoftclip, uint suffixSoftclip, CigarAlignment freshCigarWithoutTerminalNs) { // Re-append the N-prefix var nPrefixPositionMap = Enumerable.Repeat(-1, nPrefixLength); var nSuffixPositionMap = Enumerable.Repeat(-1, nSuffixLength); // TODO maybe have a function for combining pos maps instead var finalPositionMap = new PositionMap(nPrefixPositionMap.Concat(positionMapWithoutTerminalNs.Map).Concat(nSuffixPositionMap).ToArray()); var finalCigar = new CigarAlignment { new CigarOp('S', (uint)nPrefixLength) }; foreach (CigarOp op in result.Cigar) { finalCigar.Add(op); } finalCigar.Add(new CigarOp('S', (uint)nSuffixLength)); finalCigar.Compress(); result.Cigar = finalCigar; // In case realignment introduced a bunch of mismatch-Ms where there was previously softclipping, optionally re-mask them. if (result != null && _remaskSoftclips) { var mismatchMap = Helper.GetMismatchMap(read.Sequence, finalPositionMap, context.Sequence, context.StartPosition); var softclipAdjustedCigar = Helper.SoftclipCigar(result.Cigar, mismatchMap, prefixSoftclip, suffixSoftclip, maskNsOnly: _maskNsOnly, prefixNs: Helper.GetCharacterBookendLength(read.Sequence, 'N', false), suffixNs: Helper.GetCharacterBookendLength(read.Sequence, 'N', true), softclipEvenIfMatch: _keepProbeSoftclips || _keepBothSideSoftclips, softclipRepresentsMess: (!(_keepBothSideSoftclips || _keepProbeSoftclips))); // Update position map to account for any softclipping added var adjustedPrefixClip = softclipAdjustedCigar.GetPrefixClip(); for (var i = 0; i < adjustedPrefixClip; i++) { finalPositionMap.UpdatePositionAtIndex(i, -2, true); } var adjustedSuffixClip = softclipAdjustedCigar.GetSuffixClip(); for (var i = 0; i < adjustedSuffixClip; i++) { finalPositionMap.UpdatePositionAtIndex(finalPositionMap.Length - 1 - i, -2, true); } var editDistance = Helper.GetNumMismatches(read.Sequence, finalPositionMap, context.Sequence, context.StartPosition); if (editDistance == null) { // This shouldn't happen at this point - we already have a successful result throw new InvalidDataException("Edit distance is null for :" + read.Name + " with position map " + string.Join(",", finalPositionMap) + " and CIGAR " + softclipAdjustedCigar); } // TODO PERF - See how much this really helps analytically. I'm thinking maybe kill this altogether and remove from eval var sumOfMismatching = Helper.GetSumOfMismatchQualities(mismatchMap, read.Qualities); var readHasPosition = finalPositionMap.HasAnyMappableBases(); if (!readHasPosition) { throw new InvalidDataException(string.Format( "Read does not have any alignable bases. ({2} --> {0} --> {3}, {1})", freshCigarWithoutTerminalNs, string.Join(",", finalPositionMap), read.CigarData, softclipAdjustedCigar)); } result.Position = finalPositionMap.FirstMappableBase(); // TODO this used to be >= 0 but changed to > 0. Confirm correct. result.Cigar = softclipAdjustedCigar; result.NumMismatches = editDistance.Value; var addedAtFinal = new List <int>(); foreach (var i in result.IndelsAddedAt) { addedAtFinal.Add(i + nPrefixLength); } result.IndelsAddedAt = addedAtFinal; var nifiedAtFinal = new List <int>(); foreach (var i in result.NifiedAt) { nifiedAtFinal.Add(i + nPrefixLength); } result.NifiedAt = nifiedAtFinal; var newSummary = Extensions.GetAlignmentSummary(result.Position - 1 - context.StartPosition, result.Cigar, context.Sequence, read.Sequence, _trackActualMismatches, _checkSoftclipsForMismatches); result.NumNonNMismatches = newSummary.NumNonNMismatches; result.NumNonNSoftclips = newSummary.NumNonNSoftclips; result.NumSoftclips = newSummary.NumSoftclips; result.NumInsertedBases = newSummary.NumInsertedBases; result.NumMismatchesIncludeSoftclip = newSummary.NumMismatchesIncludeSoftclip; //result.MismatchesIncludeSoftclip = newSummary.MismatchesIncludeSoftclip; result.SumOfMismatchingQualities = sumOfMismatching; result.AnchorLength = newSummary.AnchorLength; } }
private RealignmentResult AddIndelAndGetResult(string readSequence, HashableIndel priorIndel, string refSequence, bool anchorLeft, PositionMap positionMap, int refSequenceStartIndex, bool pairSpecific) { var foundIndel = false; var insertionPostionInReadStart = -1; var insertionPositionInReadEnd = -1; var deletionPositionInRead = -1; bool anyPositionsAfterDeletionMapped = false; // TODO PERF can we bail out early if it's not possible that the indel could be inserted in the read, based on position? if (anchorLeft) { // move along position map to see if we can insert indel for (var i = 0; i < positionMap.Length; i++) { if (positionMap.GetPositionAtIndex(i) == priorIndel.ReferencePosition && i != positionMap.Length - 1) // make sure we dont end right before indel { foundIndel = true; if (priorIndel.Type == AlleleCategory.Insertion) { insertionPostionInReadStart = i + 1; // stick in -1 for insertion length, then adjust positions after for (var j = i + 1; j < positionMap.Length; j++) { if (j - i <= priorIndel.Length) { positionMap.UpdatePositionAtIndex(j, -1, true); if (j - i == priorIndel.Length || j == positionMap.Length - 1) { insertionPositionInReadEnd = j; } } else { if (positionMap.GetPositionAtIndex(j) != -1) // preserve existing insertions { positionMap.UpdatePositionAtIndex(j, positionMap.GetPositionAtIndex(j) - priorIndel.Length); } } } break; } if (priorIndel.Type == AlleleCategory.Deletion) { deletionPositionInRead = i; // offset positions after deletion for (var j = i + 1; j < positionMap.Length; j++) { if (positionMap.GetPositionAtIndex(j) != -1) // preserve existing insertions { anyPositionsAfterDeletionMapped = true; positionMap.UpdatePositionAtIndex(j, positionMap.GetPositionAtIndex(j) + priorIndel.Length); } } break; } } } } else { // walk backwards along position map to see if we can insert indel if (priorIndel.Type == AlleleCategory.Insertion) { for (var i = positionMap.Length - 1; i >= 0; i--) { if (positionMap.GetPositionAtIndex(i) == priorIndel.ReferencePosition + 1 && i != 0) { foundIndel = true; insertionPositionInReadEnd = i - 1; } else if (positionMap.GetPositionAtIndex(i) == priorIndel.ReferencePosition && i != positionMap.Length - 1) { foundIndel = true; insertionPositionInReadEnd = i; } if (foundIndel) { // stick in -1 for insertion length, then adjust positions for (var j = insertionPositionInReadEnd; j >= 0; j--) { if (insertionPositionInReadEnd - j + 1 <= priorIndel.Length) { positionMap.UpdatePositionAtIndex(j, -1, true); if (insertionPositionInReadEnd - j + 1 == priorIndel.Length || j == 0) { insertionPostionInReadStart = j; } } else { if (positionMap.GetPositionAtIndex(j) != -1) // Don't update position map for things that were already -1 { positionMap.UpdatePositionAtIndex(j, positionMap.GetPositionAtIndex(j) + priorIndel.Length); } } } break; } } } else if (priorIndel.Type == AlleleCategory.Deletion) { for (var i = positionMap.Length - 1; i >= 1; i--) { if (positionMap.GetPositionAtIndex(i) == priorIndel.ReferencePosition + priorIndel.Length + 1) //deletions must be fully anchored to be observed { foundIndel = true; deletionPositionInRead = i - 1; // offset positions after deletion for (var j = i - 1; j >= 0; j--) { if (positionMap.GetPositionAtIndex(j) != -1) // preserve existing insertions { anyPositionsAfterDeletionMapped = true; positionMap.UpdatePositionAtIndex(j, positionMap.GetPositionAtIndex(j) - priorIndel.Length); } } break; } } } } //if (!foundIndel || !Helper.IsValidMap(positionMap, refSequence)) //TODO changed this just for tailor if (!foundIndel || (priorIndel.Type == AlleleCategory.Deletion && !anyPositionsAfterDeletionMapped) || !Helper.IsValidMap(positionMap.Map)) { return(null); } var isSketchy = false; if (priorIndel.IsRepeat) { //if (priorIndel.Type == AlleleCategory.Deletion) //{ // if (Helper.RepeatDeletionFlankedByRepeats(readSequence, priorIndel, deletionPositionInRead)) // { // return null; // } //} //// TODO in the case of using sketchy anchor test: //// Ideally, we'd check the anchor length against how many repeats are in the reference vs the variant, //// ... Or maybe just always check the whole anchor if it's a repeat. var anchorLength = priorIndel.Type == AlleleCategory.Insertion ? Math.Min(insertionPostionInReadStart, readSequence.Length - insertionPositionInReadEnd) : Math.Min(deletionPositionInRead, readSequence.Length - deletionPositionInRead); if (anchorLength >= readSequence.Length) { throw new Exception("Anchor should never be longer than read length."); // TODO remove after dev. } if (anchorLength < Math.Max(10, priorIndel.Length)) { if (priorIndel.Type == AlleleCategory.Deletion) { if (Helper.DeletionHasSketchyAnchor(readSequence, priorIndel, deletionPositionInRead)) { if (pairSpecific) { isSketchy = true; } else { return(null); } } } else { if (priorIndel.NumBasesInReferenceSuffixBeforeUnique >= anchorLength) { if (pairSpecific) { isSketchy = true; } else { return(null); } } } } } // TODO do we need to be more nuanced about this and only do it in duplication areas? if (priorIndel.Type == AlleleCategory.Deletion) { var anchorStart = deletionPositionInRead + 1; var rightAnchorLength = readSequence.Length - anchorStart; if (rightAnchorLength < priorIndel.Length) { if (anchorStart < readSequence.Length) { if (readSequence.Substring(anchorStart) == priorIndel.ReferenceAllele.Substring(1, rightAnchorLength)) { return(null); } } } } if (priorIndel.IsDuplication && priorIndel.Type == AlleleCategory.Insertion) { // TODO return to this - I think the thought was to prevent FP dups, but the implementation may have been wrong // No partial duplications? //if (readSequence.Length - insertionPositionInReadEnd <= priorIndel.Length) if (readSequence.Length - insertionPositionInReadEnd <= 3) { // Assumes priors are left-aligned return(null); } } //verify insertion matches var newReadSequence = readSequence; var nifiedAt = new List <int>(); if (priorIndel.Type == AlleleCategory.Insertion) { if (insertionPostionInReadStart == -1 || insertionPositionInReadEnd == -1) { return(null); // weird, this shouldnt ever happen } var readInsertedSequence = readSequence.Substring(insertionPostionInReadStart, insertionPositionInReadEnd - insertionPostionInReadStart + 1); var indelSequence = priorIndel.AlternateAllele.Substring(1); if (anchorLeft && readInsertedSequence.Length < indelSequence.Length && priorIndel.NumApproxDupsRight > 0) { // Don't allow partial realignment to dups return(null); } if (!anchorLeft && readInsertedSequence.Length < indelSequence.Length && priorIndel.NumApproxDupsLeft > 0) { // Don't allow partial realignment to dups return(null); } var clippedPriorSequence = anchorLeft ? indelSequence.Substring(0, readInsertedSequence.Length) : indelSequence.Substring(indelSequence.Length - readInsertedSequence.Length); var isMismatch = readInsertedSequence != clippedPriorSequence; if (isMismatch) { int?mismatches = null; var mismatchesToDq = 0d; if (priorIndel.Length >= _minInsertionSizeToAllowMismatchingBases && !(priorIndel.NumApproxDupsLeft + priorIndel.NumApproxDupsRight > 0)) { mismatches = Helper.GetHammingNumMismatches(readInsertedSequence, clippedPriorSequence); mismatchesToDq = priorIndel.Length * _maxProportionInsertSequenceMismatch; if (mismatches > mismatchesToDq) { //Console.WriteLine( // $"Too many mismatches between insertions: {mismatches} > {maxAllowedMismatches} ({clippedPriorSequence} vs {readInsertedSequence})"); } else { //Console.WriteLine( // $"Able to Nify mismatches between insertions: {mismatches} <= {maxAllowedMismatches} ({clippedPriorSequence} vs {readInsertedSequence})"); var newSequence = Helper.NifyMismatches(clippedPriorSequence, readInsertedSequence, nifiedAt); // TODO PERF is this actually necessary now that we're not actually Nifying? We can just keep the bases that we're Nifying at. newReadSequence = readSequence.Substring(0, insertionPostionInReadStart) + newSequence.ToLower() + readSequence.Substring(insertionPositionInReadEnd + 1); nifiedAt = nifiedAt.Select(x => x + insertionPostionInReadStart).ToList(); } } if (mismatches == null || (mismatches > mismatchesToDq)) { return(null); // inserted sequence doesn't match read } } } // TODO update to use PositionMap class var newCigar = Helper.ConstructCigar(positionMap.Map); // TODO moved this, and probably should in original Hygea too? // Also, can cut down the calls to positionmap.First() in the original //var readHasPosition = positionMap.Any(p => p > 0); // Position map is one-based, so should be >, not >= 0. if (!positionMap.HasAnyMappableBases()) { throw new InvalidDataException(string.Format("Trying to generate result and read does not have any alignable bases. ({0}, {1})", newCigar, string.Join(",", positionMap))); } var startIndexInReference = positionMap.FirstMappableBase() - 1; // Position map is one-based, so should be >, not >= 0. var startIndexInRefSequenceSnippet = startIndexInReference - refSequenceStartIndex; var newSummary = Extensions.GetAlignmentSummary(startIndexInRefSequenceSnippet, newCigar, refSequence, newReadSequence, _trackActualMismatches, _checkSoftclipsForMismatches); if (newSummary == null) { return(null); } return(new RealignmentResult() { Cigar = newCigar, NumIndels = newCigar.NumIndels(), Position = startIndexInReference + 1, NumMismatches = newSummary.NumMismatches, NumNonNMismatches = newSummary.NumNonNMismatches, NumSoftclips = newSummary.NumSoftclips, NumNonNSoftclips = newSummary.NumNonNSoftclips, NumDeletedBases = newSummary.NumDeletedBases, NumInsertedBases = newSummary.NumInsertedBases, NumMatches = newSummary.NumMatches, NumIndelBases = newSummary.NumIndelBases, NumMismatchesIncludeSoftclip = newSummary.NumMismatchesIncludeSoftclip, MismatchesIncludeSoftclip = newSummary.MismatchesIncludeSoftclip, Indels = StringifyIndel(priorIndel), NifiedAt = nifiedAt, IndelsAddedAt = new List <int> { priorIndel.Type == AlleleCategory.Insertion ? insertionPostionInReadStart : deletionPositionInRead }, IsSketchy = isSketchy }); }