Esempio n. 1
0
        public int GetNm(BamAlignment alignment)
        {
            var positionMap = new PositionMap(alignment.Bases.Length);

            Read.UpdatePositionMap(alignment.Position + 1, alignment.CigarData, positionMap);

            var snippet = _genomeSnippetSource.GetGenomeSnippet(alignment.Position);

            var numMismatches =
                Helper.GetNumMismatches(alignment.Bases, positionMap, snippet.Sequence, snippet.StartPosition);

            if (numMismatches == null)
            {
                throw new Exception("Num mismatches is null");
            }

            var numIndelBases = alignment.CigarData.NumIndelBases();

            return(numMismatches.Value + numIndelBases);
        }
Esempio n. 2
0
        private RealignmentResult RealignForAnchor(HashableIndel[] indels, Dictionary <HashableIndel, GenomeSnippet> indelContexts,
                                                   Read read, bool anchorOnLeft, ReadToRealignDetails details, bool pairSpecific, int[] indexes)
        {
            try
            {
                var freshCigarWithoutTerminalNs = new CigarAlignment(details.FreshCigarWithoutTerminalNs);
                var freshPositionMap            = new PositionMap(details.PositionMapLength);

                for (int i = 0; i < details.PositionMapLength; i++)
                {
                    freshPositionMap.UpdatePositionAtIndex(i,
                                                           details.PositionMapWithoutTerminalNs.GetPositionAtIndex(i));
                }

                var result = new RealignmentResult();

                // layer on indels one by one, indels already sorted by ascending position

                if (LayerOnIndels(indels, indelContexts, anchorOnLeft, details.SequenceWithoutTerminalNs,
                                  freshPositionMap, ref result, pairSpecific))
                {
                    return(null);
                }

                var context = indelContexts[indels[0]];

                // Softclip partial insertions at read ends
                if (_maskPartialInsertion || _minimumUnanchoredInsertionLength > 0)
                {
                    MaskPartialInsertion(indels, read, context.Sequence, result, context.StartPosition);
                }

                _softclipReapplier.ReapplySoftclips(read, details.NPrefixLength, details.NSuffixLength, freshPositionMap, result, context,
                                                    details.PrefixSoftclip, details.SuffixSoftclip, freshCigarWithoutTerminalNs);

                result.AcceptedIndels         = new List <int>();
                result.AcceptedHashableIndels = new List <HashableIndel>();
                for (int i = 0; i < result.AcceptedIndelsInSubList.Count; i++)
                {
                    // TODO do we need to be more nuanced about this and only do it in duplication areas?
                    var currentSubIndex = result.AcceptedIndelsInSubList[i];
                    result.AcceptedIndels.Add(indexes[currentSubIndex]);
                    var currentIndel = indels[currentSubIndex];
                    result.AcceptedHashableIndels.Add(currentIndel);
                    if (currentIndel.Type == AlleleCategory.Deletion)
                    {
                        var addedAt             = result.IndelsAddedAt[i];
                        var anchorStart         = addedAt + 1;
                        var lastOp              = result.Cigar[result.Cigar.Count - 1];
                        var rightSoftclipLength = lastOp.Type == 'S' ? (int)lastOp.Length : 0;
                        var rightAnchorLength   = read.Sequence.Length - anchorStart - rightSoftclipLength;
                        if (rightAnchorLength < currentIndel.Length && anchorStart < read.Sequence.Length)
                        {
                            if (read.Sequence.Substring(anchorStart, rightAnchorLength) ==
                                currentIndel.ReferenceAllele.Substring(1, rightAnchorLength))
                            {
                                return(null);
                            }
                        }
                    }
                }

                if (result.SumOfMismatchingQualities == null)
                {
                    result.SumOfMismatchingQualities = Helper.GetSumOfMismatchQualities(read.Qualities, read.Sequence,
                                                                                        freshPositionMap, context.Sequence,
                                                                                        context.StartPosition);
                }


                result.Indels = string.Join("|", indels.Select(x => StringifyIndel(x)));

                return(result);
            }
            catch (Exception e)
            {
                if (_debug)
                {
                    Logger.WriteExceptionToLog(new Exception($"Realign for anchor failed: read '{read.Name}' with indels {(string.Join("|", indels.Select(x => StringifyIndel(x))))}, anchoring on {(anchorOnLeft ? "left" : "right")}.", e));
                }
                return(null);
            }
        }
Esempio n. 3
0
        public void ReapplySoftclips(Read read, int nPrefixLength, int nSuffixLength, PositionMap positionMapWithoutTerminalNs,
                                     RealignmentResult result, GenomeSnippet context, uint prefixSoftclip, uint suffixSoftclip,
                                     CigarAlignment freshCigarWithoutTerminalNs)
        {
            // Re-append the N-prefix
            var nPrefixPositionMap = Enumerable.Repeat(-1, nPrefixLength);
            var nSuffixPositionMap = Enumerable.Repeat(-1, nSuffixLength);
            // TODO maybe have a function for combining pos maps instead
            var finalPositionMap = new PositionMap(nPrefixPositionMap.Concat(positionMapWithoutTerminalNs.Map).Concat(nSuffixPositionMap).ToArray());


            var finalCigar = new CigarAlignment {
                new CigarOp('S', (uint)nPrefixLength)
            };

            foreach (CigarOp op in result.Cigar)
            {
                finalCigar.Add(op);
            }

            finalCigar.Add(new CigarOp('S', (uint)nSuffixLength));
            finalCigar.Compress();
            result.Cigar = finalCigar;



            // In case realignment introduced a bunch of mismatch-Ms where there was previously softclipping, optionally re-mask them.
            if (result != null && _remaskSoftclips)
            {
                var mismatchMap =
                    Helper.GetMismatchMap(read.Sequence, finalPositionMap, context.Sequence, context.StartPosition);

                var softclipAdjustedCigar = Helper.SoftclipCigar(result.Cigar, mismatchMap, prefixSoftclip, suffixSoftclip,
                                                                 maskNsOnly: _maskNsOnly, prefixNs: Helper.GetCharacterBookendLength(read.Sequence, 'N', false),
                                                                 suffixNs: Helper.GetCharacterBookendLength(read.Sequence, 'N', true), softclipEvenIfMatch: _keepProbeSoftclips || _keepBothSideSoftclips, softclipRepresentsMess: (!(_keepBothSideSoftclips || _keepProbeSoftclips)));

                // Update position map to account for any softclipping added
                var adjustedPrefixClip = softclipAdjustedCigar.GetPrefixClip();
                for (var i = 0; i < adjustedPrefixClip; i++)
                {
                    finalPositionMap.UpdatePositionAtIndex(i, -2, true);
                }

                var adjustedSuffixClip = softclipAdjustedCigar.GetSuffixClip();
                for (var i = 0; i < adjustedSuffixClip; i++)
                {
                    finalPositionMap.UpdatePositionAtIndex(finalPositionMap.Length - 1 - i, -2, true);
                }

                var editDistance =
                    Helper.GetNumMismatches(read.Sequence, finalPositionMap, context.Sequence, context.StartPosition);
                if (editDistance == null)
                {
                    // This shouldn't happen at this point - we already have a successful result
                    throw new InvalidDataException("Edit distance is null for :" + read.Name + " with position map " +
                                                   string.Join(",", finalPositionMap) + " and CIGAR " + softclipAdjustedCigar);
                }

                // TODO PERF - See how much this really helps analytically. I'm thinking maybe kill this altogether and remove from eval
                var sumOfMismatching = Helper.GetSumOfMismatchQualities(mismatchMap, read.Qualities);

                var readHasPosition = finalPositionMap.HasAnyMappableBases();
                if (!readHasPosition)
                {
                    throw new InvalidDataException(string.Format(
                                                       "Read does not have any alignable bases. ({2} --> {0} --> {3}, {1})", freshCigarWithoutTerminalNs,
                                                       string.Join(",", finalPositionMap), read.CigarData, softclipAdjustedCigar));
                }

                result.Position      = finalPositionMap.FirstMappableBase(); // TODO this used to be >= 0 but changed to > 0. Confirm correct.
                result.Cigar         = softclipAdjustedCigar;
                result.NumMismatches = editDistance.Value;

                var addedAtFinal = new List <int>();
                foreach (var i in result.IndelsAddedAt)
                {
                    addedAtFinal.Add(i + nPrefixLength);
                }
                result.IndelsAddedAt = addedAtFinal;
                var nifiedAtFinal = new List <int>();
                foreach (var i in result.NifiedAt)
                {
                    nifiedAtFinal.Add(i + nPrefixLength);
                }
                result.NifiedAt = nifiedAtFinal;

                var newSummary = Extensions.GetAlignmentSummary(result.Position - 1 - context.StartPosition, result.Cigar,
                                                                context.Sequence,
                                                                read.Sequence, _trackActualMismatches, _checkSoftclipsForMismatches);

                result.NumNonNMismatches            = newSummary.NumNonNMismatches;
                result.NumNonNSoftclips             = newSummary.NumNonNSoftclips;
                result.NumSoftclips                 = newSummary.NumSoftclips;
                result.NumInsertedBases             = newSummary.NumInsertedBases;
                result.NumMismatchesIncludeSoftclip = newSummary.NumMismatchesIncludeSoftclip;
                //result.MismatchesIncludeSoftclip = newSummary.MismatchesIncludeSoftclip;
                result.SumOfMismatchingQualities = sumOfMismatching;
                result.AnchorLength = newSummary.AnchorLength;
            }
        }
Esempio n. 4
0
        private RealignmentResult AddIndelAndGetResult(string readSequence, HashableIndel priorIndel,
                                                       string refSequence, bool anchorLeft, PositionMap positionMap, int refSequenceStartIndex, bool pairSpecific)
        {
            var  foundIndel = false;
            var  insertionPostionInReadStart     = -1;
            var  insertionPositionInReadEnd      = -1;
            var  deletionPositionInRead          = -1;
            bool anyPositionsAfterDeletionMapped = false;

            // TODO PERF can we bail out early if it's not possible that the indel could be inserted in the read, based on position?

            if (anchorLeft)
            {
                // move along position map to see if we can insert indel
                for (var i = 0; i < positionMap.Length; i++)
                {
                    if (positionMap.GetPositionAtIndex(i) == priorIndel.ReferencePosition && i != positionMap.Length - 1)  // make sure we dont end right before indel
                    {
                        foundIndel = true;

                        if (priorIndel.Type == AlleleCategory.Insertion)
                        {
                            insertionPostionInReadStart = i + 1;

                            // stick in -1 for insertion length, then adjust positions after
                            for (var j = i + 1; j < positionMap.Length; j++)
                            {
                                if (j - i <= priorIndel.Length)
                                {
                                    positionMap.UpdatePositionAtIndex(j, -1, true);
                                    if (j - i == priorIndel.Length || j == positionMap.Length - 1)
                                    {
                                        insertionPositionInReadEnd = j;
                                    }
                                }
                                else
                                {
                                    if (positionMap.GetPositionAtIndex(j) != -1) // preserve existing insertions
                                    {
                                        positionMap.UpdatePositionAtIndex(j, positionMap.GetPositionAtIndex(j) - priorIndel.Length);
                                    }
                                }
                            }
                            break;
                        }

                        if (priorIndel.Type == AlleleCategory.Deletion)
                        {
                            deletionPositionInRead = i;
                            // offset positions after deletion
                            for (var j = i + 1; j < positionMap.Length; j++)
                            {
                                if (positionMap.GetPositionAtIndex(j) != -1) // preserve existing insertions
                                {
                                    anyPositionsAfterDeletionMapped = true;
                                    positionMap.UpdatePositionAtIndex(j, positionMap.GetPositionAtIndex(j) + priorIndel.Length);
                                }
                            }
                            break;
                        }
                    }
                }
            }
            else
            {
                // walk backwards along position map to see if we can insert indel
                if (priorIndel.Type == AlleleCategory.Insertion)
                {
                    for (var i = positionMap.Length - 1; i >= 0; i--)
                    {
                        if (positionMap.GetPositionAtIndex(i) == priorIndel.ReferencePosition + 1 && i != 0)
                        {
                            foundIndel = true;
                            insertionPositionInReadEnd = i - 1;
                        }
                        else if (positionMap.GetPositionAtIndex(i) == priorIndel.ReferencePosition && i != positionMap.Length - 1)
                        {
                            foundIndel = true;
                            insertionPositionInReadEnd = i;
                        }

                        if (foundIndel)
                        {
                            // stick in -1 for insertion length, then adjust positions
                            for (var j = insertionPositionInReadEnd; j >= 0; j--)
                            {
                                if (insertionPositionInReadEnd - j + 1 <= priorIndel.Length)
                                {
                                    positionMap.UpdatePositionAtIndex(j, -1, true);
                                    if (insertionPositionInReadEnd - j + 1 == priorIndel.Length || j == 0)
                                    {
                                        insertionPostionInReadStart = j;
                                    }
                                }
                                else
                                {
                                    if (positionMap.GetPositionAtIndex(j) != -1) // Don't update position map for things that were already -1
                                    {
                                        positionMap.UpdatePositionAtIndex(j, positionMap.GetPositionAtIndex(j) + priorIndel.Length);
                                    }
                                }
                            }

                            break;
                        }
                    }
                }
                else if (priorIndel.Type == AlleleCategory.Deletion)
                {
                    for (var i = positionMap.Length - 1; i >= 1; i--)
                    {
                        if (positionMap.GetPositionAtIndex(i) == priorIndel.ReferencePosition + priorIndel.Length + 1) //deletions must be fully anchored to be observed
                        {
                            foundIndel = true;

                            deletionPositionInRead = i - 1;
                            // offset positions after deletion
                            for (var j = i - 1; j >= 0; j--)
                            {
                                if (positionMap.GetPositionAtIndex(j) != -1) // preserve existing insertions
                                {
                                    anyPositionsAfterDeletionMapped = true;
                                    positionMap.UpdatePositionAtIndex(j, positionMap.GetPositionAtIndex(j) - priorIndel.Length);
                                }
                            }

                            break;
                        }
                    }
                }
            }

            //if (!foundIndel || !Helper.IsValidMap(positionMap, refSequence))
            //TODO changed this just for tailor
            if (!foundIndel || (priorIndel.Type == AlleleCategory.Deletion && !anyPositionsAfterDeletionMapped) || !Helper.IsValidMap(positionMap.Map))
            {
                return(null);
            }

            var isSketchy = false;

            if (priorIndel.IsRepeat)
            {
                //if (priorIndel.Type == AlleleCategory.Deletion)
                //{
                //    if (Helper.RepeatDeletionFlankedByRepeats(readSequence, priorIndel, deletionPositionInRead))
                //    {
                //        return null;
                //    }
                //}

                //// TODO in the case of using sketchy anchor test:
                //// Ideally, we'd check the anchor length against how many repeats are in the reference vs the variant,
                //// ... Or maybe just always check the whole anchor if it's a repeat.
                var anchorLength = priorIndel.Type == AlleleCategory.Insertion ? Math.Min(insertionPostionInReadStart, readSequence.Length - insertionPositionInReadEnd) : Math.Min(deletionPositionInRead, readSequence.Length - deletionPositionInRead);
                if (anchorLength >= readSequence.Length)
                {
                    throw new Exception("Anchor should never be longer than read length."); // TODO remove after dev.
                }
                if (anchorLength < Math.Max(10, priorIndel.Length))
                {
                    if (priorIndel.Type == AlleleCategory.Deletion)
                    {
                        if (Helper.DeletionHasSketchyAnchor(readSequence, priorIndel, deletionPositionInRead))
                        {
                            if (pairSpecific)
                            {
                                isSketchy = true;
                            }
                            else
                            {
                                return(null);
                            }
                        }
                    }
                    else
                    {
                        if (priorIndel.NumBasesInReferenceSuffixBeforeUnique >= anchorLength)
                        {
                            if (pairSpecific)
                            {
                                isSketchy = true;
                            }
                            else
                            {
                                return(null);
                            }
                        }
                    }
                }
            }

            // TODO do we need to be more nuanced about this and only do it in duplication areas?
            if (priorIndel.Type == AlleleCategory.Deletion)
            {
                var anchorStart       = deletionPositionInRead + 1;
                var rightAnchorLength = readSequence.Length - anchorStart;
                if (rightAnchorLength < priorIndel.Length)
                {
                    if (anchorStart < readSequence.Length)
                    {
                        if (readSequence.Substring(anchorStart) ==
                            priorIndel.ReferenceAllele.Substring(1, rightAnchorLength))
                        {
                            return(null);
                        }
                    }
                }
            }

            if (priorIndel.IsDuplication && priorIndel.Type == AlleleCategory.Insertion)
            {
                // TODO return to this - I think the thought was to prevent FP dups, but the implementation may have been wrong
                // No partial duplications?
                //if (readSequence.Length - insertionPositionInReadEnd <= priorIndel.Length)

                if (readSequence.Length - insertionPositionInReadEnd <= 3)
                {
                    // Assumes priors are left-aligned
                    return(null);
                }
            }

            //verify insertion matches
            var newReadSequence = readSequence;
            var nifiedAt        = new List <int>();

            if (priorIndel.Type == AlleleCategory.Insertion)
            {
                if (insertionPostionInReadStart == -1 || insertionPositionInReadEnd == -1)
                {
                    return(null); // weird, this shouldnt ever happen
                }
                var readInsertedSequence = readSequence.Substring(insertionPostionInReadStart,
                                                                  insertionPositionInReadEnd - insertionPostionInReadStart + 1);

                var indelSequence = priorIndel.AlternateAllele.Substring(1);

                if (anchorLeft && readInsertedSequence.Length < indelSequence.Length && priorIndel.NumApproxDupsRight > 0)
                {
                    // Don't allow partial realignment to dups
                    return(null);
                }
                if (!anchorLeft && readInsertedSequence.Length < indelSequence.Length && priorIndel.NumApproxDupsLeft > 0)
                {
                    // Don't allow partial realignment to dups
                    return(null);
                }

                var clippedPriorSequence = anchorLeft
                    ? indelSequence.Substring(0, readInsertedSequence.Length)
                    : indelSequence.Substring(indelSequence.Length - readInsertedSequence.Length);

                var isMismatch = readInsertedSequence != clippedPriorSequence;
                if (isMismatch)
                {
                    int?mismatches     = null;
                    var mismatchesToDq = 0d;
                    if (priorIndel.Length >= _minInsertionSizeToAllowMismatchingBases && !(priorIndel.NumApproxDupsLeft + priorIndel.NumApproxDupsRight > 0))
                    {
                        mismatches = Helper.GetHammingNumMismatches(readInsertedSequence, clippedPriorSequence);

                        mismatchesToDq = priorIndel.Length * _maxProportionInsertSequenceMismatch;

                        if (mismatches > mismatchesToDq)
                        {
                            //Console.WriteLine(
                            //    $"Too many mismatches between insertions: {mismatches} > {maxAllowedMismatches} ({clippedPriorSequence} vs {readInsertedSequence})");
                        }
                        else
                        {
                            //Console.WriteLine(
                            //    $"Able to Nify mismatches between insertions: {mismatches} <= {maxAllowedMismatches} ({clippedPriorSequence} vs {readInsertedSequence})");

                            var newSequence =
                                Helper.NifyMismatches(clippedPriorSequence, readInsertedSequence, nifiedAt);
                            // TODO PERF is this actually necessary now that we're not actually Nifying? We can just keep the bases that we're Nifying at.
                            newReadSequence = readSequence.Substring(0, insertionPostionInReadStart) +
                                              newSequence.ToLower() +
                                              readSequence.Substring(insertionPositionInReadEnd + 1);
                            nifiedAt = nifiedAt.Select(x => x + insertionPostionInReadStart).ToList();
                        }
                    }

                    if (mismatches == null || (mismatches > mismatchesToDq))
                    {
                        return(null); // inserted sequence doesn't match read
                    }
                }
            }

            // TODO update to use PositionMap class
            var newCigar = Helper.ConstructCigar(positionMap.Map);

            // TODO moved this, and probably should in original Hygea too?
            // Also, can cut down the calls to positionmap.First() in the original
            //var readHasPosition = positionMap.Any(p => p > 0); // Position map is one-based, so should be >, not >= 0.
            if (!positionMap.HasAnyMappableBases())
            {
                throw new InvalidDataException(string.Format("Trying to generate result and read does not have any alignable bases. ({0}, {1})", newCigar, string.Join(",", positionMap)));
            }

            var startIndexInReference          = positionMap.FirstMappableBase() - 1; // Position map is one-based, so should be >, not >= 0.
            var startIndexInRefSequenceSnippet = startIndexInReference - refSequenceStartIndex;

            var newSummary = Extensions.GetAlignmentSummary(startIndexInRefSequenceSnippet, newCigar, refSequence,
                                                            newReadSequence, _trackActualMismatches, _checkSoftclipsForMismatches);

            if (newSummary == null)
            {
                return(null);
            }

            return(new RealignmentResult()
            {
                Cigar = newCigar,
                NumIndels = newCigar.NumIndels(),
                Position = startIndexInReference + 1,
                NumMismatches = newSummary.NumMismatches,
                NumNonNMismatches = newSummary.NumNonNMismatches,
                NumSoftclips = newSummary.NumSoftclips,
                NumNonNSoftclips = newSummary.NumNonNSoftclips,
                NumDeletedBases = newSummary.NumDeletedBases,
                NumInsertedBases = newSummary.NumInsertedBases,
                NumMatches = newSummary.NumMatches,
                NumIndelBases = newSummary.NumIndelBases,
                NumMismatchesIncludeSoftclip = newSummary.NumMismatchesIncludeSoftclip,
                MismatchesIncludeSoftclip = newSummary.MismatchesIncludeSoftclip,
                Indels = StringifyIndel(priorIndel),
                NifiedAt = nifiedAt,
                IndelsAddedAt = new List <int> {
                    priorIndel.Type == AlleleCategory.Insertion ? insertionPostionInReadStart : deletionPositionInRead
                },
                IsSketchy = isSketchy
            });
        }