Exemplo n.º 1
0
        public void UpdateIntTagData_Tests()
        {
            TagUtils tagUtils = new TagUtils();

            byte[] tagData   = tagUtils.ToBytes();
            var    alignment = new BamAlignment()
            {
                TagData = tagData
            };

            // when there was not an NM tag to begin with
            // do not add if not found
            alignment.UpdateIntTagData("NM", 4);
            Assert.Equal(null, alignment.GetIntTag("NM"));
            // add if not found
            alignment.UpdateIntTagData("NM", 4, true);
            Assert.Equal(4, alignment.GetIntTag("NM"));

            // when there was an NM tag to begin with
            alignment.UpdateIntTagData("NM", 3);
            Assert.Equal(3, alignment.GetIntTag("NM"));
        }
Exemplo n.º 2
0
        private static List <BamAlignment> IndelsDisagreeWithStrongMate(List <IndelSite> r1IndelPositions,
                                                                        List <IndelSite> r2IndelPositions, BamAlignment read1,
                                                                        BamAlignment read2, out bool disagree, int mismatchesAllowed = 1, int r1IndelAdjustment = 0,
                                                                        int r2IndelAdjustment = 0, bool softclipWeakOne = true, int?r1Nm = null, int?r2Nm = null)
        {
            var checkBoth = true;
            // TODO maybe also check if one of the reads has ins AND del
            // TODO if we've grabbed this info here, propagate it out so we don't do it twice
            // TODO indel adjustment should only actually remove insertions, no??
            var read1Nm         = r1Nm ?? read1.GetIntTag("NM");
            var read2Nm         = r2Nm ?? read2.GetIntTag("NM");
            var read1AdjustedNm = read1Nm - r1IndelAdjustment;
            var read2AdjustedNm = read2Nm - r2IndelAdjustment;

            disagree = false;

            var r1IndelPositionsUnique = r1IndelPositions != null && r2IndelPositions != null?GetUniqueIndelSites(r1IndelPositions, r2IndelPositions) : r1IndelPositions;

            var r2IndelPositionsUnique = r1IndelPositions != null && r2IndelPositions != null?GetUniqueIndelSites(r2IndelPositions, r1IndelPositions) : r2IndelPositions;

            // No sense doing further checks if there's nothing to disagree over...
            if (r1IndelPositionsUnique.Any() || r2IndelPositionsUnique.Any())
            {
                var r1AdjustedClean = read1AdjustedNm <= mismatchesAllowed;
                var r2AdjustedClean = read2AdjustedNm <= mismatchesAllowed;
                var r1Clean         = read1Nm <= mismatchesAllowed;
                var r2Clean         = read2Nm <= mismatchesAllowed;
                var r1NumIndels     = r1IndelPositions?.Count;
                var r2NumIndels     = r2IndelPositions?.Count;
                var r1IsGood        = r1AdjustedClean && (r1Clean || r1NumIndels <= 1);
                var r2IsGood        = r2AdjustedClean && (r2Clean || r2NumIndels <= 1);

                if ((read1Nm != null && read2Nm != null) && (r1IsGood || r2IsGood))
                {
                    if (r1IsGood)
                    {
                        var disagreeingPos = AnyIndelCoveredInMate(r2IndelPositionsUnique, read1, read2);

                        if (disagreeingPos != null)
                        {
                            disagree = true;
                            if (softclipWeakOne && !r2IsGood)
                            {
                                SoftclipAfterIndel(read2, read2.IsReverseStrand(), disagreeingPos.Value);
                            }
                        }
                        else
                        {
                            if (checkBoth)
                            {
                                disagreeingPos = AnyIndelCoveredInMate(r1IndelPositionsUnique, read2, read1);
                                if (disagreeingPos != null)
                                {
                                    disagree = true;
                                }
                            }
                        }
                    }
                    else
                    {
                        var disagreeingPos = AnyIndelCoveredInMate(r1IndelPositionsUnique, read2, read1);
                        if (disagreeingPos != null)
                        {
                            disagree = true;
                            if (softclipWeakOne && !r1IsGood)
                            {
                                SoftclipAfterIndel(read1, read1.IsReverseStrand(), disagreeingPos.Value);
                            }
                        }
                        else
                        {
                            if (checkBoth)
                            {
                                disagreeingPos = AnyIndelCoveredInMate(r2IndelPositionsUnique, read1, read2);
                                if (disagreeingPos != null)
                                {
                                    disagree = true;
                                }
                            }
                        }
                    }
                }
            }

            // If both are good, and they disagree, should still say they disagree?

            return(new List <BamAlignment>()
            {
                read1, read2
            });
        }
Exemplo n.º 3
0
        public static List <PreIndel> FindIndelsAndRecordEvidence(BamAlignment bamAlignment, IndelTargetFinder targetFinder, Dictionary <string, IndelEvidence> lookup,
                                                                  bool isReputable, string chrom, int minMapQuality, bool stitched = false)
        {
            // TODO define whether we want to collect indels from supplementaries. I think we probably do...
            // TODO do we want to collect indels from duplicates?
            // Was thinking this might be faster than checking all the ops on all the reads, we'll see - it also makes an important assumption that no reads are full I or full D
            if (bamAlignment.MapQuality > minMapQuality && bamAlignment.CigarData.Count > 1 &&
                bamAlignment.IsPrimaryAlignment())
            {
                var indels = targetFinder.FindIndels(bamAlignment, chrom);

                if (indels.Any())
                {
                    // TODO this doesn't support nm from stitched, which is not in a tag. Need to pass it in!!
                    var nm      = bamAlignment.GetIntTag("NM");
                    var totalNm = nm ?? 0;

                    var isMulti = indels.Count() > 1;
                    int readSpanNeededToCoverBoth = 0;
                    if (isMulti)
                    {
                        var firstPosOfVariation = indels[0].ReferencePosition;
                        var lastIndel           = indels[indels.Count - 1];
                        var lastPosOfVariation  = lastIndel.Type == AlleleCategory.Deletion
                                                     ? lastIndel.ReferencePosition + 1
                                                     : lastIndel.ReferencePosition + lastIndel.Length;
                        readSpanNeededToCoverBoth = lastPosOfVariation - firstPosOfVariation;
                    }



                    // TODO do we want to collect info here for individual indels if they are only seen in multis?
                    // Currently trying to solve this by only collecting for individuals if it seems likely that we're going to see reads that don't span both
                    if (!isMulti || (readSpanNeededToCoverBoth > 25)) // TODO magic number
                    {
                        foreach (var indel in indels)
                        {
                            var indelKey = indel.ToString();

                            // TODO less gnarly

                            var indelMetrics = IndelMetrics(lookup, indelKey);

                            UpdateIndelMetrics(bamAlignment, isReputable, stitched, indelMetrics, indel, totalNm);
                        }
                    }

                    if (isMulti)
                    {
                        var indelKey = string.Join("|", indels.Select(x => x.ToString()));
                        // TODO less gnarly

                        var indelMetrics = IndelMetrics(lookup, indelKey);

                        // TODO - are read-level repeats that informative? Because this is kind of a perf burden
                        // (^ Removed for now for that reason)
                        bool isRepeat = false;
                        //var isRepeat = StitchingLogic.OverlapEvaluator.IsRepeat(bamAlignment.Bases.Substring(0, (int)indels[0].LeftAnchor), 2, out repeatUnit) || StitchingLogic.OverlapEvaluator.IsRepeat(bamAlignment.Bases.Substring(0, (int)indels[1].RightAnchor), 2, out repeatUnit);

                        AddReadLevelIndelMetrics(bamAlignment, isReputable, stitched, indelMetrics, isRepeat);
                        AddMultiIndelMetrics(indelMetrics, indels, totalNm);
                    }
                }

                return(indels);
            }
            return(null);
        }
Exemplo n.º 4
0
        public BamAlignment GetFinalAlignment(BamAlignment origBamAlignment, out bool changed, out bool forcedSoftclip, out bool confirmed, out bool sketchy,
                                              List <PreIndel> selectedIndels = null, List <PreIndel> existingIndels          = null,
                                              bool assumeImperfect           = true, List <HashableIndel> confirmedAccepteds = null, List <PreIndel> mateIndels = null)
        {
            sketchy        = false;
            forcedSoftclip = false;
            bool forcedAlignment = false;
            var  presumeStartPositionForForcedAlignment = 0;

            if (origBamAlignment.CigarData.Count == 0)
            {
                // This was something weird that came up in the halo dataset... mapq is 0 but is still mapped, no cigar

                if (origBamAlignment.Position <= 0 && origBamAlignment.FragmentLength != 0) // No sense trying to fiddle with the position otherwise
                {
                    // TODO does this really even move the needle? Is it helping enough to outweigh its weirdness?
                    var presumedEndPosition = origBamAlignment.MatePosition < origBamAlignment.Position
                        ? origBamAlignment.MatePosition - origBamAlignment.FragmentLength
                        : origBamAlignment.MatePosition + origBamAlignment.FragmentLength;
                    presumeStartPositionForForcedAlignment = presumedEndPosition - origBamAlignment.Bases.Length;
                    forcedAlignment = true;
                }
                else
                {
                    presumeStartPositionForForcedAlignment = origBamAlignment.Position;
                    forcedAlignment = true;
                }
            }

            var  anyIndelsAtAll = _regionFilterer.AnyIndelsNearby(origBamAlignment.Position);
            bool isRealignable  = true;

            if (anyIndelsAtAll)
            {
                var isImperfectRead = false || ((origBamAlignment.ContainsDisallowedCigarOps(_suspectCigarOps) ||
                                                 origBamAlignment.GetIntTag("NM") > 0 || forcedAlignment));
                var isReadWorthCaringAbout = !origBamAlignment.IsDuplicate() && !origBamAlignment.IsSecondary();
                isRealignable = isImperfectRead && isReadWorthCaringAbout && origBamAlignment.Bases.Distinct().Count() > 1;
            }
            else
            {
                _statusCounter.AddStatusCount("No indels nearby at all");
                isRealignable = false;
            }

            if (!isRealignable)
            {
                confirmed = false;
                changed   = false;
                sketchy   = false;
                return(origBamAlignment);
            }

            // TODO maybe flag (or return all) if there's a lot or high quality stuff that we're missing! Esp with pair specific
            var indels = _indelSource.GetRelevantIndels(forcedAlignment ? presumeStartPositionForForcedAlignment : origBamAlignment.Position,
                                                        mateIndels, confirmedAccepteds);

            // Don't realign around single indels if we already have them
            bool          hasExistingUnsanctionedIndels = false;
            bool          existingSanctionedIndelIsBest = false;
            bool          hasVeryGoodIndel       = false;
            bool          hasHardToCallIndel     = false;
            var           existingMatches        = new List <PreIndel>();
            HashableIndel existingConfirmedIndel = new HashableIndel();
            var           existingMatchHashables = new List <HashableIndel>();

            if (indels.Any() && existingIndels != null && existingIndels.Any())
            {
                var topScore             = (float)(indels.Max(x => x.Key.Score));
                var matchesFound         = 0;
                var nonPreExistingIndels = new List <KeyValuePair <HashableIndel, GenomeSnippet> >();

                var index = 0;
                foreach (var kvp in indels)
                {
                    var indel   = kvp.Key;
                    var matches = existingIndels.Where(e => Helper.IsMatch(e, indel));
                    var isMatch = matches.Any();
                    if (isMatch)
                    {
                        matchesFound++;

                        if (!indel.InMulti && index == 0)
                        {
                            existingSanctionedIndelIsBest = true;
                            existingConfirmedIndel        = indel;
                        }

                        var proportionOfTopScore = indel.Score / (float)topScore;
                        if (proportionOfTopScore >= 0.75)
                        {
                            hasVeryGoodIndel = true;
                        }

                        if (indel.HardToCall)
                        {
                            hasHardToCallIndel = true;
                        }

                        existingMatches.AddRange(matches);

                        // TODO do we need special handling of multis?
                        existingMatchHashables.Add(indel);
                    }

                    if (!isMatch || indel.InMulti)
                    {
                        nonPreExistingIndels.Add(kvp);
                    }


                    index++;
                }

                // TODO do we actually want to replace indels with non-pre-existing only?
                indels = nonPreExistingIndels;

                if (matchesFound == 0)
                {
                    hasExistingUnsanctionedIndels = true;
                }
            }

            // TODO this precludes us from having good multis
            if (existingSanctionedIndelIsBest)
            {
                // If it already had the top ranked indel, there's not really any point in trying to realign around others (here we assume that it's also the best fitting indel for the read, hence why it was originally called by the regular aligner).
                _statusCounter.AddStatusCount("Existing indel is already the best available");
                changed   = false;
                confirmed = true;

                UpdateOutcomeForConfirmed(existingConfirmedIndel);

                if (confirmedAccepteds == null)
                {
                    confirmedAccepteds = new List <HashableIndel>();
                }

                confirmedAccepteds.Add(existingConfirmedIndel);

                return(origBamAlignment);
            }


            if (!indels.Any() || origBamAlignment.EndPosition - origBamAlignment.Position > 500)
            {
                if (!indels.Any())
                {
                    // TODO maybe do the forced softclip here if the read did have indels?
                    _statusCounter.AddStatusCount("No indels to realign to");
                    _statusCounter.AppendStatusStringTag("RX", $"{origBamAlignment.GetStringTag("RX")},No indels to realign to", origBamAlignment);
                }
                else
                {
                    _statusCounter.AddStatusCount("Alignment reference span longer than we can realign to");
                }
                changed   = false;
                confirmed = false;
                return(origBamAlignment);
            }



            // TODO this should relate to cap on indel size... introducing too large of an indel will make us go beyond this context.
            var context       = indels.First().Value;
            var orderedIndels = indels.Select(x => x.Key).ToList();
            var numIndels     = orderedIndels.Count;

            _statusCounter.AddStatusCount("Realigning to " + numIndels);

            var bamAlignment = new BamAlignment(origBamAlignment);

            if (forcedAlignment)
            {
                bamAlignment.CigarData = new CigarAlignment(origBamAlignment.Bases.Length + "M");
                bamAlignment.Position  = presumeStartPositionForForcedAlignment;
            }

            var realignResult = _readRealigner.Realign(new Read(_chromosome, bamAlignment),
                                                       orderedIndels, indels.ToDictionary(x => x.Key, x => x.Value), confirmedAccepteds != null && confirmedAccepteds.Any());

            var acceptedIndels = realignResult?.AcceptedIndels;
            var hasAnyIndels   = acceptedIndels != null && acceptedIndels.Any();

            if (realignResult != null)
            {
                _statusCounter.AddStatusCount("Able to realign at all (may still be worse than original)");
                _statusCounter.AppendStatusStringTag("RX", "Able to realign at all(may still be worse than original)", bamAlignment);
            }
            else
            {
                _statusCounter.AddStatusCount("Not able to realign at all");
                _statusCounter.AppendStatusStringTag("RX", "Not able to realign at all", origBamAlignment);
            }

            AlignmentSummary originalAlignmentSummary = null;
            var realignmentUnchanged = true;

            if (realignResult != null)
            {
                originalAlignmentSummary =
                    Extensions.GetAlignmentSummary((new Read(_chromosome, origBamAlignment)), context.Sequence,
                                                   _trackActualMismatches, _checkSoftclipsForMismatches, context.StartPosition);

                realignmentUnchanged = _judger.RealignmentIsUnchanged(realignResult, origBamAlignment);

                if (originalAlignmentSummary.NumMismatches > 0)
                {
                    // TODO PERF do we still want to use this ever?
                    var sumMismatch = Helper.GetSumOfMismatchQualities(origBamAlignment.Qualities,
                                                                       origBamAlignment.Bases, new Read(_chromosome, origBamAlignment).PositionMap, context.Sequence,
                                                                       context.StartPosition);
                    originalAlignmentSummary.SumOfMismatchingQualities = sumMismatch;
                }

                // Within this logic also checking the same as "!realignmentUnchanged" above.. consolidate this.
                if (selectedIndels != null &&
                    (_judger.RealignmentBetterOrEqual(realignResult, originalAlignmentSummary, confirmedAccepteds != null && confirmedAccepteds.Any())) ||
                    ResultIsGoodEnough(realignResult, origBamAlignment, originalAlignmentSummary,
                                       realignmentUnchanged, confirmedAccepteds != null && confirmedAccepteds.Any()))
                {
                    UpdateIndelOutcomes(numIndels, orderedIndels, hasAnyIndels, acceptedIndels, confirmedAccepteds, true, realignResult);

                    if (realignResult.IsSketchy)
                    {
                        sketchy = true;
                    }
                    return(AcceptRealignment(origBamAlignment, out changed, selectedIndels, existingIndels, realignResult, originalAlignmentSummary, bamAlignment, hasExistingUnsanctionedIndels, out confirmed));
                }
            }


            // At this point, any good realignment would have been returned. If it's realigned and changed now, it's an unaccepted (not good enough) realignment.
            // If it had an indel to begin with, it's basically a vote that we don't trust that indel. Optionally softclip it out.

            if (!realignmentUnchanged)
            {
                changed   = false;
                confirmed = false;

                HandleFailedRealignment(origBamAlignment, ref forcedSoftclip, existingIndels, realignResult, hasExistingUnsanctionedIndels, existingMatches);

                if ((hasVeryGoodIndel || (hasHardToCallIndel && _judger.IsVeryConfident(originalAlignmentSummary))) && !hasExistingUnsanctionedIndels && existingMatchHashables.Any())
                {
                    // It didn't have the tip-top indel, but it had one that was very close, and we tried realigning around the top guys and failed - this one looks better. Give it credit.
                    confirmed = true;
                    foreach (var indel in existingMatchHashables)
                    {
                        UpdateOutcomeForConfirmed(indel);

                        if (confirmedAccepteds != null)
                        {
                            confirmedAccepteds.Add(indel);
                        }
                    }
                }
                UpdateIndelOutcomes(numIndels, orderedIndels, hasAnyIndels, acceptedIndels, confirmedAccepteds, false, realignResult);
            }
            else
            {
                if (acceptedIndels != null)
                {
                    foreach (var indelNum in acceptedIndels)
                    {
                        var indel = orderedIndels[indelNum];

                        UpdateOutcomeForConfirmed(indel);
                    }
                }

                _statusCounter.AddStatusCount("INDEL STATUS\tUnchanged\t" + realignResult?.Indels);
                _statusCounter.AppendStatusStringTag("RX", "Unchanged: " + realignResult?.Indels, origBamAlignment);

                confirmed = true;
                changed   = false;
                return(origBamAlignment);
            }

            if (realignResult == null)
            {
                if (_softclipUnknownIndels && hasExistingUnsanctionedIndels)
                {
                    var unsanctioned = existingIndels.Where(x => !existingMatches.Contains(x));

                    foreach (var preIndel in unsanctioned.OrderBy(x => x.ReferencePosition))
                    {
                        var reverseClip = false;
                        var clipLength  = preIndel.RightAnchor;
                        if (preIndel.LeftAnchor < preIndel.RightAnchor)
                        {
                            reverseClip = true;
                            clipLength  = preIndel.LeftAnchor;
                        }

                        // TODO arbitrary number here...
                        // If it's pretty well-anchored, don't remove the indel
                        if (clipLength > 20)
                        {
                            continue;
                        }

                        forcedSoftclip = true;
                        _statusCounter.AddStatusCount("Softclipped out bad indel");
                        _statusCounter.AppendStatusStringTag("RX",
                                                             $"Softclipped out bad indel({origBamAlignment.CigarData},{string.Join(",", existingIndels)}... No realignment",
                                                             origBamAlignment);
                        _statusCounter.AddStatusCount("INDEL STATUS\tRemoved\t" + string.Join("|", existingIndels));
                        OverlappingIndelHelpers.SoftclipAfterIndel(origBamAlignment,
                                                                   reverseClip, preIndel.ReferencePosition);
                    }
                }
            }

            _statusCounter.AppendStatusStringTag("RX", "Realignment failed", origBamAlignment);
            _statusCounter.AddStatusCount("Realignment failed");

            return(origBamAlignment);
        }