Exemple #1
0
        private static string ActualReferenceAllele(GenomeSnippet snippet, PreIndel preIndel, int contextStart)
        {
            var actualReferenceAllele = snippet.Sequence.Substring(
                preIndel.ReferencePosition - 1 - contextStart, preIndel.ReferenceAllele.Length);

            return(actualReferenceAllele);
        }
        public static Mock <IGeminiDataSourceFactory> MockDataSourceFactory(Mock <IBamReader> mockReader, Mock <IDataSource <ReadPair> > mockReadPairSource)
        {
            var mockDataSourceFactory = new Mock <IGeminiDataSourceFactory>();

            mockDataSourceFactory.Setup(x => x.CreateBamReader(It.IsAny <string>())).Returns(mockReader.Object);
            mockDataSourceFactory.Setup(x => x.CreateReadPairSource(It.IsAny <IBamReader>(), It.IsAny <ReadStatusCounter>()))
            .Returns(mockReadPairSource.Object);
            mockDataSourceFactory.Setup(x => x.GetRefIdMapping(It.IsAny <string>()))
            .Returns(new Dictionary <int, string>()
            {
                { 1, "chr1" }, { 2, "chr2" }, { -1, "Unknown" }
            });
            var mockSnippetSource = new Mock <IGenomeSnippetSource>();
            var genomeSnippet     = new GenomeSnippet()
            {
                Chromosome = "chr1", Sequence = new string('A', 1000000), StartPosition = 0
            };

            mockSnippetSource.Setup(x => x.GetGenomeSnippet(It.IsAny <int>())).Returns(genomeSnippet);
            mockDataSourceFactory.Setup(x => x.CreateGenomeSnippetSource(It.IsAny <string>(), It.IsAny <ChrReference>(), It.IsAny <int>()))
            .Returns(mockSnippetSource.Object);
            mockDataSourceFactory
            .Setup(x => x.GetChromosomeIndelSource(It.IsAny <List <HashableIndel> >(),
                                                   It.IsAny <IGenomeSnippetSource>())).Returns <List <HashableIndel>, IGenomeSnippetSource>((x, y) => new ChromosomeIndelSource(x, y));
            return(mockDataSourceFactory);
        }
Exemple #3
0
        private static string ReferenceSuffix(GenomeSnippet snippet, PreIndel preIndel, int contextStart)
        {
            var offset         = Math.Max(10, 3 * preIndel.Length);
            var prefixSequence = snippet.Sequence.Substring(
                preIndel.ReferencePosition + preIndel.ReferenceAllele.Length - 1 - contextStart, offset);

            return(prefixSequence);
        }
Exemple #4
0
        private static string ReferencePrefix(GenomeSnippet snippet, PreIndel preIndel, int contextStart)
        {
            var offset         = Math.Max(10, 3 * preIndel.Length);
            var prefixStart    = Math.Max(0, preIndel.ReferencePosition - 1 - contextStart - offset - 1);
            var prefixLength   = preIndel.ReferencePosition - prefixStart;
            var prefixSequence = snippet.Sequence.Substring(prefixStart, prefixLength);

            return(prefixSequence);
        }
        public IEnumerable <KeyValuePair <HashableIndel, GenomeSnippet> > GetRelevantIndels(int position, List <PreIndel> preSelectedIndels = null, List <HashableIndel> confirmedIndels = null, List <PreIndel> existingIndels = null, List <PreIndel> mateIndels = null)
        {
            // TODO make this calculation right
            // TODO figure out what that ^ means. I don't see what's not "right" about this but I'll leave the comment til I figure it out or determine that it is not meant to be there
            if (_numIndels == 0 || position > HighestPosition + _bucketSize || position < LowestPosition - _bucketSize)
            {
                return(_emptyHashablesList);
            }

            var indelsToReturn = new Dictionary <HashableIndel, GenomeSnippet>();

            const int maxDistance = 250;

            var indelExactBucketNum = (position - LowestPosition) / _bucketSize;

            // TODO see how many are actually being used
            const int maxNumTopScorersToReturn           = 5;
            const int maxNumExtraTopScorerMultisToReturn = 3;

            for (int i = 0; i <= 2; i++)
            {
                var peripheralBucketNum = indelExactBucketNum - 1 + i;
                if (_positionalBucketsOfIndels.TryGetValue(peripheralBucketNum, out var bucket))
                {
                    var           addedForBucket   = 0;
                    GenomeSnippet snippetForBucket = null;
                    foreach (var item in bucket.OrderByDescending(v => v.Score))
                    {
                        if ((addedForBucket >= maxNumTopScorersToReturn && !item.InMulti) || (addedForBucket >= maxNumTopScorersToReturn + maxNumExtraTopScorerMultisToReturn))
                        {
                            continue;
                        }
                        if (Math.Abs(item.ReferencePosition - position) <= maxDistance)
                        {
                            addedForBucket++;
                            if (snippetForBucket == null)
                            {
                                snippetForBucket = _genomeSnippetsLookup[peripheralBucketNum];
                            }
                            indelsToReturn[item] = snippetForBucket;
                        }
                    }
                }
            }

            var filteredIndelsRaw = indelsToReturn.OrderByDescending(x => IsFavored(preSelectedIndels, confirmedIndels, x)).
                                    ThenByDescending(x => x.Key.Score).ThenByDescending(x => IsPreSelected(preSelectedIndels, x)).ThenBy(x => x.Key.StringRepresentation).ToList();

            var filteredIndels = FilterIndels(preSelectedIndels, filteredIndelsRaw, maxNumTopScorersToReturn,
                                              maxNumExtraTopScorerMultisToReturn, confirmedIndels, position);

            return(filteredIndels);
        }
        private static List <BamAlignment> ExtractReadsFromRealignerAndCombiner(PairResult pair, string refSeq,
                                                                                int refSeqOffset, List <PreIndel> preIndels, bool hasExistingIndels = false)
        {
            var stitchedPairHandler =
                new PairHandler(new Dictionary <int, string>()
            {
                { 1, "chr1" }
            }, new BasicStitcher(0), tryStitch: true);

            var snippetSource = new Mock <IGenomeSnippetSource>();
            var genomeSnippet = new GenomeSnippet()
            {
                Chromosome    = "chr1",
                Sequence      = new string('A', refSeqOffset) + refSeq + new string('T', 1000),
                StartPosition = 0
            };

            snippetSource.Setup(x => x.GetGenomeSnippet(It.IsAny <int>())).Returns(genomeSnippet);
            var mockStatusHandler = new Mock <IStatusHandler>();
            var comparer          = new GemBasicAlignmentComparer(false, false);

            var readRealigner = new GeminiReadRealigner(comparer, remaskSoftclips: false,
                                                        keepProbeSoftclips: false, keepBothSideSoftclips: false,
                                                        trackActualMismatches: false, checkSoftclipsForMismatches: true,
                                                        debug: false, maskNsOnly: false, maskPartialInsertion: false,
                                                        minimumUnanchoredInsertionLength: 1,
                                                        minInsertionSizeToAllowMismatchingBases: 4,
                                                        maxProportionInsertSequenceMismatch: 0.2); // TODO fix // TODO figure out what I was saying to fix here...

            var filterer = new Mock <IRegionFilterer>();

            filterer.Setup(x => x.AnyIndelsNearby(It.IsAny <int>())).Returns(true);

            var indels               = preIndels.Select(x => HashableIndelSource.GetHashableIndel(genomeSnippet, x, 0, false)).ToList();
            var indelSource          = new ChromosomeIndelSource(indels, snippetSource.Object);
            var realignmentEvaluator = new RealignmentEvaluator(indelSource, mockStatusHandler.Object, readRealigner,
                                                                new RealignmentJudger(comparer), "chr1", false, true, true, true, filterer.Object, false);

            var combiner = new ReadPairRealignerAndCombiner(new NonSnowballEvidenceCollector(),
                                                            new PostRealignmentStitcher(stitchedPairHandler, new DebugStatusHandler(new ReadStatusCounter())),
                                                            realignmentEvaluator, new PairSpecificIndelFinder(), "chr1", false, hasExistingIndels: hasExistingIndels);
            var nmCalc = new NmCalculator(snippetSource.Object);

            var result = combiner.ExtractReads(pair, nmCalc);

            return(result);
        }
 public GenomeSnippet GetGenomeSnippet(int position)
 {
     if (position < 0)
     {
         throw new ArgumentException(
                   $"Invalid snippet reference position ({position}): must be non-negative.");
     }
     if (Math.Abs(position - _lastPosition) < _snippetBuffer && _currentEndPos - position > _snippetBuffer)
     {
         return(_snippet);
     }
     else
     {
         _snippet       = _snippetSource.GetGenomeSnippet(position);
         _lastPosition  = position;
         _currentEndPos = _snippet.StartPosition + _snippet.Sequence.Length;
         return(_snippet);
     }
 }
        public void GetNm()
        {
            var snippetSource = new Mock <IGenomeSnippetSource>();
            var genomeSnippet = new GenomeSnippet()
            {
                Chromosome    = "chr1",
                Sequence      = "NNNNNAAAAATTTTTGGGGGCCCCC",
                StartPosition = 94 // 0 based
            };

            snippetSource.Setup(x => x.GetGenomeSnippet(It.IsAny <int>())).Returns(genomeSnippet);
            var nmCalculator = new NmCalculator(snippetSource.Object);


            // Positions passed to CreateBamAlignment are one based bc it adjusts by one in the helper
            var alignment = TestHelpers.CreateBamAlignment("AAAAA", 100, 0, 30, true);

            Assert.Equal(0, nmCalculator.GetNm(alignment));

            alignment = TestHelpers.CreateBamAlignment("AATAA", 100, 0, 30, true);
            Assert.Equal(1, nmCalculator.GetNm(alignment));

            alignment = TestHelpers.CreateBamAlignment("AGTGT", 100, 0, 30, true);
            Assert.Equal(4, nmCalculator.GetNm(alignment));

            alignment = TestHelpers.CreateBamAlignment("AGTGT", 100, 0, 30, true, cigar: new CigarAlignment("1M4I"));
            Assert.Equal(4, nmCalculator.GetNm(alignment));

            alignment = TestHelpers.CreateBamAlignment("ATTTT", 100, 0, 30, true, cigar: new CigarAlignment("1M4D4M"));
            Assert.Equal(4, nmCalculator.GetNm(alignment));

            alignment = TestHelpers.CreateBamAlignment("ACCCC", 100, 0, 30, true, cigar: new CigarAlignment("1M4D4M"));
            Assert.Equal(8, nmCalculator.GetNm(alignment));

            alignment = TestHelpers.CreateBamAlignment("GAAAA", 100, 0, 30, true);
            Assert.Equal(1, nmCalculator.GetNm(alignment));

            alignment = TestHelpers.CreateBamAlignment("AATAA", 100, 0, 30, true, cigar: new CigarAlignment("2M3S"));
            Assert.Equal(0, nmCalculator.GetNm(alignment));
        }
Exemple #9
0
        public void GetHashableIndel()
        {
            var refSequence = "ZZXXXXXCAGCAGCAGCAGXYZ";

            var indel = new PreIndel(new CandidateAllele("chr1", 7, "XCAG", "X", AlleleCategory.Deletion));

            var genomeSnippet = new GenomeSnippet()
            {
                Chromosome    = "chr1",
                Sequence      = refSequence + "TTTTT",
                StartPosition = 0
            };

            var hashable = HashableIndelSource.GetHashableIndel(genomeSnippet, indel, 0, false);

            Assert.Equal("ZZXXXXX", hashable.RefPrefix);
            Assert.Equal("CAGCAGCAGX", hashable.RefSuffix);

            indel    = new PreIndel(new CandidateAllele("chr1", 7, "X", "XCAG", AlleleCategory.Insertion));
            hashable = HashableIndelSource.GetHashableIndel(genomeSnippet, indel, 0, false);
            Assert.Equal("ZZXXXXX", hashable.RefPrefix);
            Assert.Equal("CAGCAGCAGC", hashable.RefSuffix);
        }
        public GenomeSnippet GetGenomeSnippet(int position)
        {
            if (_chrReference == null)
            {
                // TODO optionally could open the genome back up?
                throw new Exception("Already disposed of the chr reference.");
            }

            if (position < 0)
            {
                throw new ArgumentException(
                          $"Invalid snippet reference position ({position}): must be non-negative.");
            }

            var contextStart = position - _genomeContextSize;

            contextStart -= _buffer;
            contextStart  = Math.Max(0, contextStart);

            if (contextStart >= _chrReference.Sequence.Length)
            {
                throw new ArgumentException(
                          $"Snippet would go off the end of the chromosome: {position} vs {_chrReference.Sequence.Length}.");
            }

            var contextLength = Math.Min(_chrReference.Sequence.Length - contextStart, 2 * _buffer + _genomeContextSize * 2);
            var context       = _chrReference.Sequence.Substring(Math.Max(0, contextStart), contextLength);

            var snippet = new GenomeSnippet
            {
                Chromosome    = _chrReference.Name,
                Sequence      = context,
                StartPosition = contextStart
            };

            return(snippet);
        }
Exemple #11
0
        public void ReapplySoftclips()
        {
            var reapplier      = new SoftclipReapplier(true, false, false, false, false, true);
            var reapplierNonly = new SoftclipReapplier(true, true, false, false, false, true);
            var read           = new Read("chr", new BamAlignment
            {
                Position  = 20, // zero based
                CigarData = new CigarAlignment("10M"),
                Bases     = "GTACGTACGT",
                Qualities = new byte[] { 20, 20, 20, 20, 20, 20, 20, 20, 20, 20 }
            });

            var result  = GetResult("8M2I");
            var snippet = new GenomeSnippet()
            {
                Chromosome = "chr1", Sequence = "GTACGTACGT", StartPosition = 20
            };

            reapplier.ReapplySoftclips(read, 0, 0, new PositionMap(new int[] { 21, 22, 23, 24, 25, 26, 27, 28, 29, 30 }), result, snippet, 0, 0, new CigarAlignment("10M"));
            Assert.Equal("8M2I", result.Cigar.ToString());


            // reapply N softclips
            read = new Read("chr", new BamAlignment
            {
                Position  = 22, // zero based
                CigarData = new CigarAlignment("2S8M"),
                Bases     = "NNACGTACGT",
                Qualities = new byte[] { 20, 20, 20, 20, 20, 20, 20, 20, 20, 20 }
            });
            // At this point, the position map doesn't include the Ns. They get re-added.
            result = GetResult("6M2I");
            reapplier.ReapplySoftclips(read, 2, 0, new PositionMap(new int[] { 23, 24, 25, 26, 27, 28, 29, 30 }), result, snippet, 2, 0, new CigarAlignment("10M"));
            Assert.Equal("2S6M2I", result.Cigar.ToString());

            // reapply non-N softclips
            read = new Read("chr", new BamAlignment
            {
                Position  = 22, // zero based
                CigarData = new CigarAlignment("2S8M"),
                Bases     = "CCACGTACGT",
                Qualities = new byte[] { 20, 20, 20, 20, 20, 20, 20, 20, 20, 20 }
            });
            result = GetResult("8M2I");
            reapplier.ReapplySoftclips(read, 0, 0, new PositionMap(new int[] { 21, 22, 23, 24, 25, 26, 27, 28, 29, 30 }), result, snippet, 2, 0, new CigarAlignment("8M"));
            Assert.Equal("2S6M2I", result.Cigar.ToString());

            // if only remasking Ns, don't reapply non-N softclips
            result = GetResult("8M2I");
            reapplierNonly.ReapplySoftclips(read, 0, 0, new PositionMap(new int[] { 21, 22, 23, 24, 25, 26, 27, 28, 29, 30 }), result, snippet, 2, 0, new CigarAlignment("8M"));
            Assert.Equal("8M2I", result.Cigar.ToString());

            //// if the bases match, don't reapply softclips
            //read = new Read("chr", new BamAlignment
            //{
            //    Position = 22, // zero based
            //    CigarData = new CigarAlignment("2S8M"),
            //    Bases = "CTACGTACGT",
            //    Qualities = new byte[] { 20, 20, 20, 20, 20, 20, 20, 20, 20, 20 }
            //});
            //result = GetResult("8M2I");
            //reapplier.ReapplySoftclips(read, 0, 0, new PositionMap(new int[] { 21, 22, 23, 24, 25, 26, 27, 28, 29, 30 }), result, snippet, 2, 0, new CigarAlignment("8M"));
            //Assert.Equal("1S7M2I", result.Cigar.ToString());
        }
Exemple #12
0
        public void ReapplySoftclips(Read read, int nPrefixLength, int nSuffixLength, PositionMap positionMapWithoutTerminalNs,
                                     RealignmentResult result, GenomeSnippet context, uint prefixSoftclip, uint suffixSoftclip,
                                     CigarAlignment freshCigarWithoutTerminalNs)
        {
            // Re-append the N-prefix
            var nPrefixPositionMap = Enumerable.Repeat(-1, nPrefixLength);
            var nSuffixPositionMap = Enumerable.Repeat(-1, nSuffixLength);
            // TODO maybe have a function for combining pos maps instead
            var finalPositionMap = new PositionMap(nPrefixPositionMap.Concat(positionMapWithoutTerminalNs.Map).Concat(nSuffixPositionMap).ToArray());


            var finalCigar = new CigarAlignment {
                new CigarOp('S', (uint)nPrefixLength)
            };

            foreach (CigarOp op in result.Cigar)
            {
                finalCigar.Add(op);
            }

            finalCigar.Add(new CigarOp('S', (uint)nSuffixLength));
            finalCigar.Compress();
            result.Cigar = finalCigar;



            // In case realignment introduced a bunch of mismatch-Ms where there was previously softclipping, optionally re-mask them.
            if (result != null && _remaskSoftclips)
            {
                var mismatchMap =
                    Helper.GetMismatchMap(read.Sequence, finalPositionMap, context.Sequence, context.StartPosition);

                var softclipAdjustedCigar = Helper.SoftclipCigar(result.Cigar, mismatchMap, prefixSoftclip, suffixSoftclip,
                                                                 maskNsOnly: _maskNsOnly, prefixNs: Helper.GetCharacterBookendLength(read.Sequence, 'N', false),
                                                                 suffixNs: Helper.GetCharacterBookendLength(read.Sequence, 'N', true), softclipEvenIfMatch: _keepProbeSoftclips || _keepBothSideSoftclips, softclipRepresentsMess: (!(_keepBothSideSoftclips || _keepProbeSoftclips)));

                // Update position map to account for any softclipping added
                var adjustedPrefixClip = softclipAdjustedCigar.GetPrefixClip();
                for (var i = 0; i < adjustedPrefixClip; i++)
                {
                    finalPositionMap.UpdatePositionAtIndex(i, -2, true);
                }

                var adjustedSuffixClip = softclipAdjustedCigar.GetSuffixClip();
                for (var i = 0; i < adjustedSuffixClip; i++)
                {
                    finalPositionMap.UpdatePositionAtIndex(finalPositionMap.Length - 1 - i, -2, true);
                }

                var editDistance =
                    Helper.GetNumMismatches(read.Sequence, finalPositionMap, context.Sequence, context.StartPosition);
                if (editDistance == null)
                {
                    // This shouldn't happen at this point - we already have a successful result
                    throw new InvalidDataException("Edit distance is null for :" + read.Name + " with position map " +
                                                   string.Join(",", finalPositionMap) + " and CIGAR " + softclipAdjustedCigar);
                }

                // TODO PERF - See how much this really helps analytically. I'm thinking maybe kill this altogether and remove from eval
                var sumOfMismatching = Helper.GetSumOfMismatchQualities(mismatchMap, read.Qualities);

                var readHasPosition = finalPositionMap.HasAnyMappableBases();
                if (!readHasPosition)
                {
                    throw new InvalidDataException(string.Format(
                                                       "Read does not have any alignable bases. ({2} --> {0} --> {3}, {1})", freshCigarWithoutTerminalNs,
                                                       string.Join(",", finalPositionMap), read.CigarData, softclipAdjustedCigar));
                }

                result.Position      = finalPositionMap.FirstMappableBase(); // TODO this used to be >= 0 but changed to > 0. Confirm correct.
                result.Cigar         = softclipAdjustedCigar;
                result.NumMismatches = editDistance.Value;

                var addedAtFinal = new List <int>();
                foreach (var i in result.IndelsAddedAt)
                {
                    addedAtFinal.Add(i + nPrefixLength);
                }
                result.IndelsAddedAt = addedAtFinal;
                var nifiedAtFinal = new List <int>();
                foreach (var i in result.NifiedAt)
                {
                    nifiedAtFinal.Add(i + nPrefixLength);
                }
                result.NifiedAt = nifiedAtFinal;

                var newSummary = Extensions.GetAlignmentSummary(result.Position - 1 - context.StartPosition, result.Cigar,
                                                                context.Sequence,
                                                                read.Sequence, _trackActualMismatches, _checkSoftclipsForMismatches);

                result.NumNonNMismatches            = newSummary.NumNonNMismatches;
                result.NumNonNSoftclips             = newSummary.NumNonNSoftclips;
                result.NumSoftclips                 = newSummary.NumSoftclips;
                result.NumInsertedBases             = newSummary.NumInsertedBases;
                result.NumMismatchesIncludeSoftclip = newSummary.NumMismatchesIncludeSoftclip;
                //result.MismatchesIncludeSoftclip = newSummary.MismatchesIncludeSoftclip;
                result.SumOfMismatchingQualities = sumOfMismatching;
                result.AnchorLength = newSummary.AnchorLength;
            }
        }
        public void GetFinalAlignment_NonMock()
        {
            var snippetSource = new Mock <IGenomeSnippetSource>();
            var genomeSnippet = new GenomeSnippet()
            {
                Chromosome    = "chr1",
                Sequence      = new string('A', 1000) + "ATCGATTGA" + new string('T', 1000),
                StartPosition = 1000
            };

            snippetSource.Setup(x => x.GetGenomeSnippet(It.IsAny <int>())).Returns(genomeSnippet);
            var mockStatusHandler = new Mock <IStatusHandler>();
            var comparer          = new GemBasicAlignmentComparer(false, false);

            var readRealigner = new GeminiReadRealigner(comparer, remaskSoftclips: false,
                                                        keepProbeSoftclips: false, keepBothSideSoftclips: false,
                                                        trackActualMismatches: false, checkSoftclipsForMismatches: true,
                                                        debug: false, maskNsOnly: false, maskPartialInsertion: false,
                                                        minimumUnanchoredInsertionLength: 1,
                                                        minInsertionSizeToAllowMismatchingBases: 4, maxProportionInsertSequenceMismatch: 0.2); // TODO fix // TODO figure out what I was saying to fix here...

            var filterer = GetMockRegionFilterer();

            var indels               = new List <HashableIndel>();
            var indelSource          = new ChromosomeIndelSource(indels, snippetSource.Object);
            var realignmentEvaluator = new RealignmentEvaluator(indelSource, mockStatusHandler.Object, readRealigner,
                                                                new RealignmentJudger(comparer), "chr1", false, true, true, true, filterer.Object, false);

            var origBamAlignment =
                TestHelpers.CreateBamAlignment("AAAAAAATTCA", 1500, 1500, 30, true, cigar: new CigarAlignment("11M"));
            var realigned = realignmentEvaluator.GetFinalAlignment(origBamAlignment, out bool changed, out bool forcedSoftclip,
                                                                   out bool confirmed, out bool sketchy);

            // No indels
            Assert.False(changed);
            Assert.False(confirmed);

            indels = new List <HashableIndel>()
            {
                new HashableIndel()
                {
                    Chromosome        = "chr1",
                    ReferencePosition = 1506,
                    ReferenceAllele   = "A",
                    AlternateAllele   = "ATT",
                    Type   = AlleleCategory.Insertion,
                    Length = 2
                }
            };
            indelSource          = new ChromosomeIndelSource(indels, snippetSource.Object);
            realignmentEvaluator = new RealignmentEvaluator(indelSource, mockStatusHandler.Object, readRealigner,
                                                            new RealignmentJudger(comparer), "chr1", false, true, true, true, filterer.Object, false);
            realigned = realignmentEvaluator.GetFinalAlignment(origBamAlignment, out changed, out forcedSoftclip,
                                                               out confirmed, out sketchy);
            Assert.True(changed);
            Assert.False(confirmed);
            Assert.Equal("7M2I2M", realigned.CigarData.ToString());

            var confirmedAccepteds = new List <HashableIndel>();

            realignmentEvaluator = new RealignmentEvaluator(indelSource, mockStatusHandler.Object, readRealigner,
                                                            new RealignmentJudger(comparer), "chr1", false, true, true, true, filterer.Object, false);
            var reRealigned = realignmentEvaluator.GetFinalAlignment(realigned, out changed, out forcedSoftclip,
                                                                     out confirmed, out sketchy, confirmedAccepteds: confirmedAccepteds);

            Assert.False(changed);
            Assert.True(confirmed);
            Assert.Equal("7M2I2M", reRealigned.CigarData.ToString());

            // Existing indel is best (and only)
            realignmentEvaluator = new RealignmentEvaluator(indelSource, mockStatusHandler.Object, readRealigner,
                                                            new RealignmentJudger(comparer), "chr1", false, true, true, true, filterer.Object, false);
            reRealigned = realignmentEvaluator.GetFinalAlignment(realigned, out changed, out forcedSoftclip,
                                                                 out confirmed, out sketchy, confirmedAccepteds: confirmedAccepteds, existingIndels: new List <PreIndel>()
            {
                new PreIndel(new CandidateAllele("chr1", 1506, "A", "ATT", AlleleCategory.Insertion))
            });
            Assert.False(changed);
            Assert.True(confirmed);
            Assert.Equal("7M2I2M", reRealigned.CigarData.ToString());

            // Existing indel is unsanctioned but good fit - keep it
            var alignmentWithInsertion =
                TestHelpers.CreateBamAlignment("AAAAAAATTCA", 1500, 1500, 30, true, cigar: new CigarAlignment("7M3I1M"));

            realignmentEvaluator = new RealignmentEvaluator(indelSource, mockStatusHandler.Object, readRealigner,
                                                            new RealignmentJudger(comparer), "chr1", false, true, true, false, filterer.Object, false);

            var realignedExistingIns = realignmentEvaluator.GetFinalAlignment(alignmentWithInsertion, out changed, out forcedSoftclip,
                                                                              out confirmed, out sketchy, confirmedAccepteds: confirmedAccepteds, existingIndels: new List <PreIndel>()
            {
                new PreIndel(new CandidateAllele("chr1", 1506, "A", "ATTC", AlleleCategory.Insertion))
            });

            Assert.False(changed);
            Assert.False(confirmed);
            Assert.Equal("7M3I1M", realignedExistingIns.CigarData.ToString());

            // Existing indel is unsanctioned and we're softclipping unknowns - softclip it
            realignmentEvaluator = new RealignmentEvaluator(indelSource, mockStatusHandler.Object, readRealigner,
                                                            new RealignmentJudger(comparer), "chr1", false, true, true, true, filterer.Object, false);

            realignedExistingIns = realignmentEvaluator.GetFinalAlignment(alignmentWithInsertion, out changed, out forcedSoftclip,
                                                                          out confirmed, out sketchy, confirmedAccepteds: confirmedAccepteds, existingIndels: new List <PreIndel>()
            {
                new PreIndel(new CandidateAllele("chr1", 1506, "A", "ATTC", AlleleCategory.Insertion))
            });
            Assert.False(changed);
            Assert.False(confirmed);
            Assert.Equal("7M4S", realignedExistingIns.CigarData.ToString());

            indels = new List <HashableIndel>()
            {
                new HashableIndel()
                {
                    Chromosome        = "chr1",
                    ReferencePosition = 1506,
                    ReferenceAllele   = "A",
                    AlternateAllele   = "ATT",
                    Type   = AlleleCategory.Insertion,
                    Length = 2,
                    Score  = 1000
                },

                new HashableIndel()
                {
                    Chromosome        = "chr1",
                    ReferencePosition = 1506,
                    ReferenceAllele   = "A",
                    AlternateAllele   = "ATTC",
                    Type   = AlleleCategory.Insertion,
                    Length = 3,
                    Score  = 760
                },
                new HashableIndel()
                {
                    Chromosome        = "chr1",
                    ReferencePosition = 1506,
                    ReferenceAllele   = "A",
                    AlternateAllele   = "ATTG",
                    Type   = AlleleCategory.Insertion,
                    Length = 3,
                    Score  = 10
                }
            };
            indelSource          = new ChromosomeIndelSource(indels, snippetSource.Object);
            realignmentEvaluator = new RealignmentEvaluator(indelSource, mockStatusHandler.Object, readRealigner,
                                                            new RealignmentJudger(comparer), "chr1", false, true, true, true, filterer.Object, false);
            realigned = realignmentEvaluator.GetFinalAlignment(origBamAlignment, out changed, out forcedSoftclip,
                                                               out confirmed, out sketchy);
            Assert.True(changed);
            Assert.False(confirmed);
            Assert.Equal("7M3I1M", realigned.CigarData.ToString());

            confirmedAccepteds   = new List <HashableIndel>();
            realignmentEvaluator = new RealignmentEvaluator(indelSource, mockStatusHandler.Object, readRealigner,
                                                            new RealignmentJudger(comparer), "chr1", false, true, true, true, filterer.Object, false);
            reRealigned = realignmentEvaluator.GetFinalAlignment(realigned, out changed, out forcedSoftclip,
                                                                 out confirmed, out sketchy, confirmedAccepteds: confirmedAccepteds);
            Assert.False(changed);
            Assert.True(confirmed);
            Assert.Equal("7M3I1M", reRealigned.CigarData.ToString());

            // Existing indel is not the top one but is the best fit, keep it
            realignmentEvaluator = new RealignmentEvaluator(indelSource, mockStatusHandler.Object, readRealigner,
                                                            new RealignmentJudger(comparer), "chr1", false, true, true, true, filterer.Object, false);
            reRealigned = realignmentEvaluator.GetFinalAlignment(realigned, out changed, out forcedSoftclip,
                                                                 out confirmed, out sketchy, confirmedAccepteds: confirmedAccepteds, existingIndels: new List <PreIndel>()
            {
                new PreIndel(new CandidateAllele("chr1", 1506, "A", "ATTC", AlleleCategory.Insertion))
            });
            Assert.False(changed);
            Assert.True(confirmed);
            Assert.Equal("7M3I1M", reRealigned.CigarData.ToString());


            // Has existing unsanctioned indel and there are better ones to realign around - ignore the bad one, take the good
            realignmentEvaluator = new RealignmentEvaluator(indelSource, mockStatusHandler.Object, readRealigner,
                                                            new RealignmentJudger(comparer), "chr1", false, true, true, true, filterer.Object, false);
            reRealigned = realignmentEvaluator.GetFinalAlignment(realigned, out changed, out forcedSoftclip,
                                                                 out confirmed, out sketchy, confirmedAccepteds: confirmedAccepteds, existingIndels: new List <PreIndel>()
            {
                new PreIndel(new CandidateAllele("chr1", 1507, "A", "ATC", AlleleCategory.Insertion))
            });
            Assert.False(changed);
            Assert.True(confirmed);
            Assert.Equal("7M3I1M", reRealigned.CigarData.ToString());
        }
Exemple #14
0
        public static HashableIndel GetHashableIndel(GenomeSnippet snippet, PreIndel preIndel, int contextStart, bool debug)
        {
            var actualReferenceAllele = ActualReferenceAllele(snippet, preIndel, contextStart);

            var actualAltAllele = ActualAltAllele(preIndel, actualReferenceAllele);

            var indelType = actualReferenceAllele.Length > actualAltAllele.Length
                ? AlleleCategory.Deletion
                : AlleleCategory.Insertion;

            string repeatUnit;
            var    variantBases = indelType == AlleleCategory.Insertion
                ? actualAltAllele.Substring(1)
                : actualReferenceAllele.Substring(1);

            const int maxRepeatUnitLength = 3;
            var       isRepeat            = StitchingLogic.OverlapEvaluator.IsRepeat(variantBases, maxRepeatUnitLength
                                                                                     , out repeatUnit);

            var isDuplication = Helper.IsDuplication(snippet.Sequence, preIndel.ReferencePosition, isRepeat, repeatUnit, actualAltAllele);

            var numRepeatsLeft = 0;
            var numRepeats     = 0;

            if (indelType == AlleleCategory.Insertion && preIndel.Length > 3)
            {
                var currentPos = preIndel.ReferencePosition - snippet.StartPosition;
                while (true)
                {
                    // TODO < or <=
                    if (snippet.Sequence.Length <= currentPos + preIndel.Length)
                    {
                        break;
                    }
                    // Need to go both directions because we're allowing inexact.
                    var referenceAfterInsertion = snippet.Sequence.Substring(currentPos, preIndel.Length);

                    bool stillMatch = false;
                    if (referenceAfterInsertion != variantBases)
                    {
                        var numMismatches = Helper.GetHammingNumMismatches(referenceAfterInsertion, variantBases);
                        if (numMismatches <= 1)
                        {
                            stillMatch = true;
                        }
                    }
                    else
                    {
                        stillMatch = true;
                    }

                    if (stillMatch)
                    {
                        numRepeats++;
                        currentPos += preIndel.Length;
                    }
                    else
                    {
                        break;
                    }
                }

                var currentPosLeft = preIndel.ReferencePosition - preIndel.Length - snippet.StartPosition;
                while (true)
                {
                    // Need to go both directions because we're allowing inexact.
                    if (currentPosLeft < 0)
                    {
                        break;
                    }
                    var referenceAfterInsertion = snippet.Sequence.Substring(currentPosLeft, preIndel.Length);

                    bool stillMatch = false;
                    if (referenceAfterInsertion != variantBases)
                    {
                        var numMismatches = Helper.GetHammingNumMismatches(referenceAfterInsertion, variantBases);
                        if (numMismatches <= 1)
                        {
                            stillMatch = true;
                        }
                    }
                    else
                    {
                        stillMatch = true;
                    }

                    if (stillMatch)
                    {
                        numRepeatsLeft++;
                        currentPosLeft -= preIndel.Length;
                    }
                    else
                    {
                        break;
                    }
                }
            }

            string newRepeatUnit;
            var    repeats = Helper.ComputeRMxNLengthForIndel(preIndel.ReferencePosition - snippet.StartPosition, variantBases, snippet.Sequence, 6, out newRepeatUnit);

            if (repeats >= 6) // TODO make this configurable?
            {
                isRepeat   = true;
                repeatUnit = newRepeatUnit;
            }

            string otherIndel = "";

            if (preIndel.InMulti)
            {
                var otherAsPre = GetIndelKey(preIndel.OtherIndel);
                otherAsPre.ReferenceAllele = ActualReferenceAllele(snippet, otherAsPre, contextStart);
                otherAsPre.AlternateAllele = ActualAltAllele(otherAsPre, otherAsPre.ReferenceAllele);
                otherIndel = Helper.CandidateToString(otherAsPre);
            }

            var length = Math.Abs(actualReferenceAllele.Length - actualAltAllele.Length);
            var isUntrustworthyInRepeatRegion = false;

            if (length == 1)
            {
                isUntrustworthyInRepeatRegion = Helper.IsInHomopolymerStretch(snippet.Sequence, preIndel.ReferencePosition);
            }

            // TODO ADD TESTS!!
            var refPrefix = ReferencePrefix(snippet, preIndel, contextStart);
            var refSuffix = ReferenceSuffix(snippet, preIndel, contextStart);

            //Read-end repeats of this repeat unit that are this length or smaller should not be trusted as insertion evidence, but larger ones can
            var numBasesBeforeInsertionUnique = 0;

            if (indelType == AlleleCategory.Insertion)
            {
                var sequenceToCheckFor = isRepeat ? repeatUnit : actualAltAllele;

                for (int i = 0; i < refSuffix.Length - sequenceToCheckFor.Length; i += sequenceToCheckFor.Length)
                {
                    if (refSuffix.Substring(i, sequenceToCheckFor.Length) == sequenceToCheckFor)
                    {
                        numBasesBeforeInsertionUnique++;
                    }
                    else
                    {
                        break;
                    }
                }
            }


            var indelIdentifier = new HashableIndel
            {
                Chromosome        = preIndel.Chromosome,
                ReferencePosition = preIndel.ReferencePosition,
                ReferenceAllele   = actualReferenceAllele,
                AlternateAllele   = actualAltAllele,
                Type          = indelType,
                Length        = length,
                Score         = preIndel.Score,
                InMulti       = preIndel.InMulti,
                OtherIndel    = otherIndel,
                IsRepeat      = isRepeat,
                RepeatUnit    = repeatUnit,
                IsDuplication = isDuplication,
                IsUntrustworthyInRepeatRegion = isUntrustworthyInRepeatRegion,
                RefPrefix = refPrefix,
                RefSuffix = refSuffix,
                NumBasesInReferenceSuffixBeforeUnique = numBasesBeforeInsertionUnique,
                NumRepeatsNearby   = repeats,
                NumApproxDupsLeft  = numRepeatsLeft,
                NumApproxDupsRight = numRepeats
            };

            indelIdentifier = Helper.CopyHashable(indelIdentifier, otherIndel);

            if (isDuplication && debug)
            {
                Console.WriteLine($"Found a duplication: {indelIdentifier.StringRepresentation}");
            }

            if (isRepeat && debug)
            {
                Console.WriteLine($"Found a repeat: {indelIdentifier.StringRepresentation}, {repeatUnit}");
            }

            return(indelIdentifier);
        }
Exemple #15
0
        private static List <HashableIndel> GetFinalIndelsForChromosome(List <PreIndel> indelsForChrom, ChrReference chrReference, bool debug)
        {
            int numSkippedWeakShortComplex = 0;
            int numRepeatLotsCompetitors   = 0;

            var indelsdict         = new Dictionary <HashableIndel, List <PreIndel> >();
            var chromIndelContexts = new List <HashableIndel>();

            var snippet = new GenomeSnippet
            {
                Chromosome    = chrReference.Name,
                Sequence      = chrReference.Sequence,
                StartPosition = 0
            };
            var contextStart = 0;

            // TODO REFACTOR OUT FILTERING, TO MATCH SPEC STRUCTURE
            var numCandidates = indelsForChrom.Count();
            // TODO consider changing how this threshold is calculated
            var medianIndelSupport = indelsForChrom.Any() ?
                                     indelsForChrom.Select(x => x.Observations).OrderBy(x => x).ToList()[numCandidates / 2] : 0;
            var thresholdForUntrustworthyRepeat = medianIndelSupport / 5;

            foreach (var candidateIndel in indelsForChrom)
            {
                var indelIdentifier = GetHashableIndel(snippet, candidateIndel, contextStart, debug);

                if (indelIdentifier.Score == 0)
                {
                    continue;
                }
                if (indelIdentifier.IsUntrustworthyInRepeatRegion && candidateIndel.Observations < thresholdForUntrustworthyRepeat && !indelIdentifier.InMulti)
                {
                    if (debug)
                    {
                        Logger.WriteToLog(
                            $"Skipping variant {candidateIndel} because it is a weak, short variant in a complex region (Support: {candidateIndel.Observations}).");
                    }

                    numSkippedWeakShortComplex++;
                    continue;
                }

                if (!indelsdict.TryGetValue(indelIdentifier, out var indelsForIdentifier))
                {
                    indelsForIdentifier = new List <PreIndel>();
                    indelsdict.Add(indelIdentifier, indelsForIdentifier);
                }

                indelsdict[indelIdentifier].Add(candidateIndel);
            }

            int numSkippedEffectiveSame = 0;
            var toRemove = new List <HashableIndel>();

            foreach (var indel in indelsdict.Keys.OrderByDescending(x => x.Score))
            {
                // Collapse neighbor deletions that have essentially the same consequence (todo should we do this with insertions too?)

                if (indel.InMulti)
                {
                    continue;
                }
                if (toRemove.Contains(indel))
                {
                    continue;
                }

                // TODO should threshold relate to num repeats nearby?
                var thresholdForNearby     = 75;
                var nearbySameLengthIndels =
                    indelsdict.Keys.Where(x => !x.Equals(indel) && !x.InMulti && Math.Abs(indel.ReferencePosition - x.ReferencePosition) <= thresholdForNearby &&
                                          x.Type == indel.Type && x.Length == indel.Length && x.Score * 2 < indel.Score);

                if (nearbySameLengthIndels.Any())
                {
                    var snipWidth         = thresholdForNearby * 2;
                    var snipStart         = Math.Max(indel.ReferencePosition - snipWidth - snippet.StartPosition, 0);
                    var snipEndAdjustment = indel.Type == AlleleCategory.Deletion ? indel.Length : 0;
                    var snipEnd           = Math.Min(indel.ReferencePosition - snippet.StartPosition + snipWidth + snipEndAdjustment, snippet.Sequence.Length);
                    var preLength         = indel.ReferencePosition - snippet.StartPosition - snipStart;
                    var postStart         = snipStart + preLength + snipEndAdjustment;
                    var variantSeq        = indel.Type == AlleleCategory.Deletion
                        ? ""
                        : indel.AlternateAllele.Substring(1);
                    var effectiveSequence = snippet.Sequence.Substring(snipStart, preLength) + variantSeq + snippet.Sequence.Substring(postStart, snipEnd - postStart);

                    foreach (var nearIndel in nearbySameLengthIndels)
                    {
                        var snipEndAdjustment2 = nearIndel.Type == AlleleCategory.Deletion ? nearIndel.Length : 0;
                        var preLength2         = nearIndel.ReferencePosition - snippet.StartPosition - snipStart;
                        var postStart2         = snipStart + preLength2 + snipEndAdjustment2;
                        var variantSeq2        = nearIndel.Type == AlleleCategory.Deletion
                            ? ""
                            : nearIndel.AlternateAllele.Substring(1);
                        var effectiveSequence2 = snippet.Sequence.Substring(snipStart, preLength2) + variantSeq2 + snippet.Sequence.Substring(postStart2, snipEnd - postStart2);
                        var mismatches         = 0;
                        for (int i = 0; i < effectiveSequence.Length; i++)
                        {
                            if (effectiveSequence[i] != effectiveSequence2[i])
                            {
                                mismatches++;
                            }
                        }

                        if (debug)
                        {
                            Console.WriteLine(
                                $"{indel.StringRepresentation} ({indel.Score}) vs {nearIndel.StringRepresentation} ({nearIndel.Score})");
                            Console.WriteLine(effectiveSequence);
                            Console.WriteLine(effectiveSequence2);

                            Console.WriteLine($"Mismatches: {mismatches}");
                            Console.WriteLine();
                        }

                        if (mismatches <= 1)
                        {
                            numSkippedEffectiveSame++;

                            if (debug)
                            {
                                Logger.WriteToLog(
                                    $"Removing {nearIndel.StringRepresentation} ({nearIndel.Score}) from contention as its consequence is extremely similar to {indel.StringRepresentation} ({indel.Score})");
                            }
                            toRemove.Add(nearIndel);
                            // TODO do we want to add the score from the removed indels to the kept one?
                        }
                    }
                }
            }

            foreach (var removeIndel in toRemove.Distinct())
            {
                indelsdict.Remove(removeIndel);
            }

            toRemove.Clear();

            foreach (var indel in indelsdict.Keys)
            {
                if (indel.InMulti)
                {
                    continue;
                }
                if (toRemove.Contains(indel))
                {
                    continue;
                }

                var variantsAtSamePos =
                    indelsdict.Keys.Where(x => x.ReferencePosition == indel.ReferencePosition && x.Type == indel.Type &&
                                          !x.Equals(indel) && !x.InMulti).ToList();

                var numVariantsAtSamePos       = variantsAtSamePos.Count();
                var variantsRemovedFromSamePos = 0;
                if (numVariantsAtSamePos > 0)
                {
                    foreach (var variantsAtSamePo in variantsAtSamePos)
                    {
                        if (variantsAtSamePo.Score * 2 < indel.Score && !variantsAtSamePo.HardToCall)
                        {
                            toRemove.Add(variantsAtSamePo);
                            variantsRemovedFromSamePos++;
                        }
                    }

                    if (numVariantsAtSamePos - variantsRemovedFromSamePos > 2)
                    {
                        toRemove.Add(indel);
                        toRemove.AddRange(variantsAtSamePos);

                        if (debug)
                        {
                            Logger.WriteToLog(
                                $"Skipping variant {indel.StringRepresentation} ({indel.Score}) and {numVariantsAtSamePos} competitors because it's a repeat with lots of competitors and there is no clear strong candidate ({(string.Join(",", variantsAtSamePos.Select(x => x.Score)))}).");
                        }
                    }
                    else
                    {
                        // Note that this could be an issue if there are somatic indels at the same position as germline indels

                        if (debug)
                        {
                            Logger.WriteToLog(
                                $"Removing {variantsRemovedFromSamePos} of {numVariantsAtSamePos} variants at same position as {indel.StringRepresentation} ({indel.Score}) ({(string.Join(",", variantsAtSamePos.Select(x => x.Score)))}).");
                        }
                    }
                }
            }

            foreach (var removeIndel in toRemove.Distinct())
            {
                indelsdict.Remove(removeIndel);
            }

            if (debug)
            {
                Logger.WriteToLog(
                    $"Skipped {numRepeatLotsCompetitors} for being a repeat with lots of competitors and there is no clear strong candidate.");
                Logger.WriteToLog(
                    $"Skipped {numSkippedWeakShortComplex} for being a weak, short variant in a complex region.");
                Logger.WriteToLog(
                    $"Skipped {numSkippedEffectiveSame} for being effectively the same as a much stronger variant.");
            }

            chromIndelContexts = indelsdict.Keys.ToList();

            return(chromIndelContexts);
        }