예제 #1
        private static string ActualReferenceAllele(GenomeSnippet snippet, PreIndel preIndel, int contextStart)
            var actualReferenceAllele = snippet.Sequence.Substring(
                preIndel.ReferencePosition - 1 - contextStart, preIndel.ReferenceAllele.Length);

예제 #2
        public static Mock <IGeminiDataSourceFactory> MockDataSourceFactory(Mock <IBamReader> mockReader, Mock <IDataSource <ReadPair> > mockReadPairSource)
            var mockDataSourceFactory = new Mock <IGeminiDataSourceFactory>();

            mockDataSourceFactory.Setup(x => x.CreateBamReader(It.IsAny <string>())).Returns(mockReader.Object);
            mockDataSourceFactory.Setup(x => x.CreateReadPairSource(It.IsAny <IBamReader>(), It.IsAny <ReadStatusCounter>()))
            mockDataSourceFactory.Setup(x => x.GetRefIdMapping(It.IsAny <string>()))
            .Returns(new Dictionary <int, string>()
                { 1, "chr1" }, { 2, "chr2" }, { -1, "Unknown" }
            var mockSnippetSource = new Mock <IGenomeSnippetSource>();
            var genomeSnippet     = new GenomeSnippet()
                Chromosome = "chr1", Sequence = new string('A', 1000000), StartPosition = 0

            mockSnippetSource.Setup(x => x.GetGenomeSnippet(It.IsAny <int>())).Returns(genomeSnippet);
            mockDataSourceFactory.Setup(x => x.CreateGenomeSnippetSource(It.IsAny <string>(), It.IsAny <ChrReference>(), It.IsAny <int>()))
            .Setup(x => x.GetChromosomeIndelSource(It.IsAny <List <HashableIndel> >(),
                                                   It.IsAny <IGenomeSnippetSource>())).Returns <List <HashableIndel>, IGenomeSnippetSource>((x, y) => new ChromosomeIndelSource(x, y));
예제 #3
        private static string ReferenceSuffix(GenomeSnippet snippet, PreIndel preIndel, int contextStart)
            var offset         = Math.Max(10, 3 * preIndel.Length);
            var prefixSequence = snippet.Sequence.Substring(
                preIndel.ReferencePosition + preIndel.ReferenceAllele.Length - 1 - contextStart, offset);

예제 #4
        private static string ReferencePrefix(GenomeSnippet snippet, PreIndel preIndel, int contextStart)
            var offset         = Math.Max(10, 3 * preIndel.Length);
            var prefixStart    = Math.Max(0, preIndel.ReferencePosition - 1 - contextStart - offset - 1);
            var prefixLength   = preIndel.ReferencePosition - prefixStart;
            var prefixSequence = snippet.Sequence.Substring(prefixStart, prefixLength);

        public IEnumerable <KeyValuePair <HashableIndel, GenomeSnippet> > GetRelevantIndels(int position, List <PreIndel> preSelectedIndels = null, List <HashableIndel> confirmedIndels = null, List <PreIndel> existingIndels = null, List <PreIndel> mateIndels = null)
            // TODO make this calculation right
            // TODO figure out what that ^ means. I don't see what's not "right" about this but I'll leave the comment til I figure it out or determine that it is not meant to be there
            if (_numIndels == 0 || position > HighestPosition + _bucketSize || position < LowestPosition - _bucketSize)

            var indelsToReturn = new Dictionary <HashableIndel, GenomeSnippet>();

            const int maxDistance = 250;

            var indelExactBucketNum = (position - LowestPosition) / _bucketSize;

            // TODO see how many are actually being used
            const int maxNumTopScorersToReturn           = 5;
            const int maxNumExtraTopScorerMultisToReturn = 3;

            for (int i = 0; i <= 2; i++)
                var peripheralBucketNum = indelExactBucketNum - 1 + i;
                if (_positionalBucketsOfIndels.TryGetValue(peripheralBucketNum, out var bucket))
                    var           addedForBucket   = 0;
                    GenomeSnippet snippetForBucket = null;
                    foreach (var item in bucket.OrderByDescending(v => v.Score))
                        if ((addedForBucket >= maxNumTopScorersToReturn && !item.InMulti) || (addedForBucket >= maxNumTopScorersToReturn + maxNumExtraTopScorerMultisToReturn))
                        if (Math.Abs(item.ReferencePosition - position) <= maxDistance)
                            if (snippetForBucket == null)
                                snippetForBucket = _genomeSnippetsLookup[peripheralBucketNum];
                            indelsToReturn[item] = snippetForBucket;

            var filteredIndelsRaw = indelsToReturn.OrderByDescending(x => IsFavored(preSelectedIndels, confirmedIndels, x)).
                                    ThenByDescending(x => x.Key.Score).ThenByDescending(x => IsPreSelected(preSelectedIndels, x)).ThenBy(x => x.Key.StringRepresentation).ToList();

            var filteredIndels = FilterIndels(preSelectedIndels, filteredIndelsRaw, maxNumTopScorersToReturn,
                                              maxNumExtraTopScorerMultisToReturn, confirmedIndels, position);

        private static List <BamAlignment> ExtractReadsFromRealignerAndCombiner(PairResult pair, string refSeq,
                                                                                int refSeqOffset, List <PreIndel> preIndels, bool hasExistingIndels = false)
            var stitchedPairHandler =
                new PairHandler(new Dictionary <int, string>()
                { 1, "chr1" }
            }, new BasicStitcher(0), tryStitch: true);

            var snippetSource = new Mock <IGenomeSnippetSource>();
            var genomeSnippet = new GenomeSnippet()
                Chromosome    = "chr1",
                Sequence      = new string('A', refSeqOffset) + refSeq + new string('T', 1000),
                StartPosition = 0

            snippetSource.Setup(x => x.GetGenomeSnippet(It.IsAny <int>())).Returns(genomeSnippet);
            var mockStatusHandler = new Mock <IStatusHandler>();
            var comparer          = new GemBasicAlignmentComparer(false, false);

            var readRealigner = new GeminiReadRealigner(comparer, remaskSoftclips: false,
                                                        keepProbeSoftclips: false, keepBothSideSoftclips: false,
                                                        trackActualMismatches: false, checkSoftclipsForMismatches: true,
                                                        debug: false, maskNsOnly: false, maskPartialInsertion: false,
                                                        minimumUnanchoredInsertionLength: 1,
                                                        minInsertionSizeToAllowMismatchingBases: 4,
                                                        maxProportionInsertSequenceMismatch: 0.2); // TODO fix // TODO figure out what I was saying to fix here...

            var filterer = new Mock <IRegionFilterer>();

            filterer.Setup(x => x.AnyIndelsNearby(It.IsAny <int>())).Returns(true);

            var indels               = preIndels.Select(x => HashableIndelSource.GetHashableIndel(genomeSnippet, x, 0, false)).ToList();
            var indelSource          = new ChromosomeIndelSource(indels, snippetSource.Object);
            var realignmentEvaluator = new RealignmentEvaluator(indelSource, mockStatusHandler.Object, readRealigner,
                                                                new RealignmentJudger(comparer), "chr1", false, true, true, true, filterer.Object, false);

            var combiner = new ReadPairRealignerAndCombiner(new NonSnowballEvidenceCollector(),
                                                            new PostRealignmentStitcher(stitchedPairHandler, new DebugStatusHandler(new ReadStatusCounter())),
                                                            realignmentEvaluator, new PairSpecificIndelFinder(), "chr1", false, hasExistingIndels: hasExistingIndels);
            var nmCalc = new NmCalculator(snippetSource.Object);

            var result = combiner.ExtractReads(pair, nmCalc);

 public GenomeSnippet GetGenomeSnippet(int position)
     if (position < 0)
         throw new ArgumentException(
                   $"Invalid snippet reference position ({position}): must be non-negative.");
     if (Math.Abs(position - _lastPosition) < _snippetBuffer && _currentEndPos - position > _snippetBuffer)
         _snippet       = _snippetSource.GetGenomeSnippet(position);
         _lastPosition  = position;
         _currentEndPos = _snippet.StartPosition + _snippet.Sequence.Length;
        public void GetNm()
            var snippetSource = new Mock <IGenomeSnippetSource>();
            var genomeSnippet = new GenomeSnippet()
                Chromosome    = "chr1",
                Sequence      = "NNNNNAAAAATTTTTGGGGGCCCCC",
                StartPosition = 94 // 0 based

            snippetSource.Setup(x => x.GetGenomeSnippet(It.IsAny <int>())).Returns(genomeSnippet);
            var nmCalculator = new NmCalculator(snippetSource.Object);

            // Positions passed to CreateBamAlignment are one based bc it adjusts by one in the helper
            var alignment = TestHelpers.CreateBamAlignment("AAAAA", 100, 0, 30, true);

            Assert.Equal(0, nmCalculator.GetNm(alignment));

            alignment = TestHelpers.CreateBamAlignment("AATAA", 100, 0, 30, true);
            Assert.Equal(1, nmCalculator.GetNm(alignment));

            alignment = TestHelpers.CreateBamAlignment("AGTGT", 100, 0, 30, true);
            Assert.Equal(4, nmCalculator.GetNm(alignment));

            alignment = TestHelpers.CreateBamAlignment("AGTGT", 100, 0, 30, true, cigar: new CigarAlignment("1M4I"));
            Assert.Equal(4, nmCalculator.GetNm(alignment));

            alignment = TestHelpers.CreateBamAlignment("ATTTT", 100, 0, 30, true, cigar: new CigarAlignment("1M4D4M"));
            Assert.Equal(4, nmCalculator.GetNm(alignment));

            alignment = TestHelpers.CreateBamAlignment("ACCCC", 100, 0, 30, true, cigar: new CigarAlignment("1M4D4M"));
            Assert.Equal(8, nmCalculator.GetNm(alignment));

            alignment = TestHelpers.CreateBamAlignment("GAAAA", 100, 0, 30, true);
            Assert.Equal(1, nmCalculator.GetNm(alignment));

            alignment = TestHelpers.CreateBamAlignment("AATAA", 100, 0, 30, true, cigar: new CigarAlignment("2M3S"));
            Assert.Equal(0, nmCalculator.GetNm(alignment));
예제 #9
        public void GetHashableIndel()
            var refSequence = "ZZXXXXXCAGCAGCAGCAGXYZ";

            var indel = new PreIndel(new CandidateAllele("chr1", 7, "XCAG", "X", AlleleCategory.Deletion));

            var genomeSnippet = new GenomeSnippet()
                Chromosome    = "chr1",
                Sequence      = refSequence + "TTTTT",
                StartPosition = 0

            var hashable = HashableIndelSource.GetHashableIndel(genomeSnippet, indel, 0, false);

            Assert.Equal("ZZXXXXX", hashable.RefPrefix);
            Assert.Equal("CAGCAGCAGX", hashable.RefSuffix);

            indel    = new PreIndel(new CandidateAllele("chr1", 7, "X", "XCAG", AlleleCategory.Insertion));
            hashable = HashableIndelSource.GetHashableIndel(genomeSnippet, indel, 0, false);
            Assert.Equal("ZZXXXXX", hashable.RefPrefix);
            Assert.Equal("CAGCAGCAGC", hashable.RefSuffix);
        public GenomeSnippet GetGenomeSnippet(int position)
            if (_chrReference == null)
                // TODO optionally could open the genome back up?
                throw new Exception("Already disposed of the chr reference.");

            if (position < 0)
                throw new ArgumentException(
                          $"Invalid snippet reference position ({position}): must be non-negative.");

            var contextStart = position - _genomeContextSize;

            contextStart -= _buffer;
            contextStart  = Math.Max(0, contextStart);

            if (contextStart >= _chrReference.Sequence.Length)
                throw new ArgumentException(
                          $"Snippet would go off the end of the chromosome: {position} vs {_chrReference.Sequence.Length}.");

            var contextLength = Math.Min(_chrReference.Sequence.Length - contextStart, 2 * _buffer + _genomeContextSize * 2);
            var context       = _chrReference.Sequence.Substring(Math.Max(0, contextStart), contextLength);

            var snippet = new GenomeSnippet
                Chromosome    = _chrReference.Name,
                Sequence      = context,
                StartPosition = contextStart

예제 #11
        public void ReapplySoftclips()
            var reapplier      = new SoftclipReapplier(true, false, false, false, false, true);
            var reapplierNonly = new SoftclipReapplier(true, true, false, false, false, true);
            var read           = new Read("chr", new BamAlignment
                Position  = 20, // zero based
                CigarData = new CigarAlignment("10M"),
                Bases     = "GTACGTACGT",
                Qualities = new byte[] { 20, 20, 20, 20, 20, 20, 20, 20, 20, 20 }

            var result  = GetResult("8M2I");
            var snippet = new GenomeSnippet()
                Chromosome = "chr1", Sequence = "GTACGTACGT", StartPosition = 20

            reapplier.ReapplySoftclips(read, 0, 0, new PositionMap(new int[] { 21, 22, 23, 24, 25, 26, 27, 28, 29, 30 }), result, snippet, 0, 0, new CigarAlignment("10M"));
            Assert.Equal("8M2I", result.Cigar.ToString());

            // reapply N softclips
            read = new Read("chr", new BamAlignment
                Position  = 22, // zero based
                CigarData = new CigarAlignment("2S8M"),
                Bases     = "NNACGTACGT",
                Qualities = new byte[] { 20, 20, 20, 20, 20, 20, 20, 20, 20, 20 }
            // At this point, the position map doesn't include the Ns. They get re-added.
            result = GetResult("6M2I");
            reapplier.ReapplySoftclips(read, 2, 0, new PositionMap(new int[] { 23, 24, 25, 26, 27, 28, 29, 30 }), result, snippet, 2, 0, new CigarAlignment("10M"));
            Assert.Equal("2S6M2I", result.Cigar.ToString());

            // reapply non-N softclips
            read = new Read("chr", new BamAlignment
                Position  = 22, // zero based
                CigarData = new CigarAlignment("2S8M"),
                Bases     = "CCACGTACGT",
                Qualities = new byte[] { 20, 20, 20, 20, 20, 20, 20, 20, 20, 20 }
            result = GetResult("8M2I");
            reapplier.ReapplySoftclips(read, 0, 0, new PositionMap(new int[] { 21, 22, 23, 24, 25, 26, 27, 28, 29, 30 }), result, snippet, 2, 0, new CigarAlignment("8M"));
            Assert.Equal("2S6M2I", result.Cigar.ToString());

            // if only remasking Ns, don't reapply non-N softclips
            result = GetResult("8M2I");
            reapplierNonly.ReapplySoftclips(read, 0, 0, new PositionMap(new int[] { 21, 22, 23, 24, 25, 26, 27, 28, 29, 30 }), result, snippet, 2, 0, new CigarAlignment("8M"));
            Assert.Equal("8M2I", result.Cigar.ToString());

            //// if the bases match, don't reapply softclips
            //read = new Read("chr", new BamAlignment
            //    Position = 22, // zero based
            //    CigarData = new CigarAlignment("2S8M"),
            //    Bases = "CTACGTACGT",
            //    Qualities = new byte[] { 20, 20, 20, 20, 20, 20, 20, 20, 20, 20 }
            //result = GetResult("8M2I");
            //reapplier.ReapplySoftclips(read, 0, 0, new PositionMap(new int[] { 21, 22, 23, 24, 25, 26, 27, 28, 29, 30 }), result, snippet, 2, 0, new CigarAlignment("8M"));
            //Assert.Equal("1S7M2I", result.Cigar.ToString());
예제 #12
        public void ReapplySoftclips(Read read, int nPrefixLength, int nSuffixLength, PositionMap positionMapWithoutTerminalNs,
                                     RealignmentResult result, GenomeSnippet context, uint prefixSoftclip, uint suffixSoftclip,
                                     CigarAlignment freshCigarWithoutTerminalNs)
            // Re-append the N-prefix
            var nPrefixPositionMap = Enumerable.Repeat(-1, nPrefixLength);
            var nSuffixPositionMap = Enumerable.Repeat(-1, nSuffixLength);
            // TODO maybe have a function for combining pos maps instead
            var finalPositionMap = new PositionMap(nPrefixPositionMap.Concat(positionMapWithoutTerminalNs.Map).Concat(nSuffixPositionMap).ToArray());

            var finalCigar = new CigarAlignment {
                new CigarOp('S', (uint)nPrefixLength)

            foreach (CigarOp op in result.Cigar)

            finalCigar.Add(new CigarOp('S', (uint)nSuffixLength));
            result.Cigar = finalCigar;

            // In case realignment introduced a bunch of mismatch-Ms where there was previously softclipping, optionally re-mask them.
            if (result != null && _remaskSoftclips)
                var mismatchMap =
                    Helper.GetMismatchMap(read.Sequence, finalPositionMap, context.Sequence, context.StartPosition);

                var softclipAdjustedCigar = Helper.SoftclipCigar(result.Cigar, mismatchMap, prefixSoftclip, suffixSoftclip,
                                                                 maskNsOnly: _maskNsOnly, prefixNs: Helper.GetCharacterBookendLength(read.Sequence, 'N', false),
                                                                 suffixNs: Helper.GetCharacterBookendLength(read.Sequence, 'N', true), softclipEvenIfMatch: _keepProbeSoftclips || _keepBothSideSoftclips, softclipRepresentsMess: (!(_keepBothSideSoftclips || _keepProbeSoftclips)));

                // Update position map to account for any softclipping added
                var adjustedPrefixClip = softclipAdjustedCigar.GetPrefixClip();
                for (var i = 0; i < adjustedPrefixClip; i++)
                    finalPositionMap.UpdatePositionAtIndex(i, -2, true);

                var adjustedSuffixClip = softclipAdjustedCigar.GetSuffixClip();
                for (var i = 0; i < adjustedSuffixClip; i++)
                    finalPositionMap.UpdatePositionAtIndex(finalPositionMap.Length - 1 - i, -2, true);

                var editDistance =
                    Helper.GetNumMismatches(read.Sequence, finalPositionMap, context.Sequence, context.StartPosition);
                if (editDistance == null)
                    // This shouldn't happen at this point - we already have a successful result
                    throw new InvalidDataException("Edit distance is null for :" + read.Name + " with position map " +
                                                   string.Join(",", finalPositionMap) + " and CIGAR " + softclipAdjustedCigar);

                // TODO PERF - See how much this really helps analytically. I'm thinking maybe kill this altogether and remove from eval
                var sumOfMismatching = Helper.GetSumOfMismatchQualities(mismatchMap, read.Qualities);

                var readHasPosition = finalPositionMap.HasAnyMappableBases();
                if (!readHasPosition)
                    throw new InvalidDataException(string.Format(
                                                       "Read does not have any alignable bases. ({2} --> {0} --> {3}, {1})", freshCigarWithoutTerminalNs,
                                                       string.Join(",", finalPositionMap), read.CigarData, softclipAdjustedCigar));

                result.Position      = finalPositionMap.FirstMappableBase(); // TODO this used to be >= 0 but changed to > 0. Confirm correct.
                result.Cigar         = softclipAdjustedCigar;
                result.NumMismatches = editDistance.Value;

                var addedAtFinal = new List <int>();
                foreach (var i in result.IndelsAddedAt)
                    addedAtFinal.Add(i + nPrefixLength);
                result.IndelsAddedAt = addedAtFinal;
                var nifiedAtFinal = new List <int>();
                foreach (var i in result.NifiedAt)
                    nifiedAtFinal.Add(i + nPrefixLength);
                result.NifiedAt = nifiedAtFinal;

                var newSummary = Extensions.GetAlignmentSummary(result.Position - 1 - context.StartPosition, result.Cigar,
                                                                read.Sequence, _trackActualMismatches, _checkSoftclipsForMismatches);

                result.NumNonNMismatches            = newSummary.NumNonNMismatches;
                result.NumNonNSoftclips             = newSummary.NumNonNSoftclips;
                result.NumSoftclips                 = newSummary.NumSoftclips;
                result.NumInsertedBases             = newSummary.NumInsertedBases;
                result.NumMismatchesIncludeSoftclip = newSummary.NumMismatchesIncludeSoftclip;
                //result.MismatchesIncludeSoftclip = newSummary.MismatchesIncludeSoftclip;
                result.SumOfMismatchingQualities = sumOfMismatching;
                result.AnchorLength = newSummary.AnchorLength;
        public void GetFinalAlignment_NonMock()
            var snippetSource = new Mock <IGenomeSnippetSource>();
            var genomeSnippet = new GenomeSnippet()
                Chromosome    = "chr1",
                Sequence      = new string('A', 1000) + "ATCGATTGA" + new string('T', 1000),
                StartPosition = 1000

            snippetSource.Setup(x => x.GetGenomeSnippet(It.IsAny <int>())).Returns(genomeSnippet);
            var mockStatusHandler = new Mock <IStatusHandler>();
            var comparer          = new GemBasicAlignmentComparer(false, false);

            var readRealigner = new GeminiReadRealigner(comparer, remaskSoftclips: false,
                                                        keepProbeSoftclips: false, keepBothSideSoftclips: false,
                                                        trackActualMismatches: false, checkSoftclipsForMismatches: true,
                                                        debug: false, maskNsOnly: false, maskPartialInsertion: false,
                                                        minimumUnanchoredInsertionLength: 1,
                                                        minInsertionSizeToAllowMismatchingBases: 4, maxProportionInsertSequenceMismatch: 0.2); // TODO fix // TODO figure out what I was saying to fix here...

            var filterer = GetMockRegionFilterer();

            var indels               = new List <HashableIndel>();
            var indelSource          = new ChromosomeIndelSource(indels, snippetSource.Object);
            var realignmentEvaluator = new RealignmentEvaluator(indelSource, mockStatusHandler.Object, readRealigner,
                                                                new RealignmentJudger(comparer), "chr1", false, true, true, true, filterer.Object, false);

            var origBamAlignment =
                TestHelpers.CreateBamAlignment("AAAAAAATTCA", 1500, 1500, 30, true, cigar: new CigarAlignment("11M"));
            var realigned = realignmentEvaluator.GetFinalAlignment(origBamAlignment, out bool changed, out bool forcedSoftclip,
                                                                   out bool confirmed, out bool sketchy);

            // No indels

            indels = new List <HashableIndel>()
                new HashableIndel()
                    Chromosome        = "chr1",
                    ReferencePosition = 1506,
                    ReferenceAllele   = "A",
                    AlternateAllele   = "ATT",
                    Type   = AlleleCategory.Insertion,
                    Length = 2
            indelSource          = new ChromosomeIndelSource(indels, snippetSource.Object);
            realignmentEvaluator = new RealignmentEvaluator(indelSource, mockStatusHandler.Object, readRealigner,
                                                            new RealignmentJudger(comparer), "chr1", false, true, true, true, filterer.Object, false);
            realigned = realignmentEvaluator.GetFinalAlignment(origBamAlignment, out changed, out forcedSoftclip,
                                                               out confirmed, out sketchy);
            Assert.Equal("7M2I2M", realigned.CigarData.ToString());

            var confirmedAccepteds = new List <HashableIndel>();

            realignmentEvaluator = new RealignmentEvaluator(indelSource, mockStatusHandler.Object, readRealigner,
                                                            new RealignmentJudger(comparer), "chr1", false, true, true, true, filterer.Object, false);
            var reRealigned = realignmentEvaluator.GetFinalAlignment(realigned, out changed, out forcedSoftclip,
                                                                     out confirmed, out sketchy, confirmedAccepteds: confirmedAccepteds);

            Assert.Equal("7M2I2M", reRealigned.CigarData.ToString());

            // Existing indel is best (and only)
            realignmentEvaluator = new RealignmentEvaluator(indelSource, mockStatusHandler.Object, readRealigner,
                                                            new RealignmentJudger(comparer), "chr1", false, true, true, true, filterer.Object, false);
            reRealigned = realignmentEvaluator.GetFinalAlignment(realigned, out changed, out forcedSoftclip,
                                                                 out confirmed, out sketchy, confirmedAccepteds: confirmedAccepteds, existingIndels: new List <PreIndel>()
                new PreIndel(new CandidateAllele("chr1", 1506, "A", "ATT", AlleleCategory.Insertion))
            Assert.Equal("7M2I2M", reRealigned.CigarData.ToString());

            // Existing indel is unsanctioned but good fit - keep it
            var alignmentWithInsertion =
                TestHelpers.CreateBamAlignment("AAAAAAATTCA", 1500, 1500, 30, true, cigar: new CigarAlignment("7M3I1M"));

            realignmentEvaluator = new RealignmentEvaluator(indelSource, mockStatusHandler.Object, readRealigner,
                                                            new RealignmentJudger(comparer), "chr1", false, true, true, false, filterer.Object, false);

            var realignedExistingIns = realignmentEvaluator.GetFinalAlignment(alignmentWithInsertion, out changed, out forcedSoftclip,
                                                                              out confirmed, out sketchy, confirmedAccepteds: confirmedAccepteds, existingIndels: new List <PreIndel>()
                new PreIndel(new CandidateAllele("chr1", 1506, "A", "ATTC", AlleleCategory.Insertion))

            Assert.Equal("7M3I1M", realignedExistingIns.CigarData.ToString());

            // Existing indel is unsanctioned and we're softclipping unknowns - softclip it
            realignmentEvaluator = new RealignmentEvaluator(indelSource, mockStatusHandler.Object, readRealigner,
                                                            new RealignmentJudger(comparer), "chr1", false, true, true, true, filterer.Object, false);

            realignedExistingIns = realignmentEvaluator.GetFinalAlignment(alignmentWithInsertion, out changed, out forcedSoftclip,
                                                                          out confirmed, out sketchy, confirmedAccepteds: confirmedAccepteds, existingIndels: new List <PreIndel>()
                new PreIndel(new CandidateAllele("chr1", 1506, "A", "ATTC", AlleleCategory.Insertion))
            Assert.Equal("7M4S", realignedExistingIns.CigarData.ToString());

            indels = new List <HashableIndel>()
                new HashableIndel()
                    Chromosome        = "chr1",
                    ReferencePosition = 1506,
                    ReferenceAllele   = "A",
                    AlternateAllele   = "ATT",
                    Type   = AlleleCategory.Insertion,
                    Length = 2,
                    Score  = 1000

                new HashableIndel()
                    Chromosome        = "chr1",
                    ReferencePosition = 1506,
                    ReferenceAllele   = "A",
                    AlternateAllele   = "ATTC",
                    Type   = AlleleCategory.Insertion,
                    Length = 3,
                    Score  = 760
                new HashableIndel()
                    Chromosome        = "chr1",
                    ReferencePosition = 1506,
                    ReferenceAllele   = "A",
                    AlternateAllele   = "ATTG",
                    Type   = AlleleCategory.Insertion,
                    Length = 3,
                    Score  = 10
            indelSource          = new ChromosomeIndelSource(indels, snippetSource.Object);
            realignmentEvaluator = new RealignmentEvaluator(indelSource, mockStatusHandler.Object, readRealigner,
                                                            new RealignmentJudger(comparer), "chr1", false, true, true, true, filterer.Object, false);
            realigned = realignmentEvaluator.GetFinalAlignment(origBamAlignment, out changed, out forcedSoftclip,
                                                               out confirmed, out sketchy);
            Assert.Equal("7M3I1M", realigned.CigarData.ToString());

            confirmedAccepteds   = new List <HashableIndel>();
            realignmentEvaluator = new RealignmentEvaluator(indelSource, mockStatusHandler.Object, readRealigner,
                                                            new RealignmentJudger(comparer), "chr1", false, true, true, true, filterer.Object, false);
            reRealigned = realignmentEvaluator.GetFinalAlignment(realigned, out changed, out forcedSoftclip,
                                                                 out confirmed, out sketchy, confirmedAccepteds: confirmedAccepteds);
            Assert.Equal("7M3I1M", reRealigned.CigarData.ToString());

            // Existing indel is not the top one but is the best fit, keep it
            realignmentEvaluator = new RealignmentEvaluator(indelSource, mockStatusHandler.Object, readRealigner,
                                                            new RealignmentJudger(comparer), "chr1", false, true, true, true, filterer.Object, false);
            reRealigned = realignmentEvaluator.GetFinalAlignment(realigned, out changed, out forcedSoftclip,
                                                                 out confirmed, out sketchy, confirmedAccepteds: confirmedAccepteds, existingIndels: new List <PreIndel>()
                new PreIndel(new CandidateAllele("chr1", 1506, "A", "ATTC", AlleleCategory.Insertion))
            Assert.Equal("7M3I1M", reRealigned.CigarData.ToString());

            // Has existing unsanctioned indel and there are better ones to realign around - ignore the bad one, take the good
            realignmentEvaluator = new RealignmentEvaluator(indelSource, mockStatusHandler.Object, readRealigner,
                                                            new RealignmentJudger(comparer), "chr1", false, true, true, true, filterer.Object, false);
            reRealigned = realignmentEvaluator.GetFinalAlignment(realigned, out changed, out forcedSoftclip,
                                                                 out confirmed, out sketchy, confirmedAccepteds: confirmedAccepteds, existingIndels: new List <PreIndel>()
                new PreIndel(new CandidateAllele("chr1", 1507, "A", "ATC", AlleleCategory.Insertion))
            Assert.Equal("7M3I1M", reRealigned.CigarData.ToString());
예제 #14
        public static HashableIndel GetHashableIndel(GenomeSnippet snippet, PreIndel preIndel, int contextStart, bool debug)
            var actualReferenceAllele = ActualReferenceAllele(snippet, preIndel, contextStart);

            var actualAltAllele = ActualAltAllele(preIndel, actualReferenceAllele);

            var indelType = actualReferenceAllele.Length > actualAltAllele.Length
                ? AlleleCategory.Deletion
                : AlleleCategory.Insertion;

            string repeatUnit;
            var    variantBases = indelType == AlleleCategory.Insertion
                ? actualAltAllele.Substring(1)
                : actualReferenceAllele.Substring(1);

            const int maxRepeatUnitLength = 3;
            var       isRepeat            = StitchingLogic.OverlapEvaluator.IsRepeat(variantBases, maxRepeatUnitLength
                                                                                     , out repeatUnit);

            var isDuplication = Helper.IsDuplication(snippet.Sequence, preIndel.ReferencePosition, isRepeat, repeatUnit, actualAltAllele);

            var numRepeatsLeft = 0;
            var numRepeats     = 0;

            if (indelType == AlleleCategory.Insertion && preIndel.Length > 3)
                var currentPos = preIndel.ReferencePosition - snippet.StartPosition;
                while (true)
                    // TODO < or <=
                    if (snippet.Sequence.Length <= currentPos + preIndel.Length)
                    // Need to go both directions because we're allowing inexact.
                    var referenceAfterInsertion = snippet.Sequence.Substring(currentPos, preIndel.Length);

                    bool stillMatch = false;
                    if (referenceAfterInsertion != variantBases)
                        var numMismatches = Helper.GetHammingNumMismatches(referenceAfterInsertion, variantBases);
                        if (numMismatches <= 1)
                            stillMatch = true;
                        stillMatch = true;

                    if (stillMatch)
                        currentPos += preIndel.Length;

                var currentPosLeft = preIndel.ReferencePosition - preIndel.Length - snippet.StartPosition;
                while (true)
                    // Need to go both directions because we're allowing inexact.
                    if (currentPosLeft < 0)
                    var referenceAfterInsertion = snippet.Sequence.Substring(currentPosLeft, preIndel.Length);

                    bool stillMatch = false;
                    if (referenceAfterInsertion != variantBases)
                        var numMismatches = Helper.GetHammingNumMismatches(referenceAfterInsertion, variantBases);
                        if (numMismatches <= 1)
                            stillMatch = true;
                        stillMatch = true;

                    if (stillMatch)
                        currentPosLeft -= preIndel.Length;

            string newRepeatUnit;
            var    repeats = Helper.ComputeRMxNLengthForIndel(preIndel.ReferencePosition - snippet.StartPosition, variantBases, snippet.Sequence, 6, out newRepeatUnit);

            if (repeats >= 6) // TODO make this configurable?
                isRepeat   = true;
                repeatUnit = newRepeatUnit;

            string otherIndel = "";

            if (preIndel.InMulti)
                var otherAsPre = GetIndelKey(preIndel.OtherIndel);
                otherAsPre.ReferenceAllele = ActualReferenceAllele(snippet, otherAsPre, contextStart);
                otherAsPre.AlternateAllele = ActualAltAllele(otherAsPre, otherAsPre.ReferenceAllele);
                otherIndel = Helper.CandidateToString(otherAsPre);

            var length = Math.Abs(actualReferenceAllele.Length - actualAltAllele.Length);
            var isUntrustworthyInRepeatRegion = false;

            if (length == 1)
                isUntrustworthyInRepeatRegion = Helper.IsInHomopolymerStretch(snippet.Sequence, preIndel.ReferencePosition);

            // TODO ADD TESTS!!
            var refPrefix = ReferencePrefix(snippet, preIndel, contextStart);
            var refSuffix = ReferenceSuffix(snippet, preIndel, contextStart);

            //Read-end repeats of this repeat unit that are this length or smaller should not be trusted as insertion evidence, but larger ones can
            var numBasesBeforeInsertionUnique = 0;

            if (indelType == AlleleCategory.Insertion)
                var sequenceToCheckFor = isRepeat ? repeatUnit : actualAltAllele;

                for (int i = 0; i < refSuffix.Length - sequenceToCheckFor.Length; i += sequenceToCheckFor.Length)
                    if (refSuffix.Substring(i, sequenceToCheckFor.Length) == sequenceToCheckFor)

            var indelIdentifier = new HashableIndel
                Chromosome        = preIndel.Chromosome,
                ReferencePosition = preIndel.ReferencePosition,
                ReferenceAllele   = actualReferenceAllele,
                AlternateAllele   = actualAltAllele,
                Type          = indelType,
                Length        = length,
                Score         = preIndel.Score,
                InMulti       = preIndel.InMulti,
                OtherIndel    = otherIndel,
                IsRepeat      = isRepeat,
                RepeatUnit    = repeatUnit,
                IsDuplication = isDuplication,
                IsUntrustworthyInRepeatRegion = isUntrustworthyInRepeatRegion,
                RefPrefix = refPrefix,
                RefSuffix = refSuffix,
                NumBasesInReferenceSuffixBeforeUnique = numBasesBeforeInsertionUnique,
                NumRepeatsNearby   = repeats,
                NumApproxDupsLeft  = numRepeatsLeft,
                NumApproxDupsRight = numRepeats

            indelIdentifier = Helper.CopyHashable(indelIdentifier, otherIndel);

            if (isDuplication && debug)
                Console.WriteLine($"Found a duplication: {indelIdentifier.StringRepresentation}");

            if (isRepeat && debug)
                Console.WriteLine($"Found a repeat: {indelIdentifier.StringRepresentation}, {repeatUnit}");

예제 #15
        private static List <HashableIndel> GetFinalIndelsForChromosome(List <PreIndel> indelsForChrom, ChrReference chrReference, bool debug)
            int numSkippedWeakShortComplex = 0;
            int numRepeatLotsCompetitors   = 0;

            var indelsdict         = new Dictionary <HashableIndel, List <PreIndel> >();
            var chromIndelContexts = new List <HashableIndel>();

            var snippet = new GenomeSnippet
                Chromosome    = chrReference.Name,
                Sequence      = chrReference.Sequence,
                StartPosition = 0
            var contextStart = 0;

            var numCandidates = indelsForChrom.Count();
            // TODO consider changing how this threshold is calculated
            var medianIndelSupport = indelsForChrom.Any() ?
                                     indelsForChrom.Select(x => x.Observations).OrderBy(x => x).ToList()[numCandidates / 2] : 0;
            var thresholdForUntrustworthyRepeat = medianIndelSupport / 5;

            foreach (var candidateIndel in indelsForChrom)
                var indelIdentifier = GetHashableIndel(snippet, candidateIndel, contextStart, debug);

                if (indelIdentifier.Score == 0)
                if (indelIdentifier.IsUntrustworthyInRepeatRegion && candidateIndel.Observations < thresholdForUntrustworthyRepeat && !indelIdentifier.InMulti)
                    if (debug)
                            $"Skipping variant {candidateIndel} because it is a weak, short variant in a complex region (Support: {candidateIndel.Observations}).");


                if (!indelsdict.TryGetValue(indelIdentifier, out var indelsForIdentifier))
                    indelsForIdentifier = new List <PreIndel>();
                    indelsdict.Add(indelIdentifier, indelsForIdentifier);


            int numSkippedEffectiveSame = 0;
            var toRemove = new List <HashableIndel>();

            foreach (var indel in indelsdict.Keys.OrderByDescending(x => x.Score))
                // Collapse neighbor deletions that have essentially the same consequence (todo should we do this with insertions too?)

                if (indel.InMulti)
                if (toRemove.Contains(indel))

                // TODO should threshold relate to num repeats nearby?
                var thresholdForNearby     = 75;
                var nearbySameLengthIndels =
                    indelsdict.Keys.Where(x => !x.Equals(indel) && !x.InMulti && Math.Abs(indel.ReferencePosition - x.ReferencePosition) <= thresholdForNearby &&
                                          x.Type == indel.Type && x.Length == indel.Length && x.Score * 2 < indel.Score);

                if (nearbySameLengthIndels.Any())
                    var snipWidth         = thresholdForNearby * 2;
                    var snipStart         = Math.Max(indel.ReferencePosition - snipWidth - snippet.StartPosition, 0);
                    var snipEndAdjustment = indel.Type == AlleleCategory.Deletion ? indel.Length : 0;
                    var snipEnd           = Math.Min(indel.ReferencePosition - snippet.StartPosition + snipWidth + snipEndAdjustment, snippet.Sequence.Length);
                    var preLength         = indel.ReferencePosition - snippet.StartPosition - snipStart;
                    var postStart         = snipStart + preLength + snipEndAdjustment;
                    var variantSeq        = indel.Type == AlleleCategory.Deletion
                        ? ""
                        : indel.AlternateAllele.Substring(1);
                    var effectiveSequence = snippet.Sequence.Substring(snipStart, preLength) + variantSeq + snippet.Sequence.Substring(postStart, snipEnd - postStart);

                    foreach (var nearIndel in nearbySameLengthIndels)
                        var snipEndAdjustment2 = nearIndel.Type == AlleleCategory.Deletion ? nearIndel.Length : 0;
                        var preLength2         = nearIndel.ReferencePosition - snippet.StartPosition - snipStart;
                        var postStart2         = snipStart + preLength2 + snipEndAdjustment2;
                        var variantSeq2        = nearIndel.Type == AlleleCategory.Deletion
                            ? ""
                            : nearIndel.AlternateAllele.Substring(1);
                        var effectiveSequence2 = snippet.Sequence.Substring(snipStart, preLength2) + variantSeq2 + snippet.Sequence.Substring(postStart2, snipEnd - postStart2);
                        var mismatches         = 0;
                        for (int i = 0; i < effectiveSequence.Length; i++)
                            if (effectiveSequence[i] != effectiveSequence2[i])

                        if (debug)
                                $"{indel.StringRepresentation} ({indel.Score}) vs {nearIndel.StringRepresentation} ({nearIndel.Score})");

                            Console.WriteLine($"Mismatches: {mismatches}");

                        if (mismatches <= 1)

                            if (debug)
                                    $"Removing {nearIndel.StringRepresentation} ({nearIndel.Score}) from contention as its consequence is extremely similar to {indel.StringRepresentation} ({indel.Score})");
                            // TODO do we want to add the score from the removed indels to the kept one?

            foreach (var removeIndel in toRemove.Distinct())


            foreach (var indel in indelsdict.Keys)
                if (indel.InMulti)
                if (toRemove.Contains(indel))

                var variantsAtSamePos =
                    indelsdict.Keys.Where(x => x.ReferencePosition == indel.ReferencePosition && x.Type == indel.Type &&
                                          !x.Equals(indel) && !x.InMulti).ToList();

                var numVariantsAtSamePos       = variantsAtSamePos.Count();
                var variantsRemovedFromSamePos = 0;
                if (numVariantsAtSamePos > 0)
                    foreach (var variantsAtSamePo in variantsAtSamePos)
                        if (variantsAtSamePo.Score * 2 < indel.Score && !variantsAtSamePo.HardToCall)

                    if (numVariantsAtSamePos - variantsRemovedFromSamePos > 2)

                        if (debug)
                                $"Skipping variant {indel.StringRepresentation} ({indel.Score}) and {numVariantsAtSamePos} competitors because it's a repeat with lots of competitors and there is no clear strong candidate ({(string.Join(",", variantsAtSamePos.Select(x => x.Score)))}).");
                        // Note that this could be an issue if there are somatic indels at the same position as germline indels

                        if (debug)
                                $"Removing {variantsRemovedFromSamePos} of {numVariantsAtSamePos} variants at same position as {indel.StringRepresentation} ({indel.Score}) ({(string.Join(",", variantsAtSamePos.Select(x => x.Score)))}).");

            foreach (var removeIndel in toRemove.Distinct())

            if (debug)
                    $"Skipped {numRepeatLotsCompetitors} for being a repeat with lots of competitors and there is no clear strong candidate.");
                    $"Skipped {numSkippedWeakShortComplex} for being a weak, short variant in a complex region.");
                    $"Skipped {numSkippedEffectiveSame} for being effectively the same as a much stronger variant.");

            chromIndelContexts = indelsdict.Keys.ToList();
