private static string ActualReferenceAllele(GenomeSnippet snippet, PreIndel preIndel, int contextStart) { var actualReferenceAllele = snippet.Sequence.Substring( preIndel.ReferencePosition - 1 - contextStart, preIndel.ReferenceAllele.Length); return(actualReferenceAllele); }
public static Mock <IGeminiDataSourceFactory> MockDataSourceFactory(Mock <IBamReader> mockReader, Mock <IDataSource <ReadPair> > mockReadPairSource) { var mockDataSourceFactory = new Mock <IGeminiDataSourceFactory>(); mockDataSourceFactory.Setup(x => x.CreateBamReader(It.IsAny <string>())).Returns(mockReader.Object); mockDataSourceFactory.Setup(x => x.CreateReadPairSource(It.IsAny <IBamReader>(), It.IsAny <ReadStatusCounter>())) .Returns(mockReadPairSource.Object); mockDataSourceFactory.Setup(x => x.GetRefIdMapping(It.IsAny <string>())) .Returns(new Dictionary <int, string>() { { 1, "chr1" }, { 2, "chr2" }, { -1, "Unknown" } }); var mockSnippetSource = new Mock <IGenomeSnippetSource>(); var genomeSnippet = new GenomeSnippet() { Chromosome = "chr1", Sequence = new string('A', 1000000), StartPosition = 0 }; mockSnippetSource.Setup(x => x.GetGenomeSnippet(It.IsAny <int>())).Returns(genomeSnippet); mockDataSourceFactory.Setup(x => x.CreateGenomeSnippetSource(It.IsAny <string>(), It.IsAny <ChrReference>(), It.IsAny <int>())) .Returns(mockSnippetSource.Object); mockDataSourceFactory .Setup(x => x.GetChromosomeIndelSource(It.IsAny <List <HashableIndel> >(), It.IsAny <IGenomeSnippetSource>())).Returns <List <HashableIndel>, IGenomeSnippetSource>((x, y) => new ChromosomeIndelSource(x, y)); return(mockDataSourceFactory); }
private static string ReferenceSuffix(GenomeSnippet snippet, PreIndel preIndel, int contextStart) { var offset = Math.Max(10, 3 * preIndel.Length); var prefixSequence = snippet.Sequence.Substring( preIndel.ReferencePosition + preIndel.ReferenceAllele.Length - 1 - contextStart, offset); return(prefixSequence); }
private static string ReferencePrefix(GenomeSnippet snippet, PreIndel preIndel, int contextStart) { var offset = Math.Max(10, 3 * preIndel.Length); var prefixStart = Math.Max(0, preIndel.ReferencePosition - 1 - contextStart - offset - 1); var prefixLength = preIndel.ReferencePosition - prefixStart; var prefixSequence = snippet.Sequence.Substring(prefixStart, prefixLength); return(prefixSequence); }
public IEnumerable <KeyValuePair <HashableIndel, GenomeSnippet> > GetRelevantIndels(int position, List <PreIndel> preSelectedIndels = null, List <HashableIndel> confirmedIndels = null, List <PreIndel> existingIndels = null, List <PreIndel> mateIndels = null) { // TODO make this calculation right // TODO figure out what that ^ means. I don't see what's not "right" about this but I'll leave the comment til I figure it out or determine that it is not meant to be there if (_numIndels == 0 || position > HighestPosition + _bucketSize || position < LowestPosition - _bucketSize) { return(_emptyHashablesList); } var indelsToReturn = new Dictionary <HashableIndel, GenomeSnippet>(); const int maxDistance = 250; var indelExactBucketNum = (position - LowestPosition) / _bucketSize; // TODO see how many are actually being used const int maxNumTopScorersToReturn = 5; const int maxNumExtraTopScorerMultisToReturn = 3; for (int i = 0; i <= 2; i++) { var peripheralBucketNum = indelExactBucketNum - 1 + i; if (_positionalBucketsOfIndels.TryGetValue(peripheralBucketNum, out var bucket)) { var addedForBucket = 0; GenomeSnippet snippetForBucket = null; foreach (var item in bucket.OrderByDescending(v => v.Score)) { if ((addedForBucket >= maxNumTopScorersToReturn && !item.InMulti) || (addedForBucket >= maxNumTopScorersToReturn + maxNumExtraTopScorerMultisToReturn)) { continue; } if (Math.Abs(item.ReferencePosition - position) <= maxDistance) { addedForBucket++; if (snippetForBucket == null) { snippetForBucket = _genomeSnippetsLookup[peripheralBucketNum]; } indelsToReturn[item] = snippetForBucket; } } } } var filteredIndelsRaw = indelsToReturn.OrderByDescending(x => IsFavored(preSelectedIndels, confirmedIndels, x)). ThenByDescending(x => x.Key.Score).ThenByDescending(x => IsPreSelected(preSelectedIndels, x)).ThenBy(x => x.Key.StringRepresentation).ToList(); var filteredIndels = FilterIndels(preSelectedIndels, filteredIndelsRaw, maxNumTopScorersToReturn, maxNumExtraTopScorerMultisToReturn, confirmedIndels, position); return(filteredIndels); }
private static List <BamAlignment> ExtractReadsFromRealignerAndCombiner(PairResult pair, string refSeq, int refSeqOffset, List <PreIndel> preIndels, bool hasExistingIndels = false) { var stitchedPairHandler = new PairHandler(new Dictionary <int, string>() { { 1, "chr1" } }, new BasicStitcher(0), tryStitch: true); var snippetSource = new Mock <IGenomeSnippetSource>(); var genomeSnippet = new GenomeSnippet() { Chromosome = "chr1", Sequence = new string('A', refSeqOffset) + refSeq + new string('T', 1000), StartPosition = 0 }; snippetSource.Setup(x => x.GetGenomeSnippet(It.IsAny <int>())).Returns(genomeSnippet); var mockStatusHandler = new Mock <IStatusHandler>(); var comparer = new GemBasicAlignmentComparer(false, false); var readRealigner = new GeminiReadRealigner(comparer, remaskSoftclips: false, keepProbeSoftclips: false, keepBothSideSoftclips: false, trackActualMismatches: false, checkSoftclipsForMismatches: true, debug: false, maskNsOnly: false, maskPartialInsertion: false, minimumUnanchoredInsertionLength: 1, minInsertionSizeToAllowMismatchingBases: 4, maxProportionInsertSequenceMismatch: 0.2); // TODO fix // TODO figure out what I was saying to fix here... var filterer = new Mock <IRegionFilterer>(); filterer.Setup(x => x.AnyIndelsNearby(It.IsAny <int>())).Returns(true); var indels = preIndels.Select(x => HashableIndelSource.GetHashableIndel(genomeSnippet, x, 0, false)).ToList(); var indelSource = new ChromosomeIndelSource(indels, snippetSource.Object); var realignmentEvaluator = new RealignmentEvaluator(indelSource, mockStatusHandler.Object, readRealigner, new RealignmentJudger(comparer), "chr1", false, true, true, true, filterer.Object, false); var combiner = new ReadPairRealignerAndCombiner(new NonSnowballEvidenceCollector(), new PostRealignmentStitcher(stitchedPairHandler, new DebugStatusHandler(new ReadStatusCounter())), realignmentEvaluator, new PairSpecificIndelFinder(), "chr1", false, hasExistingIndels: hasExistingIndels); var nmCalc = new NmCalculator(snippetSource.Object); var result = combiner.ExtractReads(pair, nmCalc); return(result); }
public GenomeSnippet GetGenomeSnippet(int position) { if (position < 0) { throw new ArgumentException( $"Invalid snippet reference position ({position}): must be non-negative."); } if (Math.Abs(position - _lastPosition) < _snippetBuffer && _currentEndPos - position > _snippetBuffer) { return(_snippet); } else { _snippet = _snippetSource.GetGenomeSnippet(position); _lastPosition = position; _currentEndPos = _snippet.StartPosition + _snippet.Sequence.Length; return(_snippet); } }
public void GetNm() { var snippetSource = new Mock <IGenomeSnippetSource>(); var genomeSnippet = new GenomeSnippet() { Chromosome = "chr1", Sequence = "NNNNNAAAAATTTTTGGGGGCCCCC", StartPosition = 94 // 0 based }; snippetSource.Setup(x => x.GetGenomeSnippet(It.IsAny <int>())).Returns(genomeSnippet); var nmCalculator = new NmCalculator(snippetSource.Object); // Positions passed to CreateBamAlignment are one based bc it adjusts by one in the helper var alignment = TestHelpers.CreateBamAlignment("AAAAA", 100, 0, 30, true); Assert.Equal(0, nmCalculator.GetNm(alignment)); alignment = TestHelpers.CreateBamAlignment("AATAA", 100, 0, 30, true); Assert.Equal(1, nmCalculator.GetNm(alignment)); alignment = TestHelpers.CreateBamAlignment("AGTGT", 100, 0, 30, true); Assert.Equal(4, nmCalculator.GetNm(alignment)); alignment = TestHelpers.CreateBamAlignment("AGTGT", 100, 0, 30, true, cigar: new CigarAlignment("1M4I")); Assert.Equal(4, nmCalculator.GetNm(alignment)); alignment = TestHelpers.CreateBamAlignment("ATTTT", 100, 0, 30, true, cigar: new CigarAlignment("1M4D4M")); Assert.Equal(4, nmCalculator.GetNm(alignment)); alignment = TestHelpers.CreateBamAlignment("ACCCC", 100, 0, 30, true, cigar: new CigarAlignment("1M4D4M")); Assert.Equal(8, nmCalculator.GetNm(alignment)); alignment = TestHelpers.CreateBamAlignment("GAAAA", 100, 0, 30, true); Assert.Equal(1, nmCalculator.GetNm(alignment)); alignment = TestHelpers.CreateBamAlignment("AATAA", 100, 0, 30, true, cigar: new CigarAlignment("2M3S")); Assert.Equal(0, nmCalculator.GetNm(alignment)); }
public void GetHashableIndel() { var refSequence = "ZZXXXXXCAGCAGCAGCAGXYZ"; var indel = new PreIndel(new CandidateAllele("chr1", 7, "XCAG", "X", AlleleCategory.Deletion)); var genomeSnippet = new GenomeSnippet() { Chromosome = "chr1", Sequence = refSequence + "TTTTT", StartPosition = 0 }; var hashable = HashableIndelSource.GetHashableIndel(genomeSnippet, indel, 0, false); Assert.Equal("ZZXXXXX", hashable.RefPrefix); Assert.Equal("CAGCAGCAGX", hashable.RefSuffix); indel = new PreIndel(new CandidateAllele("chr1", 7, "X", "XCAG", AlleleCategory.Insertion)); hashable = HashableIndelSource.GetHashableIndel(genomeSnippet, indel, 0, false); Assert.Equal("ZZXXXXX", hashable.RefPrefix); Assert.Equal("CAGCAGCAGC", hashable.RefSuffix); }
public GenomeSnippet GetGenomeSnippet(int position) { if (_chrReference == null) { // TODO optionally could open the genome back up? throw new Exception("Already disposed of the chr reference."); } if (position < 0) { throw new ArgumentException( $"Invalid snippet reference position ({position}): must be non-negative."); } var contextStart = position - _genomeContextSize; contextStart -= _buffer; contextStart = Math.Max(0, contextStart); if (contextStart >= _chrReference.Sequence.Length) { throw new ArgumentException( $"Snippet would go off the end of the chromosome: {position} vs {_chrReference.Sequence.Length}."); } var contextLength = Math.Min(_chrReference.Sequence.Length - contextStart, 2 * _buffer + _genomeContextSize * 2); var context = _chrReference.Sequence.Substring(Math.Max(0, contextStart), contextLength); var snippet = new GenomeSnippet { Chromosome = _chrReference.Name, Sequence = context, StartPosition = contextStart }; return(snippet); }
public void ReapplySoftclips() { var reapplier = new SoftclipReapplier(true, false, false, false, false, true); var reapplierNonly = new SoftclipReapplier(true, true, false, false, false, true); var read = new Read("chr", new BamAlignment { Position = 20, // zero based CigarData = new CigarAlignment("10M"), Bases = "GTACGTACGT", Qualities = new byte[] { 20, 20, 20, 20, 20, 20, 20, 20, 20, 20 } }); var result = GetResult("8M2I"); var snippet = new GenomeSnippet() { Chromosome = "chr1", Sequence = "GTACGTACGT", StartPosition = 20 }; reapplier.ReapplySoftclips(read, 0, 0, new PositionMap(new int[] { 21, 22, 23, 24, 25, 26, 27, 28, 29, 30 }), result, snippet, 0, 0, new CigarAlignment("10M")); Assert.Equal("8M2I", result.Cigar.ToString()); // reapply N softclips read = new Read("chr", new BamAlignment { Position = 22, // zero based CigarData = new CigarAlignment("2S8M"), Bases = "NNACGTACGT", Qualities = new byte[] { 20, 20, 20, 20, 20, 20, 20, 20, 20, 20 } }); // At this point, the position map doesn't include the Ns. They get re-added. result = GetResult("6M2I"); reapplier.ReapplySoftclips(read, 2, 0, new PositionMap(new int[] { 23, 24, 25, 26, 27, 28, 29, 30 }), result, snippet, 2, 0, new CigarAlignment("10M")); Assert.Equal("2S6M2I", result.Cigar.ToString()); // reapply non-N softclips read = new Read("chr", new BamAlignment { Position = 22, // zero based CigarData = new CigarAlignment("2S8M"), Bases = "CCACGTACGT", Qualities = new byte[] { 20, 20, 20, 20, 20, 20, 20, 20, 20, 20 } }); result = GetResult("8M2I"); reapplier.ReapplySoftclips(read, 0, 0, new PositionMap(new int[] { 21, 22, 23, 24, 25, 26, 27, 28, 29, 30 }), result, snippet, 2, 0, new CigarAlignment("8M")); Assert.Equal("2S6M2I", result.Cigar.ToString()); // if only remasking Ns, don't reapply non-N softclips result = GetResult("8M2I"); reapplierNonly.ReapplySoftclips(read, 0, 0, new PositionMap(new int[] { 21, 22, 23, 24, 25, 26, 27, 28, 29, 30 }), result, snippet, 2, 0, new CigarAlignment("8M")); Assert.Equal("8M2I", result.Cigar.ToString()); //// if the bases match, don't reapply softclips //read = new Read("chr", new BamAlignment //{ // Position = 22, // zero based // CigarData = new CigarAlignment("2S8M"), // Bases = "CTACGTACGT", // Qualities = new byte[] { 20, 20, 20, 20, 20, 20, 20, 20, 20, 20 } //}); //result = GetResult("8M2I"); //reapplier.ReapplySoftclips(read, 0, 0, new PositionMap(new int[] { 21, 22, 23, 24, 25, 26, 27, 28, 29, 30 }), result, snippet, 2, 0, new CigarAlignment("8M")); //Assert.Equal("1S7M2I", result.Cigar.ToString()); }
public void ReapplySoftclips(Read read, int nPrefixLength, int nSuffixLength, PositionMap positionMapWithoutTerminalNs, RealignmentResult result, GenomeSnippet context, uint prefixSoftclip, uint suffixSoftclip, CigarAlignment freshCigarWithoutTerminalNs) { // Re-append the N-prefix var nPrefixPositionMap = Enumerable.Repeat(-1, nPrefixLength); var nSuffixPositionMap = Enumerable.Repeat(-1, nSuffixLength); // TODO maybe have a function for combining pos maps instead var finalPositionMap = new PositionMap(nPrefixPositionMap.Concat(positionMapWithoutTerminalNs.Map).Concat(nSuffixPositionMap).ToArray()); var finalCigar = new CigarAlignment { new CigarOp('S', (uint)nPrefixLength) }; foreach (CigarOp op in result.Cigar) { finalCigar.Add(op); } finalCigar.Add(new CigarOp('S', (uint)nSuffixLength)); finalCigar.Compress(); result.Cigar = finalCigar; // In case realignment introduced a bunch of mismatch-Ms where there was previously softclipping, optionally re-mask them. if (result != null && _remaskSoftclips) { var mismatchMap = Helper.GetMismatchMap(read.Sequence, finalPositionMap, context.Sequence, context.StartPosition); var softclipAdjustedCigar = Helper.SoftclipCigar(result.Cigar, mismatchMap, prefixSoftclip, suffixSoftclip, maskNsOnly: _maskNsOnly, prefixNs: Helper.GetCharacterBookendLength(read.Sequence, 'N', false), suffixNs: Helper.GetCharacterBookendLength(read.Sequence, 'N', true), softclipEvenIfMatch: _keepProbeSoftclips || _keepBothSideSoftclips, softclipRepresentsMess: (!(_keepBothSideSoftclips || _keepProbeSoftclips))); // Update position map to account for any softclipping added var adjustedPrefixClip = softclipAdjustedCigar.GetPrefixClip(); for (var i = 0; i < adjustedPrefixClip; i++) { finalPositionMap.UpdatePositionAtIndex(i, -2, true); } var adjustedSuffixClip = softclipAdjustedCigar.GetSuffixClip(); for (var i = 0; i < adjustedSuffixClip; i++) { finalPositionMap.UpdatePositionAtIndex(finalPositionMap.Length - 1 - i, -2, true); } var editDistance = Helper.GetNumMismatches(read.Sequence, finalPositionMap, context.Sequence, context.StartPosition); if (editDistance == null) { // This shouldn't happen at this point - we already have a successful result throw new InvalidDataException("Edit distance is null for :" + read.Name + " with position map " + string.Join(",", finalPositionMap) + " and CIGAR " + softclipAdjustedCigar); } // TODO PERF - See how much this really helps analytically. I'm thinking maybe kill this altogether and remove from eval var sumOfMismatching = Helper.GetSumOfMismatchQualities(mismatchMap, read.Qualities); var readHasPosition = finalPositionMap.HasAnyMappableBases(); if (!readHasPosition) { throw new InvalidDataException(string.Format( "Read does not have any alignable bases. ({2} --> {0} --> {3}, {1})", freshCigarWithoutTerminalNs, string.Join(",", finalPositionMap), read.CigarData, softclipAdjustedCigar)); } result.Position = finalPositionMap.FirstMappableBase(); // TODO this used to be >= 0 but changed to > 0. Confirm correct. result.Cigar = softclipAdjustedCigar; result.NumMismatches = editDistance.Value; var addedAtFinal = new List <int>(); foreach (var i in result.IndelsAddedAt) { addedAtFinal.Add(i + nPrefixLength); } result.IndelsAddedAt = addedAtFinal; var nifiedAtFinal = new List <int>(); foreach (var i in result.NifiedAt) { nifiedAtFinal.Add(i + nPrefixLength); } result.NifiedAt = nifiedAtFinal; var newSummary = Extensions.GetAlignmentSummary(result.Position - 1 - context.StartPosition, result.Cigar, context.Sequence, read.Sequence, _trackActualMismatches, _checkSoftclipsForMismatches); result.NumNonNMismatches = newSummary.NumNonNMismatches; result.NumNonNSoftclips = newSummary.NumNonNSoftclips; result.NumSoftclips = newSummary.NumSoftclips; result.NumInsertedBases = newSummary.NumInsertedBases; result.NumMismatchesIncludeSoftclip = newSummary.NumMismatchesIncludeSoftclip; //result.MismatchesIncludeSoftclip = newSummary.MismatchesIncludeSoftclip; result.SumOfMismatchingQualities = sumOfMismatching; result.AnchorLength = newSummary.AnchorLength; } }
public void GetFinalAlignment_NonMock() { var snippetSource = new Mock <IGenomeSnippetSource>(); var genomeSnippet = new GenomeSnippet() { Chromosome = "chr1", Sequence = new string('A', 1000) + "ATCGATTGA" + new string('T', 1000), StartPosition = 1000 }; snippetSource.Setup(x => x.GetGenomeSnippet(It.IsAny <int>())).Returns(genomeSnippet); var mockStatusHandler = new Mock <IStatusHandler>(); var comparer = new GemBasicAlignmentComparer(false, false); var readRealigner = new GeminiReadRealigner(comparer, remaskSoftclips: false, keepProbeSoftclips: false, keepBothSideSoftclips: false, trackActualMismatches: false, checkSoftclipsForMismatches: true, debug: false, maskNsOnly: false, maskPartialInsertion: false, minimumUnanchoredInsertionLength: 1, minInsertionSizeToAllowMismatchingBases: 4, maxProportionInsertSequenceMismatch: 0.2); // TODO fix // TODO figure out what I was saying to fix here... var filterer = GetMockRegionFilterer(); var indels = new List <HashableIndel>(); var indelSource = new ChromosomeIndelSource(indels, snippetSource.Object); var realignmentEvaluator = new RealignmentEvaluator(indelSource, mockStatusHandler.Object, readRealigner, new RealignmentJudger(comparer), "chr1", false, true, true, true, filterer.Object, false); var origBamAlignment = TestHelpers.CreateBamAlignment("AAAAAAATTCA", 1500, 1500, 30, true, cigar: new CigarAlignment("11M")); var realigned = realignmentEvaluator.GetFinalAlignment(origBamAlignment, out bool changed, out bool forcedSoftclip, out bool confirmed, out bool sketchy); // No indels Assert.False(changed); Assert.False(confirmed); indels = new List <HashableIndel>() { new HashableIndel() { Chromosome = "chr1", ReferencePosition = 1506, ReferenceAllele = "A", AlternateAllele = "ATT", Type = AlleleCategory.Insertion, Length = 2 } }; indelSource = new ChromosomeIndelSource(indels, snippetSource.Object); realignmentEvaluator = new RealignmentEvaluator(indelSource, mockStatusHandler.Object, readRealigner, new RealignmentJudger(comparer), "chr1", false, true, true, true, filterer.Object, false); realigned = realignmentEvaluator.GetFinalAlignment(origBamAlignment, out changed, out forcedSoftclip, out confirmed, out sketchy); Assert.True(changed); Assert.False(confirmed); Assert.Equal("7M2I2M", realigned.CigarData.ToString()); var confirmedAccepteds = new List <HashableIndel>(); realignmentEvaluator = new RealignmentEvaluator(indelSource, mockStatusHandler.Object, readRealigner, new RealignmentJudger(comparer), "chr1", false, true, true, true, filterer.Object, false); var reRealigned = realignmentEvaluator.GetFinalAlignment(realigned, out changed, out forcedSoftclip, out confirmed, out sketchy, confirmedAccepteds: confirmedAccepteds); Assert.False(changed); Assert.True(confirmed); Assert.Equal("7M2I2M", reRealigned.CigarData.ToString()); // Existing indel is best (and only) realignmentEvaluator = new RealignmentEvaluator(indelSource, mockStatusHandler.Object, readRealigner, new RealignmentJudger(comparer), "chr1", false, true, true, true, filterer.Object, false); reRealigned = realignmentEvaluator.GetFinalAlignment(realigned, out changed, out forcedSoftclip, out confirmed, out sketchy, confirmedAccepteds: confirmedAccepteds, existingIndels: new List <PreIndel>() { new PreIndel(new CandidateAllele("chr1", 1506, "A", "ATT", AlleleCategory.Insertion)) }); Assert.False(changed); Assert.True(confirmed); Assert.Equal("7M2I2M", reRealigned.CigarData.ToString()); // Existing indel is unsanctioned but good fit - keep it var alignmentWithInsertion = TestHelpers.CreateBamAlignment("AAAAAAATTCA", 1500, 1500, 30, true, cigar: new CigarAlignment("7M3I1M")); realignmentEvaluator = new RealignmentEvaluator(indelSource, mockStatusHandler.Object, readRealigner, new RealignmentJudger(comparer), "chr1", false, true, true, false, filterer.Object, false); var realignedExistingIns = realignmentEvaluator.GetFinalAlignment(alignmentWithInsertion, out changed, out forcedSoftclip, out confirmed, out sketchy, confirmedAccepteds: confirmedAccepteds, existingIndels: new List <PreIndel>() { new PreIndel(new CandidateAllele("chr1", 1506, "A", "ATTC", AlleleCategory.Insertion)) }); Assert.False(changed); Assert.False(confirmed); Assert.Equal("7M3I1M", realignedExistingIns.CigarData.ToString()); // Existing indel is unsanctioned and we're softclipping unknowns - softclip it realignmentEvaluator = new RealignmentEvaluator(indelSource, mockStatusHandler.Object, readRealigner, new RealignmentJudger(comparer), "chr1", false, true, true, true, filterer.Object, false); realignedExistingIns = realignmentEvaluator.GetFinalAlignment(alignmentWithInsertion, out changed, out forcedSoftclip, out confirmed, out sketchy, confirmedAccepteds: confirmedAccepteds, existingIndels: new List <PreIndel>() { new PreIndel(new CandidateAllele("chr1", 1506, "A", "ATTC", AlleleCategory.Insertion)) }); Assert.False(changed); Assert.False(confirmed); Assert.Equal("7M4S", realignedExistingIns.CigarData.ToString()); indels = new List <HashableIndel>() { new HashableIndel() { Chromosome = "chr1", ReferencePosition = 1506, ReferenceAllele = "A", AlternateAllele = "ATT", Type = AlleleCategory.Insertion, Length = 2, Score = 1000 }, new HashableIndel() { Chromosome = "chr1", ReferencePosition = 1506, ReferenceAllele = "A", AlternateAllele = "ATTC", Type = AlleleCategory.Insertion, Length = 3, Score = 760 }, new HashableIndel() { Chromosome = "chr1", ReferencePosition = 1506, ReferenceAllele = "A", AlternateAllele = "ATTG", Type = AlleleCategory.Insertion, Length = 3, Score = 10 } }; indelSource = new ChromosomeIndelSource(indels, snippetSource.Object); realignmentEvaluator = new RealignmentEvaluator(indelSource, mockStatusHandler.Object, readRealigner, new RealignmentJudger(comparer), "chr1", false, true, true, true, filterer.Object, false); realigned = realignmentEvaluator.GetFinalAlignment(origBamAlignment, out changed, out forcedSoftclip, out confirmed, out sketchy); Assert.True(changed); Assert.False(confirmed); Assert.Equal("7M3I1M", realigned.CigarData.ToString()); confirmedAccepteds = new List <HashableIndel>(); realignmentEvaluator = new RealignmentEvaluator(indelSource, mockStatusHandler.Object, readRealigner, new RealignmentJudger(comparer), "chr1", false, true, true, true, filterer.Object, false); reRealigned = realignmentEvaluator.GetFinalAlignment(realigned, out changed, out forcedSoftclip, out confirmed, out sketchy, confirmedAccepteds: confirmedAccepteds); Assert.False(changed); Assert.True(confirmed); Assert.Equal("7M3I1M", reRealigned.CigarData.ToString()); // Existing indel is not the top one but is the best fit, keep it realignmentEvaluator = new RealignmentEvaluator(indelSource, mockStatusHandler.Object, readRealigner, new RealignmentJudger(comparer), "chr1", false, true, true, true, filterer.Object, false); reRealigned = realignmentEvaluator.GetFinalAlignment(realigned, out changed, out forcedSoftclip, out confirmed, out sketchy, confirmedAccepteds: confirmedAccepteds, existingIndels: new List <PreIndel>() { new PreIndel(new CandidateAllele("chr1", 1506, "A", "ATTC", AlleleCategory.Insertion)) }); Assert.False(changed); Assert.True(confirmed); Assert.Equal("7M3I1M", reRealigned.CigarData.ToString()); // Has existing unsanctioned indel and there are better ones to realign around - ignore the bad one, take the good realignmentEvaluator = new RealignmentEvaluator(indelSource, mockStatusHandler.Object, readRealigner, new RealignmentJudger(comparer), "chr1", false, true, true, true, filterer.Object, false); reRealigned = realignmentEvaluator.GetFinalAlignment(realigned, out changed, out forcedSoftclip, out confirmed, out sketchy, confirmedAccepteds: confirmedAccepteds, existingIndels: new List <PreIndel>() { new PreIndel(new CandidateAllele("chr1", 1507, "A", "ATC", AlleleCategory.Insertion)) }); Assert.False(changed); Assert.True(confirmed); Assert.Equal("7M3I1M", reRealigned.CigarData.ToString()); }
public static HashableIndel GetHashableIndel(GenomeSnippet snippet, PreIndel preIndel, int contextStart, bool debug) { var actualReferenceAllele = ActualReferenceAllele(snippet, preIndel, contextStart); var actualAltAllele = ActualAltAllele(preIndel, actualReferenceAllele); var indelType = actualReferenceAllele.Length > actualAltAllele.Length ? AlleleCategory.Deletion : AlleleCategory.Insertion; string repeatUnit; var variantBases = indelType == AlleleCategory.Insertion ? actualAltAllele.Substring(1) : actualReferenceAllele.Substring(1); const int maxRepeatUnitLength = 3; var isRepeat = StitchingLogic.OverlapEvaluator.IsRepeat(variantBases, maxRepeatUnitLength , out repeatUnit); var isDuplication = Helper.IsDuplication(snippet.Sequence, preIndel.ReferencePosition, isRepeat, repeatUnit, actualAltAllele); var numRepeatsLeft = 0; var numRepeats = 0; if (indelType == AlleleCategory.Insertion && preIndel.Length > 3) { var currentPos = preIndel.ReferencePosition - snippet.StartPosition; while (true) { // TODO < or <= if (snippet.Sequence.Length <= currentPos + preIndel.Length) { break; } // Need to go both directions because we're allowing inexact. var referenceAfterInsertion = snippet.Sequence.Substring(currentPos, preIndel.Length); bool stillMatch = false; if (referenceAfterInsertion != variantBases) { var numMismatches = Helper.GetHammingNumMismatches(referenceAfterInsertion, variantBases); if (numMismatches <= 1) { stillMatch = true; } } else { stillMatch = true; } if (stillMatch) { numRepeats++; currentPos += preIndel.Length; } else { break; } } var currentPosLeft = preIndel.ReferencePosition - preIndel.Length - snippet.StartPosition; while (true) { // Need to go both directions because we're allowing inexact. if (currentPosLeft < 0) { break; } var referenceAfterInsertion = snippet.Sequence.Substring(currentPosLeft, preIndel.Length); bool stillMatch = false; if (referenceAfterInsertion != variantBases) { var numMismatches = Helper.GetHammingNumMismatches(referenceAfterInsertion, variantBases); if (numMismatches <= 1) { stillMatch = true; } } else { stillMatch = true; } if (stillMatch) { numRepeatsLeft++; currentPosLeft -= preIndel.Length; } else { break; } } } string newRepeatUnit; var repeats = Helper.ComputeRMxNLengthForIndel(preIndel.ReferencePosition - snippet.StartPosition, variantBases, snippet.Sequence, 6, out newRepeatUnit); if (repeats >= 6) // TODO make this configurable? { isRepeat = true; repeatUnit = newRepeatUnit; } string otherIndel = ""; if (preIndel.InMulti) { var otherAsPre = GetIndelKey(preIndel.OtherIndel); otherAsPre.ReferenceAllele = ActualReferenceAllele(snippet, otherAsPre, contextStart); otherAsPre.AlternateAllele = ActualAltAllele(otherAsPre, otherAsPre.ReferenceAllele); otherIndel = Helper.CandidateToString(otherAsPre); } var length = Math.Abs(actualReferenceAllele.Length - actualAltAllele.Length); var isUntrustworthyInRepeatRegion = false; if (length == 1) { isUntrustworthyInRepeatRegion = Helper.IsInHomopolymerStretch(snippet.Sequence, preIndel.ReferencePosition); } // TODO ADD TESTS!! var refPrefix = ReferencePrefix(snippet, preIndel, contextStart); var refSuffix = ReferenceSuffix(snippet, preIndel, contextStart); //Read-end repeats of this repeat unit that are this length or smaller should not be trusted as insertion evidence, but larger ones can var numBasesBeforeInsertionUnique = 0; if (indelType == AlleleCategory.Insertion) { var sequenceToCheckFor = isRepeat ? repeatUnit : actualAltAllele; for (int i = 0; i < refSuffix.Length - sequenceToCheckFor.Length; i += sequenceToCheckFor.Length) { if (refSuffix.Substring(i, sequenceToCheckFor.Length) == sequenceToCheckFor) { numBasesBeforeInsertionUnique++; } else { break; } } } var indelIdentifier = new HashableIndel { Chromosome = preIndel.Chromosome, ReferencePosition = preIndel.ReferencePosition, ReferenceAllele = actualReferenceAllele, AlternateAllele = actualAltAllele, Type = indelType, Length = length, Score = preIndel.Score, InMulti = preIndel.InMulti, OtherIndel = otherIndel, IsRepeat = isRepeat, RepeatUnit = repeatUnit, IsDuplication = isDuplication, IsUntrustworthyInRepeatRegion = isUntrustworthyInRepeatRegion, RefPrefix = refPrefix, RefSuffix = refSuffix, NumBasesInReferenceSuffixBeforeUnique = numBasesBeforeInsertionUnique, NumRepeatsNearby = repeats, NumApproxDupsLeft = numRepeatsLeft, NumApproxDupsRight = numRepeats }; indelIdentifier = Helper.CopyHashable(indelIdentifier, otherIndel); if (isDuplication && debug) { Console.WriteLine($"Found a duplication: {indelIdentifier.StringRepresentation}"); } if (isRepeat && debug) { Console.WriteLine($"Found a repeat: {indelIdentifier.StringRepresentation}, {repeatUnit}"); } return(indelIdentifier); }
private static List <HashableIndel> GetFinalIndelsForChromosome(List <PreIndel> indelsForChrom, ChrReference chrReference, bool debug) { int numSkippedWeakShortComplex = 0; int numRepeatLotsCompetitors = 0; var indelsdict = new Dictionary <HashableIndel, List <PreIndel> >(); var chromIndelContexts = new List <HashableIndel>(); var snippet = new GenomeSnippet { Chromosome = chrReference.Name, Sequence = chrReference.Sequence, StartPosition = 0 }; var contextStart = 0; // TODO REFACTOR OUT FILTERING, TO MATCH SPEC STRUCTURE var numCandidates = indelsForChrom.Count(); // TODO consider changing how this threshold is calculated var medianIndelSupport = indelsForChrom.Any() ? indelsForChrom.Select(x => x.Observations).OrderBy(x => x).ToList()[numCandidates / 2] : 0; var thresholdForUntrustworthyRepeat = medianIndelSupport / 5; foreach (var candidateIndel in indelsForChrom) { var indelIdentifier = GetHashableIndel(snippet, candidateIndel, contextStart, debug); if (indelIdentifier.Score == 0) { continue; } if (indelIdentifier.IsUntrustworthyInRepeatRegion && candidateIndel.Observations < thresholdForUntrustworthyRepeat && !indelIdentifier.InMulti) { if (debug) { Logger.WriteToLog( $"Skipping variant {candidateIndel} because it is a weak, short variant in a complex region (Support: {candidateIndel.Observations})."); } numSkippedWeakShortComplex++; continue; } if (!indelsdict.TryGetValue(indelIdentifier, out var indelsForIdentifier)) { indelsForIdentifier = new List <PreIndel>(); indelsdict.Add(indelIdentifier, indelsForIdentifier); } indelsdict[indelIdentifier].Add(candidateIndel); } int numSkippedEffectiveSame = 0; var toRemove = new List <HashableIndel>(); foreach (var indel in indelsdict.Keys.OrderByDescending(x => x.Score)) { // Collapse neighbor deletions that have essentially the same consequence (todo should we do this with insertions too?) if (indel.InMulti) { continue; } if (toRemove.Contains(indel)) { continue; } // TODO should threshold relate to num repeats nearby? var thresholdForNearby = 75; var nearbySameLengthIndels = indelsdict.Keys.Where(x => !x.Equals(indel) && !x.InMulti && Math.Abs(indel.ReferencePosition - x.ReferencePosition) <= thresholdForNearby && x.Type == indel.Type && x.Length == indel.Length && x.Score * 2 < indel.Score); if (nearbySameLengthIndels.Any()) { var snipWidth = thresholdForNearby * 2; var snipStart = Math.Max(indel.ReferencePosition - snipWidth - snippet.StartPosition, 0); var snipEndAdjustment = indel.Type == AlleleCategory.Deletion ? indel.Length : 0; var snipEnd = Math.Min(indel.ReferencePosition - snippet.StartPosition + snipWidth + snipEndAdjustment, snippet.Sequence.Length); var preLength = indel.ReferencePosition - snippet.StartPosition - snipStart; var postStart = snipStart + preLength + snipEndAdjustment; var variantSeq = indel.Type == AlleleCategory.Deletion ? "" : indel.AlternateAllele.Substring(1); var effectiveSequence = snippet.Sequence.Substring(snipStart, preLength) + variantSeq + snippet.Sequence.Substring(postStart, snipEnd - postStart); foreach (var nearIndel in nearbySameLengthIndels) { var snipEndAdjustment2 = nearIndel.Type == AlleleCategory.Deletion ? nearIndel.Length : 0; var preLength2 = nearIndel.ReferencePosition - snippet.StartPosition - snipStart; var postStart2 = snipStart + preLength2 + snipEndAdjustment2; var variantSeq2 = nearIndel.Type == AlleleCategory.Deletion ? "" : nearIndel.AlternateAllele.Substring(1); var effectiveSequence2 = snippet.Sequence.Substring(snipStart, preLength2) + variantSeq2 + snippet.Sequence.Substring(postStart2, snipEnd - postStart2); var mismatches = 0; for (int i = 0; i < effectiveSequence.Length; i++) { if (effectiveSequence[i] != effectiveSequence2[i]) { mismatches++; } } if (debug) { Console.WriteLine( $"{indel.StringRepresentation} ({indel.Score}) vs {nearIndel.StringRepresentation} ({nearIndel.Score})"); Console.WriteLine(effectiveSequence); Console.WriteLine(effectiveSequence2); Console.WriteLine($"Mismatches: {mismatches}"); Console.WriteLine(); } if (mismatches <= 1) { numSkippedEffectiveSame++; if (debug) { Logger.WriteToLog( $"Removing {nearIndel.StringRepresentation} ({nearIndel.Score}) from contention as its consequence is extremely similar to {indel.StringRepresentation} ({indel.Score})"); } toRemove.Add(nearIndel); // TODO do we want to add the score from the removed indels to the kept one? } } } } foreach (var removeIndel in toRemove.Distinct()) { indelsdict.Remove(removeIndel); } toRemove.Clear(); foreach (var indel in indelsdict.Keys) { if (indel.InMulti) { continue; } if (toRemove.Contains(indel)) { continue; } var variantsAtSamePos = indelsdict.Keys.Where(x => x.ReferencePosition == indel.ReferencePosition && x.Type == indel.Type && !x.Equals(indel) && !x.InMulti).ToList(); var numVariantsAtSamePos = variantsAtSamePos.Count(); var variantsRemovedFromSamePos = 0; if (numVariantsAtSamePos > 0) { foreach (var variantsAtSamePo in variantsAtSamePos) { if (variantsAtSamePo.Score * 2 < indel.Score && !variantsAtSamePo.HardToCall) { toRemove.Add(variantsAtSamePo); variantsRemovedFromSamePos++; } } if (numVariantsAtSamePos - variantsRemovedFromSamePos > 2) { toRemove.Add(indel); toRemove.AddRange(variantsAtSamePos); if (debug) { Logger.WriteToLog( $"Skipping variant {indel.StringRepresentation} ({indel.Score}) and {numVariantsAtSamePos} competitors because it's a repeat with lots of competitors and there is no clear strong candidate ({(string.Join(",", variantsAtSamePos.Select(x => x.Score)))})."); } } else { // Note that this could be an issue if there are somatic indels at the same position as germline indels if (debug) { Logger.WriteToLog( $"Removing {variantsRemovedFromSamePos} of {numVariantsAtSamePos} variants at same position as {indel.StringRepresentation} ({indel.Score}) ({(string.Join(",", variantsAtSamePos.Select(x => x.Score)))})."); } } } } foreach (var removeIndel in toRemove.Distinct()) { indelsdict.Remove(removeIndel); } if (debug) { Logger.WriteToLog( $"Skipped {numRepeatLotsCompetitors} for being a repeat with lots of competitors and there is no clear strong candidate."); Logger.WriteToLog( $"Skipped {numSkippedWeakShortComplex} for being a weak, short variant in a complex region."); Logger.WriteToLog( $"Skipped {numSkippedEffectiveSame} for being effectively the same as a much stronger variant."); } chromIndelContexts = indelsdict.Keys.ToList(); return(chromIndelContexts); }