public ChrRealigner(ChrReference chrReference, IAlignmentExtractor extractorForCandidates, IAlignmentExtractor extractorForRealign, IIndelCandidateFinder indelFinder, IIndelRanker indelRanker, ITargetCaller caller, RealignStateManager stateManager, IRealignmentWriter writer, List <CandidateAllele> knownIndels = null, int maxIndelSize = 25, bool tryThree = false, int anchorSizeThreshold = 10, bool skipDuplicates = false, bool skipAndRemoveDuplicates = false, bool remaskSoftclips = true, bool maskPartialInsertion = false, int minimumUnanchoredInsertionLength = 0, bool tryRealignCleanSoftclippedReads = true, bool allowRescoringOrig0 = true, int maxRealignShift = 250, AlignmentScorer alignmentScorer = null, bool debug = false) { _chrReference = chrReference; _extractorForCandidates = extractorForCandidates; _extractorForRealign = extractorForRealign; _indelFinder = indelFinder; _indelRanker = indelRanker; _caller = caller; _stateManager = stateManager; _writer = writer; _knownIndels = knownIndels == null ? null : knownIndels.Select(i => new CandidateIndel(i)).ToList(); _maxIndelSize = maxIndelSize; _anchorSizeThreshold = anchorSizeThreshold; _skipDuplicates = skipDuplicates; _skipAndRemoveDuplicates = skipAndRemoveDuplicates; _allowRescoringOrig0 = allowRescoringOrig0; _maxRealignShift = maxRealignShift; _tryRealignCleanSoftclippedReads = tryRealignCleanSoftclippedReads; _alignmentScorer = alignmentScorer; _debug = debug; if (alignmentScorer != null) { _alignmentComparer = new ScoredAlignmentComparer(alignmentScorer); } else { _alignmentComparer = new BasicAlignmentComparer(); } _readRealigner = new ReadRealigner(_alignmentComparer, tryThree, remaskSoftclips, maskPartialInsertion, minimumUnanchoredInsertionLength); }
protected virtual IAlleleCaller CreateVariantCaller(ChrReference chrReference, ChrIntervalSet intervalSet) { var coverageCalculator = CreateCoverageCalculator(); var genotypeCalculator = GenotypeCreator.CreateGenotypeCalculator( _options.PloidyModel, _options.FilteredVariantFrequency, _options.MinimumDepth, _options.DiploidThresholdingParameters, _options.MinimumGenotypeQScore, _options.MaximumGenotypeQScore); return(new AlleleCaller(new VariantCallerConfig { IncludeReferenceCalls = _options.OutputgVCFFiles, MinVariantQscore = _options.MinimumVariantQScore, MaxVariantQscore = _options.MaximumVariantQScore, MinGenotypeQscore = _options.MinimumGenotypeQScore, MaxGenotypeQscore = _options.MaximumGenotypeQScore, VariantQscoreFilterThreshold = _options.FilteredVariantQScore, MinCoverage = _options.MinimumDepth, MinFrequency = _options.MinimumFrequency, EstimatedBaseCallQuality = GetEstimatedBaseCallQuality(), StrandBiasModel = _options.StrandBiasModel, StrandBiasFilterThreshold = _options.StrandBiasAcceptanceCriteria, FilterSingleStrandVariants = _options.FilterOutVariantsPresentOnlyOneStrand, GenotypeCalculator = genotypeCalculator, VariantFreqFilter = _options.FilteredVariantFrequency, LowGTqFilter = _options.LowGenotypeQualityFilter, IndelRepeatFilter = _options.IndelRepeatFilter, LowDepthFilter = _options.LowDepthFilter, ChrReference = chrReference, RMxNFilterSettings = new RMxNFilterSettings { RMxNFilterMaxLengthRepeat = _options.RMxNFilterMaxLengthRepeat, RMxNFilterMinRepetitions = _options.RMxNFilterMinRepetitions, RMxNFilterFrequencyLimit = _options.RMxNFilterFrequencyLimit }, NoiseModel = _options.NoiseModel }, intervalSet, CreateVariantCollapser(chrReference.Name, coverageCalculator), coverageCalculator)); }
protected virtual IRegionMapper CreateRegionPadder(ChrReference chrReference, ChrIntervalSet intervalSet, bool includeReference) { // padder is only required if there are intervals and we are including reference calls return(intervalSet == null || !_options.VcfWritingParameters.OutputGvcfFile ? null : new RegionMapper(chrReference, intervalSet, _options.BamFilterParameters.MinimumBaseCallQuality)); }
/// <summary> /// Given a list of raw (non-genome-contextualized) indels to realign around, returns a list of hashable, contextualized indels. /// </summary> /// <param name="chrom"></param> /// <param name="indelsForChrom"></param> /// <param name="chrReference"></param> /// <returns></returns> public List <HashableIndel> GetFinalIndelsForChromosome(string chrom, List <PreIndel> indelsForChrom, ChrReference chrReference) { return(GetFinalIndelsForChromosome(indelsForChrom, chrReference, _debug)); }
protected virtual IAlleleCaller CreateVariantCaller(ChrReference chrReference, ChrIntervalSet intervalSet, IAlignmentSource alignmentSource, HashSet <Tuple <string, int, string, string> > forceGtAlleles = null) { var coverageCalculator = CreateCoverageCalculator(alignmentSource); var genotypeCalculator = GenotypeCreator.CreateGenotypeCalculator( _options.VariantCallingParameters.PloidyModel, _options.VariantCallingParameters.MinimumFrequencyFilter, _options.VariantCallingParameters.MinimumCoverage, _options.VariantCallingParameters.DiploidSNVThresholdingParameters, _options.VariantCallingParameters.DiploidINDELThresholdingParameters, _options.VariantCallingParameters.AdaptiveGenotypingParameters, _options.VariantCallingParameters.MinimumGenotypeQScore, _options.VariantCallingParameters.MaximumGenotypeQScore, _options.VariantCallingParameters.TargetLODFrequency, _options.VariantCallingParameters.MinimumFrequency, chrReference.Name, _options.VariantCallingParameters.IsMale); genotypeCalculator.SetMinFreqFilter(_options.VariantCallingParameters.MinimumFrequencyFilter); var locusProcessor = _options.VariantCallingParameters.PloidyModel == PloidyModel.DiploidByThresholding ? (ILocusProcessor) new DiploidLocusProcessor() : new SomaticLocusProcessor(); var variantCallerConfig = new VariantCallerConfig { IncludeReferenceCalls = _options.VcfWritingParameters.OutputGvcfFile, MinVariantQscore = _options.VariantCallingParameters.MinimumVariantQScore, MaxVariantQscore = _options.VariantCallingParameters.MaximumVariantQScore, MinGenotypeQscore = _options.VariantCallingParameters.MinimumGenotypeQScore, MaxGenotypeQscore = _options.VariantCallingParameters.MaximumGenotypeQScore, VariantQscoreFilterThreshold = _options.VariantCallingParameters.MinimumVariantQScoreFilter, NoCallFilterThreshold = _options.VariantCallingParameters.NoCallFilterThreshold, AmpliconBiasFilterThreshold = _options.VariantCallingParameters.AmpliconBiasFilterThreshold, MinCoverage = _options.VariantCallingParameters.MinimumCoverage, MinFrequency = genotypeCalculator.MinVarFrequency, NoiseLevelUsedForQScoring = _options.VariantCallingParameters.NoiseLevelUsedForQScoring, StrandBiasModel = _options.VariantCallingParameters.StrandBiasModel, StrandBiasFilterThreshold = _options.VariantCallingParameters.StrandBiasAcceptanceCriteria, FilterSingleStrandVariants = _options.VariantCallingParameters.FilterOutVariantsPresentOnlyOneStrand, GenotypeCalculator = genotypeCalculator, VariantFreqFilter = genotypeCalculator.MinVarFrequencyFilter, LowGTqFilter = _options.VariantCallingParameters.LowGenotypeQualityFilter, IndelRepeatFilter = _options.VariantCallingParameters.IndelRepeatFilter, LowDepthFilter = _options.VariantCallingParameters.LowDepthFilter, ChrReference = chrReference, RMxNFilterSettings = new RMxNFilterSettings { RMxNFilterMaxLengthRepeat = _options.VariantCallingParameters.RMxNFilterMaxLengthRepeat, RMxNFilterMinRepetitions = _options.VariantCallingParameters.RMxNFilterMinRepetitions, RMxNFilterFrequencyLimit = _options.VariantCallingParameters.RMxNFilterFrequencyLimit }, NoiseModel = _options.VariantCallingParameters.NoiseModel, LocusProcessor = locusProcessor }; var alleleCaller = new AlleleCaller(variantCallerConfig, intervalSet, CreateVariantCollapser(chrReference.Name, coverageCalculator), coverageCalculator); alleleCaller.AddForcedGtAlleles(forceGtAlleles); return(alleleCaller); }
public override ISomaticVariantCaller CreateSomaticVariantCaller(ChrReference chrReference, string bamFilePath, IVcfWriter vcfWriter, IStrandBiasFileWriter biasFileWriter = null, string intervalFilePath = null) { return(MockSomaticVariantCaller != null ? MockSomaticVariantCaller.Object : base.CreateSomaticVariantCaller(chrReference, bamFilePath, vcfWriter, biasFileWriter, intervalFilePath)); }
public void ExecuteTest_GetCandidates(bool withReference, bool withIntervals) { var testRegion = new RegionState(1, 50); var chrReference = new ChrReference() { Name = "chr1", Sequence = string.Concat(Enumerable.Repeat("A", 50)) }; var snv1 = new CandidateAllele("chr1", 5, "A", "T", AlleleCategory.Snv) { SupportByDirection = new [] { 10, 5, 0 } }; var snv2 = new CandidateAllele("chr1", 15, "A", "T", AlleleCategory.Snv) { SupportByDirection = new[] { 10, 5, 0 } }; testRegion.AddCandidate(snv1); testRegion.AddCandidate(snv2); for (var i = 0; i < 5; i++) { testRegion.AddAlleleCount(5, AlleleType.A, DirectionType.Stitched); // ref @ variant position testRegion.AddAlleleCount(6, AlleleType.A, DirectionType.Stitched); // ref by itself testRegion.AddAlleleCount(10, AlleleType.C, DirectionType.Stitched); // nonref by itself (no ref) testRegion.AddAlleleCount(15, AlleleType.A, DirectionType.Reverse); // ref (multiple directions) + nonref testRegion.AddAlleleCount(15, AlleleType.A, DirectionType.Forward); testRegion.AddAlleleCount(15, AlleleType.T, DirectionType.Reverse); } ChrIntervalSet intervals = null; if (withIntervals) { intervals = new ChrIntervalSet(new List <CallSomaticVariants.Logic.RegionState.Region>() { new CallSomaticVariants.Logic.RegionState.Region(3, 6), new CallSomaticVariants.Logic.RegionState.Region(16, 16) }, "chr1"); } var expectedList = new List <CandidateAllele>(); expectedList.Add(snv1); expectedList.Add(snv2); if (withReference) { expectedList.Add(new CandidateAllele("chr1", 5, "A", "A", AlleleCategory.Reference) { SupportByDirection = new[] { 0, 0, 5 } }); expectedList.Add(new CandidateAllele("chr1", 6, "A", "A", AlleleCategory.Reference) { SupportByDirection = new[] { 0, 0, 5 } }); expectedList.Add(new CandidateAllele("chr1", 10, "A", "A", AlleleCategory.Reference) { SupportByDirection = new[] { 0, 0, 0 } }); expectedList.Add(new CandidateAllele("chr1", 15, "A", "A", AlleleCategory.Reference) { SupportByDirection = new[] { 5, 5, 0 } }); } if (withIntervals) { expectedList = expectedList.Where(c => c.Coordinate == 5 || c.Coordinate == 6 || c.Type != AlleleCategory.Reference).ToList(); if (withReference) { expectedList.Add(new CandidateAllele("chr1", 3, "A", "A", AlleleCategory.Reference) { SupportByDirection = new[] { 0, 0, 0 } }); expectedList.Add(new CandidateAllele("chr1", 4, "A", "A", AlleleCategory.Reference) { SupportByDirection = new[] { 0, 0, 0 } }); expectedList.Add(new CandidateAllele("chr1", 16, "A", "A", AlleleCategory.Reference) { SupportByDirection = new[] { 0, 0, 0 } }); } } var allCandidates = testRegion.GetAllCandidates(withReference, chrReference, intervals); VerifyCandidates(expectedList, allCandidates); }
public void GetFinalIndelsForChromosome() { var preIndels = new List <PreIndel>(); var insertion1 = new PreIndel(new CandidateAllele("chr1", 100, "N", "NGA", AlleleCategory.Insertion)); insertion1.Score = 100; var deletion = new PreIndel(new CandidateAllele("chr1", 5, "NNNN", "N", AlleleCategory.Deletion)); deletion.Score = 100; var insertionSimilarToIns1 = new PreIndel(new CandidateAllele("chr1", 100, "N", "NGC", AlleleCategory.Insertion)); insertionSimilarToIns1.Score = 20; var insertion2 = new PreIndel(new CandidateAllele("chr1", 302, "N", "NTCATCA", AlleleCategory.Insertion)); insertion2.Score = 100; var insertionSimilarConsequenceToIns2 = new PreIndel(new CandidateAllele("chr1", 305, "N", "NTCATGA", AlleleCategory.Insertion)); insertionSimilarConsequenceToIns2.Score = 20; var insertionNotSimilarEnoughConsequenceToIns2 = new PreIndel(new CandidateAllele("chr1", 305, "N", "NTCAGTA", AlleleCategory.Insertion)); insertionNotSimilarEnoughConsequenceToIns2.Score = 20; var insertionContainingInsertion2 = new PreIndel(new CandidateAllele("chr1", 302, "N", "NTCATCATCATCA", AlleleCategory.Insertion)); insertionContainingInsertion2.Score = 20; // TODO add edge cases in terms of score, negative cases in terms of diffferent variant types preIndels = new List <PreIndel>() { deletion, insertion1, insertionSimilarToIns1, insertion2, insertionSimilarConsequenceToIns2, insertionNotSimilarEnoughConsequenceToIns2, insertionContainingInsertion2 }; // insertionSimilarToIns1 is removed for being very similar to insertion 1 and much lower quality // insertionSimilarConsequenceToIns2 is removed for having almost the exact same consequence as insertion 2 and much lower quality // insertionNotSimilarEnoughConsequenceToIns2 is pretty close to insertion 2 in terms of consequence, and weaker, but not similar enough, so can stay // insertionContainingInsertion2 has exact same nearby consequence and position as insertion 2 but it is hard to call, being a long dup. so it gets to stay. var indelSource = new HashableIndelSource(); var chrReference = new ChrReference() { FastaPath = "abc", Name = "chr1", Sequence = new string('A', 99) + new string('T', 5) + new string('C', 195) + //299 string.Join("", Enumerable.Repeat("TCA", 20)) + new string('G', 300) }; var finalIndels = indelSource.GetFinalIndelsForChromosome("chr1", preIndels, chrReference); // Rehydrate with reference sequence and keep the right ones Assert.Equal(5, finalIndels.Count); EnsureIndelNotPresent(finalIndels, insertionSimilarToIns1.ReferencePosition, "A", "AGC"); EnsureIndelNotPresent(finalIndels, insertionSimilarConsequenceToIns2.ReferencePosition, "A", "ATCATGA"); var ins1 = CheckForIndel(finalIndels, 100, "T", "TGA", 100); Assert.False(ins1.IsDuplication); Assert.False(ins1.IsRepeat); var del = CheckForIndel(finalIndels, 5, "AAAA", "A", 100); Assert.False(del.IsDuplication); Assert.True(del.IsRepeat); var ins2 = CheckForIndel(finalIndels, 302, "A", "ATCATCA", 100); Assert.True(ins2.IsRepeat); Assert.True(ins2.IsDuplication); var ins2NotSimilarEnough = CheckForIndel(finalIndels, 305, "A", "ATCAGTA", 20); Assert.True(ins2NotSimilarEnough.IsRepeat); Assert.False(ins2NotSimilarEnough.IsDuplication); var longerInsertion = CheckForIndel(finalIndels, 302, "A", "ATCATCATCATCA", 20); Assert.True(longerInsertion.IsRepeat); Assert.True(longerInsertion.IsDuplication); Assert.True(longerInsertion.HardToCall); // Should handle scenario of stutter // 012345678901234567890 // ...CCCCCCGGGGGTTTTTAAAAATATATA // *ins TGG // *ins GGG // ...CCCCCCGGGGGTGGTTTTTAAAAATATATA // ...CCCCCCGGGGGGGGTTTTTAAAAATATATA var homopolymerIns = new PreIndel(new CandidateAllele("chr1", 300, "N", "NGGG", AlleleCategory.Insertion)); homopolymerIns.Score = 100; var homopolymerInsWithStutter = new PreIndel(new CandidateAllele("chr1", 305, "N", "NTGG", AlleleCategory.Insertion)); homopolymerInsWithStutter.Score = 10; preIndels = new List <PreIndel>() { homopolymerIns, homopolymerInsWithStutter }; indelSource = new HashableIndelSource(); chrReference = new ChrReference() { FastaPath = "abc", Name = "chr1", Sequence = new string('C', 300) + "GGGGGTTTTTAAAAATATATA" + new string('G', 300) }; finalIndels = indelSource.GetFinalIndelsForChromosome("chr1", preIndels, chrReference); Assert.Equal(1, finalIndels.Count); //chr1: 125080780 N > NTTTGATTCCATTCGATGATCACTACATTCAGTTCCATTCAATGATGATTCCAACAGATTCCATTTGGTGACTCCATTCGATTCTATTCATTGATGATTCCA //chr1: 125080854 N > NATTCGATTCTATTCATTGATGATTCCATTTGATTCCATTCGATGATGACTGCCTTCAGTTCCATTCGGTGATGATTCCAACAGATTCCATTTGGTGACTCA var realLongIns1 = new PreIndel(new CandidateAllele("chr1", 780, "N", "NTTTGATTCCATTCGATGATCACTACATTCAGTTCCATTCAATGATGATTCCAACAGATTCCATTTGGTGACTCCATTCGATTCTATTCATTGATGATTCCA", AlleleCategory.Insertion)); realLongIns1.Score = 100; var realLongIns2 = new PreIndel(new CandidateAllele("chr1", 854, "N", "NATTCGATTCTATTCATTGATGATTCCATTTGATTCCATTCGATGATGACTGCCTTCAGTTCCATTCGGTGATGATTCCAACAGATTCCATTTGGTGACTCA", AlleleCategory.Insertion)); realLongIns2.Score = 20; preIndels = new List <PreIndel>() { realLongIns1, realLongIns2 }; indelSource = new HashableIndelSource(); chrReference = new ChrReference() { FastaPath = "abc", Name = "chr1", Sequence = new string('A', 3000) }; finalIndels = indelSource.GetFinalIndelsForChromosome("chr1", preIndels, chrReference); Assert.Equal(2, finalIndels.Count); // Long deletion - should adjust snippet width to accomodate var longDel1 = new PreIndel(new CandidateAllele("chr1", 100, new string('N', 200), "N", AlleleCategory.Deletion)); longDel1.Score = 100; var longDel2 = new PreIndel(new CandidateAllele("chr1", 150, new string('N', 200), "N", AlleleCategory.Deletion)); longDel2.Score = 20; preIndels = new List <PreIndel>() { longDel1, longDel2 }; indelSource = new HashableIndelSource(); chrReference = new ChrReference() { FastaPath = "abc", Name = "chr1", Sequence = new string('A', 100) + new string('T', 100) + new string('C', 1000) }; finalIndels = indelSource.GetFinalIndelsForChromosome("chr1", preIndels, chrReference); Assert.Equal(2, finalIndels.Count); chrReference = new ChrReference() { FastaPath = "abc", Name = "chr1", Sequence = new string('A', 100) + new string('T', 500) + new string('C', 1000) }; finalIndels = indelSource.GetFinalIndelsForChromosome("chr1", preIndels, chrReference); Assert.Equal(1, finalIndels.Count); // 012345678901234567890 // ...CCCCCCGGGGGGGGAGGTTTTTAAAAATATATA // ...CCCCCC---GGGGGAGGTTTTTAAAAATATATA // del 1 // ...CCCCCCGGGGGGGG---TTTTTAAAAATATATA // del 2 // ...CCCCCCGGGGGGGGA---TTTTAAAAATATATA // del 3 // ...CCCCCCGGGGGAGGTTTTTAAAAATATATA // effective 1 // ...CCCCCCGGGGGGGGTTTTTAAAAATATATA // effective 2 // ...CCCCCCGGGGGGGGATTTTAAAAATATATA // effective 3 - edit distance of 2 from eff1, 1 from eff2 var homopolymerDel = new PreIndel(new CandidateAllele("chr1", 300, "NNNN", "N", AlleleCategory.Deletion)); homopolymerDel.Score = 100; var homopolymerDelMuchWeakerOneMismatch = new PreIndel(new CandidateAllele("chr1", 308, "NNNN", "N", AlleleCategory.Deletion)); homopolymerDelMuchWeakerOneMismatch.Score = 10; var homopolymerDelMuchWeakerTwoMismatch = new PreIndel(new CandidateAllele("chr1", 309, "NNNN", "N", AlleleCategory.Deletion)); homopolymerDelMuchWeakerTwoMismatch.Score = 10; preIndels = new List <PreIndel>() { homopolymerDel, homopolymerDelMuchWeakerOneMismatch, homopolymerDelMuchWeakerTwoMismatch }; indelSource = new HashableIndelSource(); chrReference = new ChrReference() { FastaPath = "abc", Name = "chr1", Sequence = new string('C', 300) + "GGGGGGGGAGGTTTTTAAAAATATATA" + new string('G', 300) }; finalIndels = indelSource.GetFinalIndelsForChromosome("chr1", preIndels, chrReference); Assert.Equal(2, finalIndels.Count); CheckForIndel(finalIndels, 300, "CGGG", "C", 100); EnsureIndelNotPresent(finalIndels, 308, "GAGG", "G"); CheckForIndel(finalIndels, 309, "AGGT", "A", 10); // Same deletions but flip the scores -- The deletions have very similar consequences, but there is not a clear stronger deletion, which makes us less confident that these are mismatching versions of the same deletion. Keep all. homopolymerDelMuchWeakerTwoMismatch.Score = 60; homopolymerDelMuchWeakerOneMismatch.Score = 60; finalIndels = indelSource.GetFinalIndelsForChromosome("chr1", preIndels, chrReference); Assert.Equal(3, finalIndels.Count); CheckForIndel(finalIndels, 300, "CGGG", "C", 100); CheckForIndel(finalIndels, 308, "GAGG", "G", 60); CheckForIndel(finalIndels, 309, "AGGT", "A", 60); // Same deletions but flip the scores -- The strongest deletion is edit distance of 1 away from both of the others homopolymerDel.Score = 40; homopolymerDelMuchWeakerTwoMismatch.Score = 10; homopolymerDelMuchWeakerOneMismatch.Score = 100; finalIndels = indelSource.GetFinalIndelsForChromosome("chr1", preIndels, chrReference); Assert.Equal(1, finalIndels.Count); EnsureIndelNotPresent(finalIndels, 300, "CGGG", "C"); CheckForIndel(finalIndels, 308, "GAGG", "G", 100); EnsureIndelNotPresent(finalIndels, 309, "AGGT", "A"); }
protected override IAlleleCaller CreateVariantCaller(ChrReference chrReference, ChrIntervalSet intervalSet) { return(MockVariantCaller != null ? MockVariantCaller.Object : base.CreateVariantCaller(chrReference, intervalSet)); }
public List <CandidateAllele> GetAllCandidates(bool includeRefAlleles, ChrReference chrReference, ChrIntervalSet intervals = null, HashSet <Tuple <string, int, string, string> > forcesGtAlleles = null) { var alleles = new List <CandidateAllele>(); // add all candidates - these are potentially collapsable targets foreach (var positionLookup in _candidateVariantsLookup) { if (positionLookup != null) { alleles.AddRange(positionLookup); } } var IntervalsInUse = includeRefAlleles ? intervals : CreateIntervalsFromAllels(chrReference, forcesGtAlleles); if (includeRefAlleles || (forcesGtAlleles != null && forcesGtAlleles.Count != 0)) { var regionsToFetch = IntervalsInUse == null ? new List <Region> { this } // fetch whole block region : IntervalsInUse.GetClipped(this); // clip intervals to block region for (var i = 0; i < regionsToFetch.Count; i++) { var clippedInterval = regionsToFetch[i]; for (var position = clippedInterval.StartPosition; position <= clippedInterval.EndPosition; position++) { var positionIndex = position - StartPosition; // add ref alleles within region to fetch - note that zero coverage ref positions are only added if input intervals provided if (position > chrReference.Sequence.Length) { break; } var refBase = chrReference.Sequence[position - 1].ToString(); var refBaseIndex = (int)AlleleHelper.GetAlleleType(refBase); var refAllele = new CandidateAllele(chrReference.Name, position, refBase, refBase, AlleleCategory.Reference); // gather support for allele var totalSupport = 0; for (var alleleTypeIndex = 0; alleleTypeIndex < Constants.NumAlleleTypes; alleleTypeIndex++) { for (var directionIndex = 0; directionIndex < Constants.NumDirectionTypes; directionIndex++) { var count = 0; for (int anchorIndex = 0; anchorIndex < NumAnchorIndexes; anchorIndex++) { var countForAnchorType = _alleleCounts[positionIndex, alleleTypeIndex, directionIndex, anchorIndex]; count += countForAnchorType; } if (alleleTypeIndex == refBaseIndex) { refAllele.SupportByDirection[directionIndex] = count; // TODO this isn't really proven to be well-anchored, nor is it proven not to be //refAllele.WellAnchoredSupportByDirection[directionIndex] = count; } totalSupport += count; } } if (IntervalsInUse != null || totalSupport > 0) { alleles.Add(refAllele); } } } } return(alleles); }
private static void ApplyFilters(CalledAllele allele, int?minCoverageFilter, int?variantQscoreThreshold, bool filterSingleStrandVariants, float?variantFreqFilter, float?lowGenotypeqFilter, int?indelRepeatFilter, RMxNFilterSettings rMxNFilterSettings, float?noCallFilter, float?ampliconBiasFilter, bool hasStitchedSource, ChrReference chrReference) { //Reset filters allele.Filters.Clear(); if (minCoverageFilter.HasValue && allele.TotalCoverage < minCoverageFilter) { allele.AddFilter(FilterType.LowDepth); } if (variantQscoreThreshold.HasValue && allele.VariantQscore < variantQscoreThreshold && (allele.TotalCoverage != 0)) { //note we wont flag it for Qscore, if its got zero depth, because in that case, the Q score calc was not made anyway. allele.AddFilter(FilterType.LowVariantQscore); } if (allele.Type != AlleleCategory.Reference) { //No call filter if (noCallFilter.HasValue && allele.FractionNoCalls > noCallFilter) { allele.AddFilter(FilterType.NoCall); } if (!allele.StrandBiasResults.BiasAcceptable || (filterSingleStrandVariants && !allele.StrandBiasResults.VarPresentOnBothStrands)) { allele.AddFilter(FilterType.StrandBias); } if (allele.AmpliconBiasResults != null && allele.AmpliconBiasResults.BiasDetected && ampliconBiasFilter.HasValue) { allele.AddFilter(FilterType.AmpliconBias); } if (indelRepeatFilter.HasValue && indelRepeatFilter > 0) { var indelRepeatLength = ComputeIndelRepeatLength(allele, chrReference.Sequence); if (indelRepeatFilter <= indelRepeatLength) { allele.AddFilter(FilterType.IndelRepeatLength); } } if (RMxNCalculator.ShouldFilter(allele, rMxNFilterSettings, chrReference.Sequence)) { allele.AddFilter(FilterType.RMxN); } if (variantFreqFilter.HasValue && allele.Frequency < variantFreqFilter) { allele.AddFilter(FilterType.LowVariantFrequency); } if (hasStitchedSource) //can only happen for insertions and MNVs { if (allele.AlternateAllele.Contains("N")) { allele.AddFilter(FilterType.StrandBias); } } } }
public void IndelRepeat_ChromosomeEdgeCases() { var chrReference = new ChrReference() { Sequence = String.Concat(Enumerable.Repeat("A", 75)) }; // Wherever the variant is in the reference, as long as it's within it, R8 filter should be ok. // See exception below. for (int i = 0; i < chrReference.Sequence.Length - 1; i++) { var variant = TestHelper.CreatePassingVariant(false); variant.ReferencePosition = i; variant.Type = AlleleCategory.Insertion; variant.ReferenceAllele = "A"; variant.AlternateAllele = "AA"; AlleleProcessor.Process(variant, 0.01f, 0, 0, true, 0, 0, 2, null, 0.6f, chrReference); Assert.Equal(true, variant.Filters.Contains(FilterType.IndelRepeatLength)); } // Quirk: A variant at the last or second-to-last position of the chromosome does not get R8 filtered // This comes from the legacy code that we implemented _only_ to maintain continuity with Isas, and it appears that this was an intentional behavior (comment in the code is "this handles cases where a deletion is larger than the number of downstream flanking bases"). // This test is here to show the behavior. var variantAtLastPosition = TestHelper.CreatePassingVariant(false); variantAtLastPosition.ReferencePosition = chrReference.Sequence.Length; // Last position of chrom (variant positions are 1-based) variantAtLastPosition.Type = AlleleCategory.Insertion; variantAtLastPosition.ReferenceAllele = "A"; variantAtLastPosition.AlternateAllele = "AA"; AlleleProcessor.Process(variantAtLastPosition, 0.01f, 0, 0, true, 0, 0, 2, null, 0.6f, chrReference); Assert.Equal(false, variantAtLastPosition.Filters.Contains(FilterType.IndelRepeatLength)); var variantAtSecondToLastPosition = TestHelper.CreatePassingVariant(false); variantAtSecondToLastPosition.ReferencePosition = chrReference.Sequence.Length - 1; // Second to last position of chrom (variant positions are 1-based) variantAtSecondToLastPosition.Type = AlleleCategory.Insertion; variantAtSecondToLastPosition.ReferenceAllele = "A"; variantAtSecondToLastPosition.AlternateAllele = "AA"; AlleleProcessor.Process(variantAtSecondToLastPosition, 0.01f, 0, 0, true, 0, 0, 2, null, 0.6f, chrReference); Assert.Equal(false, variantAtSecondToLastPosition.Filters.Contains(FilterType.IndelRepeatLength)); // Variant decidedly outside of chromosome - throws exception because trying to substring at nonsensical positions of chromosome. // This is a non-sensical scenario, just demonstrating that it throws exception. // Note that if the variant was just one base outside of the chromosome, it wouldn't throw this exception (returns false -- again, not a real scenario, just documenting it)... .NET behavior for substring: returns empty string "if startIndex is equal to the length of this instance and length is zero." var variantOutsideOfChromosome = TestHelper.CreatePassingVariant(false); variantOutsideOfChromosome.ReferencePosition = chrReference.Sequence.Length + 2; variantOutsideOfChromosome.Type = AlleleCategory.Insertion; variantOutsideOfChromosome.ReferenceAllele = "A"; variantOutsideOfChromosome.AlternateAllele = "AA"; Assert.Throws <ArgumentOutOfRangeException>(() => AlleleProcessor.Process(variantOutsideOfChromosome, 0.01f, 0, 0, true, 0, 0, 2, null, 0.6f, chrReference)); }
protected abstract void Process(BamWorkRequest workRequest, ChrReference chrReference);
protected override IAlleleCaller CreateVariantCaller(ChrReference chrReference, ChrIntervalSet intervalSet, IAlignmentSource alignmentSource, HashSet <Tuple <string, int, string, string> > forcedGtAlleles = null) { return(MockVariantCaller != null ? MockVariantCaller.Object : base.CreateVariantCaller(chrReference, intervalSet, alignmentSource, forcedGtAlleles)); }
protected override IAlignmentSource CreateAlignmentSource(ChrReference chrReference, string bamFilePath, bool expectStitchedDirections, List <string> chrsToProcess) { return(MockAlignmentSource != null ? MockAlignmentSource.Object : base.CreateAlignmentSource(chrReference, bamFilePath, expectStitchedDirections, chrsToProcess)); }
public MockAlignmentExtractor(ChrReference chrInfo, bool SourceIsStitched = false) { _reads = new List <Read>(); _chrName = chrInfo.Name; _sourceIsStitched = SourceIsStitched; }
public SomaticVariantCaller(IAlignmentSource alignmentSource, ICandidateVariantFinder variantFinder, IAlleleCaller alleleCaller, IVcfWriter vcfWriter, IStateManager stateManager, ChrReference chrReference, IRegionPadder regionMapper, IStrandBiasFileWriter biasFileWriter) { _alignmentSource = alignmentSource; _variantFinder = variantFinder; _alleleCaller = alleleCaller; _vcfWriter = vcfWriter; _stateManager = stateManager; _chrReference = chrReference; _regionMapper = regionMapper; _biasFileWriter = biasFileWriter; if (_alignmentSource.ChromosomeFilter != _chrReference.Name) { throw new ArgumentException(string.Format("Chromosome filter in alignment source '{0}' does not match to current chromosome '{1}'", _alignmentSource.ChromosomeFilter, _chrReference.Name)); } }
public static ISomaticVariantCaller CreateMockVariantCaller(VcfFileWriter vcfWriter, ApplicationOptions options, ChrReference chrRef, MockAlignmentExtractor mockAlignmentExtractor, IStrandBiasFileWriter biasFileWriter = null, string intervalFilePath = null) { var config = new AlignmentSourceConfig { MinimumMapQuality = options.MinimumMapQuality, OnlyUseProperPairs = options.OnlyUseProperPairs, }; //var mateFinder = options.StitchReads ? new AlignmentMateFinder() : null; AlignmentMateFinder mateFinder = null; var alignmentSource = new AlignmentSource(mockAlignmentExtractor, mateFinder, config); var variantFinder = new CandidateVariantFinder(options.MinimumBaseCallQuality, options.MaxSizeMNV, options.MaxGapBetweenMNV, options.CallMNVs); var coverageCalculator = new CoverageCalculator(); var alleleCaller = new AlleleCaller(new VariantCallerConfig { IncludeReferenceCalls = options.OutputgVCFFiles, MinVariantQscore = options.MinimumVariantQScore, MaxVariantQscore = options.MaximumVariantQScore, VariantQscoreFilterThreshold = options.FilteredVariantQScore > options.MinimumVariantQScore ? options.FilteredVariantQScore : (int?)null, MinCoverage = options.MinimumDepth, MinFrequency = options.MinimumFrequency, EstimatedBaseCallQuality = options.AppliedNoiseLevel == -1 ? options.MinimumBaseCallQuality : options.AppliedNoiseLevel, StrandBiasModel = options.StrandBiasModel, StrandBiasFilterThreshold = options.StrandBiasAcceptanceCriteria, FilterSingleStrandVariants = options.FilterOutVariantsPresentOnlyOneStrand, ChrReference = chrRef }, coverageCalculator: coverageCalculator, variantCollapser: options.Collapse ? new VariantCollapser(null, coverageCalculator) : null); var stateManager = new RegionStateManager( expectStitchedReads: mockAlignmentExtractor.SourceIsStitched, trackOpenEnded: options.Collapse, trackReadSummaries: options.CoverageMethod == CoverageMethod.Approximate); //statmanager is an allele source Assert.Equal(0, stateManager.GetAlleleCount(1, AlleleType.A, DirectionType.Forward)); return(new SomaticVariantCaller( alignmentSource, variantFinder, alleleCaller, vcfWriter, stateManager, chrRef, null, biasFileWriter)); }
public static void Process(CalledAllele allele, float minFrequency, int?lowDepthFilter, int?filterVariantQscore, bool filterSingleStrandVariants, float?variantFreqFilter, float?lowGqFilter, int?indelRepeatFilter, RMxNFilterSettings rMxNFilterSettings, float?noCallFilter, float?ampliconBiasFilter, ChrReference chrReference, bool isStitchedSource = false) { allele.SetFractionNoCalls(); ApplyFilters(allele, lowDepthFilter, filterVariantQscore, filterSingleStrandVariants, variantFreqFilter, lowGqFilter, indelRepeatFilter, rMxNFilterSettings, noCallFilter, ampliconBiasFilter, isStitchedSource, chrReference); }
protected virtual IRegionPadder CreateRegionPadder(ChrReference chrReference, ChrIntervalSet intervalSet, bool includeReference) { // padder is only required if there are intervals and we are including reference calls return(intervalSet == null || !_options.OutputgVCFFiles ? null : new RegionPadder(chrReference, intervalSet)); }
public static string[] CheckReadLoading(BamAlignment read, PiscesApplicationOptions options, ChrReference chrInfo, bool isVariant, StitchingScenario scenario) { string expectedVarLoading = scenario.RefLoading; string expectedCandidateDireciton = "0"; if (isVariant) { expectedVarLoading = scenario.VarLoading; expectedCandidateDireciton = scenario.CandidateDirection; } var loadingResults = LoadReads(new List <BamAlignment>() { read }, options, chrInfo, isVariant, expectedVarLoading, expectedCandidateDireciton); if (loadingResults == null) { return(new string[] { "total fail to parse variant reads" }); } //coverage check var variantReadLoadResult = CheckLoading(scenario, 1, loadingResults.Item1, isVariant); var variantReadCandidateDirection = CheckCandidateDirection(isVariant, loadingResults.Item2, expectedCandidateDireciton); if (variantReadLoadResult == null) { return(new string[] { "total fail to check loading" }); } if (variantReadCandidateDirection == null) { return(new string[] { "total fail to check direction" }); } return(new string[] { variantReadLoadResult, variantReadCandidateDirection }); }
/// <summary> /// Sole job is to pad empty reference calls when using intervals. Assumes batch has already included reference calls (either empty or not) /// for cleared regions. /// </summary> /// <param name="chrReference"></param> /// <param name="includeReferenceCalls"></param> /// <param name="intervals"></param> public RegionPadder(ChrReference chrReference, ChrIntervalSet intervals) { _chrReference = chrReference; IntervalSet = intervals; }
protected override IAlignmentSource CreateAlignmentSource(ChrReference chrReference, string bamFilePath) { return(MockAlignmentSource != null ? MockAlignmentSource.Object : base.CreateAlignmentSource(chrReference, bamFilePath)); }
public IClassificationBlockProvider GetBlockProvider(Dictionary <int, string> refIdMapping, string chrom, IWriterSource writerSource, ConcurrentDictionary <string, int> progressTracker, ConcurrentDictionary <PairClassification, int> categoryLookup, ConcurrentDictionary <string, IndelEvidence> masterIndelLookup, ConcurrentDictionary <HashableIndel, int[]> masterOutcomesLookup, ConcurrentDictionary <HashableIndel, int> masterFinalIndels, ChrReference chrReference) { var actionBlockFactoryProvider = new PairResultActionBlockFactoryProvider(writerSource, _geminiOptions.Debug, _geminiOptions.LightDebug, chrom, _geminiSampleOptions.RefId.Value, _maxDegreeOfParallelism, _stitcherOptions.FilterForProperPairs, _geminiOptions.MessySiteWidth, progressTracker, categoryLookup); var aggregateProcessor = new AggregateRegionProcessor(chrReference, refIdMapping, _bamRealignmentFactory, _geminiOptions, _geminiFactory, chrom, _dataSourceFactory, _realignmentOptions, masterIndelLookup, masterOutcomesLookup, masterFinalIndels, _realignmentOptions.CategoriesForRealignment, progressTracker); var batchBlockFactory = new PairResultBatchBlockFactory(_geminiOptions.ReadCacheSize / 5); return(new ClassificationBlockProvider(_geminiOptions, chrom, progressTracker, categoryLookup, actionBlockFactoryProvider, aggregateProcessor, _geminiOptions.LightDebug, batchBlockFactory, new BinEvidenceFactory(_geminiOptions, _geminiSampleOptions), _realignmentOptions.CategoriesForRealignment, _maxDegreeOfParallelism)); }
protected override IRegionPadder CreateRegionPadder(ChrReference chrReference, ChrIntervalSet intervalSet, bool includeReferences) { return(MockRegionMapper != null ? MockRegionMapper.Object : base.CreateRegionPadder(chrReference, intervalSet, includeReferences)); }
private ISomaticVariantCaller CreateMockVariantCaller(VcfFileWriter vcfWriter, ApplicationOptions options, ChrReference chrRef, MockAlignmentExtractor mae, IStrandBiasFileWriter biasFileWriter = null, string intervalFilePath = null) { var config = new AlignmentSourceConfig { MinimumMapQuality = options.MinimumMapQuality, OnlyUseProperPairs = options.OnlyUseProperPairs, }; IAlignmentStitcher stitcher = null; if (options.StitchReads) { if (options.UseXCStitcher) { stitcher = new XCStitcher(options.MinimumBaseCallQuality); } else { stitcher = new BasicStitcher(options.MinimumBaseCallQuality); } } var mateFinder = options.StitchReads ? new AlignmentMateFinder(MAX_FRAGMENT_SIZE) : null; var RegionPadder = new RegionPadder(chrRef, null); var alignmentSource = new AlignmentSource(mae, mateFinder, stitcher, config); var variantFinder = new CandidateVariantFinder(options.MinimumBaseCallQuality, options.MaxSizeMNV, options.MaxGapBetweenMNV, options.CallMNVs); var alleleCaller = new AlleleCaller(new VariantCallerConfig { IncludeReferenceCalls = options.OutputgVCFFiles, MinVariantQscore = options.MinimumVariantQScore, MaxVariantQscore = options.MaximumVariantQScore, VariantQscoreFilterThreshold = options.FilteredVariantQScore > options.MinimumVariantQScore ? options.FilteredVariantQScore : (int?)null, MinCoverage = options.MinimumCoverage, MinFrequency = options.MinimumFrequency, EstimatedBaseCallQuality = options.AppliedNoiseLevel == -1 ? options.MinimumBaseCallQuality : options.AppliedNoiseLevel, StrandBiasModel = options.StrandBiasModel, StrandBiasFilterThreshold = options.StrandBiasAcceptanceCriteria, FilterSingleStrandVariants = options.FilterOutVariantsPresentOnlyOneStrand, GenotypeModel = options.GTModel }); var stateManager = new RegionStateManager(); return(new SomaticVariantCaller( alignmentSource, variantFinder, alleleCaller, vcfWriter, stateManager, chrRef, RegionPadder, biasFileWriter)); }
public SomaticVariantCaller(IAlignmentSource alignmentSource, ICandidateVariantFinder variantFinder, IAlleleCaller alleleCaller, IVcfWriter <CalledAllele> vcfWriter, IStateManager stateManager, ChrReference chrReference, IRegionMapper regionMapper, IStrandBiasFileWriter biasFileWriter, ChrIntervalSet intervalSet = null) { _alignmentSource = alignmentSource; _variantFinder = variantFinder; _alleleCaller = alleleCaller; _vcfWriter = vcfWriter; _stateManager = stateManager; _chrReference = chrReference; _regionMapper = regionMapper; _biasFileWriter = biasFileWriter; _intervalSet = intervalSet; }
protected override void Process(BamWorkRequest workRequest, ChrReference chrReference) { // do nothing }
public SmallVariantCaller(IAlignmentSource alignmentSource, ICandidateVariantFinder variantFinder, IAlleleCaller alleleCaller, IVcfWriter <CalledAllele> vcfWriter, IStateManager stateManager, ChrReference chrReference, IRegionMapper regionMapper, IStrandBiasFileWriter strandBiasFileWriter, IAmpliconBiasFileWriter ampBiasFileWriter, ChrIntervalSet intervalSet = null, HashSet <Tuple <string, int, string, string> > forcedGTAlleles = null) { _alignmentSource = alignmentSource; _variantFinder = variantFinder; _alleleCaller = alleleCaller; _vcfWriter = vcfWriter; _stateManager = stateManager; _chrReference = chrReference; _regionMapper = regionMapper; _strandBiasFileWriter = strandBiasFileWriter; _ampliconBiasFileWriter = ampBiasFileWriter; _intervalSet = intervalSet; _forcedGtAlleles = forcedGTAlleles; _unProcessedForcedAllelesByPos = CreateForcedAllelePos(_forcedGtAlleles); _writeBiasFiles = (strandBiasFileWriter != null && ampBiasFileWriter != null); }
public void CreateCallableNbhdsTests() { var vcfFilePath = Path.Combine(TestPaths.LocalTestDataDirectory, "VeryMutated.genome.vcf"); var variantSource = new AlleleReader(vcfFilePath); var vcfNeighborhood = new VcfNeighborhood(0, "chr1", new VariantSite(123), new VariantSite(125)); List <VcfNeighborhood> VcfNeighborhoods = new List <VcfNeighborhood>() { vcfNeighborhood }; //Test 1, genome is NULL var neighborhoodBuilder = new NeighborhoodBuilder(new PhasableVariantCriteria(), new VariantCallingParameters(), variantSource, null, 20); var neighborhoods = neighborhoodBuilder.ConvertToCallableNeighborhoods(VcfNeighborhoods); Assert.Equal(1, neighborhoods.Count()); Assert.Equal(2, neighborhoods.First().VcfVariantSites.Count()); Assert.Equal("chr1", neighborhoods[0].ReferenceName); Assert.Equal("RRR", neighborhoods[0].NbhdReferenceSequenceSubstring); //Test 2, genome is exists, but doesnt have the right chr var genomePath = Path.Combine(TestPaths.SharedGenomesDirectory, "Bacillus_cereus", "Sequence", "WholeGenomeFasta"); var refName = "chr_wrong"; Genome genome = new Genome(genomePath, new List <string>() { refName }); ChrReference chrReference = genome.GetChrReference(refName); neighborhoodBuilder = new NeighborhoodBuilder(new PhasableVariantCriteria(), new VariantCallingParameters(), variantSource, genome, 20); neighborhoods = neighborhoodBuilder.ConvertToCallableNeighborhoods(VcfNeighborhoods); Assert.Equal(1, neighborhoods.Count()); Assert.Equal(2, neighborhoods.First().VcfVariantSites.Count()); Assert.Equal("chr1", neighborhoods[0].ReferenceName); Assert.Equal("RRR", neighborhoods[0].NbhdReferenceSequenceSubstring); //Test 3, genome is exists, and DOES have the right chr refName = "chr"; genome = new Genome(genomePath, new List <string>() { refName }); chrReference = genome.GetChrReference(refName); neighborhoodBuilder = new NeighborhoodBuilder(new PhasableVariantCriteria(), new VariantCallingParameters(), variantSource, genome, 20); vcfNeighborhood = new VcfNeighborhood(0, "chr", new VariantSite(123), new VariantSite(125)); VcfNeighborhoods = new List <VcfNeighborhood>() { vcfNeighborhood }; neighborhoods = neighborhoodBuilder.ConvertToCallableNeighborhoods(VcfNeighborhoods); Assert.Equal(1, neighborhoods.Count()); Assert.Equal(2, neighborhoods.First().VcfVariantSites.Count()); Assert.Equal("chr", neighborhoods[0].ReferenceName); Assert.Equal("TAT", neighborhoods[0].NbhdReferenceSequenceSubstring); }