/// <summary> /// http://www.broadinstitute.org/gsa/wiki/index.php/Understanding_the_Unified_Genotyper%27s_VCF_files /// See section on Strand Bias /// </summary> // From GATK source: //double forwardLod = forwardLog10PofF + reverseLog10PofNull - overallLog10PofF; //double reverseLod = reverseLog10PofF + forwardLog10PofNull - overallLog10PofF; // //// strand score is max bias between forward and reverse strands //double strandScore = Math.max(forwardLod, reverseLod); // //// rescale by a factor of 10 //strandScore *= 10.0; // //attributes.put("SB", strandScore); private static double[] AssignBiasScore(StrandBiasStats overallStats, StrandBiasStats fwdStats, StrandBiasStats rvsStats) { var forwardBias = (fwdStats.ChanceVarFreqGreaterThanZero * rvsStats.ChanceFalsePos) / overallStats.ChanceVarFreqGreaterThanZero; var reverseBias = (rvsStats.ChanceVarFreqGreaterThanZero * fwdStats.ChanceFalsePos) / overallStats.ChanceVarFreqGreaterThanZero; if (overallStats.ChanceVarFreqGreaterThanZero == 0) { forwardBias = 1; reverseBias = 1; } var p = Math.Max(forwardBias, reverseBias); return(new[] { p, MathOperations.PtoGATKBiasScale(p) }); }
public static int Compute(CalledAllele allele, float targetLimitOfDetectionVF, int minGTQScore, int maxGTQScore) { double rawQ = allele.VariantQscore; if ((allele.TotalCoverage == 0) || (allele.IsNocall)) { return(minGTQScore); } if ((allele.Genotype == Genotype.HomozygousRef) || (allele.Genotype == Genotype.HomozygousAlt)) { //a homozygous somatic call GT is a fairly strong statement. It implies //A) we found the allele for sure (the VariantQscore) var p1 = MathOperations.QtoP(allele.VariantQscore); //and //B) the chance that we missed any alternate calls is very small. // this would be the chance false negative given VF=min freq, and coverage is as given. //these are explictly typed, to prevent any win/linux diffs sneaking in // in float -> double conversions inside downstream arguments float nonAlleleObservationsF = (1f - allele.Frequency) * allele.TotalCoverage; float expectedNonAllelObservationsF = targetLimitOfDetectionVF * allele.TotalCoverage; //This takes care of the cases: //A) we dont have enough depth to ever observe any non-ref variant. If, if depth is 10, we would never see a 5% variant anyway. //B) if we see 6% not reference > 5% min safe var call freqeuncy, we are pretty worried about calling this as a 0/0 GT if (nonAlleleObservationsF >= expectedNonAllelObservationsF) { return(minGTQScore); } //var p2 = poissonDist.CumulativeDistribution(nonRefObservations); <- this method does badly for values lower than the mean var p2 = Poisson.Cdf(nonAlleleObservationsF, expectedNonAllelObservationsF); rawQ = MathOperations.PtoQ(p1 + p2); } var qScore = Math.Min(maxGTQScore, rawQ); qScore = Math.Max(qScore, minGTQScore); return((int)Math.Round(qScore)); }
/// <summary> /// This method looks for bias in the variant support / total coverage ratios, by amplicon. /// This method is agnostic about where these support and coverage calculations come from, so it is up to the user /// to make sure the counts are appropriate for the variant in question. /// Note that for SNPs this is fairly straight forward, but for indels and MNVs it can become terribly difficult. /// This method should be used with appropriate caution. /// </summary> /// <param name="supportByAmplicon">the support counts, for each named amplicon</param> /// <param name="coverageByAmplicon">the coverage counts, for each named amplicon<</param> /// <param name="acceptanceCriteria">the minimumn probabilty we accept for the varaint being real, given the model</param> /// <param name="maxQScore">the max cap for a qscore. This parameter safegaurds against reporting insanely high confidence, given the limitations of a simple model that only addresses sampling error</param> /// <returns></returns> public static BiasResultsAcrossAmplicons CalculateAmpliconBias(AmpliconCounts supportByAmplicon, AmpliconCounts coverageByAmplicon, float acceptanceCriteria, int maxQScore) { //if we have no amplicon information, don't worry about it. if ((supportByAmplicon.AmpliconNames == null) || (supportByAmplicon.AmpliconNames.Length == 0) || (supportByAmplicon.AmpliconNames[0] == null)) { return(null); } //If we only have coverage on one amplicon, don't worry about it. There is no "bias" to detect. //We might later on, add a check to require extra evidence for variants only covered by one amplicon. TBD if (coverageByAmplicon.AmpliconNames.Length < 2) { return(null); } var resultDict = new BiasResultsAcrossAmplicons() { ResultsByAmpliconName = new Dictionary <string, AmpliconBiasResult>() }; var maxFreq = 0.0; for (int i = 0; i < coverageByAmplicon.AmpliconNames.Length; i++) { var name = coverageByAmplicon.AmpliconNames[i]; if (name == null) { break; } double support = supportByAmplicon.GetCountsForAmplicon(name); double coverage = coverageByAmplicon.CountsForAmplicon[i]; double freq = (coverage > 0) ? support / coverage : 0; if (freq >= maxFreq) { resultDict.AmpliconWithCandidateArtifact = name; maxFreq = freq; } var resultForAmplicon = new AmpliconBiasResult() { Frequency = freq, Name = name, ObservedSupport = support, Coverage = coverage }; resultDict.ResultsByAmpliconName.Add(name, resultForAmplicon); } bool shouldFailVariant = false; foreach (var amplicon in resultDict.ResultsByAmpliconName.Keys) { double coverage = resultDict.ResultsByAmpliconName[amplicon].Coverage; double support = resultDict.ResultsByAmpliconName[amplicon].ObservedSupport; double freq = resultDict.ResultsByAmpliconName[amplicon].Frequency; int qScore = 0; bool biasDetected = false; var allowableProb = acceptanceCriteria; double expectedNumObservationsOfVariant = maxFreq * coverage; var pChanceItsReal = 1.0; if (expectedNumObservationsOfVariant < Constants.MinNumObservations) { qScore = maxQScore; //we'd never see it anyway. Seems fine. } else if ((expectedNumObservationsOfVariant <= support) || (freq > Constants.FreePassObservationFreq)) { //we saw this variant quite a lot for this amplicon qScore = maxQScore; // it certainly seems to be in this amplicon! } else //we didnt see this variant much for this amplicon. Hm.... Perhaps its not real...? { //What is the chance this variant exists but just happened not to show up much on this amplicon's reads? //Lets look at the chance that we observed it at "support" or less, given the estimated frequency. pChanceItsReal = Math.Max(0.0, Poisson.Cdf(support, expectedNumObservationsOfVariant)); //biasProb = 1.0 - pChanceItsReal; var q = MathOperations.PtoQ(1.0 - pChanceItsReal); qScore = (int)q; } //if acceptanceCriteria = Q20, thats 1/100. //so, if we even have 1/100 chance of this happening, lets allow it. // Ie, a true variant would (generally) 50% of the time show up at its expected frequency. // Sometimes, it would show up less. At (1/100)% of the time, it only shows up at Z frequency. // So, if the observation is less likely than (1/100) for a real variant, we fail it. if (pChanceItsReal < allowableProb) { biasDetected = true; shouldFailVariant = true; } resultDict.ResultsByAmpliconName[amplicon].ChanceItsReal = pChanceItsReal; resultDict.ResultsByAmpliconName[amplicon].ConfidenceQScore = qScore; resultDict.ResultsByAmpliconName[amplicon].BiasDetected = biasDetected; resultDict.ResultsByAmpliconName[amplicon].ExpectedSupport = expectedNumObservationsOfVariant; resultDict.BiasDetected = shouldFailVariant; } return(resultDict); }