/// <summary> /// http://www.broadinstitute.org/gsa/wiki/index.php/Understanding_the_Unified_Genotyper%27s_VCF_files /// See section on Strand Bias /// </summary> // From GATK source: //double forwardLod = forwardLog10PofF + reverseLog10PofNull - overallLog10PofF; //double reverseLod = reverseLog10PofF + forwardLog10PofNull - overallLog10PofF; // //// strand score is max bias between forward and reverse strands //double strandScore = Math.max(forwardLod, reverseLod); // //// rescale by a factor of 10 //strandScore *= 10.0; // //attributes.put("SB", strandScore); private static double[] AssignBiasScore(StrandBiasStats overallStats, StrandBiasStats fwdStats, StrandBiasStats rvsStats) { var forwardBias = (fwdStats.ChanceVarFreqGreaterThanZero * rvsStats.ChanceFalsePos) / overallStats.ChanceVarFreqGreaterThanZero; var reverseBias = (rvsStats.ChanceVarFreqGreaterThanZero * fwdStats.ChanceFalsePos) / overallStats.ChanceVarFreqGreaterThanZero; if (overallStats.ChanceVarFreqGreaterThanZero == 0) { forwardBias = 1; reverseBias = 1; } var p = Math.Max(forwardBias, reverseBias); return(new[] { p, MathOperations.PtoGATKBiasScale(p) }); }
public static StrandBiasStats DeepCopy(StrandBiasStats originalStats) { if (originalStats == null) { return(null); } var newStats = new StrandBiasStats(originalStats.Support, originalStats.Coverage) { ChanceFalseNeg = originalStats.ChanceFalseNeg, ChanceFalsePos = originalStats.ChanceFalsePos, ChanceVarFreqGreaterThanZero = originalStats.ChanceVarFreqGreaterThanZero, Frequency = originalStats.Frequency, }; return(newStats); }
public static void PopulateStats(StrandBiasStats stats, double noiseFreq, double minDetectableSNP, StrandBiasModel strandBiasModel) { if (stats.Support == 0) { if (strandBiasModel == StrandBiasModel.Poisson) { stats.ChanceFalsePos = 1; stats.ChanceVarFreqGreaterThanZero = 0; stats.ChanceFalseNeg = 0; } else if (strandBiasModel == StrandBiasModel.Extended) { //the chance that we observe the SNP is (minDetectableSNPfreq) for one observation. //the chance that we do not is (1- minDetectableSNPfreq) for one observation. //the chance that we do not observe it, N times in a row is: stats.ChanceVarFreqGreaterThanZero = (Math.Pow(1 - minDetectableSNP, stats.Coverage)); //used in SB metric //liklihood that variant really does not exist //= 1 - chance that it does but you did not see it stats.ChanceFalsePos = 1 - stats.ChanceVarFreqGreaterThanZero; //used in SB metric //Chance a low freq variant is at work in the model, and we did not observe it: stats.ChanceFalseNeg = stats.ChanceVarFreqGreaterThanZero; } } else { // chance of these observations or less, given min observable variant distribution stats.ChanceVarFreqGreaterThanZero = Poisson.Cdf(stats.Support - 1, stats.Coverage * noiseFreq); //used in SB metric stats.ChanceFalsePos = 1 - stats.ChanceVarFreqGreaterThanZero; //used in SB metric stats.ChanceFalseNeg = Poisson.Cdf(stats.Support, stats.Coverage * minDetectableSNP); } //Note: // // Type 1 error is when we rejected the null hypothesis when we should not have. (we have noise, but called a SNP) // Type 2 error is when we accepected the alternate when we should not have. (we have a variant, but we did not call it.) // // Type 1 error is our this.ChanceFalsePos aka p-value. // Type 2 error is out this.ChanceFalseNeg }
public static StrandBiasResults DeepCopy(StrandBiasResults originalSBresults) { if (originalSBresults == null) { return(null); } var sb = new StrandBiasResults() { BiasAcceptable = originalSBresults.BiasAcceptable, BiasScore = originalSBresults.BiasScore, GATKBiasScore = originalSBresults.GATKBiasScore, VarPresentOnBothStrands = originalSBresults.VarPresentOnBothStrands, CovPresentOnBothStrands = originalSBresults.CovPresentOnBothStrands, TestAcceptable = originalSBresults.TestAcceptable, TestScore = originalSBresults.TestScore, ForwardStats = StrandBiasStats.DeepCopy(originalSBresults.ForwardStats), OverallStats = StrandBiasStats.DeepCopy(originalSBresults.OverallStats), ReverseStats = StrandBiasStats.DeepCopy(originalSBresults.ReverseStats), StitchedStats = StrandBiasStats.DeepCopy(originalSBresults.StitchedStats), }; return(sb); }
public static void PopulateDiploidStats(StrandBiasStats stats, double noiseFreq, double minDetectableSNP) { //expectation we ought to see the 20% variant on this strand: //save ourself some time here.. if (stats.Frequency >= minDetectableSNP) { stats.ChanceFalseNeg = 1; // TP if we called it stats.ChanceFalsePos = 0; //FP if we called if stats.ChanceVarFreqGreaterThanZero = 1; return; } //trickier case, when we barely see it but we dont have enough reads... var binomialHetAltExpected = new MathNet.Numerics.Distributions.Binomial(minDetectableSNP, (int)stats.Coverage); //this is a real variant ( a false neg if we filtered it) stats.ChanceFalseNeg = Math.Max(binomialHetAltExpected.CumulativeDistribution(stats.Support), 0); //if this was a het variant, would we ever see it this low? //chance this is due to noise ( a false pos if we left it in) stats.ChanceFalsePos = Math.Max(0.0, 1 - Poisson.Cdf(stats.Support, stats.Coverage * 0.1)); //chance this varaint is due to noise, we could see this much or more stats.ChanceVarFreqGreaterThanZero = stats.ChanceFalseNeg; }
public void TestPopulateDiploidStats() { double noiseFreq = 0.01; double diploidThreshold = 0.20; //Cases where the variant obviously exisits StrandBiasStats stats = new StrandBiasStats(100, 100); //#observations, coverage StrandBiasCalculator.PopulateDiploidStats(stats, noiseFreq, diploidThreshold); Assert.Equal(stats.ChanceFalseNeg, 1, 3); Assert.Equal(stats.ChanceFalsePos, 0, 3); Assert.Equal(stats.ChanceVarFreqGreaterThanZero, 1, 3); stats = new StrandBiasStats(50, 100); //#observations, coverage StrandBiasCalculator.PopulateDiploidStats(stats, noiseFreq, diploidThreshold); Assert.Equal(stats.ChanceFalseNeg, 1, 3); Assert.Equal(stats.ChanceFalsePos, 0, 3); Assert.Equal(stats.ChanceVarFreqGreaterThanZero, 1, 3); stats = new StrandBiasStats(20, 100); //#observations, coverage StrandBiasCalculator.PopulateDiploidStats(stats, noiseFreq, diploidThreshold); Assert.Equal(stats.ChanceFalseNeg, 1, 3); Assert.Equal(stats.ChanceFalsePos, 0, 3); Assert.Equal(stats.ChanceVarFreqGreaterThanZero, 1, 3); // //Cases where the variant becomes less obvious stats = new StrandBiasStats(15, 100); //#observations, coverage StrandBiasCalculator.PopulateDiploidStats(stats, noiseFreq, diploidThreshold); Assert.Equal(stats.ChanceFalseNeg, 0.129, 3); //Chance this is a real variant ( a false neg if we filtered it)//it could happen that this is still real Assert.Equal(stats.ChanceFalsePos, 0.049, 3); //chance this is due to noise ( a false pos if we left it in). not very likely Assert.Equal(stats.ChanceVarFreqGreaterThanZero, 0.129, 3); stats = new StrandBiasStats(10, 100); //#observations, coverage StrandBiasCalculator.PopulateDiploidStats(stats, noiseFreq, diploidThreshold); Assert.Equal(stats.ChanceFalseNeg, 0.006, 3); //Chance this is a real variant ( a false neg if we filtered it)//it could happen that this is still real Assert.Equal(stats.ChanceFalsePos, 0.417, 3); //chance this is due to noise ( a false pos if we left it in). not very likely Assert.Equal(stats.ChanceVarFreqGreaterThanZero, 0.006, 3); stats = new StrandBiasStats(1, 100); //#observations, coverage StrandBiasCalculator.PopulateDiploidStats(stats, noiseFreq, diploidThreshold); Assert.Equal(stats.ChanceFalseNeg, 0, 3); //Chance this is a real variant ( a false neg if we filtered it)//it could happen that this is still real Assert.Equal(stats.ChanceFalsePos, 1, 3); //chance this is due to noise ( a false pos if we left it in). not very likely Assert.Equal(stats.ChanceVarFreqGreaterThanZero, 0, 3); //a few pathological cases stats = new StrandBiasStats(0, 100); //#observations, coverage StrandBiasCalculator.PopulateDiploidStats(stats, noiseFreq, diploidThreshold); Assert.Equal(stats.ChanceFalseNeg, 0, 3); Assert.Equal(stats.ChanceFalsePos, 1, 3); Assert.Equal(stats.ChanceVarFreqGreaterThanZero, 0, 3); stats = new StrandBiasStats(10, 0); //#observations, coverage StrandBiasCalculator.PopulateDiploidStats(stats, noiseFreq, diploidThreshold); Assert.Equal(stats.ChanceFalseNeg, 1, 3); Assert.Equal(stats.ChanceFalsePos, 0, 3); Assert.Equal(stats.ChanceVarFreqGreaterThanZero, 1, 3); stats = new StrandBiasStats(0, 0); //#observations, coverage StrandBiasCalculator.PopulateDiploidStats(stats, noiseFreq, diploidThreshold); Assert.Equal(stats.ChanceFalseNeg, 1, 3); //not a meaningful answer, but at least nothing explodes. Assert.Equal(stats.ChanceFalsePos, 0, 3); Assert.Equal(stats.ChanceVarFreqGreaterThanZero, 1, 3); stats = new StrandBiasStats(101, 100); //#observations, coverage StrandBiasCalculator.PopulateDiploidStats(stats, noiseFreq, diploidThreshold); Assert.Equal(stats.ChanceFalseNeg, 1, 3); Assert.Equal(stats.ChanceFalsePos, 0, 3); Assert.Equal(stats.ChanceVarFreqGreaterThanZero, 1, 3); //check it reacts properly to depth. Ie, a 15% variant in N of 20 isnt a big deal, //but a 15% varaint in N of 100000 seems rather low. stats = new StrandBiasStats((20.0 * 0.15), 20); //#observations, coverage StrandBiasCalculator.PopulateDiploidStats(stats, noiseFreq, diploidThreshold); Assert.Equal(stats.ChanceFalseNeg, 0.411, 3); //note, the believability of this variant goes up from 0.129 Assert.Equal(stats.ChanceFalsePos, 0.143, 3); //but its also more possible to be noise. Basically, the whole picture is more murky Assert.Equal(stats.ChanceVarFreqGreaterThanZero, 0.411, 3); stats = new StrandBiasStats(15, 100); //#observations, coverage StrandBiasCalculator.PopulateDiploidStats(stats, noiseFreq, diploidThreshold); Assert.Equal(stats.ChanceFalseNeg, 0.129, 3); Assert.Equal(stats.ChanceFalsePos, 0.049, 3); Assert.Equal(stats.ChanceVarFreqGreaterThanZero, 0.129, 3); //slightly more lilkey to be a variant than noise, but neither hypothesis fits. stats = new StrandBiasStats((500.0 * 0.15), 500); //#observations, coverage StrandBiasCalculator.PopulateDiploidStats(stats, noiseFreq, diploidThreshold); Assert.Equal(stats.ChanceFalseNeg, 0.002, 3); Assert.Equal(stats.ChanceFalsePos, 0, 3); Assert.Equal(stats.ChanceVarFreqGreaterThanZero, 0.002, 3); //it doesnt look like noise or a varaint. no hypothesis is reasonable. stats = new StrandBiasStats((100000.0 * 0.15), 100000); //#observations, coverage StrandBiasCalculator.PopulateDiploidStats(stats, noiseFreq, diploidThreshold); Assert.Equal(stats.ChanceFalseNeg, 0, 3); Assert.Equal(stats.ChanceFalsePos, 0, 3); Assert.Equal(stats.ChanceVarFreqGreaterThanZero, 0, 3); }