Exemplo n.º 1
0
        /// <summary>
        ///     http://www.broadinstitute.org/gsa/wiki/index.php/Understanding_the_Unified_Genotyper%27s_VCF_files
        ///     See section on Strand Bias
        /// </summary>
        // From GATK source:
        //double forwardLod = forwardLog10PofF + reverseLog10PofNull - overallLog10PofF;
        //double reverseLod = reverseLog10PofF + forwardLog10PofNull - overallLog10PofF;
        //
        //// strand score is max bias between forward and reverse strands
        //double strandScore = Math.max(forwardLod, reverseLod);
        //
        //// rescale by a factor of 10
        //strandScore *= 10.0;
        //
        //attributes.put("SB", strandScore);
        private static double[] AssignBiasScore(StrandBiasStats overallStats, StrandBiasStats fwdStats, StrandBiasStats rvsStats)
        {
            var forwardBias = (fwdStats.ChanceVarFreqGreaterThanZero * rvsStats.ChanceFalsePos) /
                              overallStats.ChanceVarFreqGreaterThanZero;
            var reverseBias = (rvsStats.ChanceVarFreqGreaterThanZero * fwdStats.ChanceFalsePos) /
                              overallStats.ChanceVarFreqGreaterThanZero;

            if (overallStats.ChanceVarFreqGreaterThanZero == 0)
            {
                forwardBias = 1;
                reverseBias = 1;
            }

            var p = Math.Max(forwardBias, reverseBias);

            return(new[] { p, MathOperations.PtoGATKBiasScale(p) });
        }
        public static int Compute(CalledAllele allele, float targetLimitOfDetectionVF, int minGTQScore, int maxGTQScore)
        {
            double rawQ = allele.VariantQscore;

            if ((allele.TotalCoverage == 0) || (allele.IsNocall))
            {
                return(minGTQScore);
            }

            if ((allele.Genotype == Genotype.HomozygousRef) || (allele.Genotype == Genotype.HomozygousAlt))
            {
                //a homozygous somatic call GT is a fairly strong statement. It implies
                //A) we found the allele for sure (the VariantQscore)
                var p1 = MathOperations.QtoP(allele.VariantQscore);

                //and
                //B) the chance that we missed any alternate calls is very small.
                // this would be the chance false negative given VF=min freq, and coverage is as given.

                //these are explictly typed, to prevent any win/linux diffs sneaking in
                // in float -> double conversions inside downstream arguments
                float nonAlleleObservationsF        = (1f - allele.Frequency) * allele.TotalCoverage;
                float expectedNonAllelObservationsF = targetLimitOfDetectionVF * allele.TotalCoverage;


                //This takes care of the cases:
                //A) we dont have enough depth to ever observe any non-ref variant. If, if depth is 10, we would never see a 5% variant anyway.
                //B) if we see 6% not reference > 5% min safe var call freqeuncy, we are pretty worried about calling this as a 0/0 GT
                if (nonAlleleObservationsF >= expectedNonAllelObservationsF)
                {
                    return(minGTQScore);
                }

                //var p2 = poissonDist.CumulativeDistribution(nonRefObservations); <- this method does badly for values lower than the mean
                var p2 = Poisson.Cdf(nonAlleleObservationsF, expectedNonAllelObservationsF);
                rawQ = MathOperations.PtoQ(p1 + p2);
            }

            var qScore = Math.Min(maxGTQScore, rawQ);

            qScore = Math.Max(qScore, minGTQScore);
            return((int)Math.Round(qScore));
        }
Exemplo n.º 3
0
        /// <summary>
        /// This method looks for bias in the variant support / total coverage ratios, by amplicon.
        /// This method is agnostic about where these support and coverage calculations come from, so it is up to the user
        /// to make sure the counts are appropriate for the variant in question.
        /// Note that for SNPs this is fairly straight forward, but for indels and MNVs it can become terribly difficult.
        /// This method should be used with appropriate caution.
        /// </summary>
        /// <param name="supportByAmplicon">the support counts, for each named amplicon</param>
        /// <param name="coverageByAmplicon">the coverage counts, for each named amplicon<</param>
        /// <param name="acceptanceCriteria">the minimumn probabilty we accept for the varaint being real, given the model</param>
        /// <param name="maxQScore">the max cap for a qscore. This parameter safegaurds against reporting insanely high confidence, given the limitations of a simple model that only addresses sampling error</param>
        /// <returns></returns>
        public static BiasResultsAcrossAmplicons CalculateAmpliconBias(AmpliconCounts supportByAmplicon,
                                                                       AmpliconCounts coverageByAmplicon, float acceptanceCriteria, int maxQScore)
        {
            //if we have no amplicon information, don't worry about it.
            if ((supportByAmplicon.AmpliconNames == null) ||
                (supportByAmplicon.AmpliconNames.Length == 0) ||
                (supportByAmplicon.AmpliconNames[0] == null))
            {
                return(null);
            }

            //If we only have coverage on one amplicon, don't worry about it. There is no "bias" to detect.
            //We might later on, add a check to require extra evidence for variants only covered by one amplicon. TBD
            if (coverageByAmplicon.AmpliconNames.Length < 2)
            {
                return(null);
            }

            var resultDict = new BiasResultsAcrossAmplicons()
            {
                ResultsByAmpliconName = new Dictionary <string, AmpliconBiasResult>()
            };
            var maxFreq = 0.0;

            for (int i = 0; i < coverageByAmplicon.AmpliconNames.Length; i++)
            {
                var name = coverageByAmplicon.AmpliconNames[i];
                if (name == null)
                {
                    break;
                }

                double support  = supportByAmplicon.GetCountsForAmplicon(name);
                double coverage = coverageByAmplicon.CountsForAmplicon[i];
                double freq     = (coverage > 0) ? support / coverage : 0;

                if (freq >= maxFreq)
                {
                    resultDict.AmpliconWithCandidateArtifact = name;
                    maxFreq = freq;
                }

                var resultForAmplicon = new AmpliconBiasResult()
                {
                    Frequency = freq, Name = name, ObservedSupport = support, Coverage = coverage
                };
                resultDict.ResultsByAmpliconName.Add(name, resultForAmplicon);
            }

            bool shouldFailVariant = false;

            foreach (var amplicon in resultDict.ResultsByAmpliconName.Keys)
            {
                double coverage      = resultDict.ResultsByAmpliconName[amplicon].Coverage;
                double support       = resultDict.ResultsByAmpliconName[amplicon].ObservedSupport;
                double freq          = resultDict.ResultsByAmpliconName[amplicon].Frequency;
                int    qScore        = 0;
                bool   biasDetected  = false;
                var    allowableProb = acceptanceCriteria;

                double expectedNumObservationsOfVariant = maxFreq * coverage;
                var    pChanceItsReal = 1.0;

                if (expectedNumObservationsOfVariant < Constants.MinNumObservations)
                {
                    qScore = maxQScore; //we'd never see it anyway. Seems fine.
                }
                else if ((expectedNumObservationsOfVariant <= support) || (freq > Constants.FreePassObservationFreq))
                {
                    //we saw this variant quite a lot for this amplicon
                    qScore = maxQScore; // it certainly seems to be in this amplicon!
                }
                else //we didnt see this variant much for this amplicon. Hm.... Perhaps its not real...?
                {
                    //What is the chance this variant exists but just happened not to show up much on this amplicon's reads?
                    //Lets look at the chance that we observed it at "support" or less, given the estimated frequency.

                    pChanceItsReal = Math.Max(0.0, Poisson.Cdf(support, expectedNumObservationsOfVariant));
                    //biasProb = 1.0 - pChanceItsReal;
                    var q = MathOperations.PtoQ(1.0 - pChanceItsReal);
                    qScore = (int)q;
                }

                //if acceptanceCriteria = Q20, thats 1/100.
                //so, if we even have 1/100 chance of this happening, lets allow it.
                // Ie, a true variant would (generally) 50% of the time show up at its expected frequency.
                // Sometimes, it would show up less. At (1/100)% of the time, it only shows up at Z frequency.
                // So, if the observation is less likely than (1/100) for a real variant, we fail it.

                if (pChanceItsReal < allowableProb)
                {
                    biasDetected      = true;
                    shouldFailVariant = true;
                }
                resultDict.ResultsByAmpliconName[amplicon].ChanceItsReal    = pChanceItsReal;
                resultDict.ResultsByAmpliconName[amplicon].ConfidenceQScore = qScore;
                resultDict.ResultsByAmpliconName[amplicon].BiasDetected     = biasDetected;
                resultDict.ResultsByAmpliconName[amplicon].ExpectedSupport  = expectedNumObservationsOfVariant;
                resultDict.BiasDetected = shouldFailVariant;
            }


            return(resultDict);
        }