private ComparisonDecision MakeDecision(MetricValue baselineMetric, MetricValue candidateMetric) { // we're doing a t-test to make a decision on the test metrics, for now. return(TTest.Run(baselineMetric, candidateMetric, m_pvalue)); }
/// <summary> /// Runs an unpaired t-test on the sample data, making a decision on the data /// based on the results of the test. Whether or not the null hypothesis /// (that the means are the same betwen the baseline and the candidate) is rejected /// is based upon the value of the test statistic compared to the pvalue. /// </summary> /// <param name="baseline"></param> /// <param name="candidate"></param> /// <param name="pvalue">The PValue to use for this test. Must be specified up-front /// in order to avoid bias.</param> /// <returns>A <see cref="ComparisonDecision"/> made using the data.</returns> public static ComparisonDecision Run(MetricValue baseline, MetricValue candidate, double pvalue = StandardPValue) { // it is not required by this statistical test for the baseline and candidate // to have the same number of samples, but it /is/ required that their variances // be the same. // // TODO(segilles) - if, throughout the course of using this tool we find that its // false positive rate is high due to high variance differences, we should consider // switching to Welch's t-test, which is more robust in the case of unequal variances. // We use a table for the t-distribution, and we only have values for certain p-values. if (pvalue != StandardPValue && Array.IndexOf(AllowedPValues, pvalue) < 0) { throw new ArgumentException($"invalid pvalue: {pvalue}"); } // First, we calculate the test statistic. double testStatistic = CalculateStatistic(baseline, candidate); int degreesOfFreedom = baseline.SampleSize + candidate.SampleSize - 2; if (!s_tdistTable.ContainsKey(degreesOfFreedom)) { // TODO(segilles) if this happens a lot, we may consider adding more keys to the table. int closest = int.MaxValue; foreach (var key in s_tdistTable.Keys) { if (Math.Abs(key - degreesOfFreedom) <= Math.Abs(closest - degreesOfFreedom)) { closest = key; } } Logger.LogVerbose($"DOF {degreesOfFreedom} not found in table, rounding to closest DOF: {closest}"); degreesOfFreedom = closest; } double targetStatistic = s_tdistTable[degreesOfFreedom][pvalue]; Logger.LogVerbose($"metric {baseline.Name} has test statistic {testStatistic} vs. target {targetStatistic}"); if (double.IsInfinity(testStatistic) || double.IsNaN(testStatistic)) { // if this occured, there was something funny about the data we used to calculate. return(ComparisonDecision.Indeterminate); } if (Math.Abs(testStatistic) < targetStatistic) { // there's not enough data to prove or disprove the null hypothesis. return(ComparisonDecision.Indeterminate); } // we've got a statistically significant difference! switch (baseline.Direction) { case Direction.HigherIsBetter: return(baseline.Value < candidate.Value ? ComparisonDecision.Improvement : ComparisonDecision.Regression); case Direction.LowerIsBetter: return(baseline.Value > candidate.Value ? ComparisonDecision.Improvement : ComparisonDecision.Regression); default: throw new InvalidOperationException("invalid value for Direction enum"); } }