/// <summary> /// Two-sided or one-sided test for whether statitics of two variables are equal in the true population, var1 and var2 are paired and dependent /// /// Hypotheses are: /// H_0: mu_var1 = mu_var2 /// H_1: mu_var1 != mu_var2 /// /// The hypotheses can be written as /// H_0: mu_var1 - mu_var2 = 0 /// H_1: mu_var1 - mu_var2 != 0 /// /// By Central Limt Theorem: /// sample_mean_var1 - sample_mean_var2 ~ N(0, SE), where null_value = 0 and SE is the standard error of the sampling distribution /// /// p-value = (sample_mean is at least ||null_value-point_estimate|| away from the null_value) | mu = null_value) /// </summary> /// <param name="sample_for_paired_data">a random sample consisting data paired together, var1 and var2, var1 and var2 are not independent</param> /// <param name="one_sided">True if the test is one-sided</param> /// <param name="significance_level"></param> /// <returns></returns> public bool RejectH0_PairedData(Tuple <double, double>[] sample_for_paired_data, out double pValue, double significance_level = 0.05, bool one_sided = false, bool useStudentT = false) { int sample_size = sample_for_paired_data.Length; double[] diff = new double[sample_size]; for (int i = 0; i < sample_size; ++i) { diff[i] = sample_for_paired_data[i].Item1 - sample_for_paired_data[i].Item2; } double point_estimate = Mean.GetMean(diff); double null_value = 0; double SE = StandardError.GetStandardError(diff); double test_statistic = System.Math.Abs(point_estimate - null_value) / SE; double percentile = 0; if (sample_for_paired_data.Length < 30 || useStudentT) //if sample size is smaller than 30, then CLT for population statistics such as sample mean no longer holds and Student's t distribution should be used in place of the normal distribution { percentile = StudentT.GetPercentile(test_statistic, sample_for_paired_data.Length - 1); } else { percentile = Gaussian.GetPercentile(test_statistic); } pValue = (1 - percentile) * (one_sided ? 1 : 2); return(pValue < significance_level); }
/// <summary> /// Return the confidence interval of the population mean (measured on a continuous random variable) given a random sample /// /// Note that this is for a variable whose values are continuous /// </summary> /// <param name="sampleMean">point estimate sample mean given by the random sample</param> /// <param name="sampleStdDev">point estimate sample standard deviation given by the random sample</param> /// <param name="sampleSize">size of the random sample</param> /// <param name="confidence_level"></param> /// <returns></returns> public static double[] GetConfidenceInterval(double sampleMean, double sampleStdDev, int sampleSize, double confidence_level, bool useStudentT = false) { double standard_error = StandardError.GetStandardError(sampleStdDev, sampleSize); double[] confidence_interval = new double[2]; double p1 = (1 - confidence_level) / 2.0; double p2 = 1 - p1; double critical_value1 = 0; double critical_value2 = 0; if (sampleSize < 30 || useStudentT) //if sample size is smaller than 30, then CLT for population statistics such as sample mean no longer holds and Student's t distribution should be used in place of the normal distribution { int df = sampleSize - 1; critical_value1 = StudentT.GetQuantile(p1, df); critical_value2 = StudentT.GetQuantile(p2, df); } else { critical_value1 = Gaussian.GetQuantile(p1); critical_value2 = Gaussian.GetQuantile(p2); } confidence_interval[0] = sampleMean + critical_value1 * standard_error; confidence_interval[1] = sampleMean + critical_value2 * standard_error; return(confidence_interval); }
/// <summary> /// Return the confidence interval of the population mean at a given confidence level, given the point estimate sample mean are known from multiple groups / classes /// /// Note that each class should be a continuous variable. /// </summary> /// <param name="sampleMeans">point estimate sample means from different groups/classes</param> /// <param name="sampleStdDev">point estimate sample standard deviations from different groups / classes</param> /// <param name="sampleSizes">sample size from different classes</param> /// <param name="confidence_level">The given confidence level</param> /// <param name="useStudentT">whether student t should be used for test statistic</param> /// <returns>The confidence level of the population mean at the given confidence level</returns> public static double[] GetConfidenceInterval(double[] sampleMeans, double[] sampleStdDev, int[] sampleSizes, double confidence_level, bool useStudentT = false) { double[] standardErrors = new double[sampleMeans.Length]; for (int i = 0; i < sampleMeans.Length; ++i) { standardErrors[i] = StandardError.GetStandardError(sampleStdDev[i], sampleSizes[i]); } double standardError = StandardError.GetStandardErrorForWeightAverages(sampleSizes, standardErrors); double sampleMean = Mean.GetMeanForWeightedAverage(sampleMeans, sampleSizes); double p1 = (1 - confidence_level) / 2.0; double p2 = 1 - p1; bool shouldUseStudentT = useStudentT; if (!shouldUseStudentT) { for (int i = 0; i < sampleSizes.Length; ++i) { if (sampleSizes[i] < 30) { shouldUseStudentT = true; break; } } } double critical_value1 = 0; double critical_value2 = 0; if (shouldUseStudentT) { int smallestSampleSize = int.MaxValue; for (int i = 0; i < sampleSizes.Length; ++i) { if (sampleSizes[i] < smallestSampleSize) { smallestSampleSize = sampleSizes[i]; } } int df = smallestSampleSize - 1; critical_value1 = StudentT.GetQuantile(p1, df); critical_value2 = StudentT.GetQuantile(p2, df); } else { critical_value1 = Gaussian.GetQuantile(p1); critical_value2 = Gaussian.GetQuantile(p2); } double[] confidence_interval = new double[2]; confidence_interval[0] = sampleMean + critical_value1 * standardError; confidence_interval[1] = sampleMean + critical_value2 * standardError; return(confidence_interval); }
/// <summary> /// Two-sided or one-sided test for a single mean /// /// Given that: /// H_0 : mu = null_value /// H_A : mu != null_value /// /// By Central Limit Theorem: /// sample_mean ~ N(mu, SE) /// /// p-value = (sample_mean is at least ||null_value-point_estimate|| away from the null_value) | mu = null_value) /// if(p-value < significance_level) reject H_0 /// </summary> /// <param name="sample"></param> /// <param name="null_value"></param> /// <param name="significance_level"></param> /// <param name="one_sided">True if the test is one_sided</param> /// <returns></returns> public static bool RejectH0(double[] sample, double null_value, out double pValue, double significance_level = 0.05, bool one_sided = false, bool useStudentT = false) { double pointEstimate = Mean.GetMean(sample); double standardError = StandardError.GetStandardError(sample); //SE is the estimated standard deviation of the true population mean, mu double test_statistic = System.Math.Abs(pointEstimate - null_value) / standardError; //This assumes that H_0 is true, that is, the true population mean, mu = null_value double percentile = 0; if (sample.Length < 30 || useStudentT) //if sample size is smaller than 30, then CLT for population statistics such as sample mean no longer holds and Student's t distribution should be used in place of the normal distribution { percentile = StudentT.GetPercentile(test_statistic, sample.Length - 1); } else { percentile = Gaussian.GetPercentile(test_statistic); } pValue = pValue = (1 - percentile) * (one_sided ? 1 : 2); return(pValue < significance_level); }
/// <summary> /// Two-sided or one-sided test for whether statitics of two variables are equal in the true population, var1 and var2 are independent /// /// Hypotheses are: /// H_0: mu_var1 = mu_var2 /// H_1: mu_var1 != mu_var2 /// /// The hypotheses can be written as /// H_0: mu_var1 - mu_var2 = 0 /// H_1: mu_var1 - mu_var2 != 0 /// /// By Central Limt Theorem: /// sample_mean_var1 - sample_mean_var2 ~ N(0, SE), where null_value = 0 and SE is the standard error of the sampling distribution /// /// p-value = (sample_mean is at least ||null_value-point_estimate|| away from the null_value) | mu = null_value) /// </summary> /// <param name="sample_for_var1">value sample for variable 1</param> /// <param name="sample_for_var2">value sample for variable 2</param> /// <param name="one_sided">True if the test is one-sided</param> /// <param name="significance_level"></param> /// <returns></returns> public bool RejectH0(double[] sample_for_var1, double[] sample_for_var2, out double pValue, double significance_level = 0.05, bool one_sided = false, bool useStudentT = false) { double pointEstimate = Mean.GetMean(sample_for_var1) - Mean.GetMean(sample_for_var2); double null_value = 0; double SE = StandardError.GetStandardError(sample_for_var1, sample_for_var2); double test_statistic = System.Math.Abs(pointEstimate - null_value) / SE; double percentile = 0; if (sample_for_var1.Length < 30 || sample_for_var2.Length < 30 || useStudentT) //if sample size is smaller than 30, then CLT for population statistics such as sample mean no longer holds and Student's t distribution should be used in place of the normal distribution { int df = System.Math.Min(sample_for_var1.Length - 1, sample_for_var2.Length - 1); percentile = StudentT.GetPercentile(test_statistic, df); } else { percentile = Gaussian.GetPercentile(test_statistic); } pValue = (1 - percentile) * (one_sided ? 1 : 2); return(pValue < significance_level); }
/// <summary> /// Estimate the normal distribution of a sample mean (for a continuous variable) /// /// The Central Limit Theorem (CLT) states that: /// The distribution of sample statistics (e.g., sample mean) is nearly normal, centered at the population mean, and with a standard deviation equal to the population standard deviation /// divided by square root of the sample size. /// /// With CTL, we can estimate the the normal distribution of a sample, given its estimated mean and stddev as well as the sample size. /// /// For the CTL to hold true for a sample, the following conditions must be met: /// 1. Independence: Sample observations must be independent. /// > random sample/assignment /// > if sampling without replacement, the sample size < 10% of the population /// 2. Sample size/skew: Either the population distribution is normal, or if the population distribution is skewed, the sample size is large (rule of thumb: sample size > 30) /// </summary> /// <param name="sampleMean">point estimate of sample mean</param> /// <param name="sampleStdDev">standard deviation of a random sample</param> /// <param name="sampleSize">the size of the random sample</param> /// <returns>The normal distribution of the sample means for a random sample drawn from the population</returns> public static Gaussian EstimateSampleMeanDistribution(double sampleMean, double sampleStdDev, int sampleSize) { double SE = StandardError.GetStandardError(sampleStdDev, sampleSize); return(new Gaussian(sampleMean, SE)); }