/// <summary> /// Return a set of simulated bootstrap statistics that form the bootstrap distribution for means via simulation given the original sample /// </summary> /// <param name="originalSample">The original bootstrap sample</param> /// <param name="bootstrapSampleCount">The number of bootstrap samples collected to form the bootstrap distribution</param> /// <returns></returns> public static double[] SimulateSampleMeans(double[] originalSample, int bootstrapSampleCount) { double originalSampleMean = Mean.GetMean(originalSample); double originalSampleStdDev = StdDev.GetStdDev(originalSample, originalSampleMean); return(SimulateSampleMeans(originalSampleMean, originalSampleStdDev, originalSample.Length, bootstrapSampleCount)); }
/// <summary> /// Return the correlation for observations (x_1, y_1), (x_2, y_2), ... (x_n, y_n), where n is the sample size /// The correlation is computed as correlation(x, y) = sum_i((x_i - mu_x) * (y_i - mu_y)) / (sum_i((x_i - mu_x)^2) * sum_i((y_i - mu_y)^2)) /// which can also be written as n * sum_i((x_i - mu_x) * (y_i - mu_y) / (sigma_x * sigma_y)) /// where mu_x = sum_i(x_i) / n and sigma_x = sqrt(sum_i((x_i - mu_x)^2) / n) /// </summary> /// <param name="observations">The observations (x_1, y_1), (x_2, y_2), ... (x_n, y_n), where n is the sample size</param> /// <returns>The correlation value for variable x and y</returns> public double GetCorrelation(Tuple <double, double>[] observations) { int n = observations.Length; double[] x = new double[n]; double[] y = new double[n]; for (int i = 0; i < n; ++i) { x[i] = observations[i].Item1; y[i] = observations[i].Item2; } double mu_x = Mean.GetMean(x); double mu_y = Mean.GetMean(y); double sigma_x = StdDev.GetStdDev(x, mu_x); double sigma_y = StdDev.GetStdDev(y, mu_y); double sum = 0; for (int i = 0; i < n; ++i) { sum += ((x[i] - mu_x) / sigma_x) * ((y[i] - mu_y) / sigma_y); } return(sum / n); }
/// <summary> /// Return the standard error of the sampling distribution given a random sample /// </summary> /// <param name="sample">The random sample given</param> /// <returns>Standard error of a random sample, which is the standard deviation of the sample statistic normal distribution by CLT</returns> public static double GetStandardError(double[] sample) { double sampleMean = Mean.GetMean(sample); double sampleStdDev = StdDev.GetStdDev(sample, sampleMean); return(GetStandardError(sampleStdDev, sample.Length)); }
/// <summary> /// Get the confidence interval of a continuous variable for a random sample /// </summary> /// <param name="sample"></param> /// <param name="confidence_level"></param> /// <returns></returns> public static double[] GetConfidenceInterval(double[] sample, double confidence_level, bool useStudentT = false) { double sampleMean = Mean.GetMean(sample); double sampleStdDev = StdDev.GetStdDev(sample, sampleMean); return(GetConfidenceInterval(sampleMean, sampleStdDev, sample.Length, confidence_level, useStudentT)); }
/// <summary> /// Two-sided or one-sided test for whether statitics of two variables are equal in the true population, var1 and var2 are paired and dependent /// /// Hypotheses are: /// H_0: mu_var1 = mu_var2 /// H_1: mu_var1 != mu_var2 /// /// The hypotheses can be written as /// H_0: mu_var1 - mu_var2 = 0 /// H_1: mu_var1 - mu_var2 != 0 /// /// By Central Limt Theorem: /// sample_mean_var1 - sample_mean_var2 ~ N(0, SE), where null_value = 0 and SE is the standard error of the sampling distribution /// /// p-value = (sample_mean is at least ||null_value-point_estimate|| away from the null_value) | mu = null_value) /// </summary> /// <param name="sample_for_paired_data">a random sample consisting data paired together, var1 and var2, var1 and var2 are not independent</param> /// <param name="one_sided">True if the test is one-sided</param> /// <param name="significance_level"></param> /// <returns></returns> public bool RejectH0_PairedData(Tuple <double, double>[] sample_for_paired_data, out double pValue, double significance_level = 0.05, bool one_sided = false, bool useStudentT = false) { int sample_size = sample_for_paired_data.Length; double[] diff = new double[sample_size]; for (int i = 0; i < sample_size; ++i) { diff[i] = sample_for_paired_data[i].Item1 - sample_for_paired_data[i].Item2; } double point_estimate = Mean.GetMean(diff); double null_value = 0; double SE = StandardError.GetStandardError(diff); double test_statistic = System.Math.Abs(point_estimate - null_value) / SE; double percentile = 0; if (sample_for_paired_data.Length < 30 || useStudentT) //if sample size is smaller than 30, then CLT for population statistics such as sample mean no longer holds and Student's t distribution should be used in place of the normal distribution { percentile = StudentT.GetPercentile(test_statistic, sample_for_paired_data.Length - 1); } else { percentile = Gaussian.GetPercentile(test_statistic); } pValue = (1 - percentile) * (one_sided ? 1 : 2); return(pValue < significance_level); }
/// <summary> /// Return the confidence interval of the difference between two classes in terms of the proportion of SUCCESS in the population at a given confidence level /// /// Note that each class should be a categorical variable with two levels : {SUCCESS, FAILURE} /// Note that class 1 and class 2 are not paired or dependent /// </summary> /// <param name="p_hat1">point estimate of the proportion of SUCCESS in class 1</param> /// <param name="p_hat2">point estimate of the proportion of SUCCESS in class 2</param> /// <param name="n1">sample size in class 1</param> /// <param name="n2">sample size in class 2</param> /// <param name="confidence_level">The given confidence level</param> /// <param name="useSimulation">Flag for whether simulation should be used instead of the normal distribution for proportion of SUCCESS</param> /// <returns>The confidence interval of the difference between two classes in terms of the proportion of SUCCESS</returns> public static double[] GetConfidenceInterval(double p_hat1, double p_hat2, int n1, int n2, double confidence_level, bool useSimulation = false, int simulationCount = 500) { bool shouldUseSimulation = useSimulation; double p1 = (1 - confidence_level) / 2; double p2 = 1 - p1; if (!shouldUseSimulation && (p_hat1 * n1 < 10 || (1 - p_hat1) * n1 < 10 || p_hat2 * n2 < 10 || (1 - p_hat2) * n2 < 10)) { shouldUseSimulation = true; } if (shouldUseSimulation) { double[] sim_sample1 = new double[simulationCount]; // this will follow a normal distribution based on CTL for proportion double[] sim_sample2 = new double[simulationCount]; // this will follow a normal distribution based on CLT for proportion int simulationSampleSize = System.Math.Max((int)System.Math.Max(10 / p_hat1, 10 / (1 - p_hat1)) * 2, (int)System.Math.Max(10 / p_hat2, 10 / (1 - p_hat2)) * 2); for (int i = 0; i < simulationCount; ++i) { int successCount1 = 0; int successCount2 = 0; for (int j = 0; j < simulationSampleSize; ++j) { if (DistributionModel.GetUniform() <= p_hat1) { successCount1++; } if (DistributionModel.GetUniform() <= p_hat2) { successCount2++; } } sim_sample1[i] = (double)(successCount1) / simulationSampleSize; sim_sample2[i] = (double)(successCount2) / simulationSampleSize; } double sim_mu1 = Mean.GetMean(sim_sample1); double sim_sigma1 = StdDev.GetStdDev(sim_sample1, sim_mu1); double sim_mu2 = Mean.GetMean(sim_sample2); double sim_sigma2 = StdDev.GetStdDev(sim_sample2, sim_mu2); double sim_mud = sim_mu1 - sim_mu2; double sim_SE = System.Math.Sqrt(sim_sigma1 * sim_sigma1 + sim_sigma2 * sim_sigma2); return(new double[] { sim_mud + Gaussian.GetPercentile(p1) * sim_SE, sim_mud + Gaussian.GetQuantile(p2) * sim_SE }); } else { double SE = System.Math.Sqrt((p_hat1 * (1 - p_hat1) / n1 + (p_hat2 * (1 - p_hat2)) / n2)); double pd_hat = p_hat1 - p_hat2; return(new double[] { pd_hat + Gaussian.GetQuantile(p1) * SE, pd_hat + Gaussian.GetQuantile(p2) * SE }); } }
/// <summary> /// Two-sided or one-sided test for a single median /// /// Given that: /// H_0 : median = null_value /// H_A : median != null_value /// /// By Central Limit Theorem: /// sample_median ~ N(mu, SE) /// /// p-value = (sample_median is at least ||null_value - point_estimate|| away from the null_value) | median = null_value) /// if(p-value < significance_level) reject H_0 /// </summary> /// <param name="originalSample">The original sample</param> /// <param name="bootstrapSampleCount"></param> /// <param name="null_value"></param> /// <param name="significance_level"></param> /// <param name="one_sided"></param> /// <returns></returns> public static bool RejectH0_ForMedian(double[] originalSample, int bootstrapSampleCount, double null_value, out double pValue, double significance_level = 0.05, bool one_sided = false) { double[] bootstrapMedians = SimulateSampleMedians(originalSample, bootstrapSampleCount); double bootstrap_mean = Mean.GetMean(bootstrapMedians); double bootstrap_SE = StdDev.GetStdDev(bootstrapMedians, bootstrap_mean); return(HypothesisTesting.RejectH0(bootstrap_mean, null_value, bootstrap_SE, originalSample.Length, out pValue, significance_level, one_sided)); }
/// <summary> /// Return the standard error of the sampling distribution of the difference between two population statistics var1 and var2, assuming var1 and var2 are independent /// </summary> /// <param name="sample_for_var1">random sample for var1</param> /// <param name="sample_for_var2">random sample for var2</param> /// <returns>Standard error of a random sample, which is the standard deviation of the sample statistic normal distribution by CLT</returns> public static double GetStandardError(double[] sample_for_var1, double[] sample_for_var2) { double mu_for_var1 = Mean.GetMean(sample_for_var1); double mu_for_var2 = Mean.GetMean(sample_for_var2); double sigma_for_var1 = StdDev.GetStdDev(sample_for_var1, mu_for_var1); double sigma_for_var2 = StdDev.GetStdDev(sample_for_var2, mu_for_var2); return(System.Math.Sqrt(sigma_for_var1 * sigma_for_var1 / sample_for_var1.Length + sigma_for_var2 * sigma_for_var2 / sample_for_var2.Length)); }
public static Dictionary <int, double> GetMeanWithinGroup(Dictionary <int, List <double> > groupSample) { Dictionary <int, double> means = new Dictionary <int, double>(); foreach (int grpId in groupSample.Keys) { means[grpId] = Mean.GetMean(groupSample[grpId]); } return(means); }
/// <summary> /// Pairwise comparison of group1 and group2 /// </summary> /// <param name="group1">random sample from class 1</param> /// <param name="group2">random sample from class 2</param> /// <param name="anova">parameters obtained after ANOVA</param> /// <returns>p-value = P(observed or more extreme values | H_0 is true)</returns> public static double PairwiseCompare(List <double> group1, List <double> group2, ANOVA anova) { double x_bar1 = Mean.GetMean(group1); double x_bar2 = Mean.GetMean(group2); int n1 = group1.Count; int n2 = group2.Count; int null_value = 0; double t = GetTStatistic(x_bar1, x_bar2, n1, n2, null_value, anova.MSE); double pValue = GetPValue(t, anova.dfE); return(pValue); }
/// <summary> /// Return the sum of squares group (SSG) /// /// SSG measures the variability between groups /// This is also known as explained variablity: deviation of group mean from overral mean, weighted by sample size /// </summary> /// <param name="groupedSample">The sample groupped based on the classes</param> /// <returns></returns> public static double GetSSG(Dictionary <int, List <double> > groupedSample, double grand_mean) { double SSG = 0; foreach (int grpId in groupedSample.Keys) { List <double> group = groupedSample[grpId]; double group_mean = Mean.GetMean(group); double group_size = group.Count; SSG += group_size * (group_mean - grand_mean) * (group_mean - grand_mean); } return(SSG); }
public static double[] GetConfidenceIntervalForMean(double[] originalSample, int bootstrapSampleCount, double confidence_level) { double[] bootstrapMeans = SimulateSampleMeans(originalSample, bootstrapSampleCount); double bootstrap_mean = Mean.GetMean(bootstrapMeans); double bootstrap_SE = StdDev.GetStdDev(bootstrapMeans, bootstrap_mean); double p1 = (1 - confidence_level) / 2; double p2 = 1 - p1; double z1 = Gaussian.GetQuantile(p1); double z2 = Gaussian.GetQuantile(p2); return(new double[] { bootstrap_mean + z1 * bootstrap_SE, bootstrap_mean + z2 * bootstrap_SE }); }
/// <summary> /// Return the sum of squares total /// /// SST measures the total variability in the response variable /// </summary> /// <param name="totalSample">all the data points in the sample containing all classes</param> /// <param name="grand_mean">The mean of all the data points in the sample containing all classes</param> /// <returns>The sum of squares total, which measures p=o--i9i9</returns> public static double GetSST(double[] totalSample, out double grand_mean) { grand_mean = Mean.GetMean(totalSample); double SST = 0; int n = totalSample.Length; for (int i = 0; i < n; ++i) { double yd = totalSample[i] - grand_mean; SST += yd * yd; } return(SST); }
/// <summary> /// Return the NormalTable distribution of population statistic (a*x + b*y) for correlated random variables x and y /// </summary> /// <param name="x">random sample for random variable x</param> /// <param name="y">random sample for random variable y</param> /// <param name="x_coefficient">a which is the coefficient of x</param> /// <param name="y_coefficient">b which is the coefficient of y</param> /// <param name="correlation">correlation between x and y</param> /// <param name="result_mean">output mean for the a*x + b*y</param> /// <param name="result_SE">output standard error for the a*x + b*y</param> public static void Sum(double[] x, double[] y, int x_coefficient, double y_coefficient, double correlation, out double result_mean, out double result_SE) { result_mean = 0; result_SE = 0; double mean_x = Mean.GetMean(x); double mean_y = Mean.GetMean(y); double stddev_x = StdDev.GetStdDev(x, mean_x); double stddev_y = StdDev.GetStdDev(y, mean_y); result_mean = x_coefficient * mean_x + y_coefficient * mean_y; result_SE = System.Math.Sqrt(System.Math.Pow(x_coefficient * stddev_x, 2) / x.Length + System.Math.Pow(y_coefficient * stddev_y, 2) / y.Length + 2 * correlation * x_coefficient * stddev_x * y_coefficient * stddev_y / System.Math.Sqrt(x.Length * y.Length)); }
/// <summary> /// Return a set of simulated bootstrap statistics that form the bootstrap distribution for means via simulation given the original sample /// </summary> /// <param name="originalSampleMean">point estimate of sample mean from the original sample</param> /// <param name="originalSampleStdDev">standard deviation of the original sample</param> /// <param name="originalSampleSize">size of the original sample</param> /// <param name="bootstrapSampleCount">The number of bootstrap samples collected to form the bootstrap distribution</param> /// <returns></returns> public static double[] SimulateSampleMeans(double originalSampleMean, double originalSampleStdDev, int originalSampleSize, int bootstrapSampleCount) { Gaussian distribution = new Gaussian(originalSampleMean, originalSampleStdDev); double[] bootstrapMeans = new double[bootstrapSampleCount]; double[] bootstrapSample = new double[originalSampleSize]; for (int i = 0; i < bootstrapSampleCount; ++i) { for (int j = 0; j < originalSampleSize; ++j) { bootstrapSample[j] = distribution.Next(); } bootstrapMeans[i] = Mean.GetMean(bootstrapSample); } return(bootstrapMeans); }
/// <summary> /// Return the confidence interval for proportion of SUCCESS in the population at a given confidence level given the sample proportion point estimate /// </summary> /// <param name="proportion">sample proportion point estimate</param> /// <param name="sampleSize">sample size</param> /// <param name="confidence_level"></param> /// <returns>confidence interval for proportion of SUCCESS in the population at a given confidence level</returns> public static double[] GetConfidenceInterval(double proportion, int sampleSize, double confidence_level, bool useSimulation = false, int simulationCount = 500) { double standard_error = StandardError.GetStandardErrorForProportion(proportion, sampleSize); double p1 = (1 - confidence_level) / 2; double p2 = 1 - p1; int expected_success_count = (int)(proportion * sampleSize); int expected_failure_count = (int)((1 - proportion) * sampleSize); if (expected_failure_count < 10 || expected_success_count < 10 || useSimulation) //if np < 10 or n(1-p) < 10, then CLT for proportion no longer holds and simulation should be used in place of the normal distribution { double[] sampleProportions = new double[simulationCount]; int simulationSampleSize = (int)System.Math.Max(10 / proportion, 10 / (1 - proportion)) * 2; for (int i = 0; i < simulationCount; ++i) { int successCount = 0; for (int j = 0; j < simulationSampleSize; ++j) { if (DistributionModel.GetUniform() <= proportion) { successCount++; } } sampleProportions[i] = (double)successCount / simulationSampleSize; } double proportion_mu = Mean.GetMean(sampleProportions); double proportion_sigma = StdDev.GetStdDev(sampleProportions, proportion_mu); return(new double[] { proportion_mu + Gaussian.GetPercentile(p1) * proportion_sigma, proportion_mu + Gaussian.GetQuantile(p2) * proportion_sigma }); } else { double critical_value1 = Gaussian.GetQuantile(p1); double critical_value2 = Gaussian.GetQuantile(p2); double[] confidence_interval = new double[2]; confidence_interval[0] = proportion + critical_value1 * standard_error; confidence_interval[1] = proportion + critical_value2 * standard_error; return(confidence_interval); } }
/// <summary> /// Two-sided or one-sided test for a single mean /// /// Given that: /// H_0 : mu = null_value /// H_A : mu != null_value /// /// By Central Limit Theorem: /// sample_mean ~ N(mu, SE) /// /// p-value = (sample_mean is at least ||null_value-point_estimate|| away from the null_value) | mu = null_value) /// if(p-value < significance_level) reject H_0 /// </summary> /// <param name="sample"></param> /// <param name="null_value"></param> /// <param name="significance_level"></param> /// <param name="one_sided">True if the test is one_sided</param> /// <returns></returns> public static bool RejectH0(double[] sample, double null_value, out double pValue, double significance_level = 0.05, bool one_sided = false, bool useStudentT = false) { double pointEstimate = Mean.GetMean(sample); double standardError = StandardError.GetStandardError(sample); //SE is the estimated standard deviation of the true population mean, mu double test_statistic = System.Math.Abs(pointEstimate - null_value) / standardError; //This assumes that H_0 is true, that is, the true population mean, mu = null_value double percentile = 0; if (sample.Length < 30 || useStudentT) //if sample size is smaller than 30, then CLT for population statistics such as sample mean no longer holds and Student's t distribution should be used in place of the normal distribution { percentile = StudentT.GetPercentile(test_statistic, sample.Length - 1); } else { percentile = Gaussian.GetPercentile(test_statistic); } pValue = pValue = (1 - percentile) * (one_sided ? 1 : 2); return(pValue < significance_level); }
public static Dictionary <int, double[]> GetMeanWithinGroup(Dictionary <int, List <double> >[] groupSampleWithDim) { Dictionary <int, double[]> means = new Dictionary <int, double[]>(); int D = groupSampleWithDim.Length; foreach (int groupId in groupSampleWithDim[0].Keys) { means[groupId] = new double[D]; } for (int d = 0; d < D; ++d) { Dictionary <int, List <double> > groupSample = groupSampleWithDim[d]; foreach (int grpId in groupSample.Keys) { means[grpId][d] = Mean.GetMean(groupSample[grpId]); } } return(means); }
/// <summary> /// Two-sided or one-sided test for whether statitics of two variables are equal in the true population, var1 and var2 are independent /// /// Hypotheses are: /// H_0: mu_var1 = mu_var2 /// H_1: mu_var1 != mu_var2 /// /// The hypotheses can be written as /// H_0: mu_var1 - mu_var2 = 0 /// H_1: mu_var1 - mu_var2 != 0 /// /// By Central Limt Theorem: /// sample_mean_var1 - sample_mean_var2 ~ N(0, SE), where null_value = 0 and SE is the standard error of the sampling distribution /// /// p-value = (sample_mean is at least ||null_value-point_estimate|| away from the null_value) | mu = null_value) /// </summary> /// <param name="sample_for_var1">value sample for variable 1</param> /// <param name="sample_for_var2">value sample for variable 2</param> /// <param name="one_sided">True if the test is one-sided</param> /// <param name="significance_level"></param> /// <returns></returns> public bool RejectH0(double[] sample_for_var1, double[] sample_for_var2, out double pValue, double significance_level = 0.05, bool one_sided = false, bool useStudentT = false) { double pointEstimate = Mean.GetMean(sample_for_var1) - Mean.GetMean(sample_for_var2); double null_value = 0; double SE = StandardError.GetStandardError(sample_for_var1, sample_for_var2); double test_statistic = System.Math.Abs(pointEstimate - null_value) / SE; double percentile = 0; if (sample_for_var1.Length < 30 || sample_for_var2.Length < 30 || useStudentT) //if sample size is smaller than 30, then CLT for population statistics such as sample mean no longer holds and Student's t distribution should be used in place of the normal distribution { int df = System.Math.Min(sample_for_var1.Length - 1, sample_for_var2.Length - 1); percentile = StudentT.GetPercentile(test_statistic, df); } else { percentile = Gaussian.GetPercentile(test_statistic); } pValue = (1 - percentile) * (one_sided ? 1 : 2); return(pValue < significance_level); }
/// <summary> /// Calculate the confidence interval for the proportion of SUCCESS in the population at a given confidence interval, given the point estimate proprotions are known from multiple groups /// /// Note that this is only for categorical variable with two levels : SUCCESS, FAILURE /// </summary> /// <param name="proportions">The point estimate proportion of SUCESS obtained from multiple groups</param> /// <param name="sampleSizes">The sample size of each group</param> /// <param name="confidence_level">The given confidence interval</param> /// <returns>The confidence interval for the proportion of SUCCESS in the population at the given confidence level</returns> public static double[] GetConfidenceInterval(double[] proportions, int[] sampleSizes, double confidence_level, bool useSimulation = false, int simulationCount = 500) { double p1 = (1 - confidence_level) / 2; double p2 = 1 - p1; bool shouldUseSimulation = useSimulation; if (!shouldUseSimulation) { for (int i = 0; i < sampleSizes.Length; ++i) { int n_i = sampleSizes[i]; int expected_success_count = (int)(proportions[i] * n_i); int expected_failure_count = (int)((1 - proportions[i]) * n_i); if (expected_failure_count < 10 || expected_success_count < 10) { shouldUseSimulation = true; break; } } } if (shouldUseSimulation) { double sucess_count = 0; double total_count = 0; for (int i = 0; i < sampleSizes.Length; ++i) { int n_i = sampleSizes[i]; sucess_count += proportions[i] * n_i; total_count += n_i; } double p_hat = sucess_count / total_count; double[] sampleProportions = new double[simulationCount]; int simulationSampleSize = (int)System.Math.Max(10 / p_hat, 10 / (1 - p_hat)) * 2; for (int i = 0; i < simulationCount; ++i) { int successCount = 0; for (int j = 0; j < simulationSampleSize; ++j) { if (DistributionModel.GetUniform() <= p_hat) { successCount++; } } sampleProportions[i] = (double)successCount / simulationSampleSize; } double proportion_mu = Mean.GetMean(sampleProportions); double proportion_sigma = StdDev.GetStdDev(sampleProportions, proportion_mu); return(new double[] { proportion_mu + Gaussian.GetPercentile(p1) * proportion_sigma, proportion_mu + Gaussian.GetQuantile(p2) * proportion_sigma }); } else { double[] standardErrors = new double[proportions.Length]; for (int i = 0; i < proportions.Length; ++i) { standardErrors[i] = StandardError.GetStandardErrorForProportion(proportions[i], sampleSizes[i]); } double standardError = StandardError.GetStandardErrorForWeightAverages(sampleSizes, standardErrors); double sampleMean = Mean.GetMeanForWeightedAverage(proportions, sampleSizes); double critical_value1 = 0; double critical_value2 = 0; critical_value1 = Gaussian.GetQuantile(p1); critical_value2 = Gaussian.GetQuantile(p2); double[] confidence_interval = new double[2]; confidence_interval[0] = sampleMean + critical_value1 * standardError; confidence_interval[1] = sampleMean + critical_value2 * standardError; return(confidence_interval); } }