/// <summary> /// Get the confidence interval of a continuous variable for a random sample /// </summary> /// <param name="sample"></param> /// <param name="confidence_level"></param> /// <returns></returns> public static double[] GetConfidenceInterval(double[] sample, double confidence_level, bool useStudentT = false) { double sampleMean = Mean.GetMean(sample); double sampleStdDev = StdDev.GetStdDev(sample, sampleMean); return(GetConfidenceInterval(sampleMean, sampleStdDev, sample.Length, confidence_level, useStudentT)); }
/// <summary> /// Return the correlation for observations (x_1, y_1), (x_2, y_2), ... (x_n, y_n), where n is the sample size /// The correlation is computed as correlation(x, y) = sum_i((x_i - mu_x) * (y_i - mu_y)) / (sum_i((x_i - mu_x)^2) * sum_i((y_i - mu_y)^2)) /// which can also be written as n * sum_i((x_i - mu_x) * (y_i - mu_y) / (sigma_x * sigma_y)) /// where mu_x = sum_i(x_i) / n and sigma_x = sqrt(sum_i((x_i - mu_x)^2) / n) /// </summary> /// <param name="observations">The observations (x_1, y_1), (x_2, y_2), ... (x_n, y_n), where n is the sample size</param> /// <returns>The correlation value for variable x and y</returns> public double GetCorrelation(Tuple <double, double>[] observations) { int n = observations.Length; double[] x = new double[n]; double[] y = new double[n]; for (int i = 0; i < n; ++i) { x[i] = observations[i].Item1; y[i] = observations[i].Item2; } double mu_x = Mean.GetMean(x); double mu_y = Mean.GetMean(y); double sigma_x = StdDev.GetStdDev(x, mu_x); double sigma_y = StdDev.GetStdDev(y, mu_y); double sum = 0; for (int i = 0; i < n; ++i) { sum += ((x[i] - mu_x) / sigma_x) * ((y[i] - mu_y) / sigma_y); } return(sum / n); }
/// <summary> /// Return the standard error of the sampling distribution given a random sample /// </summary> /// <param name="sample">The random sample given</param> /// <returns>Standard error of a random sample, which is the standard deviation of the sample statistic normal distribution by CLT</returns> public static double GetStandardError(double[] sample) { double sampleMean = Mean.GetMean(sample); double sampleStdDev = StdDev.GetStdDev(sample, sampleMean); return(GetStandardError(sampleStdDev, sample.Length)); }
/// <summary> /// Return a set of simulated bootstrap statistics that form the bootstrap distribution for medians via simulation given the original sample /// </summary> /// <param name="originalSample">The original sample</param> /// <param name="bootstrapSampleCount">The number of bootstrap samples collected to form the bootstrap distribution</param> /// <returns></returns> public static double[] SimulateSampleMedians(double[] originalSample, int bootstrapSampleCount) { double originalSampleMedian = Median.GetMedian(originalSample); double originalSampleStdDev = StdDev.GetStdDev(originalSample, originalSampleMedian); return(SimulateSampleMedians(originalSampleMedian, originalSampleStdDev, originalSample.Length, bootstrapSampleCount)); }
/// <summary> /// Return the confidence interval of the difference between two classes in terms of the proportion of SUCCESS in the population at a given confidence level /// /// Note that each class should be a categorical variable with two levels : {SUCCESS, FAILURE} /// Note that class 1 and class 2 are not paired or dependent /// </summary> /// <param name="p_hat1">point estimate of the proportion of SUCCESS in class 1</param> /// <param name="p_hat2">point estimate of the proportion of SUCCESS in class 2</param> /// <param name="n1">sample size in class 1</param> /// <param name="n2">sample size in class 2</param> /// <param name="confidence_level">The given confidence level</param> /// <param name="useSimulation">Flag for whether simulation should be used instead of the normal distribution for proportion of SUCCESS</param> /// <returns>The confidence interval of the difference between two classes in terms of the proportion of SUCCESS</returns> public static double[] GetConfidenceInterval(double p_hat1, double p_hat2, int n1, int n2, double confidence_level, bool useSimulation = false, int simulationCount = 500) { bool shouldUseSimulation = useSimulation; double p1 = (1 - confidence_level) / 2; double p2 = 1 - p1; if (!shouldUseSimulation && (p_hat1 * n1 < 10 || (1 - p_hat1) * n1 < 10 || p_hat2 * n2 < 10 || (1 - p_hat2) * n2 < 10)) { shouldUseSimulation = true; } if (shouldUseSimulation) { double[] sim_sample1 = new double[simulationCount]; // this will follow a normal distribution based on CTL for proportion double[] sim_sample2 = new double[simulationCount]; // this will follow a normal distribution based on CLT for proportion int simulationSampleSize = System.Math.Max((int)System.Math.Max(10 / p_hat1, 10 / (1 - p_hat1)) * 2, (int)System.Math.Max(10 / p_hat2, 10 / (1 - p_hat2)) * 2); for (int i = 0; i < simulationCount; ++i) { int successCount1 = 0; int successCount2 = 0; for (int j = 0; j < simulationSampleSize; ++j) { if (DistributionModel.GetUniform() <= p_hat1) { successCount1++; } if (DistributionModel.GetUniform() <= p_hat2) { successCount2++; } } sim_sample1[i] = (double)(successCount1) / simulationSampleSize; sim_sample2[i] = (double)(successCount2) / simulationSampleSize; } double sim_mu1 = Mean.GetMean(sim_sample1); double sim_sigma1 = StdDev.GetStdDev(sim_sample1, sim_mu1); double sim_mu2 = Mean.GetMean(sim_sample2); double sim_sigma2 = StdDev.GetStdDev(sim_sample2, sim_mu2); double sim_mud = sim_mu1 - sim_mu2; double sim_SE = System.Math.Sqrt(sim_sigma1 * sim_sigma1 + sim_sigma2 * sim_sigma2); return(new double[] { sim_mud + Gaussian.GetPercentile(p1) * sim_SE, sim_mud + Gaussian.GetQuantile(p2) * sim_SE }); } else { double SE = System.Math.Sqrt((p_hat1 * (1 - p_hat1) / n1 + (p_hat2 * (1 - p_hat2)) / n2)); double pd_hat = p_hat1 - p_hat2; return(new double[] { pd_hat + Gaussian.GetQuantile(p1) * SE, pd_hat + Gaussian.GetQuantile(p2) * SE }); } }
/// <summary> /// Two-sided or one-sided test for a single median /// /// Given that: /// H_0 : median = null_value /// H_A : median != null_value /// /// By Central Limit Theorem: /// sample_median ~ N(mu, SE) /// /// p-value = (sample_median is at least ||null_value - point_estimate|| away from the null_value) | median = null_value) /// if(p-value < significance_level) reject H_0 /// </summary> /// <param name="originalSample">The original sample</param> /// <param name="bootstrapSampleCount"></param> /// <param name="null_value"></param> /// <param name="significance_level"></param> /// <param name="one_sided"></param> /// <returns></returns> public static bool RejectH0_ForMedian(double[] originalSample, int bootstrapSampleCount, double null_value, out double pValue, double significance_level = 0.05, bool one_sided = false) { double[] bootstrapMedians = SimulateSampleMedians(originalSample, bootstrapSampleCount); double bootstrap_mean = Mean.GetMean(bootstrapMedians); double bootstrap_SE = StdDev.GetStdDev(bootstrapMedians, bootstrap_mean); return(HypothesisTesting.RejectH0(bootstrap_mean, null_value, bootstrap_SE, originalSample.Length, out pValue, significance_level, one_sided)); }
/// <summary> /// Return the standard error of the sampling distribution of the difference between two population statistics var1 and var2, assuming var1 and var2 are independent /// </summary> /// <param name="sample_for_var1">random sample for var1</param> /// <param name="sample_for_var2">random sample for var2</param> /// <returns>Standard error of a random sample, which is the standard deviation of the sample statistic normal distribution by CLT</returns> public static double GetStandardError(double[] sample_for_var1, double[] sample_for_var2) { double mu_for_var1 = Mean.GetMean(sample_for_var1); double mu_for_var2 = Mean.GetMean(sample_for_var2); double sigma_for_var1 = StdDev.GetStdDev(sample_for_var1, mu_for_var1); double sigma_for_var2 = StdDev.GetStdDev(sample_for_var2, mu_for_var2); return(System.Math.Sqrt(sigma_for_var1 * sigma_for_var1 / sample_for_var1.Length + sigma_for_var2 * sigma_for_var2 / sample_for_var2.Length)); }
public static double[] GetConfidenceIntervalForMean(double[] originalSample, int bootstrapSampleCount, double confidence_level) { double[] bootstrapMeans = SimulateSampleMeans(originalSample, bootstrapSampleCount); double bootstrap_mean = Mean.GetMean(bootstrapMeans); double bootstrap_SE = StdDev.GetStdDev(bootstrapMeans, bootstrap_mean); double p1 = (1 - confidence_level) / 2; double p2 = 1 - p1; double z1 = Gaussian.GetQuantile(p1); double z2 = Gaussian.GetQuantile(p2); return(new double[] { bootstrap_mean + z1 * bootstrap_SE, bootstrap_mean + z2 * bootstrap_SE }); }
/// <summary> /// Return the NormalTable distribution of population statistic (a*x + b*y) for correlated random variables x and y /// </summary> /// <param name="x">random sample for random variable x</param> /// <param name="y">random sample for random variable y</param> /// <param name="x_coefficient">a which is the coefficient of x</param> /// <param name="y_coefficient">b which is the coefficient of y</param> /// <param name="correlation">correlation between x and y</param> /// <param name="result_mean">output mean for the a*x + b*y</param> /// <param name="result_SE">output standard error for the a*x + b*y</param> public static void Sum(double[] x, double[] y, int x_coefficient, double y_coefficient, double correlation, out double result_mean, out double result_SE) { result_mean = 0; result_SE = 0; double mean_x = Mean.GetMean(x); double mean_y = Mean.GetMean(y); double stddev_x = StdDev.GetStdDev(x, mean_x); double stddev_y = StdDev.GetStdDev(y, mean_y); result_mean = x_coefficient * mean_x + y_coefficient * mean_y; result_SE = System.Math.Sqrt(System.Math.Pow(x_coefficient * stddev_x, 2) / x.Length + System.Math.Pow(y_coefficient * stddev_y, 2) / y.Length + 2 * correlation * x_coefficient * stddev_x * y_coefficient * stddev_y / System.Math.Sqrt(x.Length * y.Length)); }
/// <summary> /// Return the confidence interval for proportion of SUCCESS in the population at a given confidence level given the sample proportion point estimate /// </summary> /// <param name="proportion">sample proportion point estimate</param> /// <param name="sampleSize">sample size</param> /// <param name="confidence_level"></param> /// <returns>confidence interval for proportion of SUCCESS in the population at a given confidence level</returns> public static double[] GetConfidenceInterval(double proportion, int sampleSize, double confidence_level, bool useSimulation = false, int simulationCount = 500) { double standard_error = StandardError.GetStandardErrorForProportion(proportion, sampleSize); double p1 = (1 - confidence_level) / 2; double p2 = 1 - p1; int expected_success_count = (int)(proportion * sampleSize); int expected_failure_count = (int)((1 - proportion) * sampleSize); if (expected_failure_count < 10 || expected_success_count < 10 || useSimulation) //if np < 10 or n(1-p) < 10, then CLT for proportion no longer holds and simulation should be used in place of the normal distribution { double[] sampleProportions = new double[simulationCount]; int simulationSampleSize = (int)System.Math.Max(10 / proportion, 10 / (1 - proportion)) * 2; for (int i = 0; i < simulationCount; ++i) { int successCount = 0; for (int j = 0; j < simulationSampleSize; ++j) { if (DistributionModel.GetUniform() <= proportion) { successCount++; } } sampleProportions[i] = (double)successCount / simulationSampleSize; } double proportion_mu = Mean.GetMean(sampleProportions); double proportion_sigma = StdDev.GetStdDev(sampleProportions, proportion_mu); return(new double[] { proportion_mu + Gaussian.GetPercentile(p1) * proportion_sigma, proportion_mu + Gaussian.GetQuantile(p2) * proportion_sigma }); } else { double critical_value1 = Gaussian.GetQuantile(p1); double critical_value2 = Gaussian.GetQuantile(p2); double[] confidence_interval = new double[2]; confidence_interval[0] = proportion + critical_value1 * standard_error; confidence_interval[1] = proportion + critical_value2 * standard_error; return(confidence_interval); } }
/// <summary> /// Calculate the confidence interval for the proportion of SUCCESS in the population at a given confidence interval, given the point estimate proprotions are known from multiple groups /// /// Note that this is only for categorical variable with two levels : SUCCESS, FAILURE /// </summary> /// <param name="proportions">The point estimate proportion of SUCESS obtained from multiple groups</param> /// <param name="sampleSizes">The sample size of each group</param> /// <param name="confidence_level">The given confidence interval</param> /// <returns>The confidence interval for the proportion of SUCCESS in the population at the given confidence level</returns> public static double[] GetConfidenceInterval(double[] proportions, int[] sampleSizes, double confidence_level, bool useSimulation = false, int simulationCount = 500) { double p1 = (1 - confidence_level) / 2; double p2 = 1 - p1; bool shouldUseSimulation = useSimulation; if (!shouldUseSimulation) { for (int i = 0; i < sampleSizes.Length; ++i) { int n_i = sampleSizes[i]; int expected_success_count = (int)(proportions[i] * n_i); int expected_failure_count = (int)((1 - proportions[i]) * n_i); if (expected_failure_count < 10 || expected_success_count < 10) { shouldUseSimulation = true; break; } } } if (shouldUseSimulation) { double sucess_count = 0; double total_count = 0; for (int i = 0; i < sampleSizes.Length; ++i) { int n_i = sampleSizes[i]; sucess_count += proportions[i] * n_i; total_count += n_i; } double p_hat = sucess_count / total_count; double[] sampleProportions = new double[simulationCount]; int simulationSampleSize = (int)System.Math.Max(10 / p_hat, 10 / (1 - p_hat)) * 2; for (int i = 0; i < simulationCount; ++i) { int successCount = 0; for (int j = 0; j < simulationSampleSize; ++j) { if (DistributionModel.GetUniform() <= p_hat) { successCount++; } } sampleProportions[i] = (double)successCount / simulationSampleSize; } double proportion_mu = Mean.GetMean(sampleProportions); double proportion_sigma = StdDev.GetStdDev(sampleProportions, proportion_mu); return(new double[] { proportion_mu + Gaussian.GetPercentile(p1) * proportion_sigma, proportion_mu + Gaussian.GetQuantile(p2) * proportion_sigma }); } else { double[] standardErrors = new double[proportions.Length]; for (int i = 0; i < proportions.Length; ++i) { standardErrors[i] = StandardError.GetStandardErrorForProportion(proportions[i], sampleSizes[i]); } double standardError = StandardError.GetStandardErrorForWeightAverages(sampleSizes, standardErrors); double sampleMean = Mean.GetMeanForWeightedAverage(proportions, sampleSizes); double critical_value1 = 0; double critical_value2 = 0; critical_value1 = Gaussian.GetQuantile(p1); critical_value2 = Gaussian.GetQuantile(p2); double[] confidence_interval = new double[2]; confidence_interval[0] = sampleMean + critical_value1 * standardError; confidence_interval[1] = sampleMean + critical_value2 * standardError; return(confidence_interval); } }