Exemplo n.º 1
0
        /// <summary>
        /// Return a set of simulated bootstrap statistics that form the bootstrap distribution for means via simulation given the original sample
        /// </summary>
        /// <param name="originalSample">The original bootstrap sample</param>
        /// <param name="bootstrapSampleCount">The number of bootstrap samples collected to form the bootstrap distribution</param>
        /// <returns></returns>
        public static double[] SimulateSampleMeans(double[] originalSample, int bootstrapSampleCount)
        {
            double originalSampleMean   = Mean.GetMean(originalSample);
            double originalSampleStdDev = StdDev.GetStdDev(originalSample, originalSampleMean);

            return(SimulateSampleMeans(originalSampleMean, originalSampleStdDev, originalSample.Length, bootstrapSampleCount));
        }
        /// <summary>
        /// Return the correlation for observations (x_1, y_1), (x_2, y_2), ... (x_n, y_n), where n is the sample size
        /// The correlation is computed as correlation(x, y) = sum_i((x_i - mu_x) * (y_i - mu_y)) / (sum_i((x_i - mu_x)^2) * sum_i((y_i - mu_y)^2))
        /// which can also be written as n * sum_i((x_i - mu_x) * (y_i - mu_y) / (sigma_x * sigma_y))
        /// where mu_x = sum_i(x_i) / n and sigma_x = sqrt(sum_i((x_i - mu_x)^2) / n)
        /// </summary>
        /// <param name="observations">The observations (x_1, y_1), (x_2, y_2), ... (x_n, y_n), where n is the sample size</param>
        /// <returns>The correlation value for variable x and y</returns>
        public double GetCorrelation(Tuple <double, double>[] observations)
        {
            int n = observations.Length;

            double[] x = new double[n];
            double[] y = new double[n];
            for (int i = 0; i < n; ++i)
            {
                x[i] = observations[i].Item1;
                y[i] = observations[i].Item2;
            }

            double mu_x = Mean.GetMean(x);
            double mu_y = Mean.GetMean(y);

            double sigma_x = StdDev.GetStdDev(x, mu_x);
            double sigma_y = StdDev.GetStdDev(y, mu_y);

            double sum = 0;

            for (int i = 0; i < n; ++i)
            {
                sum += ((x[i] - mu_x) / sigma_x) * ((y[i] - mu_y) / sigma_y);
            }
            return(sum / n);
        }
Exemplo n.º 3
0
        /// <summary>
        /// Return the standard error of the sampling distribution given a random sample
        /// </summary>
        /// <param name="sample">The random sample given</param>
        /// <returns>Standard error of a random sample, which is the standard deviation of the sample statistic normal distribution by CLT</returns>
        public static double GetStandardError(double[] sample)
        {
            double sampleMean   = Mean.GetMean(sample);
            double sampleStdDev = StdDev.GetStdDev(sample, sampleMean);

            return(GetStandardError(sampleStdDev, sample.Length));
        }
Exemplo n.º 4
0
        /// <summary>
        /// Get the confidence interval of a continuous variable for a random sample
        /// </summary>
        /// <param name="sample"></param>
        /// <param name="confidence_level"></param>
        /// <returns></returns>
        public static double[] GetConfidenceInterval(double[] sample, double confidence_level, bool useStudentT = false)
        {
            double sampleMean   = Mean.GetMean(sample);
            double sampleStdDev = StdDev.GetStdDev(sample, sampleMean);

            return(GetConfidenceInterval(sampleMean, sampleStdDev, sample.Length, confidence_level, useStudentT));
        }
Exemplo n.º 5
0
        /// <summary>
        /// Two-sided or one-sided test for whether statitics of two variables are equal in the true population, var1 and var2 are paired and dependent
        ///
        /// Hypotheses are:
        /// H_0: mu_var1 = mu_var2
        /// H_1: mu_var1 != mu_var2
        ///
        /// The hypotheses can be written as
        /// H_0: mu_var1 - mu_var2 = 0
        /// H_1: mu_var1 - mu_var2 != 0
        ///
        /// By Central Limt Theorem:
        /// sample_mean_var1 - sample_mean_var2 ~ N(0, SE), where null_value = 0 and SE is the standard error of the sampling distribution
        ///
        /// p-value = (sample_mean is at least ||null_value-point_estimate|| away from the null_value) | mu = null_value)
        /// </summary>
        /// <param name="sample_for_paired_data">a random sample consisting data paired together, var1 and var2, var1 and var2 are not independent</param>
        /// <param name="one_sided">True if the test is one-sided</param>
        /// <param name="significance_level"></param>
        /// <returns></returns>
        public bool RejectH0_PairedData(Tuple <double, double>[] sample_for_paired_data, out double pValue, double significance_level = 0.05, bool one_sided = false, bool useStudentT = false)
        {
            int sample_size = sample_for_paired_data.Length;

            double[] diff = new double[sample_size];
            for (int i = 0; i < sample_size; ++i)
            {
                diff[i] = sample_for_paired_data[i].Item1 - sample_for_paired_data[i].Item2;
            }
            double point_estimate = Mean.GetMean(diff);
            double null_value     = 0;
            double SE             = StandardError.GetStandardError(diff);
            double test_statistic = System.Math.Abs(point_estimate - null_value) / SE;

            double percentile = 0;

            if (sample_for_paired_data.Length < 30 || useStudentT) //if sample size is smaller than 30, then CLT for population statistics such as sample mean no longer holds and Student's t distribution should be used in place of the normal distribution
            {
                percentile = StudentT.GetPercentile(test_statistic, sample_for_paired_data.Length - 1);
            }
            else
            {
                percentile = Gaussian.GetPercentile(test_statistic);
            }

            pValue = (1 - percentile) * (one_sided ? 1 : 2);
            return(pValue < significance_level);
        }
Exemplo n.º 6
0
        /// <summary>
        /// Return the confidence interval of the difference between two classes in terms of the proportion of SUCCESS in the population at a given confidence level
        ///
        /// Note that each class should be a categorical variable with two levels : {SUCCESS, FAILURE}
        /// Note that class 1 and class 2 are not paired or dependent
        /// </summary>
        /// <param name="p_hat1">point estimate of the proportion of SUCCESS in class 1</param>
        /// <param name="p_hat2">point estimate of the proportion of SUCCESS in class 2</param>
        /// <param name="n1">sample size in class 1</param>
        /// <param name="n2">sample size in class 2</param>
        /// <param name="confidence_level">The given confidence level</param>
        /// <param name="useSimulation">Flag for whether simulation should be used instead of the normal distribution for proportion of SUCCESS</param>
        /// <returns>The confidence interval of the difference between two classes in terms of the proportion of SUCCESS</returns>
        public static double[] GetConfidenceInterval(double p_hat1, double p_hat2, int n1, int n2, double confidence_level, bool useSimulation = false, int simulationCount = 500)
        {
            bool shouldUseSimulation = useSimulation;

            double p1 = (1 - confidence_level) / 2;
            double p2 = 1 - p1;

            if (!shouldUseSimulation && (p_hat1 * n1 < 10 || (1 - p_hat1) * n1 < 10 || p_hat2 * n2 < 10 || (1 - p_hat2) * n2 < 10))
            {
                shouldUseSimulation = true;
            }

            if (shouldUseSimulation)
            {
                double[] sim_sample1 = new double[simulationCount]; // this will follow a normal distribution based on CTL for proportion
                double[] sim_sample2 = new double[simulationCount]; // this will follow a normal distribution based on CLT for proportion

                int simulationSampleSize = System.Math.Max((int)System.Math.Max(10 / p_hat1, 10 / (1 - p_hat1)) * 2, (int)System.Math.Max(10 / p_hat2, 10 / (1 - p_hat2)) * 2);

                for (int i = 0; i < simulationCount; ++i)
                {
                    int successCount1 = 0;
                    int successCount2 = 0;
                    for (int j = 0; j < simulationSampleSize; ++j)
                    {
                        if (DistributionModel.GetUniform() <= p_hat1)
                        {
                            successCount1++;
                        }
                        if (DistributionModel.GetUniform() <= p_hat2)
                        {
                            successCount2++;
                        }
                    }
                    sim_sample1[i] = (double)(successCount1) / simulationSampleSize;
                    sim_sample2[i] = (double)(successCount2) / simulationSampleSize;
                }

                double sim_mu1    = Mean.GetMean(sim_sample1);
                double sim_sigma1 = StdDev.GetStdDev(sim_sample1, sim_mu1);

                double sim_mu2    = Mean.GetMean(sim_sample2);
                double sim_sigma2 = StdDev.GetStdDev(sim_sample2, sim_mu2);

                double sim_mud = sim_mu1 - sim_mu2;
                double sim_SE  = System.Math.Sqrt(sim_sigma1 * sim_sigma1 + sim_sigma2 * sim_sigma2);

                return(new double[] { sim_mud + Gaussian.GetPercentile(p1) * sim_SE, sim_mud + Gaussian.GetQuantile(p2) * sim_SE });
            }
            else
            {
                double SE = System.Math.Sqrt((p_hat1 * (1 - p_hat1) / n1 + (p_hat2 * (1 - p_hat2)) / n2));

                double pd_hat = p_hat1 - p_hat2;


                return(new double[] { pd_hat + Gaussian.GetQuantile(p1) * SE, pd_hat + Gaussian.GetQuantile(p2) * SE });
            }
        }
Exemplo n.º 7
0
        /// <summary>
        /// Two-sided or one-sided test for a single median
        ///
        /// Given that:
        /// H_0 : median = null_value
        /// H_A : median != null_value
        ///
        /// By Central Limit Theorem:
        /// sample_median ~ N(mu, SE)
        ///
        /// p-value = (sample_median is at least ||null_value - point_estimate|| away from the null_value) | median = null_value)
        /// if(p-value < significance_level) reject H_0
        /// </summary>
        /// <param name="originalSample">The original sample</param>
        /// <param name="bootstrapSampleCount"></param>
        /// <param name="null_value"></param>
        /// <param name="significance_level"></param>
        /// <param name="one_sided"></param>
        /// <returns></returns>
        public static bool RejectH0_ForMedian(double[] originalSample, int bootstrapSampleCount, double null_value, out double pValue, double significance_level = 0.05, bool one_sided = false)
        {
            double[] bootstrapMedians = SimulateSampleMedians(originalSample, bootstrapSampleCount);
            double   bootstrap_mean   = Mean.GetMean(bootstrapMedians);
            double   bootstrap_SE     = StdDev.GetStdDev(bootstrapMedians, bootstrap_mean);

            return(HypothesisTesting.RejectH0(bootstrap_mean, null_value, bootstrap_SE, originalSample.Length, out pValue, significance_level, one_sided));
        }
Exemplo n.º 8
0
        /// <summary>
        /// Return the confidence interval of the population mean at a given confidence level, given the point estimate sample mean are known from multiple groups / classes
        ///
        /// Note that each class should be a continuous variable.
        /// </summary>
        /// <param name="sampleMeans">point estimate sample means from different groups/classes</param>
        /// <param name="sampleStdDev">point estimate sample standard deviations from different groups / classes</param>
        /// <param name="sampleSizes">sample size from different classes</param>
        /// <param name="confidence_level">The given confidence level</param>
        /// <param name="useStudentT">whether student t should be used for test statistic</param>
        /// <returns>The confidence level of the population mean at the given confidence level</returns>
        public static double[] GetConfidenceInterval(double[] sampleMeans, double[] sampleStdDev, int[] sampleSizes, double confidence_level, bool useStudentT = false)
        {
            double[] standardErrors = new double[sampleMeans.Length];
            for (int i = 0; i < sampleMeans.Length; ++i)
            {
                standardErrors[i] = StandardError.GetStandardError(sampleStdDev[i], sampleSizes[i]);
            }

            double standardError = StandardError.GetStandardErrorForWeightAverages(sampleSizes, standardErrors);
            double sampleMean    = Mean.GetMeanForWeightedAverage(sampleMeans, sampleSizes);

            double p1 = (1 - confidence_level) / 2.0;
            double p2 = 1 - p1;

            bool shouldUseStudentT = useStudentT;

            if (!shouldUseStudentT)
            {
                for (int i = 0; i < sampleSizes.Length; ++i)
                {
                    if (sampleSizes[i] < 30)
                    {
                        shouldUseStudentT = true;
                        break;
                    }
                }
            }

            double critical_value1 = 0;
            double critical_value2 = 0;

            if (shouldUseStudentT)
            {
                int smallestSampleSize = int.MaxValue;
                for (int i = 0; i < sampleSizes.Length; ++i)
                {
                    if (sampleSizes[i] < smallestSampleSize)
                    {
                        smallestSampleSize = sampleSizes[i];
                    }
                }
                int df = smallestSampleSize - 1;
                critical_value1 = StudentT.GetQuantile(p1, df);
                critical_value2 = StudentT.GetQuantile(p2, df);
            }
            else
            {
                critical_value1 = Gaussian.GetQuantile(p1);
                critical_value2 = Gaussian.GetQuantile(p2);
            }

            double[] confidence_interval = new double[2];
            confidence_interval[0] = sampleMean + critical_value1 * standardError;
            confidence_interval[1] = sampleMean + critical_value2 * standardError;

            return(confidence_interval);
        }
Exemplo n.º 9
0
        /// <summary>
        /// Return the standard error of the sampling distribution of the difference between two population statistics var1 and var2, assuming var1 and var2 are independent
        /// </summary>
        /// <param name="sample_for_var1">random sample for var1</param>
        /// <param name="sample_for_var2">random sample for var2</param>
        /// <returns>Standard error of a random sample, which is the standard deviation of the sample statistic normal distribution by CLT</returns>
        public static double GetStandardError(double[] sample_for_var1, double[] sample_for_var2)
        {
            double mu_for_var1 = Mean.GetMean(sample_for_var1);
            double mu_for_var2 = Mean.GetMean(sample_for_var2);

            double sigma_for_var1 = StdDev.GetStdDev(sample_for_var1, mu_for_var1);
            double sigma_for_var2 = StdDev.GetStdDev(sample_for_var2, mu_for_var2);

            return(System.Math.Sqrt(sigma_for_var1 * sigma_for_var1 / sample_for_var1.Length + sigma_for_var2 * sigma_for_var2 / sample_for_var2.Length));
        }
Exemplo n.º 10
0
        public static Dictionary <int, double> GetMeanWithinGroup(Dictionary <int, List <double> > groupSample)
        {
            Dictionary <int, double> means = new Dictionary <int, double>();

            foreach (int grpId in groupSample.Keys)
            {
                means[grpId] = Mean.GetMean(groupSample[grpId]);
            }
            return(means);
        }
        /// <summary>
        /// Pairwise comparison of group1 and group2
        /// </summary>
        /// <param name="group1">random sample from class 1</param>
        /// <param name="group2">random sample from class 2</param>
        /// <param name="anova">parameters obtained after ANOVA</param>
        /// <returns>p-value = P(observed or more extreme values | H_0 is true)</returns>
        public static double PairwiseCompare(List <double> group1, List <double> group2, ANOVA anova)
        {
            double x_bar1 = Mean.GetMean(group1);
            double x_bar2 = Mean.GetMean(group2);
            int    n1     = group1.Count;
            int    n2     = group2.Count;

            int    null_value = 0;
            double t          = GetTStatistic(x_bar1, x_bar2, n1, n2, null_value, anova.MSE);
            double pValue     = GetPValue(t, anova.dfE);

            return(pValue);
        }
Exemplo n.º 12
0
        /// <summary>
        /// Return the sum of squares group (SSG)
        ///
        /// SSG measures the variability between groups
        /// This is also known as explained variablity: deviation of group mean from overral mean, weighted by sample size
        /// </summary>
        /// <param name="groupedSample">The sample groupped based on the classes</param>
        /// <returns></returns>
        public static double GetSSG(Dictionary <int, List <double> > groupedSample, double grand_mean)
        {
            double SSG = 0;

            foreach (int grpId in groupedSample.Keys)
            {
                List <double> group      = groupedSample[grpId];
                double        group_mean = Mean.GetMean(group);
                double        group_size = group.Count;
                SSG += group_size * (group_mean - grand_mean) * (group_mean - grand_mean);
            }
            return(SSG);
        }
Exemplo n.º 13
0
        public static double[] GetConfidenceIntervalForMean(double[] originalSample, int bootstrapSampleCount, double confidence_level)
        {
            double[] bootstrapMeans = SimulateSampleMeans(originalSample, bootstrapSampleCount);
            double   bootstrap_mean = Mean.GetMean(bootstrapMeans);
            double   bootstrap_SE   = StdDev.GetStdDev(bootstrapMeans, bootstrap_mean);

            double p1 = (1 - confidence_level) / 2;
            double p2 = 1 - p1;

            double z1 = Gaussian.GetQuantile(p1);
            double z2 = Gaussian.GetQuantile(p2);

            return(new double[] { bootstrap_mean + z1 * bootstrap_SE, bootstrap_mean + z2 * bootstrap_SE });
        }
Exemplo n.º 14
0
        /// <summary>
        /// Return the NormalTable distribution of population statistic (a*x + b*y) for correlated random variables x and y
        /// </summary>
        /// <param name="x">random sample for random variable x</param>
        /// <param name="y">random sample for random variable y</param>
        /// <param name="x_coefficient">a which is the coefficient of x</param>
        /// <param name="y_coefficient">b which is the coefficient of y</param>
        /// <param name="correlation">correlation between x and y</param>
        /// <param name="result_mean">output mean for the a*x + b*y</param>
        /// <param name="result_SE">output standard error for the a*x + b*y</param>
        public static void Sum(double[] x, double[] y, int x_coefficient, double y_coefficient, double correlation, out double result_mean, out double result_SE)
        {
            result_mean = 0;
            result_SE   = 0;

            double mean_x = Mean.GetMean(x);
            double mean_y = Mean.GetMean(y);

            double stddev_x = StdDev.GetStdDev(x, mean_x);
            double stddev_y = StdDev.GetStdDev(y, mean_y);

            result_mean = x_coefficient * mean_x + y_coefficient * mean_y;
            result_SE   = System.Math.Sqrt(System.Math.Pow(x_coefficient * stddev_x, 2) / x.Length + System.Math.Pow(y_coefficient * stddev_y, 2) / y.Length + 2 * correlation * x_coefficient * stddev_x * y_coefficient * stddev_y / System.Math.Sqrt(x.Length * y.Length));
        }
Exemplo n.º 15
0
        /// <summary>
        /// Return the sum of squares total
        ///
        /// SST measures the total variability in the response variable
        /// </summary>
        /// <param name="totalSample">all the data points in the sample containing all classes</param>
        /// <param name="grand_mean">The mean of all the data points in the sample containing all classes</param>
        /// <returns>The sum of squares total, which measures p=o--i9i9</returns>
        public static double GetSST(double[] totalSample, out double grand_mean)
        {
            grand_mean = Mean.GetMean(totalSample);

            double SST = 0;
            int    n   = totalSample.Length;

            for (int i = 0; i < n; ++i)
            {
                double yd = totalSample[i] - grand_mean;
                SST += yd * yd;
            }
            return(SST);
        }
Exemplo n.º 16
0
        /// <summary>
        /// Return a set of simulated bootstrap statistics that form the bootstrap distribution for means via simulation given the original sample
        /// </summary>
        /// <param name="originalSampleMean">point estimate of sample mean from the original sample</param>
        /// <param name="originalSampleStdDev">standard deviation of the original sample</param>
        /// <param name="originalSampleSize">size of the original sample</param>
        /// <param name="bootstrapSampleCount">The number of bootstrap samples collected to form the bootstrap distribution</param>
        /// <returns></returns>
        public static double[] SimulateSampleMeans(double originalSampleMean, double originalSampleStdDev, int originalSampleSize, int bootstrapSampleCount)
        {
            Gaussian distribution = new Gaussian(originalSampleMean, originalSampleStdDev);

            double[] bootstrapMeans = new double[bootstrapSampleCount];

            double[] bootstrapSample = new double[originalSampleSize];
            for (int i = 0; i < bootstrapSampleCount; ++i)
            {
                for (int j = 0; j < originalSampleSize; ++j)
                {
                    bootstrapSample[j] = distribution.Next();
                }
                bootstrapMeans[i] = Mean.GetMean(bootstrapSample);
            }
            return(bootstrapMeans);
        }
Exemplo n.º 17
0
        /// <summary>
        /// Return the confidence interval for proportion of SUCCESS in the population at a given confidence level given the sample proportion point estimate
        /// </summary>
        /// <param name="proportion">sample proportion point estimate</param>
        /// <param name="sampleSize">sample size</param>
        /// <param name="confidence_level"></param>
        /// <returns>confidence interval for proportion of SUCCESS in the population at a given confidence level</returns>
        public static double[] GetConfidenceInterval(double proportion, int sampleSize, double confidence_level, bool useSimulation = false, int simulationCount = 500)
        {
            double standard_error = StandardError.GetStandardErrorForProportion(proportion, sampleSize);

            double p1 = (1 - confidence_level) / 2;
            double p2 = 1 - p1;

            int expected_success_count = (int)(proportion * sampleSize);
            int expected_failure_count = (int)((1 - proportion) * sampleSize);

            if (expected_failure_count < 10 || expected_success_count < 10 || useSimulation) //if np < 10 or n(1-p) < 10, then CLT for proportion no longer holds and simulation should be used in place of the normal distribution
            {
                double[] sampleProportions    = new double[simulationCount];
                int      simulationSampleSize = (int)System.Math.Max(10 / proportion, 10 / (1 - proportion)) * 2;
                for (int i = 0; i < simulationCount; ++i)
                {
                    int successCount = 0;
                    for (int j = 0; j < simulationSampleSize; ++j)
                    {
                        if (DistributionModel.GetUniform() <= proportion)
                        {
                            successCount++;
                        }
                    }
                    sampleProportions[i] = (double)successCount / simulationSampleSize;
                }

                double proportion_mu    = Mean.GetMean(sampleProportions);
                double proportion_sigma = StdDev.GetStdDev(sampleProportions, proportion_mu);

                return(new double[] { proportion_mu + Gaussian.GetPercentile(p1) * proportion_sigma, proportion_mu + Gaussian.GetQuantile(p2) * proportion_sigma });
            }
            else
            {
                double critical_value1 = Gaussian.GetQuantile(p1);
                double critical_value2 = Gaussian.GetQuantile(p2);

                double[] confidence_interval = new double[2];
                confidence_interval[0] = proportion + critical_value1 * standard_error;
                confidence_interval[1] = proportion + critical_value2 * standard_error;

                return(confidence_interval);
            }
        }
Exemplo n.º 18
0
        /// <summary>
        /// Two-sided or one-sided test for a single mean
        ///
        /// Given that:
        /// H_0 : mu = null_value
        /// H_A : mu != null_value
        ///
        /// By Central Limit Theorem:
        /// sample_mean ~ N(mu, SE)
        ///
        /// p-value = (sample_mean is at least ||null_value-point_estimate|| away from the null_value) | mu = null_value)
        /// if(p-value < significance_level) reject H_0
        /// </summary>
        /// <param name="sample"></param>
        /// <param name="null_value"></param>
        /// <param name="significance_level"></param>
        /// <param name="one_sided">True if the test is one_sided</param>
        /// <returns></returns>
        public static bool RejectH0(double[] sample, double null_value, out double pValue, double significance_level = 0.05, bool one_sided = false, bool useStudentT = false)
        {
            double pointEstimate  = Mean.GetMean(sample);
            double standardError  = StandardError.GetStandardError(sample);                      //SE is the estimated standard deviation of the true population mean, mu
            double test_statistic = System.Math.Abs(pointEstimate - null_value) / standardError; //This assumes that H_0 is true, that is, the true population mean, mu = null_value

            double percentile = 0;

            if (sample.Length < 30 || useStudentT) //if sample size is smaller than 30, then CLT for population statistics such as sample mean no longer holds and Student's t distribution should be used in place of the normal distribution
            {
                percentile = StudentT.GetPercentile(test_statistic, sample.Length - 1);
            }
            else
            {
                percentile = Gaussian.GetPercentile(test_statistic);
            }

            pValue = pValue = (1 - percentile) * (one_sided ? 1 : 2);
            return(pValue < significance_level);
        }
Exemplo n.º 19
0
        public static Dictionary <int, double[]> GetMeanWithinGroup(Dictionary <int, List <double> >[] groupSampleWithDim)
        {
            Dictionary <int, double[]> means = new Dictionary <int, double[]>();
            int D = groupSampleWithDim.Length;

            foreach (int groupId in groupSampleWithDim[0].Keys)
            {
                means[groupId] = new double[D];
            }

            for (int d = 0; d < D; ++d)
            {
                Dictionary <int, List <double> > groupSample = groupSampleWithDim[d];

                foreach (int grpId in groupSample.Keys)
                {
                    means[grpId][d] = Mean.GetMean(groupSample[grpId]);
                }
            }
            return(means);
        }
Exemplo n.º 20
0
        /// <summary>
        /// Two-sided or one-sided test for whether statitics of two variables are equal in the true population, var1 and var2 are independent
        ///
        /// Hypotheses are:
        /// H_0: mu_var1 = mu_var2
        /// H_1: mu_var1 != mu_var2
        ///
        /// The hypotheses can be written as
        /// H_0: mu_var1 - mu_var2 = 0
        /// H_1: mu_var1 - mu_var2 != 0
        ///
        /// By Central Limt Theorem:
        /// sample_mean_var1 - sample_mean_var2 ~ N(0, SE), where null_value = 0 and SE is the standard error of the sampling distribution
        ///
        /// p-value = (sample_mean is at least ||null_value-point_estimate|| away from the null_value) | mu = null_value)
        /// </summary>
        /// <param name="sample_for_var1">value sample for variable 1</param>
        /// <param name="sample_for_var2">value sample for variable 2</param>
        /// <param name="one_sided">True if the test is one-sided</param>
        /// <param name="significance_level"></param>
        /// <returns></returns>
        public bool RejectH0(double[] sample_for_var1, double[] sample_for_var2, out double pValue, double significance_level = 0.05, bool one_sided = false, bool useStudentT = false)
        {
            double pointEstimate  = Mean.GetMean(sample_for_var1) - Mean.GetMean(sample_for_var2);
            double null_value     = 0;
            double SE             = StandardError.GetStandardError(sample_for_var1, sample_for_var2);
            double test_statistic = System.Math.Abs(pointEstimate - null_value) / SE;

            double percentile = 0;

            if (sample_for_var1.Length < 30 || sample_for_var2.Length < 30 || useStudentT) //if sample size is smaller than 30, then CLT for population statistics such as sample mean no longer holds and Student's t distribution should be used in place of the normal distribution
            {
                int df = System.Math.Min(sample_for_var1.Length - 1, sample_for_var2.Length - 1);
                percentile = StudentT.GetPercentile(test_statistic, df);
            }
            else
            {
                percentile = Gaussian.GetPercentile(test_statistic);
            }

            pValue = (1 - percentile) * (one_sided ? 1 : 2);
            return(pValue < significance_level);
        }
Exemplo n.º 21
0
        /// <summary>
        /// Calculate the confidence interval for the proportion of SUCCESS in the population at a given confidence interval, given the point estimate proprotions are known from multiple groups
        ///
        /// Note that this is only for categorical variable with two levels : SUCCESS, FAILURE
        /// </summary>
        /// <param name="proportions">The point estimate proportion of SUCESS obtained from multiple groups</param>
        /// <param name="sampleSizes">The sample size of each group</param>
        /// <param name="confidence_level">The given confidence interval</param>
        /// <returns>The confidence interval for the proportion of SUCCESS in the population at the given confidence level</returns>
        public static double[] GetConfidenceInterval(double[] proportions, int[] sampleSizes, double confidence_level, bool useSimulation = false, int simulationCount = 500)
        {
            double p1 = (1 - confidence_level) / 2;
            double p2 = 1 - p1;

            bool shouldUseSimulation = useSimulation;

            if (!shouldUseSimulation)
            {
                for (int i = 0; i < sampleSizes.Length; ++i)
                {
                    int n_i = sampleSizes[i];
                    int expected_success_count = (int)(proportions[i] * n_i);
                    int expected_failure_count = (int)((1 - proportions[i]) * n_i);
                    if (expected_failure_count < 10 || expected_success_count < 10)
                    {
                        shouldUseSimulation = true;
                        break;
                    }
                }
            }

            if (shouldUseSimulation)
            {
                double sucess_count = 0;
                double total_count  = 0;
                for (int i = 0; i < sampleSizes.Length; ++i)
                {
                    int n_i = sampleSizes[i];
                    sucess_count += proportions[i] * n_i;
                    total_count  += n_i;
                }

                double p_hat = sucess_count / total_count;

                double[] sampleProportions    = new double[simulationCount];
                int      simulationSampleSize = (int)System.Math.Max(10 / p_hat, 10 / (1 - p_hat)) * 2;
                for (int i = 0; i < simulationCount; ++i)
                {
                    int successCount = 0;
                    for (int j = 0; j < simulationSampleSize; ++j)
                    {
                        if (DistributionModel.GetUniform() <= p_hat)
                        {
                            successCount++;
                        }
                    }
                    sampleProportions[i] = (double)successCount / simulationSampleSize;
                }

                double proportion_mu    = Mean.GetMean(sampleProportions);
                double proportion_sigma = StdDev.GetStdDev(sampleProportions, proportion_mu);

                return(new double[] { proportion_mu + Gaussian.GetPercentile(p1) * proportion_sigma, proportion_mu + Gaussian.GetQuantile(p2) * proportion_sigma });
            }
            else
            {
                double[] standardErrors = new double[proportions.Length];
                for (int i = 0; i < proportions.Length; ++i)
                {
                    standardErrors[i] = StandardError.GetStandardErrorForProportion(proportions[i], sampleSizes[i]);
                }

                double standardError = StandardError.GetStandardErrorForWeightAverages(sampleSizes, standardErrors);

                double sampleMean = Mean.GetMeanForWeightedAverage(proportions, sampleSizes);


                double critical_value1 = 0;
                double critical_value2 = 0;

                critical_value1 = Gaussian.GetQuantile(p1);
                critical_value2 = Gaussian.GetQuantile(p2);

                double[] confidence_interval = new double[2];
                confidence_interval[0] = sampleMean + critical_value1 * standardError;
                confidence_interval[1] = sampleMean + critical_value2 * standardError;

                return(confidence_interval);
            }
        }