Ejemplo n.º 1
0
        /// <summary>
        /// Return the correlation for observations (x_1, y_1), (x_2, y_2), ... (x_n, y_n), where n is the sample size
        /// The correlation is computed as correlation(x, y) = sum_i((x_i - mu_x) * (y_i - mu_y)) / (sum_i((x_i - mu_x)^2) * sum_i((y_i - mu_y)^2))
        /// which can also be written as n * sum_i((x_i - mu_x) * (y_i - mu_y) / (sigma_x * sigma_y))
        /// where mu_x = sum_i(x_i) / n and sigma_x = sqrt(sum_i((x_i - mu_x)^2) / n)
        /// </summary>
        /// <param name="observations">The observations (x_1, y_1), (x_2, y_2), ... (x_n, y_n), where n is the sample size</param>
        /// <returns>The correlation value for variable x and y</returns>
        public double GetCorrelation(Tuple <double, double>[] observations)
        {
            int n = observations.Length;

            double[] x = new double[n];
            double[] y = new double[n];
            for (int i = 0; i < n; ++i)
            {
                x[i] = observations[i].Item1;
                y[i] = observations[i].Item2;
            }

            double mu_x = Mean.GetMean(x);
            double mu_y = Mean.GetMean(y);

            double sigma_x = StdDev.GetStdDev(x, mu_x);
            double sigma_y = StdDev.GetStdDev(y, mu_y);

            double sum = 0;

            for (int i = 0; i < n; ++i)
            {
                sum += ((x[i] - mu_x) / sigma_x) * ((y[i] - mu_y) / sigma_y);
            }
            return(sum / n);
        }
Ejemplo n.º 2
0
        /// <summary>
        /// Return a set of simulated bootstrap statistics that form the bootstrap distribution for medians via simulation given the original sample
        /// </summary>
        /// <param name="originalSample">The original sample</param>
        /// <param name="bootstrapSampleCount">The number of bootstrap samples collected to form the bootstrap distribution</param>
        /// <returns></returns>
        public static double[] SimulateSampleMedians(double[] originalSample, int bootstrapSampleCount)
        {
            double originalSampleMedian = Median.GetMedian(originalSample);
            double originalSampleStdDev = StdDev.GetStdDev(originalSample, originalSampleMedian);

            return(SimulateSampleMedians(originalSampleMedian, originalSampleStdDev, originalSample.Length, bootstrapSampleCount));
        }
Ejemplo n.º 3
0
        /// <summary>
        /// Get the confidence interval of a continuous variable for a random sample
        /// </summary>
        /// <param name="sample"></param>
        /// <param name="confidence_level"></param>
        /// <returns></returns>
        public static double[] GetConfidenceInterval(double[] sample, double confidence_level, bool useStudentT = false)
        {
            double sampleMean   = Mean.GetMean(sample);
            double sampleStdDev = StdDev.GetStdDev(sample, sampleMean);

            return(GetConfidenceInterval(sampleMean, sampleStdDev, sample.Length, confidence_level, useStudentT));
        }
Ejemplo n.º 4
0
        /// <summary>
        /// Return the standard error of the sampling distribution given a random sample
        /// </summary>
        /// <param name="sample">The random sample given</param>
        /// <returns>Standard error of a random sample, which is the standard deviation of the sample statistic normal distribution by CLT</returns>
        public static double GetStandardError(double[] sample)
        {
            double sampleMean   = Mean.GetMean(sample);
            double sampleStdDev = StdDev.GetStdDev(sample, sampleMean);

            return(GetStandardError(sampleStdDev, sample.Length));
        }
Ejemplo n.º 5
0
        /// <summary>
        /// Return the confidence interval of the difference between two classes in terms of the proportion of SUCCESS in the population at a given confidence level
        ///
        /// Note that each class should be a categorical variable with two levels : {SUCCESS, FAILURE}
        /// Note that class 1 and class 2 are not paired or dependent
        /// </summary>
        /// <param name="p_hat1">point estimate of the proportion of SUCCESS in class 1</param>
        /// <param name="p_hat2">point estimate of the proportion of SUCCESS in class 2</param>
        /// <param name="n1">sample size in class 1</param>
        /// <param name="n2">sample size in class 2</param>
        /// <param name="confidence_level">The given confidence level</param>
        /// <param name="useSimulation">Flag for whether simulation should be used instead of the normal distribution for proportion of SUCCESS</param>
        /// <returns>The confidence interval of the difference between two classes in terms of the proportion of SUCCESS</returns>
        public static double[] GetConfidenceInterval(double p_hat1, double p_hat2, int n1, int n2, double confidence_level, bool useSimulation = false, int simulationCount = 500)
        {
            bool shouldUseSimulation = useSimulation;

            double p1 = (1 - confidence_level) / 2;
            double p2 = 1 - p1;

            if (!shouldUseSimulation && (p_hat1 * n1 < 10 || (1 - p_hat1) * n1 < 10 || p_hat2 * n2 < 10 || (1 - p_hat2) * n2 < 10))
            {
                shouldUseSimulation = true;
            }

            if (shouldUseSimulation)
            {
                double[] sim_sample1 = new double[simulationCount]; // this will follow a normal distribution based on CTL for proportion
                double[] sim_sample2 = new double[simulationCount]; // this will follow a normal distribution based on CLT for proportion

                int simulationSampleSize = System.Math.Max((int)System.Math.Max(10 / p_hat1, 10 / (1 - p_hat1)) * 2, (int)System.Math.Max(10 / p_hat2, 10 / (1 - p_hat2)) * 2);

                for (int i = 0; i < simulationCount; ++i)
                {
                    int successCount1 = 0;
                    int successCount2 = 0;
                    for (int j = 0; j < simulationSampleSize; ++j)
                    {
                        if (DistributionModel.GetUniform() <= p_hat1)
                        {
                            successCount1++;
                        }
                        if (DistributionModel.GetUniform() <= p_hat2)
                        {
                            successCount2++;
                        }
                    }
                    sim_sample1[i] = (double)(successCount1) / simulationSampleSize;
                    sim_sample2[i] = (double)(successCount2) / simulationSampleSize;
                }

                double sim_mu1    = Mean.GetMean(sim_sample1);
                double sim_sigma1 = StdDev.GetStdDev(sim_sample1, sim_mu1);

                double sim_mu2    = Mean.GetMean(sim_sample2);
                double sim_sigma2 = StdDev.GetStdDev(sim_sample2, sim_mu2);

                double sim_mud = sim_mu1 - sim_mu2;
                double sim_SE  = System.Math.Sqrt(sim_sigma1 * sim_sigma1 + sim_sigma2 * sim_sigma2);

                return(new double[] { sim_mud + Gaussian.GetPercentile(p1) * sim_SE, sim_mud + Gaussian.GetQuantile(p2) * sim_SE });
            }
            else
            {
                double SE = System.Math.Sqrt((p_hat1 * (1 - p_hat1) / n1 + (p_hat2 * (1 - p_hat2)) / n2));

                double pd_hat = p_hat1 - p_hat2;


                return(new double[] { pd_hat + Gaussian.GetQuantile(p1) * SE, pd_hat + Gaussian.GetQuantile(p2) * SE });
            }
        }
Ejemplo n.º 6
0
        /// <summary>
        /// Two-sided or one-sided test for a single median
        ///
        /// Given that:
        /// H_0 : median = null_value
        /// H_A : median != null_value
        ///
        /// By Central Limit Theorem:
        /// sample_median ~ N(mu, SE)
        ///
        /// p-value = (sample_median is at least ||null_value - point_estimate|| away from the null_value) | median = null_value)
        /// if(p-value < significance_level) reject H_0
        /// </summary>
        /// <param name="originalSample">The original sample</param>
        /// <param name="bootstrapSampleCount"></param>
        /// <param name="null_value"></param>
        /// <param name="significance_level"></param>
        /// <param name="one_sided"></param>
        /// <returns></returns>
        public static bool RejectH0_ForMedian(double[] originalSample, int bootstrapSampleCount, double null_value, out double pValue, double significance_level = 0.05, bool one_sided = false)
        {
            double[] bootstrapMedians = SimulateSampleMedians(originalSample, bootstrapSampleCount);
            double   bootstrap_mean   = Mean.GetMean(bootstrapMedians);
            double   bootstrap_SE     = StdDev.GetStdDev(bootstrapMedians, bootstrap_mean);

            return(HypothesisTesting.RejectH0(bootstrap_mean, null_value, bootstrap_SE, originalSample.Length, out pValue, significance_level, one_sided));
        }
Ejemplo n.º 7
0
        /// <summary>
        /// Return the standard error of the sampling distribution of the difference between two population statistics var1 and var2, assuming var1 and var2 are independent
        /// </summary>
        /// <param name="sample_for_var1">random sample for var1</param>
        /// <param name="sample_for_var2">random sample for var2</param>
        /// <returns>Standard error of a random sample, which is the standard deviation of the sample statistic normal distribution by CLT</returns>
        public static double GetStandardError(double[] sample_for_var1, double[] sample_for_var2)
        {
            double mu_for_var1 = Mean.GetMean(sample_for_var1);
            double mu_for_var2 = Mean.GetMean(sample_for_var2);

            double sigma_for_var1 = StdDev.GetStdDev(sample_for_var1, mu_for_var1);
            double sigma_for_var2 = StdDev.GetStdDev(sample_for_var2, mu_for_var2);

            return(System.Math.Sqrt(sigma_for_var1 * sigma_for_var1 / sample_for_var1.Length + sigma_for_var2 * sigma_for_var2 / sample_for_var2.Length));
        }
Ejemplo n.º 8
0
        public static double[] GetConfidenceIntervalForMean(double[] originalSample, int bootstrapSampleCount, double confidence_level)
        {
            double[] bootstrapMeans = SimulateSampleMeans(originalSample, bootstrapSampleCount);
            double   bootstrap_mean = Mean.GetMean(bootstrapMeans);
            double   bootstrap_SE   = StdDev.GetStdDev(bootstrapMeans, bootstrap_mean);

            double p1 = (1 - confidence_level) / 2;
            double p2 = 1 - p1;

            double z1 = Gaussian.GetQuantile(p1);
            double z2 = Gaussian.GetQuantile(p2);

            return(new double[] { bootstrap_mean + z1 * bootstrap_SE, bootstrap_mean + z2 * bootstrap_SE });
        }
Ejemplo n.º 9
0
        /// <summary>
        /// Return the NormalTable distribution of population statistic (a*x + b*y) for correlated random variables x and y
        /// </summary>
        /// <param name="x">random sample for random variable x</param>
        /// <param name="y">random sample for random variable y</param>
        /// <param name="x_coefficient">a which is the coefficient of x</param>
        /// <param name="y_coefficient">b which is the coefficient of y</param>
        /// <param name="correlation">correlation between x and y</param>
        /// <param name="result_mean">output mean for the a*x + b*y</param>
        /// <param name="result_SE">output standard error for the a*x + b*y</param>
        public static void Sum(double[] x, double[] y, int x_coefficient, double y_coefficient, double correlation, out double result_mean, out double result_SE)
        {
            result_mean = 0;
            result_SE   = 0;

            double mean_x = Mean.GetMean(x);
            double mean_y = Mean.GetMean(y);

            double stddev_x = StdDev.GetStdDev(x, mean_x);
            double stddev_y = StdDev.GetStdDev(y, mean_y);

            result_mean = x_coefficient * mean_x + y_coefficient * mean_y;
            result_SE   = System.Math.Sqrt(System.Math.Pow(x_coefficient * stddev_x, 2) / x.Length + System.Math.Pow(y_coefficient * stddev_y, 2) / y.Length + 2 * correlation * x_coefficient * stddev_x * y_coefficient * stddev_y / System.Math.Sqrt(x.Length * y.Length));
        }
Ejemplo n.º 10
0
        /// <summary>
        /// Return the confidence interval for proportion of SUCCESS in the population at a given confidence level given the sample proportion point estimate
        /// </summary>
        /// <param name="proportion">sample proportion point estimate</param>
        /// <param name="sampleSize">sample size</param>
        /// <param name="confidence_level"></param>
        /// <returns>confidence interval for proportion of SUCCESS in the population at a given confidence level</returns>
        public static double[] GetConfidenceInterval(double proportion, int sampleSize, double confidence_level, bool useSimulation = false, int simulationCount = 500)
        {
            double standard_error = StandardError.GetStandardErrorForProportion(proportion, sampleSize);

            double p1 = (1 - confidence_level) / 2;
            double p2 = 1 - p1;

            int expected_success_count = (int)(proportion * sampleSize);
            int expected_failure_count = (int)((1 - proportion) * sampleSize);

            if (expected_failure_count < 10 || expected_success_count < 10 || useSimulation) //if np < 10 or n(1-p) < 10, then CLT for proportion no longer holds and simulation should be used in place of the normal distribution
            {
                double[] sampleProportions    = new double[simulationCount];
                int      simulationSampleSize = (int)System.Math.Max(10 / proportion, 10 / (1 - proportion)) * 2;
                for (int i = 0; i < simulationCount; ++i)
                {
                    int successCount = 0;
                    for (int j = 0; j < simulationSampleSize; ++j)
                    {
                        if (DistributionModel.GetUniform() <= proportion)
                        {
                            successCount++;
                        }
                    }
                    sampleProportions[i] = (double)successCount / simulationSampleSize;
                }

                double proportion_mu    = Mean.GetMean(sampleProportions);
                double proportion_sigma = StdDev.GetStdDev(sampleProportions, proportion_mu);

                return(new double[] { proportion_mu + Gaussian.GetPercentile(p1) * proportion_sigma, proportion_mu + Gaussian.GetQuantile(p2) * proportion_sigma });
            }
            else
            {
                double critical_value1 = Gaussian.GetQuantile(p1);
                double critical_value2 = Gaussian.GetQuantile(p2);

                double[] confidence_interval = new double[2];
                confidence_interval[0] = proportion + critical_value1 * standard_error;
                confidence_interval[1] = proportion + critical_value2 * standard_error;

                return(confidence_interval);
            }
        }
Ejemplo n.º 11
0
        /// <summary>
        /// Calculate the confidence interval for the proportion of SUCCESS in the population at a given confidence interval, given the point estimate proprotions are known from multiple groups
        ///
        /// Note that this is only for categorical variable with two levels : SUCCESS, FAILURE
        /// </summary>
        /// <param name="proportions">The point estimate proportion of SUCESS obtained from multiple groups</param>
        /// <param name="sampleSizes">The sample size of each group</param>
        /// <param name="confidence_level">The given confidence interval</param>
        /// <returns>The confidence interval for the proportion of SUCCESS in the population at the given confidence level</returns>
        public static double[] GetConfidenceInterval(double[] proportions, int[] sampleSizes, double confidence_level, bool useSimulation = false, int simulationCount = 500)
        {
            double p1 = (1 - confidence_level) / 2;
            double p2 = 1 - p1;

            bool shouldUseSimulation = useSimulation;

            if (!shouldUseSimulation)
            {
                for (int i = 0; i < sampleSizes.Length; ++i)
                {
                    int n_i = sampleSizes[i];
                    int expected_success_count = (int)(proportions[i] * n_i);
                    int expected_failure_count = (int)((1 - proportions[i]) * n_i);
                    if (expected_failure_count < 10 || expected_success_count < 10)
                    {
                        shouldUseSimulation = true;
                        break;
                    }
                }
            }

            if (shouldUseSimulation)
            {
                double sucess_count = 0;
                double total_count  = 0;
                for (int i = 0; i < sampleSizes.Length; ++i)
                {
                    int n_i = sampleSizes[i];
                    sucess_count += proportions[i] * n_i;
                    total_count  += n_i;
                }

                double p_hat = sucess_count / total_count;

                double[] sampleProportions    = new double[simulationCount];
                int      simulationSampleSize = (int)System.Math.Max(10 / p_hat, 10 / (1 - p_hat)) * 2;
                for (int i = 0; i < simulationCount; ++i)
                {
                    int successCount = 0;
                    for (int j = 0; j < simulationSampleSize; ++j)
                    {
                        if (DistributionModel.GetUniform() <= p_hat)
                        {
                            successCount++;
                        }
                    }
                    sampleProportions[i] = (double)successCount / simulationSampleSize;
                }

                double proportion_mu    = Mean.GetMean(sampleProportions);
                double proportion_sigma = StdDev.GetStdDev(sampleProportions, proportion_mu);

                return(new double[] { proportion_mu + Gaussian.GetPercentile(p1) * proportion_sigma, proportion_mu + Gaussian.GetQuantile(p2) * proportion_sigma });
            }
            else
            {
                double[] standardErrors = new double[proportions.Length];
                for (int i = 0; i < proportions.Length; ++i)
                {
                    standardErrors[i] = StandardError.GetStandardErrorForProportion(proportions[i], sampleSizes[i]);
                }

                double standardError = StandardError.GetStandardErrorForWeightAverages(sampleSizes, standardErrors);

                double sampleMean = Mean.GetMeanForWeightedAverage(proportions, sampleSizes);


                double critical_value1 = 0;
                double critical_value2 = 0;

                critical_value1 = Gaussian.GetQuantile(p1);
                critical_value2 = Gaussian.GetQuantile(p2);

                double[] confidence_interval = new double[2];
                confidence_interval[0] = sampleMean + critical_value1 * standardError;
                confidence_interval[1] = sampleMean + critical_value2 * standardError;

                return(confidence_interval);
            }
        }