/// <summary>
        /// Return the confidence interval of the population mean (measured on a continuous random variable) given a random sample
        ///
        /// Note that this is for a variable whose values are continuous
        /// </summary>
        /// <param name="sampleMean">point estimate sample mean given by the random sample</param>
        /// <param name="sampleStdDev">point estimate sample standard deviation given by the random sample</param>
        /// <param name="sampleSize">size of the random sample</param>
        /// <param name="confidence_level"></param>
        /// <returns></returns>
        public static double[] GetConfidenceInterval(double sampleMean, double sampleStdDev, int sampleSize, double confidence_level, bool useStudentT = false)
        {
            double standard_error = StandardError.GetStandardError(sampleStdDev, sampleSize);

            double[] confidence_interval = new double[2];

            double p1 = (1 - confidence_level) / 2.0;
            double p2 = 1 - p1;

            double critical_value1 = 0;
            double critical_value2 = 0;

            if (sampleSize < 30 || useStudentT) //if sample size is smaller than 30, then CLT for population statistics such as sample mean no longer holds and Student's t distribution should be used in place of the normal distribution
            {
                int df = sampleSize - 1;
                critical_value1 = StudentT.GetQuantile(p1, df);
                critical_value2 = StudentT.GetQuantile(p2, df);
            }
            else
            {
                critical_value1 = Gaussian.GetQuantile(p1);
                critical_value2 = Gaussian.GetQuantile(p2);
            }

            confidence_interval[0] = sampleMean + critical_value1 * standard_error;
            confidence_interval[1] = sampleMean + critical_value2 * standard_error;

            return(confidence_interval);
        }
예제 #2
0
        /// <summary>
        /// Two-sided or one-sided test for whether statitics of two variables are equal in the true population, var1 and var2 are paired and dependent
        ///
        /// Hypotheses are:
        /// H_0: mu_var1 = mu_var2
        /// H_1: mu_var1 != mu_var2
        ///
        /// The hypotheses can be written as
        /// H_0: mu_var1 - mu_var2 = 0
        /// H_1: mu_var1 - mu_var2 != 0
        ///
        /// By Central Limt Theorem:
        /// sample_mean_var1 - sample_mean_var2 ~ N(0, SE), where null_value = 0 and SE is the standard error of the sampling distribution
        ///
        /// p-value = (sample_mean is at least ||null_value-point_estimate|| away from the null_value) | mu = null_value)
        /// </summary>
        /// <param name="sample_for_paired_data">a random sample consisting data paired together, var1 and var2, var1 and var2 are not independent</param>
        /// <param name="one_sided">True if the test is one-sided</param>
        /// <param name="significance_level"></param>
        /// <returns></returns>
        public bool RejectH0_PairedData(Tuple <double, double>[] sample_for_paired_data, out double pValue, double significance_level = 0.05, bool one_sided = false, bool useStudentT = false)
        {
            int sample_size = sample_for_paired_data.Length;

            double[] diff = new double[sample_size];
            for (int i = 0; i < sample_size; ++i)
            {
                diff[i] = sample_for_paired_data[i].Item1 - sample_for_paired_data[i].Item2;
            }
            double point_estimate = Mean.GetMean(diff);
            double null_value     = 0;
            double SE             = StandardError.GetStandardError(diff);
            double test_statistic = System.Math.Abs(point_estimate - null_value) / SE;

            double percentile = 0;

            if (sample_for_paired_data.Length < 30 || useStudentT) //if sample size is smaller than 30, then CLT for population statistics such as sample mean no longer holds and Student's t distribution should be used in place of the normal distribution
            {
                percentile = StudentT.GetPercentile(test_statistic, sample_for_paired_data.Length - 1);
            }
            else
            {
                percentile = Gaussian.GetPercentile(test_statistic);
            }

            pValue = (1 - percentile) * (one_sided ? 1 : 2);
            return(pValue < significance_level);
        }
        /// <summary>
        /// Return the confidence interval of the population mean at a given confidence level, given the point estimate sample mean are known from multiple groups / classes
        ///
        /// Note that each class should be a continuous variable.
        /// </summary>
        /// <param name="sampleMeans">point estimate sample means from different groups/classes</param>
        /// <param name="sampleStdDev">point estimate sample standard deviations from different groups / classes</param>
        /// <param name="sampleSizes">sample size from different classes</param>
        /// <param name="confidence_level">The given confidence level</param>
        /// <param name="useStudentT">whether student t should be used for test statistic</param>
        /// <returns>The confidence level of the population mean at the given confidence level</returns>
        public static double[] GetConfidenceInterval(double[] sampleMeans, double[] sampleStdDev, int[] sampleSizes, double confidence_level, bool useStudentT = false)
        {
            double[] standardErrors = new double[sampleMeans.Length];
            for (int i = 0; i < sampleMeans.Length; ++i)
            {
                standardErrors[i] = StandardError.GetStandardError(sampleStdDev[i], sampleSizes[i]);
            }

            double standardError = StandardError.GetStandardErrorForWeightAverages(sampleSizes, standardErrors);
            double sampleMean    = Mean.GetMeanForWeightedAverage(sampleMeans, sampleSizes);

            double p1 = (1 - confidence_level) / 2.0;
            double p2 = 1 - p1;

            bool shouldUseStudentT = useStudentT;

            if (!shouldUseStudentT)
            {
                for (int i = 0; i < sampleSizes.Length; ++i)
                {
                    if (sampleSizes[i] < 30)
                    {
                        shouldUseStudentT = true;
                        break;
                    }
                }
            }

            double critical_value1 = 0;
            double critical_value2 = 0;

            if (shouldUseStudentT)
            {
                int smallestSampleSize = int.MaxValue;
                for (int i = 0; i < sampleSizes.Length; ++i)
                {
                    if (sampleSizes[i] < smallestSampleSize)
                    {
                        smallestSampleSize = sampleSizes[i];
                    }
                }
                int df = smallestSampleSize - 1;
                critical_value1 = StudentT.GetQuantile(p1, df);
                critical_value2 = StudentT.GetQuantile(p2, df);
            }
            else
            {
                critical_value1 = Gaussian.GetQuantile(p1);
                critical_value2 = Gaussian.GetQuantile(p2);
            }

            double[] confidence_interval = new double[2];
            confidence_interval[0] = sampleMean + critical_value1 * standardError;
            confidence_interval[1] = sampleMean + critical_value2 * standardError;

            return(confidence_interval);
        }
예제 #4
0
        /// <summary>
        /// Two-sided or one-sided test for a single mean
        ///
        /// Given that:
        /// H_0 : mu = null_value
        /// H_A : mu != null_value
        ///
        /// By Central Limit Theorem:
        /// sample_mean ~ N(mu, SE)
        ///
        /// p-value = (sample_mean is at least ||null_value-point_estimate|| away from the null_value) | mu = null_value)
        /// if(p-value < significance_level) reject H_0
        /// </summary>
        /// <param name="sample"></param>
        /// <param name="null_value"></param>
        /// <param name="significance_level"></param>
        /// <param name="one_sided">True if the test is one_sided</param>
        /// <returns></returns>
        public static bool RejectH0(double[] sample, double null_value, out double pValue, double significance_level = 0.05, bool one_sided = false, bool useStudentT = false)
        {
            double pointEstimate  = Mean.GetMean(sample);
            double standardError  = StandardError.GetStandardError(sample);                      //SE is the estimated standard deviation of the true population mean, mu
            double test_statistic = System.Math.Abs(pointEstimate - null_value) / standardError; //This assumes that H_0 is true, that is, the true population mean, mu = null_value

            double percentile = 0;

            if (sample.Length < 30 || useStudentT) //if sample size is smaller than 30, then CLT for population statistics such as sample mean no longer holds and Student's t distribution should be used in place of the normal distribution
            {
                percentile = StudentT.GetPercentile(test_statistic, sample.Length - 1);
            }
            else
            {
                percentile = Gaussian.GetPercentile(test_statistic);
            }

            pValue = pValue = (1 - percentile) * (one_sided ? 1 : 2);
            return(pValue < significance_level);
        }
예제 #5
0
        /// <summary>
        /// Two-sided or one-sided test for whether statitics of two variables are equal in the true population, var1 and var2 are independent
        ///
        /// Hypotheses are:
        /// H_0: mu_var1 = mu_var2
        /// H_1: mu_var1 != mu_var2
        ///
        /// The hypotheses can be written as
        /// H_0: mu_var1 - mu_var2 = 0
        /// H_1: mu_var1 - mu_var2 != 0
        ///
        /// By Central Limt Theorem:
        /// sample_mean_var1 - sample_mean_var2 ~ N(0, SE), where null_value = 0 and SE is the standard error of the sampling distribution
        ///
        /// p-value = (sample_mean is at least ||null_value-point_estimate|| away from the null_value) | mu = null_value)
        /// </summary>
        /// <param name="sample_for_var1">value sample for variable 1</param>
        /// <param name="sample_for_var2">value sample for variable 2</param>
        /// <param name="one_sided">True if the test is one-sided</param>
        /// <param name="significance_level"></param>
        /// <returns></returns>
        public bool RejectH0(double[] sample_for_var1, double[] sample_for_var2, out double pValue, double significance_level = 0.05, bool one_sided = false, bool useStudentT = false)
        {
            double pointEstimate  = Mean.GetMean(sample_for_var1) - Mean.GetMean(sample_for_var2);
            double null_value     = 0;
            double SE             = StandardError.GetStandardError(sample_for_var1, sample_for_var2);
            double test_statistic = System.Math.Abs(pointEstimate - null_value) / SE;

            double percentile = 0;

            if (sample_for_var1.Length < 30 || sample_for_var2.Length < 30 || useStudentT) //if sample size is smaller than 30, then CLT for population statistics such as sample mean no longer holds and Student's t distribution should be used in place of the normal distribution
            {
                int df = System.Math.Min(sample_for_var1.Length - 1, sample_for_var2.Length - 1);
                percentile = StudentT.GetPercentile(test_statistic, df);
            }
            else
            {
                percentile = Gaussian.GetPercentile(test_statistic);
            }

            pValue = (1 - percentile) * (one_sided ? 1 : 2);
            return(pValue < significance_level);
        }
예제 #6
0
        /// <summary>
        /// Estimate the normal distribution of a sample mean (for a continuous variable)
        ///
        /// The Central Limit Theorem (CLT) states that:
        /// The distribution of sample statistics (e.g., sample mean) is nearly normal, centered at the population mean, and with a standard deviation equal to the population standard deviation
        /// divided by square root of the sample size.
        ///
        /// With CTL, we can estimate the the normal distribution of a sample, given its estimated mean and stddev as well as the sample size.
        ///
        /// For the CTL to hold true for a sample, the following conditions must be met:
        /// 1. Independence: Sample observations must be independent.
        ///   > random sample/assignment
        ///   > if sampling without replacement, the sample size < 10% of the population
        /// 2. Sample size/skew: Either the population distribution is normal, or if the population distribution is skewed, the sample size is large (rule of thumb: sample size > 30)
        /// </summary>
        /// <param name="sampleMean">point estimate of sample mean</param>
        /// <param name="sampleStdDev">standard deviation of a random sample</param>
        /// <param name="sampleSize">the size of the random sample</param>
        /// <returns>The normal distribution of the sample means for a random sample drawn from the population</returns>
        public static Gaussian EstimateSampleMeanDistribution(double sampleMean, double sampleStdDev, int sampleSize)
        {
            double SE = StandardError.GetStandardError(sampleStdDev, sampleSize);

            return(new Gaussian(sampleMean, SE));
        }