Esempio n. 1
0
        /// <summary>
        /// GOF test for one categorical variable with more than two levels.
        ///
        /// Hypotheses are:
        /// H_0 : actual distribution of each level = expected distribution of each level
        /// H_1 : actual distribution of each level != expected distribution of each level
        ///
        /// p-value = P(observed or more mismatch of expected and actual level distribution | H_0 is true)
        ///
        /// Reject H_0 if p-value < alpha
        /// </summary>
        /// <param name="countOfEachLevel">The count of each level in the sample data for the categorical variable</param>
        /// <param name="expectedPercentageOfEachLevel">The expected distribution / percentage of each level in the population for the categorical variable</param>
        /// <param name="pValue">p-value which is P(observed or more extreme mismatch of expected and actual level distribution | H_0 is true</param>
        /// <param name="significance_level">alpha</param>
        /// <returns>True if H_0 is rejected; False if H_0 is failed to be rejected</returns>
        public bool RejectH0(int[] observedCountInEachLevel, double[] expectedPercentageOfEachLevel, out double pValue, double significance_level = 0.05)
        {
            int sampleSize    = 0;
            int countOfLevels = observedCountInEachLevel.Length;

            for (int i = 0; i < countOfLevels; ++i)
            {
                sampleSize += observedCountInEachLevel[i];
            }
            int[] expectedCountInEachLevel = new int[countOfLevels];

            int r = sampleSize;

            for (int i = 0; i < countOfLevels; ++i)
            {
                expectedCountInEachLevel[i] = (int)(expectedPercentageOfEachLevel[i] * sampleSize);
                r -= expectedCountInEachLevel[i];
            }
            if (r > 0)
            {
                expectedCountInEachLevel[0] += r;
            }

            double ChiSq = 0;

            for (int i = 0; i < countOfLevels; ++i)
            {
                ChiSq += System.Math.Pow(observedCountInEachLevel[i] - expectedCountInEachLevel[i], 2) / expectedCountInEachLevel[i];
            }

            pValue = 1 - ChiSquare.GetPercentile(ChiSq, countOfLevels - 1);
            return(pValue < significance_level);
        }
        /// <summary>
        /// Chi^2 independence test for categorical variables, var1 and var2
        ///
        /// The hypotheses are:
        /// H_0 : variable 1 is independent of variable 2
        /// H_A : variable 1 and variable 2 are dependent
        ///
        /// p-value = P(observed or more extreme events that favors H_A | H_0)
        ///
        /// Now assuming H_0 is true, that is, the var1 and var2 are independent,
        /// This implies the distribution of each level of var1 in each level of var2 should be the same
        /// In other words, the expected distribution of each level of var1 in each level of var2 is given by distributionInEachLevel_var1
        /// Now we can build a new contingency table containing the expected count corresponding to each level of both var1 and var2
        ///
        /// Reject H_0 if p-value < alpha
        /// </summary>
        /// <param name="contingency_table">The contingency table in which each cell contains the counts of records in the sample data that matches the row (i.e. a var1 level) and col (i.e. a var2 level)</param>
        /// <param name="pValue">p-value = P(observed or more extreme events that favors H_A | H_0)</param>
        /// <param name="signficance_level">alpha</param>
        /// <returns>True if H_0 is rejected; False if H_0 is failed to be rejected</returns>
        public bool RejectH0(int[][] contingency_table, out double pValue, double signficance_level = 0.05)
        {
            int countOfLevels_var1 = contingency_table.Length;
            int countOfLevels_var2 = contingency_table[0].Length;

            int sampleSize = 0;

            int[] countInEachLevel_var1 = new int[countOfLevels_var1];
            for (int row = 0; row < countOfLevels_var1; ++row)
            {
                int countInLevel = 0;
                for (int col = 0; col < countOfLevels_var2; ++col)
                {
                    countInLevel += contingency_table[row][col];
                }
                countInEachLevel_var1[row] = countInLevel;
                sampleSize += countInLevel;
            }
            double[] distributionInEachLevel_var1 = new double[countOfLevels_var1];
            for (int row = 0; row < countOfLevels_var1; ++row)
            {
                distributionInEachLevel_var1[row] = (double)countInEachLevel_var1[row] / sampleSize;
            }

            int[] countInEachLevel_var2 = new int[countOfLevels_var2];
            for (int col = 0; col < countOfLevels_var2; ++col)
            {
                int countInLevel = 0;
                for (int row = 0; row < countOfLevels_var1; ++row)
                {
                    countInLevel += contingency_table[row][col];
                }
                countInEachLevel_var2[col] = countInLevel;
            }

            //Now assuming H_0 is true, that is, the var1 and var2 are independent,
            //This implies the distribution of each level of var1 in each level of var2 should be the same
            //In other words, the expected distribution of each level of var1 in each level of var2 is given by distributionInEachLevel_var1
            //Now we can build a new contingency table containing the expected count corresponding to each level of both var1 and var2
            double[][] expected_contingency_table = new double[countOfLevels_var1][];
            for (int row = 0; row < countOfLevels_var1; ++row)
            {
                expected_contingency_table[row] = new double[countOfLevels_var2];
                for (int col = 0; col < countOfLevels_var2; ++col)
                {
                    expected_contingency_table[row][col] = countInEachLevel_var2[col] * distributionInEachLevel_var1[row];
                }
            }

            double ChiSq = 0;

            for (int row = 0; row < countOfLevels_var1; ++row)
            {
                for (int col = 0; col < countOfLevels_var2; ++col)
                {
                    ChiSq += System.Math.Pow(contingency_table[row][col] - expected_contingency_table[row][col], 2) / expected_contingency_table[row][col];
                }
            }

            int df = (countOfLevels_var1 - 1) * (countOfLevels_var2 - 1);

            pValue = 1 - ChiSquare.GetPercentile(ChiSq, df);
            return(pValue < signficance_level);
        }