/// <summary> /// GOF test for one categorical variable with more than two levels. /// /// Hypotheses are: /// H_0 : actual distribution of each level = expected distribution of each level /// H_1 : actual distribution of each level != expected distribution of each level /// /// p-value = P(observed or more mismatch of expected and actual level distribution | H_0 is true) /// /// Reject H_0 if p-value < alpha /// </summary> /// <param name="countOfEachLevel">The count of each level in the sample data for the categorical variable</param> /// <param name="expectedPercentageOfEachLevel">The expected distribution / percentage of each level in the population for the categorical variable</param> /// <param name="pValue">p-value which is P(observed or more extreme mismatch of expected and actual level distribution | H_0 is true</param> /// <param name="significance_level">alpha</param> /// <returns>True if H_0 is rejected; False if H_0 is failed to be rejected</returns> public bool RejectH0(int[] observedCountInEachLevel, double[] expectedPercentageOfEachLevel, out double pValue, double significance_level = 0.05) { int sampleSize = 0; int countOfLevels = observedCountInEachLevel.Length; for (int i = 0; i < countOfLevels; ++i) { sampleSize += observedCountInEachLevel[i]; } int[] expectedCountInEachLevel = new int[countOfLevels]; int r = sampleSize; for (int i = 0; i < countOfLevels; ++i) { expectedCountInEachLevel[i] = (int)(expectedPercentageOfEachLevel[i] * sampleSize); r -= expectedCountInEachLevel[i]; } if (r > 0) { expectedCountInEachLevel[0] += r; } double ChiSq = 0; for (int i = 0; i < countOfLevels; ++i) { ChiSq += System.Math.Pow(observedCountInEachLevel[i] - expectedCountInEachLevel[i], 2) / expectedCountInEachLevel[i]; } pValue = 1 - ChiSquare.GetPercentile(ChiSq, countOfLevels - 1); return(pValue < significance_level); }
/// <summary> /// Chi^2 independence test for categorical variables, var1 and var2 /// /// The hypotheses are: /// H_0 : variable 1 is independent of variable 2 /// H_A : variable 1 and variable 2 are dependent /// /// p-value = P(observed or more extreme events that favors H_A | H_0) /// /// Now assuming H_0 is true, that is, the var1 and var2 are independent, /// This implies the distribution of each level of var1 in each level of var2 should be the same /// In other words, the expected distribution of each level of var1 in each level of var2 is given by distributionInEachLevel_var1 /// Now we can build a new contingency table containing the expected count corresponding to each level of both var1 and var2 /// /// Reject H_0 if p-value < alpha /// </summary> /// <param name="contingency_table">The contingency table in which each cell contains the counts of records in the sample data that matches the row (i.e. a var1 level) and col (i.e. a var2 level)</param> /// <param name="pValue">p-value = P(observed or more extreme events that favors H_A | H_0)</param> /// <param name="signficance_level">alpha</param> /// <returns>True if H_0 is rejected; False if H_0 is failed to be rejected</returns> public bool RejectH0(int[][] contingency_table, out double pValue, double signficance_level = 0.05) { int countOfLevels_var1 = contingency_table.Length; int countOfLevels_var2 = contingency_table[0].Length; int sampleSize = 0; int[] countInEachLevel_var1 = new int[countOfLevels_var1]; for (int row = 0; row < countOfLevels_var1; ++row) { int countInLevel = 0; for (int col = 0; col < countOfLevels_var2; ++col) { countInLevel += contingency_table[row][col]; } countInEachLevel_var1[row] = countInLevel; sampleSize += countInLevel; } double[] distributionInEachLevel_var1 = new double[countOfLevels_var1]; for (int row = 0; row < countOfLevels_var1; ++row) { distributionInEachLevel_var1[row] = (double)countInEachLevel_var1[row] / sampleSize; } int[] countInEachLevel_var2 = new int[countOfLevels_var2]; for (int col = 0; col < countOfLevels_var2; ++col) { int countInLevel = 0; for (int row = 0; row < countOfLevels_var1; ++row) { countInLevel += contingency_table[row][col]; } countInEachLevel_var2[col] = countInLevel; } //Now assuming H_0 is true, that is, the var1 and var2 are independent, //This implies the distribution of each level of var1 in each level of var2 should be the same //In other words, the expected distribution of each level of var1 in each level of var2 is given by distributionInEachLevel_var1 //Now we can build a new contingency table containing the expected count corresponding to each level of both var1 and var2 double[][] expected_contingency_table = new double[countOfLevels_var1][]; for (int row = 0; row < countOfLevels_var1; ++row) { expected_contingency_table[row] = new double[countOfLevels_var2]; for (int col = 0; col < countOfLevels_var2; ++col) { expected_contingency_table[row][col] = countInEachLevel_var2[col] * distributionInEachLevel_var1[row]; } } double ChiSq = 0; for (int row = 0; row < countOfLevels_var1; ++row) { for (int col = 0; col < countOfLevels_var2; ++col) { ChiSq += System.Math.Pow(contingency_table[row][col] - expected_contingency_table[row][col], 2) / expected_contingency_table[row][col]; } } int df = (countOfLevels_var1 - 1) * (countOfLevels_var2 - 1); pValue = 1 - ChiSquare.GetPercentile(ChiSq, df); return(pValue < signficance_level); }