/// <summary> /// Approximate Wilk's lambda by F statistics /// </summary> /// <param name="H">Hypothesis Sum of Squares and Cross Products.</param> /// <param name="E">Error Sum of Squares and Cross Products</param> /// <param name="N">number of data points</param> /// <param name="p">number of variables</param> /// <param name="g">number of groups</param> /// <param name="F_crit">The F critical value</param> /// <returns>The p-value</returns> public static double GetWilksLambda(double[][] H, double[][] E, int N, int p, int g, out double F_crit) { double[][] H_plus_E = Add(H, E); double E_determinant = MatrixOp.GetDeterminant(E); double H_plus_E_determinant = MatrixOp.GetDeterminant(H_plus_E); double lambda = E_determinant / H_plus_E_determinant; double a = N - g - (p - g + 2) / 2.0; int b_threshold = p * p + (g - 1) * (g - 1) - 5; double b = 1; if (b_threshold > 0) { b = System.Math.Sqrt((p * p * (g - 1) - 4) / b_threshold); } double c = (p * (g - 1) - 2) / 2.0; F_crit = ((1 - System.Math.Pow(lambda, 1 / b)) / (System.Math.Pow(lambda, 1 / b))) * ((a * b - c) / (p * (g - 1))); double DF1 = p * (g - 1); double DF2 = a * b - c; double pValue = 1 - FDistribution.GetPercentile(F_crit, DF1, DF2); return(pValue); }
/// <summary> /// hypothesis testing for more than two classes using ANOVA /// /// Given that: /// H_0 : mu_1 = mu_2 = ... mu_k (where k is the number of classes) /// H_A : mu != null_value /// /// p-value = Pr(> f) is the probability of at least as large a ratio between the "between" and "within" group variablity if in fact the means of all groups are equal /// if(p-value < significance_level) reject H_0 /// </summary> /// <param name="groupedSample">The sample groupped based on the classes</param> /// <param name="pValue"></param> /// <param name="significance_level">p-value = Pr(> f) is the probability of at least as large a ratio between the "between" and "within" group variablity if in fact the means of all groups are equal</param> /// <returns>True if H_0 is rejected; False if H_0 is failed to be rejected</returns> public static bool RunANOVA(double[] totalSample, int[] grpCat, out ANOVA output, double significance_level = 0.05) { output = new ANOVA(); Dictionary <int, List <double> > groupedSample = new Dictionary <int, List <double> >(); for (int i = 0; i < totalSample.Length; ++i) { int grpId = grpCat[i]; double sampleVal = totalSample[i]; List <double> grp = null; if (groupedSample.ContainsKey(grpId)) { grp = groupedSample[grpId]; } else { grp = new List <double>(); groupedSample[grpId] = grp; } grp.Add(sampleVal); } double grand_mean; //Sum of squares measures the total variablity output.SST = GetSST(totalSample, out grand_mean); //sum of squares total output.SSG = GetSSG(groupedSample, grand_mean); //sum of squares group, which is known as explained variablity (explained by the group variable) output.SSE = output.SST - output.SSG; //sum of squares error, which is known as unexplained variability (unexplained by the group variable, due to other reasons) //Degrees of freedom output.dfT = totalSample.Length - 1; //degrees of freedom total output.dfG = groupedSample.Count - 1; //degrees of freedom group output.dfE = output.dfT - output.dfG; // degrees of freedom error //Mean squares measures variability between and within groups, calculated as the total variability (sum of squares) scaled by the associated degrees of freedom output.MSG = output.SSG / output.dfG; // mean squares group : between group variability output.MSE = output.SSE / output.dfE; // mean squares error : within group variablity output.Intercepts = GetIntercepts(GetMeanWithinGroup(groupedSample)); //f statistic: ratio of the between group variablity and within group variablity output.F = output.MSG / output.MSE; try { //p-value = Pr(> f) is the probability of at least as large a ratio between the "between" and "within" group variablity if in fact the means of all groups are equal output.pValue = 1 - FDistribution.GetPercentile(output.F, output.dfG, output.dfE); } catch { } return(output.RejectH0 = output.pValue < significance_level); }
/// <summary> /// Suppose the regression is given by y = b * x + intercept[j], where j is the group id (in other words, b is the fixed effect, intercept is the random effect) /// Run the ANCOVA which calculates the following: /// 1. the slope, b, of y = b * x + intercept[j] /// 2. the intercept, of y = b * x + intercept[j] /// </summary> /// <param name="x">data for the predictor variable</param> /// <param name="y">data for the response variable</param> /// <param name="grpCat">group id for each (x, y)</param> /// <param name="output">the result of ANCOVA</param> /// <param name="significance_level">alpha for the hypothesis testing, in which H_0 : y - b * x is independent of group category</param> public static void RunANCOVA(double[] x, double[] y, int[] grpCat, out ANCOVA output, double significance_level = 0.05) { output = new ANCOVA(); Dictionary <int, List <double> > groupped_x = new Dictionary <int, List <double> >(); Dictionary <int, List <double> > groupped_y = new Dictionary <int, List <double> >(); int N = x.Length; for (int i = 0; i < N; ++i) { int grpId = grpCat[i]; double xVal = x[i]; double yVal = y[i]; List <double> group_x = null; List <double> group_y = null; if (groupped_x.ContainsKey(grpId)) { group_x = groupped_x[grpId]; } else { group_x = new List <double>(); groupped_x[grpId] = group_x; } if (groupped_y.ContainsKey(grpId)) { group_y = groupped_y[grpId]; } else { group_y = new List <double>(); groupped_y[grpId] = group_y; } group_x.Add(xVal); group_y.Add(yVal); } double grand_mean_x; double grand_mean_y; output.SSTx = GetSST(x, out grand_mean_x); output.SSTy = GetSST(y, out grand_mean_y); output.SSBGx = GetSSG(groupped_x, grand_mean_x); output.SSBGy = GetSSG(groupped_y, grand_mean_y); output.SSWGy = output.SSTy - output.SSBGy; output.SSWGx = output.SSTx - output.SSBGx; output.SCT = GetCovariance(x, y); output.SCWG = GetCovarianceWithinGroup(groupped_x, groupped_y); output.rT = output.SCT / System.Math.Sqrt(output.SSTx * output.SSTy); output.rWG = output.SCWG / System.Math.Sqrt(output.SSWGx * output.SSWGy); output.SSTy_adj = output.SSTy - System.Math.Pow(output.SCT, 2) / output.SSTx; output.SSWGy_adj = output.SSWGy - System.Math.Pow(output.SCWG, 2) / output.SSWGx; output.SSBGy_adj = output.SSTy_adj - output.SSWGy_adj; output.dfT = N - 2; output.dfBG = groupped_x.Count - 1; output.dfWG = N - groupped_x.Count - 1; output.MSBGy_adj = output.SSBGy_adj / output.dfBG; output.MSWGy_adj = output.SSWGy_adj / output.dfWG; output.Slope = output.SCWG / output.SSWGx; output.MeanWithinGroups_x = GetMeanWithinGroup(groupped_x); output.MeanWithinGroups_y = GetMeanWithinGroup(groupped_y); output.Intercepts = GetIntercepts(output.MeanWithinGroups_x, output.MeanWithinGroups_y, grand_mean_x, output.Slope); output.F = output.MSBGy_adj / output.MSWGy_adj; try { output.pValue = 1 - FDistribution.GetPercentile(output.F, output.dfBG, output.dfWG); } catch { } output.RejectH0 = output.pValue < significance_level; }
/// <summary> /// hypothesis testing for a variable based on more than two groups using ANOVA /// /// Given that (for each of grpCat1, grpCat2, and grpCat1 * grpCat2): /// H_0 : mu_1 = mu_2 = ... mu_k (where k is the number of classes) /// H_A : mu != null_value /// /// p-value = Pr(> f) is the probability of at least as large a ratio between the "between" and "within" group variablity if in fact the means of all groups are equal /// if(p-value < significance_level) reject H_0 /// </summary> /// <param name="groupedSample">The sample groupped based on the classes</param> /// <param name="pValue"></param> /// <param name="significance_level">p-value = Pr(> f) is the probability of at least as large a ratio between the "between" and "within" group variablity if in fact the means of all groups are equal</param> /// <returns></returns> public static void RunANOVA(double[] sample, int[] grpCat1, int[] grpCat2, out TwoWayANOVA output, double significance_level = 0.05) { output = new TwoWayANOVA(); Dictionary <int, List <double> > grpSample1 = new Dictionary <int, List <double> >(); Dictionary <int, List <double> > grpSample2 = new Dictionary <int, List <double> >(); Dictionary <string, List <double> > grpSample12 = new Dictionary <string, List <double> >(); int N = sample.Length; for (int i = 0; i < N; ++i) { List <double> grp1 = null; List <double> grp2 = null; List <double> grp12 = null; double sampleVal = sample[i]; int grpId1 = grpCat1[i]; int grpId2 = grpCat2[i]; string grpId12 = string.Format("{0};{1}", grpId1, grpId2); if (grpSample1.ContainsKey(grpId1)) { grp1 = grpSample1[grpId1]; } else { grp1 = new List <double>(); grpSample1[grpId1] = grp1; } if (grpSample2.ContainsKey(grpId2)) { grp2 = grpSample2[grpId2]; } else { grp2 = new List <double>(); grpSample2[grpId2] = grp2; } if (grpSample12.ContainsKey(grpId12)) { grp12 = grpSample12[grpId12]; } else { grp12 = new List <double>(); grpSample12[grpId12] = grp12; } grp1.Add(sampleVal); grp2.Add(sampleVal); grp12.Add(sampleVal); } double grand_mean; //Sum of squares measures the total variablity output.SST = GetSST(sample, out grand_mean); //sum of squares total output.SSG1 = GetSSG(grpSample1, grand_mean); //grpCat1: sum of squares group, which is known as explained variablity (explained by the group variable) output.SSG2 = GetSSG(grpSample2, grand_mean); //grpCat2: sum of squares group, which is known as explained variablity (explained by the group variable) output.SSG12 = GetSSG(grpSample12, grand_mean); //grpCat1 * grpCat2: sum of squares group, which is known as explained variablity (explained by the group variable) output.SSE = output.SST - output.SSG1 - output.SSG2 - output.SSG12; //sum of squares error, which is known as unexplained variability (unexplained by the group variable, due to other reasons) //Degrees of freedom output.dfT = sample.Length - 1; //degrees of freedom total output.dfG1 = grpSample1.Count - 1; //grpCat1: degrees of freedom group output.dfG2 = grpSample2.Count - 1; //grpCat2: degrees of freedom group output.dfG12 = grpSample12.Count - 1; //grpCat1 * grpCat2: degrees of freedom group output.dfE = output.dfT - output.dfG1 - output.dfG2 - output.dfG12; // degrees of freedom error //Mean squares measures variability between and within groups, calculated as the total variability (sum of squares) scaled by the associated degrees of freedom output.MSG1 = output.SSG1 / output.dfG1; //grpCat1: mean squares group : between group variability output.MSG2 = output.SSG2 / output.dfG2; //grpCat1: mean squares group : between group variability output.MSG12 = output.SSG12 / output.dfG12; //grpCat12: mean squares group : between group variability output.MSE = output.SSE / output.dfE; // mean squares error : within group variablity //f statistic: ratio of the between group variablity and within group variablity output.F1 = output.MSG1 / output.MSE; output.F2 = output.MSG2 / output.MSE; output.F12 = output.MSG12 / output.MSE; //p-value = Pr(> f) is the probability of at least as large a ratio between the "between" and "within" group variablity if in fact the means of all groups are equal output.pValue1 = 1 - FDistribution.GetPercentile(output.F1, output.dfG1, output.dfE); output.pValue2 = 1 - FDistribution.GetPercentile(output.F2, output.dfG2, output.dfE); output.pValue12 = 1 - FDistribution.GetPercentile(output.F12, output.dfG12, output.dfE); output.RejectH0_Var1 = output.pValue1 < significance_level; output.RejectH0_Var2 = output.pValue2 < significance_level; output.RejectH0_Interaction = output.pValue12 < significance_level; }