/// <summary>
        /// Approximate Wilk's lambda by F statistics
        /// </summary>
        /// <param name="H">Hypothesis Sum of Squares and Cross Products.</param>
        /// <param name="E">Error Sum of Squares and Cross Products</param>
        /// <param name="N">number of data points</param>
        /// <param name="p">number of variables</param>
        /// <param name="g">number of groups</param>
        /// <param name="F_crit">The F critical value</param>
        /// <returns>The p-value</returns>
        public static double GetWilksLambda(double[][] H, double[][] E, int N, int p, int g, out double F_crit)
        {
            double[][] H_plus_E             = Add(H, E);
            double     E_determinant        = MatrixOp.GetDeterminant(E);
            double     H_plus_E_determinant = MatrixOp.GetDeterminant(H_plus_E);
            double     lambda = E_determinant / H_plus_E_determinant;

            double a           = N - g - (p - g + 2) / 2.0;
            int    b_threshold = p * p + (g - 1) * (g - 1) - 5;
            double b           = 1;

            if (b_threshold > 0)
            {
                b = System.Math.Sqrt((p * p * (g - 1) - 4) / b_threshold);
            }

            double c = (p * (g - 1) - 2) / 2.0;

            F_crit = ((1 - System.Math.Pow(lambda, 1 / b)) / (System.Math.Pow(lambda, 1 / b))) * ((a * b - c) / (p * (g - 1)));

            double DF1    = p * (g - 1);
            double DF2    = a * b - c;
            double pValue = 1 - FDistribution.GetPercentile(F_crit, DF1, DF2);

            return(pValue);
        }
Пример #2
0
        /// <summary>
        /// hypothesis testing for more than two classes using ANOVA
        ///
        /// Given that:
        /// H_0 : mu_1 = mu_2 = ... mu_k (where k is the number of classes)
        /// H_A : mu != null_value
        ///
        /// p-value = Pr(> f) is the probability of at least as large a ratio between the "between" and "within" group variablity if in fact the means of all groups are equal
        /// if(p-value < significance_level) reject H_0
        /// </summary>
        /// <param name="groupedSample">The sample groupped based on the classes</param>
        /// <param name="pValue"></param>
        /// <param name="significance_level">p-value = Pr(> f) is the probability of at least as large a ratio between the "between" and "within" group variablity if in fact the means of all groups are equal</param>
        /// <returns>True if H_0 is rejected; False if H_0 is failed to be rejected</returns>
        public static bool RunANOVA(double[] totalSample, int[] grpCat, out ANOVA output, double significance_level = 0.05)
        {
            output = new ANOVA();

            Dictionary <int, List <double> > groupedSample = new Dictionary <int, List <double> >();

            for (int i = 0; i < totalSample.Length; ++i)
            {
                int           grpId     = grpCat[i];
                double        sampleVal = totalSample[i];
                List <double> grp       = null;
                if (groupedSample.ContainsKey(grpId))
                {
                    grp = groupedSample[grpId];
                }
                else
                {
                    grp = new List <double>();
                    groupedSample[grpId] = grp;
                }
                grp.Add(sampleVal);
            }
            double grand_mean;

            //Sum of squares measures the total variablity
            output.SST = GetSST(totalSample, out grand_mean); //sum of squares total
            output.SSG = GetSSG(groupedSample, grand_mean);   //sum of squares group, which is known as explained variablity (explained by the group variable)
            output.SSE = output.SST - output.SSG;             //sum of squares error, which is known as unexplained variability (unexplained by the group variable, due to other reasons)

            //Degrees of freedom
            output.dfT = totalSample.Length - 1;  //degrees of freedom total
            output.dfG = groupedSample.Count - 1; //degrees of freedom group
            output.dfE = output.dfT - output.dfG; // degrees of freedom error

            //Mean squares measures variability between and within groups, calculated as the total variability (sum of squares) scaled by the associated degrees of freedom
            output.MSG = output.SSG / output.dfG; // mean squares group : between group variability
            output.MSE = output.SSE / output.dfE; // mean squares error : within group variablity

            output.Intercepts = GetIntercepts(GetMeanWithinGroup(groupedSample));

            //f statistic: ratio of the between group variablity and within group variablity
            output.F = output.MSG / output.MSE;

            try
            {
                //p-value = Pr(> f) is the probability of at least as large a ratio between the "between" and "within" group variablity if in fact the means of all groups are equal
                output.pValue = 1 - FDistribution.GetPercentile(output.F, output.dfG, output.dfE);
            }
            catch
            {
            }

            return(output.RejectH0 = output.pValue < significance_level);
        }
Пример #3
0
        /// <summary>
        /// Suppose the regression is given by y = b * x + intercept[j], where j is the group id (in other words, b is the fixed effect, intercept is the random effect)
        /// Run the ANCOVA which calculates the following:
        ///  1. the slope, b, of y = b * x + intercept[j]
        ///  2. the intercept, of y = b * x + intercept[j]
        /// </summary>
        /// <param name="x">data for the predictor variable</param>
        /// <param name="y">data for the response variable</param>
        /// <param name="grpCat">group id for each (x, y)</param>
        /// <param name="output">the result of ANCOVA</param>
        /// <param name="significance_level">alpha for the hypothesis testing, in which H_0 : y - b * x is independent of group category</param>
        public static void RunANCOVA(double[] x, double[] y, int[] grpCat, out ANCOVA output, double significance_level = 0.05)
        {
            output = new ANCOVA();

            Dictionary <int, List <double> > groupped_x = new Dictionary <int, List <double> >();
            Dictionary <int, List <double> > groupped_y = new Dictionary <int, List <double> >();

            int N = x.Length;

            for (int i = 0; i < N; ++i)
            {
                int    grpId = grpCat[i];
                double xVal  = x[i];
                double yVal  = y[i];

                List <double> group_x = null;
                List <double> group_y = null;

                if (groupped_x.ContainsKey(grpId))
                {
                    group_x = groupped_x[grpId];
                }
                else
                {
                    group_x           = new List <double>();
                    groupped_x[grpId] = group_x;
                }

                if (groupped_y.ContainsKey(grpId))
                {
                    group_y = groupped_y[grpId];
                }
                else
                {
                    group_y           = new List <double>();
                    groupped_y[grpId] = group_y;
                }

                group_x.Add(xVal);
                group_y.Add(yVal);
            }

            double grand_mean_x;
            double grand_mean_y;

            output.SSTx = GetSST(x, out grand_mean_x);
            output.SSTy = GetSST(y, out grand_mean_y);

            output.SSBGx = GetSSG(groupped_x, grand_mean_x);
            output.SSBGy = GetSSG(groupped_y, grand_mean_y);

            output.SSWGy = output.SSTy - output.SSBGy;
            output.SSWGx = output.SSTx - output.SSBGx;

            output.SCT  = GetCovariance(x, y);
            output.SCWG = GetCovarianceWithinGroup(groupped_x, groupped_y);

            output.rT  = output.SCT / System.Math.Sqrt(output.SSTx * output.SSTy);
            output.rWG = output.SCWG / System.Math.Sqrt(output.SSWGx * output.SSWGy);

            output.SSTy_adj  = output.SSTy - System.Math.Pow(output.SCT, 2) / output.SSTx;
            output.SSWGy_adj = output.SSWGy - System.Math.Pow(output.SCWG, 2) / output.SSWGx;
            output.SSBGy_adj = output.SSTy_adj - output.SSWGy_adj;

            output.dfT  = N - 2;
            output.dfBG = groupped_x.Count - 1;
            output.dfWG = N - groupped_x.Count - 1;

            output.MSBGy_adj = output.SSBGy_adj / output.dfBG;
            output.MSWGy_adj = output.SSWGy_adj / output.dfWG;

            output.Slope = output.SCWG / output.SSWGx;

            output.MeanWithinGroups_x = GetMeanWithinGroup(groupped_x);
            output.MeanWithinGroups_y = GetMeanWithinGroup(groupped_y);

            output.Intercepts = GetIntercepts(output.MeanWithinGroups_x, output.MeanWithinGroups_y, grand_mean_x, output.Slope);

            output.F = output.MSBGy_adj / output.MSWGy_adj;
            try
            {
                output.pValue = 1 - FDistribution.GetPercentile(output.F, output.dfBG, output.dfWG);
            }
            catch
            {
            }

            output.RejectH0 = output.pValue < significance_level;
        }
        /// <summary>
        /// hypothesis testing for a variable based on more than two groups using ANOVA
        ///
        /// Given that (for each of grpCat1, grpCat2, and grpCat1 * grpCat2):
        /// H_0 : mu_1 = mu_2 = ... mu_k (where k is the number of classes)
        /// H_A : mu != null_value
        ///
        /// p-value = Pr(> f) is the probability of at least as large a ratio between the "between" and "within" group variablity if in fact the means of all groups are equal
        /// if(p-value < significance_level) reject H_0
        /// </summary>
        /// <param name="groupedSample">The sample groupped based on the classes</param>
        /// <param name="pValue"></param>
        /// <param name="significance_level">p-value = Pr(> f) is the probability of at least as large a ratio between the "between" and "within" group variablity if in fact the means of all groups are equal</param>
        /// <returns></returns>
        public static void RunANOVA(double[] sample, int[] grpCat1, int[] grpCat2, out TwoWayANOVA output, double significance_level = 0.05)
        {
            output = new TwoWayANOVA();

            Dictionary <int, List <double> >    grpSample1  = new Dictionary <int, List <double> >();
            Dictionary <int, List <double> >    grpSample2  = new Dictionary <int, List <double> >();
            Dictionary <string, List <double> > grpSample12 = new Dictionary <string, List <double> >();
            int N = sample.Length;

            for (int i = 0; i < N; ++i)
            {
                List <double> grp1      = null;
                List <double> grp2      = null;
                List <double> grp12     = null;
                double        sampleVal = sample[i];
                int           grpId1    = grpCat1[i];
                int           grpId2    = grpCat2[i];
                string        grpId12   = string.Format("{0};{1}", grpId1, grpId2);

                if (grpSample1.ContainsKey(grpId1))
                {
                    grp1 = grpSample1[grpId1];
                }
                else
                {
                    grp1 = new List <double>();
                    grpSample1[grpId1] = grp1;
                }

                if (grpSample2.ContainsKey(grpId2))
                {
                    grp2 = grpSample2[grpId2];
                }
                else
                {
                    grp2 = new List <double>();
                    grpSample2[grpId2] = grp2;
                }

                if (grpSample12.ContainsKey(grpId12))
                {
                    grp12 = grpSample12[grpId12];
                }
                else
                {
                    grp12 = new List <double>();
                    grpSample12[grpId12] = grp12;
                }

                grp1.Add(sampleVal);
                grp2.Add(sampleVal);
                grp12.Add(sampleVal);
            }

            double grand_mean;

            //Sum of squares measures the total variablity
            output.SST   = GetSST(sample, out grand_mean);                        //sum of squares total
            output.SSG1  = GetSSG(grpSample1, grand_mean);                        //grpCat1: sum of squares group, which is known as explained variablity (explained by the group variable)
            output.SSG2  = GetSSG(grpSample2, grand_mean);                        //grpCat2: sum of squares group, which is known as explained variablity (explained by the group variable)
            output.SSG12 = GetSSG(grpSample12, grand_mean);                       //grpCat1 * grpCat2: sum of squares group, which is known as explained variablity (explained by the group variable)
            output.SSE   = output.SST - output.SSG1 - output.SSG2 - output.SSG12; //sum of squares error, which is known as unexplained variability (unexplained by the group variable, due to other reasons)

            //Degrees of freedom
            output.dfT   = sample.Length - 1;                                     //degrees of freedom total
            output.dfG1  = grpSample1.Count - 1;                                  //grpCat1: degrees of freedom group
            output.dfG2  = grpSample2.Count - 1;                                  //grpCat2: degrees of freedom group
            output.dfG12 = grpSample12.Count - 1;                                 //grpCat1 * grpCat2: degrees of freedom group
            output.dfE   = output.dfT - output.dfG1 - output.dfG2 - output.dfG12; // degrees of freedom error

            //Mean squares measures variability between and within groups, calculated as the total variability (sum of squares) scaled by the associated degrees of freedom
            output.MSG1  = output.SSG1 / output.dfG1;   //grpCat1: mean squares group : between group variability
            output.MSG2  = output.SSG2 / output.dfG2;   //grpCat1: mean squares group : between group variability
            output.MSG12 = output.SSG12 / output.dfG12; //grpCat12: mean squares group : between group variability
            output.MSE   = output.SSE / output.dfE;     // mean squares error : within group variablity

            //f statistic: ratio of the between group variablity and within group variablity
            output.F1  = output.MSG1 / output.MSE;
            output.F2  = output.MSG2 / output.MSE;
            output.F12 = output.MSG12 / output.MSE;

            //p-value = Pr(> f) is the probability of at least as large a ratio between the "between" and "within" group variablity if in fact the means of all groups are equal
            output.pValue1  = 1 - FDistribution.GetPercentile(output.F1, output.dfG1, output.dfE);
            output.pValue2  = 1 - FDistribution.GetPercentile(output.F2, output.dfG2, output.dfE);
            output.pValue12 = 1 - FDistribution.GetPercentile(output.F12, output.dfG12, output.dfE);

            output.RejectH0_Var1        = output.pValue1 < significance_level;
            output.RejectH0_Var2        = output.pValue2 < significance_level;
            output.RejectH0_Interaction = output.pValue12 < significance_level;
        }