Beispiel #1
0
        private void initialize(double[][] samples)
        {
            DFb = groupCount - 1;
            DFw = totalSize - groupCount;
            DFt = totalSize - 1;

            // Step 1. Calculate the mean within each group
            means = Statistics.Tools.Mean(samples, 1);

            // Step 2. Calculate the overall mean
            totalMean = Statistics.Tools.GrandMean(means, sizes);


            // Step 3. Calculate the "between-group" sum of squares
            for (int i = 0; i < samples.Length; i++)
            {
                //  between-group sum of squares
                double u = (means[i] - totalMean);
                SSb += sizes[i] * u * u;
            }


            // Step 4. Calculate the "within-group" sum of squares
            for (int i = 0; i < samples.Length; i++)
            {
                for (int j = 0; j < samples[i].Length; j++)
                {
                    double u = samples[i][j] - means[i];
                    SSw += u * u;
                }
            }

            SSt = SSb + SSw; // total sum of squares


            // Step 5. Calculate the F statistic
            MSb = SSb / DFb; // between-group mean square
            MSw = SSw / DFw; // within-group mean square
            FTest = new FTest(MSb / MSw, DFb, DFw);


            // Step 6. Create the ANOVA table
            List<AnovaVariationSource> table = new List<AnovaVariationSource>();
            table.Add(new AnovaVariationSource(this, "Between-Groups", SSb, DFb, FTest));
            table.Add(new AnovaVariationSource(this, "Within-Groups", SSw, DFw, null));
            table.Add(new AnovaVariationSource(this, "Total", SSt, DFt, null));
            this.Table = new AnovaSourceCollection(table);
        }
Beispiel #2
0
        private void compute(double[][] x, double[] y)
        {
            int n = x.Length;
            int p = NumberOfInputs;

            SSt             = 0;
            SSe             = 0;
            outputMean      = 0.0;
            NumberOfSamples = x.Length;

            // Compute the regression
            OrdinaryLeastSquares.Token = Token;
            regression        = OrdinaryLeastSquares.Learn(x, y);
            informationMatrix = OrdinaryLeastSquares.GetInformationMatrix();

            // Calculate mean of the expected outputs
            outputMean = y.Mean();

            // Calculate actual outputs (results)
#pragma warning disable 612, 618
            results = regression.Transform(x);

            // Calculate SSe and SSt
            for (int i = 0; i < x.Length; i++)
            {
                double d;
                d    = y[i] - results[i];
                SSe += d * d;

                d    = y[i] - outputMean;
                SSt += d * d;
            }


            // Calculate SSr
            SSr = SSt - SSe;

            // Calculate R-Squared
            rSquared = (SSt != 0) ? 1.0 - (SSe / SSt) : 1.0;

            // Calculated Adjusted R-Squared
            if (rSquared == 1)
            {
                rAdjusted = 1;
            }
            else
            {
                if (n - p == 1)
                {
                    rAdjusted = double.NaN;
                }
                else
                {
                    rAdjusted = 1.0 - (1.0 - rSquared) * ((n - 1.0) / (n - p - 1.0));
                }
            }

            // Calculate Degrees of Freedom
            DFr = p;
            DFe = n - (p + 1);
            DFt = DFr + DFe;

            // Calculate Sum of Squares Mean
            MSe = SSe / DFe;
            MSr = SSr / DFr;
            MSt = SSt / DFt;

            // Calculate the F statistic
            ftest    = new FTest(MSr / MSe, DFr, DFe);
            stdError = Math.Sqrt(MSe);

            // Create the ANOVA table
            List <AnovaVariationSource> table = new List <AnovaVariationSource>();
            table.Add(new AnovaVariationSource(this, "Regression", SSr, DFr, MSr, ftest));
            table.Add(new AnovaVariationSource(this, "Error", SSe, DFe, MSe, null));
            table.Add(new AnovaVariationSource(this, "Total", SSt, DFt, MSt, null));
            this.anovaTable = new AnovaSourceCollection(table);

            // Compute coefficient standard errors;
            standardErrors = new double[NumberOfInputs + 1];
            for (int i = 0; i < informationMatrix.Length; i++)
            {
                standardErrors[i] = Math.Sqrt(MSe * informationMatrix[i][i]);
            }

            // Compute coefficient tests
            for (int i = 0; i < CoefficientValues.Length; i++)
            {
                double tStatistic = CoefficientValues[i] / standardErrors[i];

                ttests[i] = new TTest(estimatedValue: CoefficientValues[i],
                                      standardError: standardErrors[i], degreesOfFreedom: DFe);

                ftests[i] = new FTest(tStatistic * tStatistic, 1, DFe);

                confidences[i] = ttests[i].GetConfidenceInterval(confidencePercent);
            }


            // Compute model performance tests
            ttest         = new TTest(results, outputMean);
            ztest         = new ZTest(results, outputMean);
            chiSquareTest = new ChiSquareTest(y, results, n - p - 1);
#pragma warning restore 612, 618
        }
        public void gh_937()
        {
            #region doc_learn_database
            // Note: this example uses a System.Data.DataTable to represent input data,
            // but note that this is not required. The data could have been represented
            // as jagged double matrices (double[][]) directly.

            // If you have to handle heterogeneus data in your application, such as user records
            // in a database, this data is best represented within the framework using a .NET's
            // DataTable object. In order to try to learn a classification or regression model
            // using this datatable, first we will need to convert the table into a representation
            // that the machine learning model can understand. Such representation is quite often,
            // a matrix of doubles (double[][]).
            var data = new DataTable("Customer Revenue Example");

            data.Columns.Add("Day", "CustomerId", "Time (hour)", "Weather", "Revenue");
            data.Rows.Add("D1", 0, 8, "Sunny", 101.2);
            data.Rows.Add("D2", 1, 10, "Sunny", 24.1);
            data.Rows.Add("D3", 2, 10, "Rain", 107);
            data.Rows.Add("D4", 3, 16, "Rain", 223);
            data.Rows.Add("D5", 4, 15, "Rain", 1);
            data.Rows.Add("D6", 5, 20, "Rain", 42);
            data.Rows.Add("D7", 6, 12, "Cloudy", 123);
            data.Rows.Add("D8", 7, 12, "Sunny", 64);

            // One way to perform this conversion is by using a Codification filter. The Codification
            // filter can take care of converting variables that actually denote symbols (i.e. the
            // weather in the example above) into representations that make more sense given the assumption
            // of a real vector-based classifier.

            // Create a codification codebook
            var codebook = new Codification()
            {
                { "Weather", CodificationVariable.Categorical },
                { "Time (hour)", CodificationVariable.Continuous },
                { "Revenue", CodificationVariable.Continuous },
            };

            // Learn from the data
            codebook.Learn(data);

            // Now, we will use the codebook to transform the DataTable into double[][] vectors. Due
            // the way the conversion works, we can end up with more columns in your output vectors
            // than the ones started with. If you would like more details about what those columns
            // represent, you can pass then as 'out' parameters in the methods that follow below.
            string[] inputNames;  // (note: if you do not want to run this example yourself, you
            string   outputName;  // can see below the new variable names that will be generated)

            // Now, we can translate our training data into integer symbols using our codebook:
            double[][] inputs  = codebook.Apply(data, "Weather", "Time (hour)").ToJagged(out inputNames);
            double[]   outputs = codebook.Apply(data, "Revenue").ToVector(out outputName);
            // (note: the Apply method transform a DataTable into another DataTable containing the codified
            //  variables. The ToJagged and ToVector methods are then used to transform those tables into
            //  double[][] matrices and double[] vectors, respectively.

            // If we would like to learn a linear regression model for this data, there are two possible
            // ways depending on which aspect of the linear regression we are interested the most. If we
            // are interested in interpreting the linear regression, performing hypothesis tests with the
            // coefficients and performing an actual _linear regression analysis_, then we can use the
            // MultipleLinearRegressionAnalysis class for this. If however we are only interested in using
            // the learned model directly to predict new values for the dataset, then we could be using the
            // MultipleLinearRegression and OrdinaryLeastSquares classes directly instead.

            // This example deals with the former case. For the later, please see the documentation page
            // for the MultipleLinearRegression class.

            // We can create a new multiple linear analysis for the variables
            var mlra = new MultipleLinearRegressionAnalysis(intercept: true)
            {
                // We can also inform the names of the new variables that have been created by the
                // codification filter. Those can help in the visualizing the analysis once it is
                // data-bound to a visual control such a Windows.Forms.DataGridView or WPF DataGrid:

                Inputs = inputNames, // will be { "Weather: Sunny", "Weather: Rain, "Weather: Cloudy", "Time (hours)" }
                Output = outputName  // will be "Revenue"
            };

            // To overcome linear dependency errors
            mlra.OrdinaryLeastSquares.IsRobust = true;

            // Compute the analysis and obtain the estimated regression
            MultipleLinearRegression regression = mlra.Learn(inputs, outputs);

            // And then predict the label using
            double predicted = mlra.Transform(inputs[0]); // result will be ~72.3

            // Because we opted for doing a MultipleLinearRegressionAnalysis instead of a simple
            // linear regression, we will have further information about the regression available:
            int    inputCount       = mlra.NumberOfInputs;    // should be 4
            int    outputCount      = mlra.NumberOfOutputs;   // should be 1
            double r2               = mlra.RSquared;          // should be 0.12801838425195311
            AnovaSourceCollection a = mlra.Table;             // ANOVA table (bind to a visual control for quick inspection)
            double[][]            h = mlra.InformationMatrix; // should contain Fisher's information matrix for the problem
            ZTest z = mlra.ZTest;                             // should be 0 (p=0.999, non-significant)
            #endregion

            Assert.AreEqual(72.279574468085144d, predicted, 1e-8);
            Assert.AreEqual(4, inputCount, 1e-8);
            Assert.AreEqual(1, outputCount, 1e-8);
            Assert.AreEqual(0.12801838425195311, r2, 1e-8);
            Assert.AreEqual(0.11010987669344097, a[0].Statistic, 1e-8);

            string     str       = h.ToCSharp();
            double[][] expectedH = new double[][]
            {
                new double[] { 0.442293243337911, -0.069833718526197, -0.228692384542512, -0.0141758263063635, 0.143767140269202 },
                new double[] { -0.0698337185261971, 0.717811616891116, -0.112258662892007, -0.0655549422852099, 0.535719235472913 },
                new double[] { -0.228692384542512, -0.112258662892007, 0.717434922237013, -0.0232803210243207, 0.376483874802496 },
                new double[] { -0.0141758263063635, -0.0655549422852099, -0.0232803210243207, 0.0370082984668314, -0.103011089615894 },
                new double[] { 0.143767140269202, 0.535719235472913, 0.376483874802496, -0.103011089615894, 1.05597025054461 }
            };

            Assert.IsTrue(expectedH.IsEqual(h, 1e-8));
            Assert.AreEqual(0, z.Statistic, 1e-8);
            Assert.AreEqual(1, z.PValue, 1e-8);
        }
        /// <summary>
        ///   Computes the Multiple Linear Regression Analysis.
        /// </summary>
        ///
        public void Compute()
        {
            int n = inputData.Length;
            int p = inputCount;

            SSt = SSe = outputMean = 0.0;

            // Compute the regression
            double[,] informationMatrix;
            regression.Regress(inputData, outputData,
                               out informationMatrix);


            // Calculate mean of the expected outputs
            for (int i = 0; i < outputData.Length; i++)
            {
                outputMean += outputData[i];
            }
            outputMean /= outputData.Length;

            // Calculate actual outputs (results)
            results = new double[inputData.Length];
            for (int i = 0; i < inputData.Length; i++)
            {
                results[i] = regression.Compute(inputData[i]);
            }

            // Calculate SSe and SSt
            for (int i = 0; i < inputData.Length; i++)
            {
                double d;

                d    = outputData[i] - results[i];
                SSe += d * d;

                d    = outputData[i] - outputMean;
                SSt += d * d;
            }

            // Calculate SSr
            SSr = SSt - SSe;

            // Calculate R-Squared
            rSquared = (SSt != 0) ? 1.0 - (SSe / SSt) : 1.0;

            // Calculated Adjusted R-Squared
            if (rSquared == 1)
            {
                rAdjusted = 1;
            }
            else
            {
                if (n - p == 1)
                {
                    rAdjusted = double.NaN;
                }
                else
                {
                    rAdjusted = 1.0 - (1.0 - rSquared) * ((n - 1.0) / (n - p - 1.0));
                }
            }

            // Calculate Degrees of Freedom
            DFr = p;
            DFe = n - (p + 1);
            DFt = DFr + DFe;

            // Calculate Sum of Squares Mean
            MSe = SSe / DFe;
            MSr = SSr / DFr;
            MSt = SSt / DFt;

            // Calculate the F statistic
            ftest    = new FTest(MSr / MSe, DFr, DFe);
            stdError = Math.Sqrt(MSe);


            // Create the ANOVA table
            List <AnovaVariationSource> table = new List <AnovaVariationSource>();

            table.Add(new AnovaVariationSource(this, "Regression", SSr, DFr, MSr, ftest));
            table.Add(new AnovaVariationSource(this, "Error", SSe, DFe, MSe, null));
            table.Add(new AnovaVariationSource(this, "Total", SSt, DFt, MSt, null));
            this.anovaTable = new AnovaSourceCollection(table);


            // Compute coefficient standard errors;
            standardErrors = new double[coefficientCount];
            for (int i = 0; i < standardErrors.Length; i++)
            {
                standardErrors[i] = Math.Sqrt(MSe * informationMatrix[i, i]);
            }


            // Compute coefficient tests
            for (int i = 0; i < regression.Coefficients.Length; i++)
            {
                double tStatistic = regression.Coefficients[i] / standardErrors[i];

                ttests[i] = new TTest(estimatedValue: regression.Coefficients[i],
                                      standardError: standardErrors[i], degreesOfFreedom: DFe);

                ftests[i] = new FTest(tStatistic * tStatistic, 1, DFe);

                confidences[i] = ttests[i].GetConfidenceInterval(confidencePercent);
            }


            // Compute model performance tests
            ttest         = new TTest(results, outputMean);
            ztest         = new ZTest(results, outputMean);
            chiSquareTest = new ChiSquareTest(outputData, results, n - p - 1);
        }
        private void initialize(double[][][] samples, TwoWayAnovaModel type)
        {
            // References:
            // -  http://www.smi.hst.aau.dk/~cdahl/BiostatPhD/ANOVA.pdf

            ModelType = type;
            Observations = FirstFactorSamples * SecondFactorSamples * Replications;

            // Step 1. Initialize all degrees of freedom
            int cellDegreesOfFreedom = FirstFactorSamples * SecondFactorSamples - 1;
            int aDegreesOfFreedom = FirstFactorSamples - 1;
            int bDegreesOfFreedom = SecondFactorSamples - 1;
            int abDegreesOfFreedom = cellDegreesOfFreedom - aDegreesOfFreedom - bDegreesOfFreedom;
            int errorDegreesOfFreedom = FirstFactorSamples * SecondFactorSamples * (Replications - 1);
            int totalDegreesOfFreedom = Observations - 1;

            // Step 1. Calculate cell means
            cellMeans = new double[FirstFactorSamples, SecondFactorSamples];

            double sum = 0;
            for (int i = 0; i < samples.Length; i++)
                for (int j = 0; j < samples[i].Length; j++)
                    sum += cellMeans[i, j] = Statistics.Tools.Mean(samples[i][j]);

            // Step 2. Calculate the total mean (grand mean)
            totalMean = sum / (FirstFactorSamples * SecondFactorSamples);

            // Step 3. Calculate factor means
            aMean = new double[FirstFactorSamples];
            for (int i = 0; i < samples.Length; i++)
            {
                sum = 0;
                for (int j = 0; j < samples[i].Length; j++)
                    for (int k = 0; k < samples[i][j].Length; k++)
                        sum += samples[i][j][k];

                aMean[i] = sum / (SecondFactorSamples * Replications);
            }

            bMean = new double[SecondFactorSamples];
            for (int j = 0; j < samples[0].Length; j++)
            {
                sum = 0;
                for (int i = 0; i < samples.Length; i++)
                    for (int k = 0; k < samples[i][j].Length; k++)
                        sum += samples[i][j][k];

                bMean[j] = sum / (FirstFactorSamples * Replications);
            }

            // Step 4. Calculate total sum of squares
            double ssum = 0;
            for (int i = 0; i < samples.Length; i++)
            {
                for (int j = 0; j < samples[i].Length; j++)
                {
                    for (int k = 0; k < samples[i][j].Length; k++)
                    {
                        double u = samples[i][j][k] - totalMean;
                        ssum += u * u;
                    }
                }
            }
            double totalSumOfSquares = ssum;

            // Step 5. Calculate the cell sum of squares
            ssum = 0;
            for (int i = 0; i < FirstFactorSamples; i++)
            {
                for (int j = 0; j < SecondFactorSamples; j++)
                {
                    double u = cellMeans[i, j] - totalMean;
                    ssum += u * u;
                }
            }
            double cellSumOfSquares = ssum * Replications;

            // Step 6. Compute within-cells error sum of squares
            ssum = 0;
            for (int i = 0; i < samples.Length; i++)
            {
                for (int j = 0; j < samples[i].Length; j++)
                {
                    for (int k = 0; k < samples[i][j].Length; k++)
                    {
                        double u = samples[i][j][k] - cellMeans[i, j];
                        ssum += u * u;
                    }
                }
            }
            double errorSumOfSquares = ssum;

            // Step 7. Compute factors sum of squares
            ssum = 0;
            for (int i = 0; i < aMean.Length; i++)
            {
                double u = aMean[i] - totalMean;
                ssum += u * u;
            }
            double aSumOfSquares = ssum * SecondFactorSamples * Replications;

            ssum = 0;
            for (int i = 0; i < bMean.Length; i++)
            {
                double u = bMean[i] - totalMean;
                ssum += u * u;
            }
            double bSumOfSquares = ssum * FirstFactorSamples * Replications;

            // Step 9. Compute interaction sum of squares
            double abSumOfSquares = cellSumOfSquares - aSumOfSquares - bSumOfSquares;

            // Step 10. Compute mean squares
            double aMeanSquares = aSumOfSquares / aDegreesOfFreedom;
            double bMeanSquares = bSumOfSquares / bDegreesOfFreedom;
            double abMeanSquares = abSumOfSquares / abDegreesOfFreedom;
            double errorMeanSquares = errorSumOfSquares / errorDegreesOfFreedom;

            // Step 10. Create the F-Statistics
            FTest aSignificance, bSignificance, abSignificance;

            if (type == TwoWayAnovaModel.Fixed)
            {
                // Model 1: Factors A and B fixed
                aSignificance = new FTest(aMeanSquares / abMeanSquares, aDegreesOfFreedom, abDegreesOfFreedom);
                bSignificance = new FTest(bMeanSquares / abMeanSquares, bDegreesOfFreedom, abDegreesOfFreedom);
                abSignificance = new FTest(abMeanSquares / errorMeanSquares, abDegreesOfFreedom, errorDegreesOfFreedom);
            }
            else if (type == TwoWayAnovaModel.Mixed)
            {
                // Model 2: Factors A and B random
                aSignificance = new FTest(aMeanSquares / errorMeanSquares, aDegreesOfFreedom, errorDegreesOfFreedom);
                bSignificance = new FTest(bMeanSquares / errorMeanSquares, bDegreesOfFreedom, errorDegreesOfFreedom);
                abSignificance = new FTest(abMeanSquares / errorMeanSquares, abDegreesOfFreedom, errorDegreesOfFreedom);
            }
            else if (type == TwoWayAnovaModel.Random)
            {
                // Model 3: Factor A fixed, factor B random
                aSignificance = new FTest(aMeanSquares / abMeanSquares, aDegreesOfFreedom, abDegreesOfFreedom);
                bSignificance = new FTest(bMeanSquares / errorMeanSquares, bDegreesOfFreedom, errorDegreesOfFreedom);
                abSignificance = new FTest(abMeanSquares / errorMeanSquares, abDegreesOfFreedom, errorDegreesOfFreedom);
            }
            else throw new ArgumentException("Unhandled analysis type.","type");

            // Step 11. Create the ANOVA table and sources
            AnovaVariationSource cell  = new AnovaVariationSource(this, "Cells", cellSumOfSquares, cellDegreesOfFreedom);
            AnovaVariationSource a     = new AnovaVariationSource(this, "Factor A", aSumOfSquares, aDegreesOfFreedom, aMeanSquares, aSignificance);
            AnovaVariationSource b     = new AnovaVariationSource(this, "Factor B", bSumOfSquares, bDegreesOfFreedom, bMeanSquares, bSignificance);
            AnovaVariationSource ab    = new AnovaVariationSource(this, "Interaction AxB", abSumOfSquares, abDegreesOfFreedom, abMeanSquares, abSignificance);
            AnovaVariationSource error = new AnovaVariationSource(this, "Within-cells (error)", errorSumOfSquares, errorDegreesOfFreedom, errorMeanSquares);
            AnovaVariationSource total = new AnovaVariationSource(this, "Total", totalSumOfSquares, totalDegreesOfFreedom);

            this.Sources = new TwoWayAnovaVariationSources()
            {
                Cells = cell,
                FactorA = a,
                FactorB = b,
                Interaction = ab,
                Error = error,
                Total = total
            };

            this.Table = new AnovaSourceCollection(cell, a, b, ab, error, total);
        }