private void initialize(double[][] samples) { DFb = groupCount - 1; DFw = totalSize - groupCount; DFt = totalSize - 1; // Step 1. Calculate the mean within each group means = Statistics.Tools.Mean(samples, 1); // Step 2. Calculate the overall mean totalMean = Statistics.Tools.GrandMean(means, sizes); // Step 3. Calculate the "between-group" sum of squares for (int i = 0; i < samples.Length; i++) { // between-group sum of squares double u = (means[i] - totalMean); SSb += sizes[i] * u * u; } // Step 4. Calculate the "within-group" sum of squares for (int i = 0; i < samples.Length; i++) { for (int j = 0; j < samples[i].Length; j++) { double u = samples[i][j] - means[i]; SSw += u * u; } } SSt = SSb + SSw; // total sum of squares // Step 5. Calculate the F statistic MSb = SSb / DFb; // between-group mean square MSw = SSw / DFw; // within-group mean square FTest = new FTest(MSb / MSw, DFb, DFw); // Step 6. Create the ANOVA table List<AnovaVariationSource> table = new List<AnovaVariationSource>(); table.Add(new AnovaVariationSource(this, "Between-Groups", SSb, DFb, FTest)); table.Add(new AnovaVariationSource(this, "Within-Groups", SSw, DFw, null)); table.Add(new AnovaVariationSource(this, "Total", SSt, DFt, null)); this.Table = new AnovaSourceCollection(table); }
private void compute(double[][] x, double[] y) { int n = x.Length; int p = NumberOfInputs; SSt = 0; SSe = 0; outputMean = 0.0; NumberOfSamples = x.Length; // Compute the regression OrdinaryLeastSquares.Token = Token; regression = OrdinaryLeastSquares.Learn(x, y); informationMatrix = OrdinaryLeastSquares.GetInformationMatrix(); // Calculate mean of the expected outputs outputMean = y.Mean(); // Calculate actual outputs (results) #pragma warning disable 612, 618 results = regression.Transform(x); // Calculate SSe and SSt for (int i = 0; i < x.Length; i++) { double d; d = y[i] - results[i]; SSe += d * d; d = y[i] - outputMean; SSt += d * d; } // Calculate SSr SSr = SSt - SSe; // Calculate R-Squared rSquared = (SSt != 0) ? 1.0 - (SSe / SSt) : 1.0; // Calculated Adjusted R-Squared if (rSquared == 1) { rAdjusted = 1; } else { if (n - p == 1) { rAdjusted = double.NaN; } else { rAdjusted = 1.0 - (1.0 - rSquared) * ((n - 1.0) / (n - p - 1.0)); } } // Calculate Degrees of Freedom DFr = p; DFe = n - (p + 1); DFt = DFr + DFe; // Calculate Sum of Squares Mean MSe = SSe / DFe; MSr = SSr / DFr; MSt = SSt / DFt; // Calculate the F statistic ftest = new FTest(MSr / MSe, DFr, DFe); stdError = Math.Sqrt(MSe); // Create the ANOVA table List <AnovaVariationSource> table = new List <AnovaVariationSource>(); table.Add(new AnovaVariationSource(this, "Regression", SSr, DFr, MSr, ftest)); table.Add(new AnovaVariationSource(this, "Error", SSe, DFe, MSe, null)); table.Add(new AnovaVariationSource(this, "Total", SSt, DFt, MSt, null)); this.anovaTable = new AnovaSourceCollection(table); // Compute coefficient standard errors; standardErrors = new double[NumberOfInputs + 1]; for (int i = 0; i < informationMatrix.Length; i++) { standardErrors[i] = Math.Sqrt(MSe * informationMatrix[i][i]); } // Compute coefficient tests for (int i = 0; i < CoefficientValues.Length; i++) { double tStatistic = CoefficientValues[i] / standardErrors[i]; ttests[i] = new TTest(estimatedValue: CoefficientValues[i], standardError: standardErrors[i], degreesOfFreedom: DFe); ftests[i] = new FTest(tStatistic * tStatistic, 1, DFe); confidences[i] = ttests[i].GetConfidenceInterval(confidencePercent); } // Compute model performance tests ttest = new TTest(results, outputMean); ztest = new ZTest(results, outputMean); chiSquareTest = new ChiSquareTest(y, results, n - p - 1); #pragma warning restore 612, 618 }
public void gh_937() { #region doc_learn_database // Note: this example uses a System.Data.DataTable to represent input data, // but note that this is not required. The data could have been represented // as jagged double matrices (double[][]) directly. // If you have to handle heterogeneus data in your application, such as user records // in a database, this data is best represented within the framework using a .NET's // DataTable object. In order to try to learn a classification or regression model // using this datatable, first we will need to convert the table into a representation // that the machine learning model can understand. Such representation is quite often, // a matrix of doubles (double[][]). var data = new DataTable("Customer Revenue Example"); data.Columns.Add("Day", "CustomerId", "Time (hour)", "Weather", "Revenue"); data.Rows.Add("D1", 0, 8, "Sunny", 101.2); data.Rows.Add("D2", 1, 10, "Sunny", 24.1); data.Rows.Add("D3", 2, 10, "Rain", 107); data.Rows.Add("D4", 3, 16, "Rain", 223); data.Rows.Add("D5", 4, 15, "Rain", 1); data.Rows.Add("D6", 5, 20, "Rain", 42); data.Rows.Add("D7", 6, 12, "Cloudy", 123); data.Rows.Add("D8", 7, 12, "Sunny", 64); // One way to perform this conversion is by using a Codification filter. The Codification // filter can take care of converting variables that actually denote symbols (i.e. the // weather in the example above) into representations that make more sense given the assumption // of a real vector-based classifier. // Create a codification codebook var codebook = new Codification() { { "Weather", CodificationVariable.Categorical }, { "Time (hour)", CodificationVariable.Continuous }, { "Revenue", CodificationVariable.Continuous }, }; // Learn from the data codebook.Learn(data); // Now, we will use the codebook to transform the DataTable into double[][] vectors. Due // the way the conversion works, we can end up with more columns in your output vectors // than the ones started with. If you would like more details about what those columns // represent, you can pass then as 'out' parameters in the methods that follow below. string[] inputNames; // (note: if you do not want to run this example yourself, you string outputName; // can see below the new variable names that will be generated) // Now, we can translate our training data into integer symbols using our codebook: double[][] inputs = codebook.Apply(data, "Weather", "Time (hour)").ToJagged(out inputNames); double[] outputs = codebook.Apply(data, "Revenue").ToVector(out outputName); // (note: the Apply method transform a DataTable into another DataTable containing the codified // variables. The ToJagged and ToVector methods are then used to transform those tables into // double[][] matrices and double[] vectors, respectively. // If we would like to learn a linear regression model for this data, there are two possible // ways depending on which aspect of the linear regression we are interested the most. If we // are interested in interpreting the linear regression, performing hypothesis tests with the // coefficients and performing an actual _linear regression analysis_, then we can use the // MultipleLinearRegressionAnalysis class for this. If however we are only interested in using // the learned model directly to predict new values for the dataset, then we could be using the // MultipleLinearRegression and OrdinaryLeastSquares classes directly instead. // This example deals with the former case. For the later, please see the documentation page // for the MultipleLinearRegression class. // We can create a new multiple linear analysis for the variables var mlra = new MultipleLinearRegressionAnalysis(intercept: true) { // We can also inform the names of the new variables that have been created by the // codification filter. Those can help in the visualizing the analysis once it is // data-bound to a visual control such a Windows.Forms.DataGridView or WPF DataGrid: Inputs = inputNames, // will be { "Weather: Sunny", "Weather: Rain, "Weather: Cloudy", "Time (hours)" } Output = outputName // will be "Revenue" }; // To overcome linear dependency errors mlra.OrdinaryLeastSquares.IsRobust = true; // Compute the analysis and obtain the estimated regression MultipleLinearRegression regression = mlra.Learn(inputs, outputs); // And then predict the label using double predicted = mlra.Transform(inputs[0]); // result will be ~72.3 // Because we opted for doing a MultipleLinearRegressionAnalysis instead of a simple // linear regression, we will have further information about the regression available: int inputCount = mlra.NumberOfInputs; // should be 4 int outputCount = mlra.NumberOfOutputs; // should be 1 double r2 = mlra.RSquared; // should be 0.12801838425195311 AnovaSourceCollection a = mlra.Table; // ANOVA table (bind to a visual control for quick inspection) double[][] h = mlra.InformationMatrix; // should contain Fisher's information matrix for the problem ZTest z = mlra.ZTest; // should be 0 (p=0.999, non-significant) #endregion Assert.AreEqual(72.279574468085144d, predicted, 1e-8); Assert.AreEqual(4, inputCount, 1e-8); Assert.AreEqual(1, outputCount, 1e-8); Assert.AreEqual(0.12801838425195311, r2, 1e-8); Assert.AreEqual(0.11010987669344097, a[0].Statistic, 1e-8); string str = h.ToCSharp(); double[][] expectedH = new double[][] { new double[] { 0.442293243337911, -0.069833718526197, -0.228692384542512, -0.0141758263063635, 0.143767140269202 }, new double[] { -0.0698337185261971, 0.717811616891116, -0.112258662892007, -0.0655549422852099, 0.535719235472913 }, new double[] { -0.228692384542512, -0.112258662892007, 0.717434922237013, -0.0232803210243207, 0.376483874802496 }, new double[] { -0.0141758263063635, -0.0655549422852099, -0.0232803210243207, 0.0370082984668314, -0.103011089615894 }, new double[] { 0.143767140269202, 0.535719235472913, 0.376483874802496, -0.103011089615894, 1.05597025054461 } }; Assert.IsTrue(expectedH.IsEqual(h, 1e-8)); Assert.AreEqual(0, z.Statistic, 1e-8); Assert.AreEqual(1, z.PValue, 1e-8); }
/// <summary> /// Computes the Multiple Linear Regression Analysis. /// </summary> /// public void Compute() { int n = inputData.Length; int p = inputCount; SSt = SSe = outputMean = 0.0; // Compute the regression double[,] informationMatrix; regression.Regress(inputData, outputData, out informationMatrix); // Calculate mean of the expected outputs for (int i = 0; i < outputData.Length; i++) { outputMean += outputData[i]; } outputMean /= outputData.Length; // Calculate actual outputs (results) results = new double[inputData.Length]; for (int i = 0; i < inputData.Length; i++) { results[i] = regression.Compute(inputData[i]); } // Calculate SSe and SSt for (int i = 0; i < inputData.Length; i++) { double d; d = outputData[i] - results[i]; SSe += d * d; d = outputData[i] - outputMean; SSt += d * d; } // Calculate SSr SSr = SSt - SSe; // Calculate R-Squared rSquared = (SSt != 0) ? 1.0 - (SSe / SSt) : 1.0; // Calculated Adjusted R-Squared if (rSquared == 1) { rAdjusted = 1; } else { if (n - p == 1) { rAdjusted = double.NaN; } else { rAdjusted = 1.0 - (1.0 - rSquared) * ((n - 1.0) / (n - p - 1.0)); } } // Calculate Degrees of Freedom DFr = p; DFe = n - (p + 1); DFt = DFr + DFe; // Calculate Sum of Squares Mean MSe = SSe / DFe; MSr = SSr / DFr; MSt = SSt / DFt; // Calculate the F statistic ftest = new FTest(MSr / MSe, DFr, DFe); stdError = Math.Sqrt(MSe); // Create the ANOVA table List <AnovaVariationSource> table = new List <AnovaVariationSource>(); table.Add(new AnovaVariationSource(this, "Regression", SSr, DFr, MSr, ftest)); table.Add(new AnovaVariationSource(this, "Error", SSe, DFe, MSe, null)); table.Add(new AnovaVariationSource(this, "Total", SSt, DFt, MSt, null)); this.anovaTable = new AnovaSourceCollection(table); // Compute coefficient standard errors; standardErrors = new double[coefficientCount]; for (int i = 0; i < standardErrors.Length; i++) { standardErrors[i] = Math.Sqrt(MSe * informationMatrix[i, i]); } // Compute coefficient tests for (int i = 0; i < regression.Coefficients.Length; i++) { double tStatistic = regression.Coefficients[i] / standardErrors[i]; ttests[i] = new TTest(estimatedValue: regression.Coefficients[i], standardError: standardErrors[i], degreesOfFreedom: DFe); ftests[i] = new FTest(tStatistic * tStatistic, 1, DFe); confidences[i] = ttests[i].GetConfidenceInterval(confidencePercent); } // Compute model performance tests ttest = new TTest(results, outputMean); ztest = new ZTest(results, outputMean); chiSquareTest = new ChiSquareTest(outputData, results, n - p - 1); }
private void initialize(double[][][] samples, TwoWayAnovaModel type) { // References: // - http://www.smi.hst.aau.dk/~cdahl/BiostatPhD/ANOVA.pdf ModelType = type; Observations = FirstFactorSamples * SecondFactorSamples * Replications; // Step 1. Initialize all degrees of freedom int cellDegreesOfFreedom = FirstFactorSamples * SecondFactorSamples - 1; int aDegreesOfFreedom = FirstFactorSamples - 1; int bDegreesOfFreedom = SecondFactorSamples - 1; int abDegreesOfFreedom = cellDegreesOfFreedom - aDegreesOfFreedom - bDegreesOfFreedom; int errorDegreesOfFreedom = FirstFactorSamples * SecondFactorSamples * (Replications - 1); int totalDegreesOfFreedom = Observations - 1; // Step 1. Calculate cell means cellMeans = new double[FirstFactorSamples, SecondFactorSamples]; double sum = 0; for (int i = 0; i < samples.Length; i++) for (int j = 0; j < samples[i].Length; j++) sum += cellMeans[i, j] = Statistics.Tools.Mean(samples[i][j]); // Step 2. Calculate the total mean (grand mean) totalMean = sum / (FirstFactorSamples * SecondFactorSamples); // Step 3. Calculate factor means aMean = new double[FirstFactorSamples]; for (int i = 0; i < samples.Length; i++) { sum = 0; for (int j = 0; j < samples[i].Length; j++) for (int k = 0; k < samples[i][j].Length; k++) sum += samples[i][j][k]; aMean[i] = sum / (SecondFactorSamples * Replications); } bMean = new double[SecondFactorSamples]; for (int j = 0; j < samples[0].Length; j++) { sum = 0; for (int i = 0; i < samples.Length; i++) for (int k = 0; k < samples[i][j].Length; k++) sum += samples[i][j][k]; bMean[j] = sum / (FirstFactorSamples * Replications); } // Step 4. Calculate total sum of squares double ssum = 0; for (int i = 0; i < samples.Length; i++) { for (int j = 0; j < samples[i].Length; j++) { for (int k = 0; k < samples[i][j].Length; k++) { double u = samples[i][j][k] - totalMean; ssum += u * u; } } } double totalSumOfSquares = ssum; // Step 5. Calculate the cell sum of squares ssum = 0; for (int i = 0; i < FirstFactorSamples; i++) { for (int j = 0; j < SecondFactorSamples; j++) { double u = cellMeans[i, j] - totalMean; ssum += u * u; } } double cellSumOfSquares = ssum * Replications; // Step 6. Compute within-cells error sum of squares ssum = 0; for (int i = 0; i < samples.Length; i++) { for (int j = 0; j < samples[i].Length; j++) { for (int k = 0; k < samples[i][j].Length; k++) { double u = samples[i][j][k] - cellMeans[i, j]; ssum += u * u; } } } double errorSumOfSquares = ssum; // Step 7. Compute factors sum of squares ssum = 0; for (int i = 0; i < aMean.Length; i++) { double u = aMean[i] - totalMean; ssum += u * u; } double aSumOfSquares = ssum * SecondFactorSamples * Replications; ssum = 0; for (int i = 0; i < bMean.Length; i++) { double u = bMean[i] - totalMean; ssum += u * u; } double bSumOfSquares = ssum * FirstFactorSamples * Replications; // Step 9. Compute interaction sum of squares double abSumOfSquares = cellSumOfSquares - aSumOfSquares - bSumOfSquares; // Step 10. Compute mean squares double aMeanSquares = aSumOfSquares / aDegreesOfFreedom; double bMeanSquares = bSumOfSquares / bDegreesOfFreedom; double abMeanSquares = abSumOfSquares / abDegreesOfFreedom; double errorMeanSquares = errorSumOfSquares / errorDegreesOfFreedom; // Step 10. Create the F-Statistics FTest aSignificance, bSignificance, abSignificance; if (type == TwoWayAnovaModel.Fixed) { // Model 1: Factors A and B fixed aSignificance = new FTest(aMeanSquares / abMeanSquares, aDegreesOfFreedom, abDegreesOfFreedom); bSignificance = new FTest(bMeanSquares / abMeanSquares, bDegreesOfFreedom, abDegreesOfFreedom); abSignificance = new FTest(abMeanSquares / errorMeanSquares, abDegreesOfFreedom, errorDegreesOfFreedom); } else if (type == TwoWayAnovaModel.Mixed) { // Model 2: Factors A and B random aSignificance = new FTest(aMeanSquares / errorMeanSquares, aDegreesOfFreedom, errorDegreesOfFreedom); bSignificance = new FTest(bMeanSquares / errorMeanSquares, bDegreesOfFreedom, errorDegreesOfFreedom); abSignificance = new FTest(abMeanSquares / errorMeanSquares, abDegreesOfFreedom, errorDegreesOfFreedom); } else if (type == TwoWayAnovaModel.Random) { // Model 3: Factor A fixed, factor B random aSignificance = new FTest(aMeanSquares / abMeanSquares, aDegreesOfFreedom, abDegreesOfFreedom); bSignificance = new FTest(bMeanSquares / errorMeanSquares, bDegreesOfFreedom, errorDegreesOfFreedom); abSignificance = new FTest(abMeanSquares / errorMeanSquares, abDegreesOfFreedom, errorDegreesOfFreedom); } else throw new ArgumentException("Unhandled analysis type.","type"); // Step 11. Create the ANOVA table and sources AnovaVariationSource cell = new AnovaVariationSource(this, "Cells", cellSumOfSquares, cellDegreesOfFreedom); AnovaVariationSource a = new AnovaVariationSource(this, "Factor A", aSumOfSquares, aDegreesOfFreedom, aMeanSquares, aSignificance); AnovaVariationSource b = new AnovaVariationSource(this, "Factor B", bSumOfSquares, bDegreesOfFreedom, bMeanSquares, bSignificance); AnovaVariationSource ab = new AnovaVariationSource(this, "Interaction AxB", abSumOfSquares, abDegreesOfFreedom, abMeanSquares, abSignificance); AnovaVariationSource error = new AnovaVariationSource(this, "Within-cells (error)", errorSumOfSquares, errorDegreesOfFreedom, errorMeanSquares); AnovaVariationSource total = new AnovaVariationSource(this, "Total", totalSumOfSquares, totalDegreesOfFreedom); this.Sources = new TwoWayAnovaVariationSources() { Cells = cell, FactorA = a, FactorB = b, Interaction = ab, Error = error, Total = total }; this.Table = new AnovaSourceCollection(cell, a, b, ab, error, total); }