public void LinearLogisticRegressionSimple()
        {
            Polynomial m = Polynomial.FromCoefficients(-1.0, 2.0);

            FrameTable table = new FrameTable();

            table.AddColumn <double>("x");
            table.AddColumn <string>("z");

            Random rng = new Random(2);
            ContinuousDistribution xDistribution = new CauchyDistribution(4.0, 2.0);

            for (int i = 0; i < 24; i++)
            {
                double x = xDistribution.GetRandomValue(rng);
                double y = m.Evaluate(x);
                double p = 1.0 / (1.0 + Math.Exp(-y));
                bool   z = (rng.NextDouble() < p);
                table.AddRow(x, z.ToString());
            }

            LinearLogisticRegressionResult fit = table["z"].As((string s) => Boolean.Parse(s)).LinearLogisticRegression(table["x"].As <double>());

            Assert.IsTrue(fit.Intercept.ConfidenceInterval(0.99).ClosedContains(m.Coefficient(0)));
            Assert.IsTrue(fit.Slope.ConfidenceInterval(0.99).ClosedContains(m.Coefficient(1)));
        }
        public void MultivariateLinearRegressionVariances()
        {
            // define model y = a + b0 * x0 + b1 * x1 + noise
            double a  = -3.0;
            double b0 = 2.0;
            double b1 = -1.0;
            ContinuousDistribution x0distribution = new LaplaceDistribution();
            ContinuousDistribution x1distribution = new CauchyDistribution();
            ContinuousDistribution eDistribution  = new NormalDistribution(0.0, 4.0);

            FrameTable data = new FrameTable();

            data.AddColumns <double>("a", "da", "b0", "db0", "b1", "db1", "ab1Cov", "p", "dp");

            // draw a sample from the model
            Random rng = new Random(4);

            for (int j = 0; j < 64; j++)
            {
                List <double> x0s = new List <double>();
                List <double> x1s = new List <double>();
                List <double> ys  = new List <double>();

                for (int i = 0; i < 16; i++)
                {
                    double x0 = x0distribution.GetRandomValue(rng);
                    double x1 = x1distribution.GetRandomValue(rng);
                    double e  = eDistribution.GetRandomValue(rng);
                    double y  = a + b0 * x0 + b1 * x1 + e;
                    x0s.Add(x0);
                    x1s.Add(x1);
                    ys.Add(y);
                }

                // do a linear regression fit on the model
                MultiLinearRegressionResult result = ys.MultiLinearRegression(
                    new Dictionary <string, IReadOnlyList <double> > {
                    { "x0", x0s }, { "x1", x1s }
                }
                    );
                UncertainValue pp = result.Predict(-5.0, 6.0);

                data.AddRow(
                    result.Intercept.Value, result.Intercept.Uncertainty,
                    result.CoefficientOf("x0").Value, result.CoefficientOf("x0").Uncertainty,
                    result.CoefficientOf("x1").Value, result.CoefficientOf("x1").Uncertainty,
                    result.Parameters.CovarianceOf("Intercept", "x1"),
                    pp.Value, pp.Uncertainty
                    );
            }

            // The estimated parameters should agree with the model that generated the data.

            // The variances of the estimates should agree with the claimed variances
            Assert.IsTrue(data["a"].As <double>().PopulationStandardDeviation().ConfidenceInterval(0.99).ClosedContains(data["da"].As <double>().Mean()));
            Assert.IsTrue(data["b0"].As <double>().PopulationStandardDeviation().ConfidenceInterval(0.99).ClosedContains(data["db0"].As <double>().Mean()));
            Assert.IsTrue(data["b1"].As <double>().PopulationStandardDeviation().ConfidenceInterval(0.99).ClosedContains(data["db1"].As <double>().Mean()));
            Assert.IsTrue(data["a"].As <double>().PopulationCovariance(data["b1"].As <double>()).ConfidenceInterval(0.99).ClosedContains(data["ab1Cov"].As <double>().Mean()));
            Assert.IsTrue(data["p"].As <double>().PopulationStandardDeviation().ConfidenceInterval(0.99).ClosedContains(data["dp"].As <double>().Median()));
        }
Example #3
0
        public void SpearmanNullDistributionTest()
        {
            // Pick independent distributions for x and y, which needn't be normal and needn't be related.
            ContinuousDistribution xDistrubtion  = new UniformDistribution();
            ContinuousDistribution yDistribution = new CauchyDistribution();
            Random rng = new Random(1);

            // Generate bivariate samples of various sizes
            foreach (int n in TestUtilities.GenerateIntegerValues(4, 64, 8))
            {
                Sample testStatistics = new Sample();
                ContinuousDistribution testDistribution = null;

                for (int i = 0; i < 128; i++)
                {
                    BivariateSample sample = new BivariateSample();
                    for (int j = 0; j < n; j++)
                    {
                        sample.Add(xDistrubtion.GetRandomValue(rng), yDistribution.GetRandomValue(rng));
                    }

                    TestResult result = sample.SpearmanRhoTest();
                    testStatistics.Add(result.Statistic);
                    testDistribution = result.Distribution;
                }

                TestResult r2 = testStatistics.KolmogorovSmirnovTest(testDistribution);
                Assert.IsTrue(r2.Probability > 0.05);

                Assert.IsTrue(testStatistics.PopulationMean.ConfidenceInterval(0.99).ClosedContains(testDistribution.Mean));
                Assert.IsTrue(testStatistics.PopulationVariance.ConfidenceInterval(0.99).ClosedContains(testDistribution.Variance));
            }
        }
        public void MultivariateLinearRegressionSimple()
        {
            // define model y = a + b0 * x0 + b1 * x1 + noise
            double a  = 1.0;
            double b0 = -2.0;
            double b1 = 3.0;
            ContinuousDistribution x0distribution = new CauchyDistribution(10.0, 5.0);
            ContinuousDistribution x1distribution = new UniformDistribution(Interval.FromEndpoints(-10.0, 20.0));
            ContinuousDistribution noise          = new NormalDistribution(0.0, 10.0);

            // draw a sample from the model
            Random             rng    = new Random(1);
            MultivariateSample sample = new MultivariateSample("x0", "x1", "y");
            FrameTable         table  = new FrameTable();

            table.AddColumns <double>("x0", "x1", "y");

            for (int i = 0; i < 100; i++)
            {
                double x0  = x0distribution.GetRandomValue(rng);
                double x1  = x1distribution.GetRandomValue(rng);
                double eps = noise.GetRandomValue(rng);
                double y   = a + b0 * x0 + b1 * x1 + eps;
                sample.Add(x0, x1, y);
                table.AddRow(x0, x1, y);
            }

            // do a linear regression fit on the model
            ParameterCollection         oldResult = sample.LinearRegression(2).Parameters;
            MultiLinearRegressionResult newResult = table["y"].As <double>().MultiLinearRegression(
                table["x0"].As <double>(), table["x1"].As <double>()
                );

            // the result should have the appropriate dimension
            Assert.IsTrue(oldResult.Count == 3);
            Assert.IsTrue(newResult.Parameters.Count == 3);

            // The parameters should match the model
            Assert.IsTrue(oldResult[0].Estimate.ConfidenceInterval(0.90).ClosedContains(b0));
            Assert.IsTrue(oldResult[1].Estimate.ConfidenceInterval(0.90).ClosedContains(b1));
            Assert.IsTrue(oldResult[2].Estimate.ConfidenceInterval(0.90).ClosedContains(a));

            Assert.IsTrue(newResult.CoefficientOf(0).ConfidenceInterval(0.99).ClosedContains(b0));
            Assert.IsTrue(newResult.CoefficientOf("x1").ConfidenceInterval(0.99).ClosedContains(b1));
            Assert.IsTrue(newResult.Intercept.ConfidenceInterval(0.99).ClosedContains(a));

            // The residuals should be compatible with the model predictions
            for (int i = 0; i < table.Rows.Count; i++)
            {
                FrameRow row = table.Rows[i];
                double   x0  = (double)row["x0"];
                double   x1  = (double)row["x1"];
                double   yp  = newResult.Predict(x0, x1).Value;
                double   y   = (double)row["y"];
                Assert.IsTrue(TestUtilities.IsNearlyEqual(newResult.Residuals[i], y - yp));
            }
        }
        public void BivariateAssociationDiscreteNullDistribution()
        {
            Random rng = new Random(1);

            // Pick very non-normal distributions for our non-parameteric tests
            ContinuousDistribution xd = new FrechetDistribution(1.0);
            ContinuousDistribution yd = new CauchyDistribution();

            // Pick small sample sizes to get exact distributions
            foreach (int n in TestUtilities.GenerateIntegerValues(4, 24, 4))
            {
                // Do a bunch of test runs, recording reported statistic for each.
                List <int>           spearmanStatistics   = new List <int>();
                List <int>           kendallStatistics    = new List <int>();
                DiscreteDistribution spearmanDistribution = null;
                DiscreteDistribution kendallDistribution  = null;

                for (int i = 0; i < 512; i++)
                {
                    List <double> x = new List <double>();
                    List <double> y = new List <double>();
                    for (int j = 0; j < n; j++)
                    {
                        x.Add(xd.GetRandomValue(rng));
                        y.Add(yd.GetRandomValue(rng));
                    }

                    DiscreteTestStatistic spearman = Bivariate.SpearmanRhoTest(x, y).UnderlyingStatistic;
                    if (spearman != null)
                    {
                        spearmanStatistics.Add(spearman.Value);
                        spearmanDistribution = spearman.Distribution;
                    }
                    DiscreteTestStatistic kendall = Bivariate.KendallTauTest(x, y).UnderlyingStatistic;
                    if (kendall != null)
                    {
                        kendallStatistics.Add(kendall.Value);
                        kendallDistribution = kendall.Distribution;
                    }
                }

                // Test whether statistics are actually distributed as claimed
                if (spearmanDistribution != null)
                {
                    TestResult spearmanChiSquared = spearmanStatistics.ChiSquaredTest(spearmanDistribution);
                    Assert.IsTrue(spearmanChiSquared.Probability > 0.01);
                }
                if (kendallDistribution != null)
                {
                    TestResult kendallChiSquared = kendallStatistics.ChiSquaredTest(kendallDistribution);
                    Assert.IsTrue(kendallChiSquared.Probability > 0.01);
                }
            }
        }
        public void BivariateNullAssociation()
        {
            Random rng = new Random(31415926);

            // Create a data structure to hold the results of Pearson, Spearman, and Kendall tests.
            FrameTable data = new FrameTable();

            data.AddColumn <double>("r");
            data.AddColumn <double>("ρ");
            data.AddColumn <double>("τ");

            // Create variables to hold the claimed distribution of each test statistic.
            ContinuousDistribution PRD = null;
            ContinuousDistribution SRD = null;
            ContinuousDistribution KTD = null;

            // Generate a large number of bivariate samples and conduct our three tests on each.
            ContinuousDistribution xDistribution = new LognormalDistribution();
            ContinuousDistribution yDistribution = new CauchyDistribution();

            for (int j = 0; j < 100; j++)
            {
                List <double> x = new List <double>();
                List <double> y = new List <double>();
                for (int i = 0; i < 100; i++)
                {
                    x.Add(xDistribution.GetRandomValue(rng));
                    y.Add(yDistribution.GetRandomValue(rng));
                }

                TestResult PR = Bivariate.PearsonRTest(x, y);
                TestResult SR = Bivariate.SpearmanRhoTest(x, y);
                TestResult KT = Bivariate.KendallTauTest(x, y);

                PRD = PR.Statistic.Distribution;
                SRD = SR.Statistic.Distribution;
                KTD = KT.Statistic.Distribution;

                data.AddRow(new Dictionary <string, object>()
                {
                    { "r", PR.Statistic.Value }, { "ρ", SR.Statistic.Value }, { "τ", KT.Statistic.Value }
                });
            }

            Assert.IsTrue(data["r"].As <double>().KolmogorovSmirnovTest(PRD).Probability > 0.05);
            Assert.IsTrue(data["ρ"].As <double>().KolmogorovSmirnovTest(SRD).Probability > 0.05);
            Assert.IsTrue(data["τ"].As <double>().KolmogorovSmirnovTest(KTD).Probability > 0.05);
        }
        public void BivariatePolynomialRegressionCovariance()
        {
            // do a set of polynomial regression fits
            // make sure not only that the fit parameters are what they should be, but that their variances/covariances are as claimed

            Random rng = new Random(271828);

            // define logistic parameters
            double[] a = new double[] { 0.0, -1.0, 2.0, -3.0 };

            // keep track of sample of returned a and b fit parameters
            MultivariateSample A = new MultivariateSample(a.Length);

            // also keep track of returned covariance estimates
            // since these vary slightly from fit to fit, we will average them
            SymmetricMatrix C = new SymmetricMatrix(a.Length);

            // also keep track of test statistics
            Sample F = new Sample();

            // do 100 fits
            for (int k = 0; k < 100; k++)
            {
                // we should be able to draw x's from any distribution; noise should be drawn from a normal distribution
                ContinuousDistribution xd = new CauchyDistribution();
                ContinuousDistribution nd = new NormalDistribution(0.0, 4.0);

                // generate a synthetic data set
                BivariateSample s = new BivariateSample();
                for (int j = 0; j < 20; j++)
                {
                    double x = xd.GetRandomValue(rng);
                    double y = nd.GetRandomValue(rng);
                    for (int i = 0; i < a.Length; i++)
                    {
                        y += a[i] * MoreMath.Pow(x, i);
                    }
                    s.Add(x, y);
                }

                // do the regression
                PolynomialRegressionResult r = s.PolynomialRegression(a.Length - 1);

                ColumnVector ps = r.Parameters.ValuesVector;

                // record best fit parameters
                A.Add(ps);

                // record estimated covariances
                C += r.Parameters.CovarianceMatrix;

                // record the fit statistic
                F.Add(r.F.Statistic.Value);
            }

            C = (1.0 / A.Count) * C; // allow matrix division by real numbers

            // check that mean parameter estimates are what they should be: the underlying population parameters
            for (int i = 0; i < A.Dimension; i++)
            {
                Assert.IsTrue(A.Column(i).PopulationMean.ConfidenceInterval(0.95).ClosedContains(a[i]));
            }

            // check that parameter covarainces are what they should be: the reported covariance estimates
            for (int i = 0; i < A.Dimension; i++)
            {
                for (int j = i; j < A.Dimension; j++)
                {
                    Assert.IsTrue(A.TwoColumns(i, j).PopulationCovariance.ConfidenceInterval(0.95).ClosedContains(C[i, j]));
                }
            }

            // check that F is distributed as it should be
            //Console.WriteLine(fs.KolmogorovSmirnovTest(new FisherDistribution(2, 48)).LeftProbability);
        }
        public void LinearRegressionSimple()
        {
            double a = -1.0;
            double b = 2.0;

            ContinuousDistribution xDistribution = new CauchyDistribution();
            ContinuousDistribution eDistribution = new NormalDistribution();

            int    n   = 16;
            Random rng = new Random(1);

            double[] x = new double[n];
            double[] y = new double[n];
            for (int i = 0; i < 16; i++)
            {
                x[i] = xDistribution.GetRandomValue(rng);
                y[i] = a + b * x[i] + eDistribution.GetRandomValue(rng);
            }

            LinearRegressionResult result = y.LinearRegression(x);

            // Parameters should be right
            Assert.IsTrue(result.Intercept.ConfidenceInterval(0.95).ClosedContains(a));
            Assert.IsTrue(result.Slope.ConfidenceInterval(0.95).ClosedContains(b));

            // Reported values should be consistent
            Assert.IsTrue(result.Intercept == result.Parameters["Intercept"].Estimate);
            Assert.IsTrue(result.Intercept.Value == result.Parameters.ValuesVector[result.Parameters.IndexOf("Intercept")]);
            Assert.IsTrue(TestUtilities.IsNearlyEqual(result.Intercept.Uncertainty, Math.Sqrt(result.Parameters.VarianceOf("Intercept"))));
            Assert.IsTrue(result.Slope == result.Parameters["Slope"].Estimate);
            Assert.IsTrue(result.Slope.Value == result.Parameters.ValuesVector[result.Parameters.IndexOf("Slope")]);
            Assert.IsTrue(TestUtilities.IsNearlyEqual(result.Slope.Uncertainty, Math.Sqrt(result.Parameters.VarianceOf("Slope"))));

            // Residuals should agree with definition
            for (int i = 0; i < x.Length; i++)
            {
                double yp = result.Predict(x[i]).Value;
                Assert.IsTrue(TestUtilities.IsNearlyEqual(result.Residuals[i], y[i] - yp));
            }

            // R and R-squared agree
            Assert.IsTrue(TestUtilities.IsNearlyEqual(result.RSquared, MoreMath.Sqr(result.R.Statistic.Value)));

            // F-test and R-test agree
            Assert.IsTrue(TestUtilities.IsNearlyEqual(result.F.Probability, result.R.Probability));

            // ANOVA's sums of squares are correct
            double SST = y.Variance() * y.Length;

            Assert.IsTrue(TestUtilities.IsNearlyEqual(SST, result.Anova.Total.SumOfSquares));
            double SSR = 0.0;

            foreach (double z in result.Residuals)
            {
                SSR += z * z;
            }
            Assert.IsTrue(TestUtilities.IsNearlyEqual(SSR, result.Anova.Residual.SumOfSquares));
            Assert.IsTrue(TestUtilities.IsNearlyEqual(SSR, result.SumOfSquaredResiduals));

            // R is same as correlation coefficient
            Assert.IsTrue(TestUtilities.IsNearlyEqual(x.CorrelationCoefficient(y), result.R.Statistic.Value));
        }