/// <summary> /// Finds the parameterized function that best fits the data. /// </summary> /// <param name="f">The parameterized function.</param> /// <param name="start">An initial guess for the parameters.</param> /// <returns>The fit result.</returns> /// <remarks> /// <para> /// In the returned <see cref="FitResult"/>, the parameters appear in the same order as in /// the supplied fit function and initial guess vector. No goodness-of-fit test is returned. /// </para> /// </remarks> /// <exception cref="ArgumentNullException"><paramref name="f"/> or <paramref name="start"/> is null.</exception> /// <exception cref="InsufficientDataException">There are not more data points than fit parameters.</exception> /// <exception cref="DivideByZeroException">The curvature matrix is singular, indicating that the data is independent of /// one or more parameters, or that two or more parameters are linearly dependent.</exception> public NonlinearRegressionResult NonlinearRegression(Func <IReadOnlyList <double>, double, double> f, IReadOnlyList <double> start) { if (start == null) { throw new ArgumentNullException(nameof(start)); } return(Bivariate.NonlinearRegression(yData, xData, f, start)); }
/// <summary> /// Computes the best-fit linear logistic regression from the data. /// </summary> /// <returns>The fit result.</returns> /// <remarks> /// <para>Linear logistic regression is a way to fit binary outcome data to a linear model.</para> /// <para>The method assumes that binary outcomes are encoded as 0 and 1. If any y-values other than /// 0 and 1 are encountered, it throws an <see cref="InvalidOperationException"/>.</para> /// <para>The fit result is two-dimensional. The first parameter is a, the second b.</para> /// </remarks> /// <exception cref="InsufficientDataException">There are fewer than three data points.</exception> /// <exception cref="InvalidOperationException">There is a y-value other than 0 or 1.</exception> public LinearLogisticRegressionResult LinearLogisticRegression() { List <bool> y = yData.Select(v => { if (v == 0.0) { return(false); } else if (v == 1.0) { return(true); } else { throw new InvalidOperationException(); } }).ToList(); return(Bivariate.LinearLogisticRegression(y, xData)); }
internal LinearRegressionResult(IReadOnlyList <double> x, IReadOnlyList <double> y) : base() { double yMean, xxSum, xySum, yySum; Bivariate.ComputeBivariateMomentsUpToTwo(x, y, out n, out xMean, out yMean, out xxSum, out yySum, out xySum); b = xySum / xxSum; a = yMean - b * xMean; residuals = new List <double>(n); SSR = 0.0; SSF = 0.0; for (int i = 0; i < n; i++) { double yi = y[i]; double ypi = a + b * x[i]; double zi = yi - ypi; residuals.Add(zi); SSR += zi * zi; SSF += MoreMath.Sqr(ypi - yMean); } SST = yySum; xVariance = xxSum / n; sigma2 = SSR / (n - 2); sigma = Math.Sqrt(sigma2); cbb = sigma2 / xVariance / n; cab = -xMean * cbb; caa = (xVariance + xMean * xMean) * cbb; rTest = new Lazy <TestResult>(() => { double r = xySum / Math.Sqrt(xxSum * yySum); TestResult rTest = new TestResult("r", r, new PearsonRDistribution(n), TestType.TwoTailed); return(rTest); }); }
/// <summary> /// Computes the polynomial of given degree which best fits the data. /// </summary> /// <param name="m">The degree, which must be non-negative.</param> /// <returns>The fit result.</returns> /// <exception cref="ArgumentOutOfRangeException"><paramref name="m"/> is negative.</exception> /// <exception cref="InsufficientDataException">There are fewer data points than coefficients to be fit.</exception> public PolynomialRegressionResult PolynomialRegression(int m) { return(Bivariate.PolynomialRegression(yData, xData, m)); }
/// <summary> /// Computes the best-fit linear regression from the data. /// </summary> /// <returns>The result of the fit.</returns> /// <remarks> /// <para>Linear regression assumes that the data have been generated by a function y = a + b x + e, where e is /// normally distributed noise, and determines the values of a and b that best fit the data. It also /// determines an error matrix on the parameters a and b, and does an F-test to</para> /// <para>The fit result is two-dimensional. The first parameter is the intercept a, the second is the slope b. /// The goodness-of-fit test is a F-test comparing the variance accounted for by the model to the remaining, /// unexplained variance.</para> /// </remarks> /// <exception cref="InsufficientDataException">There are fewer than three data points.</exception> public LinearRegressionResult LinearRegression() { return(Bivariate.LinearRegression(yData, xData)); }
/// <summary> /// Performs a Wilcoxon signed rank test. /// </summary> /// <returns>The result of the test.</returns> /// <remarks> /// <para>The Wilcoxon signed rank test is a non-parametric alternative to the /// paired t-test (<see cref="PairedStudentTTest"/>). Given two measurements on /// the same subjects, this method tests for changes in the distribution between /// the two measurements. It is sensitive primarily to shifts in the median. /// Note that the distributions of the individual measurements /// may be far from normal, and may be different for each subject.</para> /// </remarks> /// <seealso href="https://en.wikipedia.org/wiki/Wilcoxon_signed-rank_test"/> public TestResult WilcoxonSignedRankTest() { return(Bivariate.WilcoxonSignedRankTest(xData, yData)); }
/// <summary> /// Performs a paired Student t-test. /// </summary> /// <returns>The result of the test.</returns> /// <remarks> /// <para>Like a two-sample, unpaired t-test (<see cref="Sample.StudentTTest(Sample,Sample)" />), /// a paired t-test compares two samples to detect a difference in means. /// Unlike the unpaired version, the paired version assumes that each </para> /// </remarks> /// <exception cref="InsufficientDataException">There are fewer than two data points.</exception> public TestResult PairedStudentTTest() { return(Bivariate.PairedStudentTTest(xData, yData)); }
/// <summary> /// Performs a Kendall concordance test for association. /// </summary> /// <returns>The result of the test.</returns> /// <remarks> /// <para>Kendall's τ is a non-parametric and robust test of association /// between two variables. It simply measures the number of cases where an increase /// in one variable is associated with an increase in the other (concordant pairs), /// compared with the number of cases where an increase in one variable is associated /// with a decrease in the other (discordant pairs).</para> /// <para>Because τ depends only on the sign /// of a change and not its magnitude, it is not skewed by outliers exhibiting very large /// changes, nor by cases where the degree of change in one variable associated with /// a given change in the other changes over the range of the variables. Of course, it may /// still miss an association whose sign changes over the range of the variables. For example, /// if data points lie along a semi-circle in the plane, an increase in the first variable /// is associated with an increase in the second variable along the rising arc and and decrease in /// the second variable along the falling arc. No test that looks for single-signed correlation /// will catch this association. /// </para> /// <para>Because it examine all pairs of data points, the Kendall test requires /// O(N<sup>2</sup>) operations. It is thus impractical for very large data sets. While /// not quite as robust as the Kendall test, the Spearman test is a good fall-back in such cases.</para> /// </remarks> /// <exception cref="InsufficientDataException"><see cref="Count"/> is less than two.</exception> /// <seealso cref="PearsonRTest"/> /// <seealso cref="SpearmanRhoTest"/> /// <seealso href="http://en.wikipedia.org/wiki/Kendall_tau_test" /> public TestResult KendallTauTest() { return(Bivariate.KendallTauTest(xData, yData)); }
/// <summary> /// Performs a Spearman rank-order test of association between the two variables. /// </summary> /// <returns>The result of the test.</returns> /// <remarks> /// <para>The Spearman rank-order test of association is a non-parametric test for association between /// two variables. The test statistic rho is the correlation coefficient of the <em>rank</em> of /// each entry in the sample. It is thus invariant over monotonic re-parameterizations of the data, /// and will, for example, detect a quadratic or exponential association just as well as a linear /// association.</para> /// <para>The Spearman rank-order test requires O(N log N) operations.</para> /// </remarks> /// <exception cref="InsufficientDataException">There are fewer than three data points.</exception> /// <seealso cref="PearsonRTest"/> /// <seealso cref="KendallTauTest"/> /// <seealso href="http://en.wikipedia.org/wiki/Spearman%27s_rank_correlation_coefficient"/> public TestResult SpearmanRhoTest() { return(Bivariate.SpearmanRhoTest(xData, yData)); }
/// <summary> /// Performs a Pearson correlation test for association. /// </summary> /// <returns>The result of the test.</returns> /// <remarks> /// <para>This test measures the strength of the linear correlation between two variables. The /// test statistic r is simply the covariance of the two variables, scaled by their respective /// standard deviations so as to obtain a number between -1 (perfect linear anti-correlation) /// and +1 (perfect linear correlation).</para> /// <para>The Pearson test cannot reliably detect or rule out non-linear associations. For example, /// variables with a perfect quadratic association may have only a weak linear correlation. If /// you wish to test for associations that may not be linear, consider using the Spearman or /// Kendall tests instead.</para> /// <para>The Pearson correlation test requires O(N) operations.</para> /// <para>The Pearson test requires at least three bivariate values.</para> /// </remarks> /// <exception cref="InsufficientDataException"><see cref="Count"/> is less than three.</exception> /// <seealso cref="SpearmanRhoTest"/> /// <seealso cref="KendallTauTest"/> /// <seealso href="http://en.wikipedia.org/wiki/Pearson_correlation_coefficient" /> public TestResult PearsonRTest() { return(Bivariate.PearsonRTest(xData, yData)); }
// We need a goodness-of-fit measurement internal LinearLogisticRegressionResult(IReadOnlyList <double> x, IReadOnlyList <bool> y) { Debug.Assert(x != null); Debug.Assert(y != null); Debug.Assert(x.Count == y.Count); // check size of data set int n = x.Count; if (n < 3) { throw new InsufficientDataException(); } // The linear logistic model is: // p_i = \sigma(t_i) \quad t_i = a + b x_i // So the log likelihood of the data set under the model is: // \ln L = \sum_{{\rm true} i} \ln p_i + \sum_{{\rm false} i} \ln (1 - p_i) // = \sum_{{\rm true} i} \ln \sigma(t_i) + \sum_{{\rm false} i} \ln (1 - \sigma(t_i)) // Taking derivatives: // \frac{\partial L}{\partial a} = \sum_{{\rm true} i} \frac{\sigma'(t_i)}{\sigma(t_i)} // + \sum_{{\rm false} i} \frac{-\sigma'(t_i)}{1 - \sigma(t_i)} // \frac{\partial L}{\partial b} = \sum_{{\rm true} i} \frac{\sigma'(t_i)}{\sigma(t_i)} x_i // + \sum_{{\rm false} i} \frac{-\sigma'(t_i)}{1 - \sigma(t_i)} x_i // Using \sigma(t) = \frac{1}{1 + e^{-t}}, we can derive: // \frac{\sigma'(t)}{\sigma(t)} = \sigma(-t) // \frac{\sigma'(t)}{1 - \sigma(t)} = \sigma(t) // So this becomes // \frac{\partial L}{\partial a} = \sum_i \pm \sigma(\mp t_i) // \frac{\partial L}{\partial b} = \sum_i \pm \sigma(\mp t_i) x_i // where the upper sign is for true values and the lower sign is for false values. // Find the simultaneous zeros of these equations to obtain the likelihood-maximizing a, b. // To get the curvature matrix, we need the second derivatives. // \frac{\partial^2 L}{\partial a^2} = - \sum_i \sigma'(\mp t_i) // \frac{\partial^2 L}{\partial a \partial b} = - \sum_i \sigma'(\mp t_i) x_i // \frac{\partial^2 L}{\partial b^2} = - \sum_i \sigma'(\mp t_i) x_i^2 // We need an initial guess at the parameters. Begin with the Ansatz of the logistic model: // \frac{p}{1-p} = e^{\alpha + \beta x} // Differentiate and do some algebra to get: // \frac{\partial p}{\partial x} = \beta p ( 1 - p) // Evaluating at means, and noting that p (1 - p) = var(y) and that, in a development around the means, // cov(p, x) = \frac{\partial p}{\partial x} var(x) // we get // \beta = \frac{cov(y, x)}{var(x) var(y)} // This approximation gets the sign right, but it looks like it usually gets the magnitude quite wrong. // The problem with the approach is that var(y) = p (1 - p) assumes y are chosen with fixed p, but they aren't. // We need to re-visit this analysis. double xMean, yMean, xxSum, yySum, xySum; Bivariate.ComputeBivariateMomentsUpToTwo(x, y.Select(z => z ? 1.0 : 0.0), out n, out xMean, out yMean, out xxSum, out yySum, out xySum); double p = yMean; double b0 = xySum / xxSum / yySum * n; double a0 = Math.Log(p / (1.0 - p)) - b0 * xMean; Func <IReadOnlyList <double>, IReadOnlyList <double> > J = (IReadOnlyList <double> a) => { double dLda = 0.0; double dLdb = 0.0; for (int i = 0; i < n; i++) { double t = a[0] + a[1] * x[i]; if (y[i]) { double s = Sigma(-t); dLda += s; dLdb += s * x[i]; } else { double s = Sigma(t); dLda -= s; dLdb -= s * x[i]; } } return(new double[] { dLda, dLdb }); }; ColumnVector b = MultiFunctionMath.FindZero(J, new double[] { a0, b0 }); SymmetricMatrix C = new SymmetricMatrix(2); for (int i = 0; i < n; i++) { double t = b[0] + b[1] * x[i]; if (y[i]) { t = -t; } double e = Math.Exp(-t); double sp = e / MoreMath.Sqr(1.0 + e); C[0, 0] += sp; C[0, 1] += sp * x[i]; C[1, 1] += sp * x[i] * x[i]; } CholeskyDecomposition CD = C.CholeskyDecomposition(); if (CD == null) { throw new DivideByZeroException(); } C = CD.Inverse(); best = b; covariance = C; }