Ejemplo n.º 1
0
 /// <summary>
 /// Finds the parameterized function that best fits the data.
 /// </summary>
 /// <param name="f">The parameterized function.</param>
 /// <param name="start">An initial guess for the parameters.</param>
 /// <returns>The fit result.</returns>
 /// <remarks>
 /// <para>
 /// In the returned <see cref="FitResult"/>, the parameters appear in the same order as in
 /// the supplied fit function and initial guess vector. No goodness-of-fit test is returned.
 /// </para>
 /// </remarks>
 /// <exception cref="ArgumentNullException"><paramref name="f"/> or <paramref name="start"/> is null.</exception>
 /// <exception cref="InsufficientDataException">There are not more data points than fit parameters.</exception>
 /// <exception cref="DivideByZeroException">The curvature matrix is singular, indicating that the data is independent of
 /// one or more parameters, or that two or more parameters are linearly dependent.</exception>
 public NonlinearRegressionResult NonlinearRegression(Func <IReadOnlyList <double>, double, double> f, IReadOnlyList <double> start)
 {
     if (start == null)
     {
         throw new ArgumentNullException(nameof(start));
     }
     return(Bivariate.NonlinearRegression(yData, xData, f, start));
 }
Ejemplo n.º 2
0
        /// <summary>
        /// Computes the best-fit linear logistic regression from the data.
        /// </summary>
        /// <returns>The fit result.</returns>
        /// <remarks>
        /// <para>Linear logistic regression is a way to fit binary outcome data to a linear model.</para>
        /// <para>The method assumes that binary outcomes are encoded as 0 and 1. If any y-values other than
        /// 0 and 1 are encountered, it throws an <see cref="InvalidOperationException"/>.</para>
        /// <para>The fit result is two-dimensional. The first parameter is a, the second b.</para>
        /// </remarks>
        /// <exception cref="InsufficientDataException">There are fewer than three data points.</exception>
        /// <exception cref="InvalidOperationException">There is a y-value other than 0 or 1.</exception>
        public LinearLogisticRegressionResult LinearLogisticRegression()
        {
            List <bool> y = yData.Select(v => { if (v == 0.0)
                                                {
                                                    return(false);
                                                }
                                                else if (v == 1.0)
                                                {
                                                    return(true);
                                                }
                                                else
                                                {
                                                    throw new InvalidOperationException();
                                                } }).ToList();

            return(Bivariate.LinearLogisticRegression(y, xData));
        }
Ejemplo n.º 3
0
        internal LinearRegressionResult(IReadOnlyList <double> x, IReadOnlyList <double> y) :
            base()
        {
            double yMean, xxSum, xySum, yySum;

            Bivariate.ComputeBivariateMomentsUpToTwo(x, y, out n, out xMean, out yMean, out xxSum, out yySum, out xySum);

            b = xySum / xxSum;
            a = yMean - b * xMean;

            residuals = new List <double>(n);
            SSR       = 0.0;
            SSF       = 0.0;
            for (int i = 0; i < n; i++)
            {
                double yi  = y[i];
                double ypi = a + b * x[i];
                double zi  = yi - ypi;
                residuals.Add(zi);
                SSR += zi * zi;
                SSF += MoreMath.Sqr(ypi - yMean);
            }
            SST = yySum;

            xVariance = xxSum / n;
            sigma2    = SSR / (n - 2);
            sigma     = Math.Sqrt(sigma2);
            cbb       = sigma2 / xVariance / n;
            cab       = -xMean * cbb;
            caa       = (xVariance + xMean * xMean) * cbb;

            rTest = new Lazy <TestResult>(() => {
                double r         = xySum / Math.Sqrt(xxSum * yySum);
                TestResult rTest = new TestResult("r", r, new PearsonRDistribution(n), TestType.TwoTailed);
                return(rTest);
            });
        }
Ejemplo n.º 4
0
 /// <summary>
 /// Computes the polynomial of given degree which best fits the data.
 /// </summary>
 /// <param name="m">The degree, which must be non-negative.</param>
 /// <returns>The fit result.</returns>
 /// <exception cref="ArgumentOutOfRangeException"><paramref name="m"/> is negative.</exception>
 /// <exception cref="InsufficientDataException">There are fewer data points than coefficients to be fit.</exception>
 public PolynomialRegressionResult PolynomialRegression(int m)
 {
     return(Bivariate.PolynomialRegression(yData, xData, m));
 }
Ejemplo n.º 5
0
 /// <summary>
 /// Computes the best-fit linear regression from the data.
 /// </summary>
 /// <returns>The result of the fit.</returns>
 /// <remarks>
 /// <para>Linear regression assumes that the data have been generated by a function y = a + b x + e, where e is
 /// normally distributed noise, and determines the values of a and b that best fit the data. It also
 /// determines an error matrix on the parameters a and b, and does an F-test to</para>
 /// <para>The fit result is two-dimensional. The first parameter is the intercept a, the second is the slope b.
 /// The goodness-of-fit test is a F-test comparing the variance accounted for by the model to the remaining,
 /// unexplained variance.</para>
 /// </remarks>
 /// <exception cref="InsufficientDataException">There are fewer than three data points.</exception>
 public LinearRegressionResult LinearRegression()
 {
     return(Bivariate.LinearRegression(yData, xData));
 }
Ejemplo n.º 6
0
 /// <summary>
 /// Performs a Wilcoxon signed rank test.
 /// </summary>
 /// <returns>The result of the test.</returns>
 /// <remarks>
 /// <para>The Wilcoxon signed rank test is a non-parametric alternative to the
 /// paired t-test (<see cref="PairedStudentTTest"/>). Given two measurements on
 /// the same subjects, this method tests for changes in the distribution between
 /// the two measurements. It is sensitive primarily to shifts in the median.
 /// Note that the distributions of the individual measurements
 /// may be far from normal, and may be different for each subject.</para>
 /// </remarks>
 /// <seealso href="https://en.wikipedia.org/wiki/Wilcoxon_signed-rank_test"/>
 public TestResult WilcoxonSignedRankTest()
 {
     return(Bivariate.WilcoxonSignedRankTest(xData, yData));
 }
Ejemplo n.º 7
0
 /// <summary>
 /// Performs a paired Student t-test.
 /// </summary>
 /// <returns>The result of the test.</returns>
 /// <remarks>
 /// <para>Like a two-sample, unpaired t-test (<see cref="Sample.StudentTTest(Sample,Sample)" />),
 /// a paired t-test compares two samples to detect a difference in means.
 /// Unlike the unpaired version, the paired version assumes that each </para>
 /// </remarks>
 /// <exception cref="InsufficientDataException">There are fewer than two data points.</exception>
 public TestResult PairedStudentTTest()
 {
     return(Bivariate.PairedStudentTTest(xData, yData));
 }
Ejemplo n.º 8
0
 /// <summary>
 /// Performs a Kendall concordance test for association.
 /// </summary>
 /// <returns>The result of the test.</returns>
 /// <remarks>
 /// <para>Kendall's &#x3C4; is a non-parametric and robust test of association
 /// between two variables. It simply measures the number of cases where an increase
 /// in one variable is associated with an increase in the other (concordant pairs),
 /// compared with the number of cases where an increase in one variable is associated
 /// with a decrease in the other (discordant pairs).</para>
 /// <para>Because &#x3C4; depends only on the sign
 /// of a change and not its magnitude, it is not skewed by outliers exhibiting very large
 /// changes, nor by cases where the degree of change in one variable associated with
 /// a given change in the other changes over the range of the variables. Of course, it may
 /// still miss an association whose sign changes over the range of the variables. For example,
 /// if data points lie along a semi-circle in the plane, an increase in the first variable
 /// is associated with an increase in the second variable along the rising arc and and decrease in
 /// the second variable along the falling arc. No test that looks for single-signed correlation
 /// will catch this association.
 /// </para>
 /// <para>Because it examine all pairs of data points, the Kendall test requires
 /// O(N<sup>2</sup>) operations. It is thus impractical for very large data sets. While
 /// not quite as robust as the Kendall test, the Spearman test is a good fall-back in such cases.</para>
 /// </remarks>
 /// <exception cref="InsufficientDataException"><see cref="Count"/> is less than two.</exception>
 /// <seealso cref="PearsonRTest"/>
 /// <seealso cref="SpearmanRhoTest"/>
 /// <seealso href="http://en.wikipedia.org/wiki/Kendall_tau_test" />
 public TestResult KendallTauTest()
 {
     return(Bivariate.KendallTauTest(xData, yData));
 }
Ejemplo n.º 9
0
 /// <summary>
 /// Performs a Spearman rank-order test of association between the two variables.
 /// </summary>
 /// <returns>The result of the test.</returns>
 /// <remarks>
 /// <para>The Spearman rank-order test of association is a non-parametric test for association between
 /// two variables. The test statistic rho is the correlation coefficient of the <em>rank</em> of
 /// each entry in the sample. It is thus invariant over monotonic re-parameterizations of the data,
 /// and will, for example, detect a quadratic or exponential association just as well as a linear
 /// association.</para>
 /// <para>The Spearman rank-order test requires O(N log N) operations.</para>
 /// </remarks>
 /// <exception cref="InsufficientDataException">There are fewer than three data points.</exception>
 /// <seealso cref="PearsonRTest"/>
 /// <seealso cref="KendallTauTest"/>
 /// <seealso href="http://en.wikipedia.org/wiki/Spearman%27s_rank_correlation_coefficient"/>
 public TestResult SpearmanRhoTest()
 {
     return(Bivariate.SpearmanRhoTest(xData, yData));
 }
Ejemplo n.º 10
0
 /// <summary>
 /// Performs a Pearson correlation test for association.
 /// </summary>
 /// <returns>The result of the test.</returns>
 /// <remarks>
 /// <para>This test measures the strength of the linear correlation between two variables. The
 /// test statistic r is simply the covariance of the two variables, scaled by their respective
 /// standard deviations so as to obtain a number between -1 (perfect linear anti-correlation)
 /// and +1 (perfect linear correlation).</para>
 /// <para>The Pearson test cannot reliably detect or rule out non-linear associations. For example,
 /// variables with a perfect quadratic association may have only a weak linear correlation. If
 /// you wish to test for associations that may not be linear, consider using the Spearman or
 /// Kendall tests instead.</para>
 /// <para>The Pearson correlation test requires O(N) operations.</para>
 /// <para>The Pearson test requires at least three bivariate values.</para>
 /// </remarks>
 /// <exception cref="InsufficientDataException"><see cref="Count"/> is less than three.</exception>
 /// <seealso cref="SpearmanRhoTest"/>
 /// <seealso cref="KendallTauTest"/>
 /// <seealso href="http://en.wikipedia.org/wiki/Pearson_correlation_coefficient" />
 public TestResult PearsonRTest()
 {
     return(Bivariate.PearsonRTest(xData, yData));
 }
        // We need a goodness-of-fit measurement

        internal LinearLogisticRegressionResult(IReadOnlyList <double> x, IReadOnlyList <bool> y)
        {
            Debug.Assert(x != null);
            Debug.Assert(y != null);
            Debug.Assert(x.Count == y.Count);

            // check size of data set
            int n = x.Count;

            if (n < 3)
            {
                throw new InsufficientDataException();
            }

            // The linear logistic model is:
            //   p_i = \sigma(t_i) \quad t_i = a + b x_i
            // So the log likelihood of the data set under the model is:
            //   \ln L = \sum_{{\rm true} i} \ln p_i + \sum_{{\rm false} i} \ln (1 - p_i)
            //         = \sum_{{\rm true} i} \ln \sigma(t_i) + \sum_{{\rm false} i} \ln (1 - \sigma(t_i))
            // Taking derivatives:
            //   \frac{\partial L}{\partial a} = \sum_{{\rm true} i} \frac{\sigma'(t_i)}{\sigma(t_i)}
            //     + \sum_{{\rm false} i} \frac{-\sigma'(t_i)}{1 - \sigma(t_i)}
            //   \frac{\partial L}{\partial b} = \sum_{{\rm true} i} \frac{\sigma'(t_i)}{\sigma(t_i)} x_i
            //     + \sum_{{\rm false} i} \frac{-\sigma'(t_i)}{1 - \sigma(t_i)} x_i
            // Using \sigma(t) = \frac{1}{1 + e^{-t}}, we can derive:
            //   \frac{\sigma'(t)}{\sigma(t)} = \sigma(-t)
            //   \frac{\sigma'(t)}{1 - \sigma(t)} = \sigma(t)
            // So this becomes
            //   \frac{\partial L}{\partial a} = \sum_i \pm \sigma(\mp t_i)
            //   \frac{\partial L}{\partial b} = \sum_i \pm \sigma(\mp t_i) x_i
            // where the upper sign is for true values and the lower sign is for false values.
            // Find the simultaneous zeros of these equations to obtain the likelihood-maximizing a, b.

            // To get the curvature matrix, we need the second derivatives.
            //   \frac{\partial^2 L}{\partial a^2} = - \sum_i \sigma'(\mp t_i)
            //   \frac{\partial^2 L}{\partial a \partial b} = - \sum_i \sigma'(\mp t_i) x_i
            //   \frac{\partial^2 L}{\partial b^2} = - \sum_i \sigma'(\mp t_i) x_i^2

            // We need an initial guess at the parameters. Begin with the Ansatz of the logistic model:
            //    \frac{p}{1-p} = e^{\alpha + \beta x}
            // Differentiate and do some algebra to get:
            //    \frac{\partial p}{\partial x} = \beta p ( 1 - p)
            // Evaluating at means, and noting that p (1 - p) = var(y) and that, in a development around the means,
            //    cov(p, x) = \frac{\partial p}{\partial x} var(x)
            // we get
            //    \beta = \frac{cov(y, x)}{var(x) var(y)}
            // This approximation gets the sign right, but it looks like it usually gets the magnitude quite wrong.
            // The problem with the approach is that var(y) = p (1 - p) assumes y are chosen with fixed p, but they aren't.
            // We need to re-visit this analysis.

            double xMean, yMean, xxSum, yySum, xySum;

            Bivariate.ComputeBivariateMomentsUpToTwo(x, y.Select(z => z ? 1.0 : 0.0), out n, out xMean, out yMean, out xxSum, out yySum, out xySum);
            double p  = yMean;
            double b0 = xySum / xxSum / yySum * n;
            double a0 = Math.Log(p / (1.0 - p)) - b0 * xMean;

            Func <IReadOnlyList <double>, IReadOnlyList <double> > J = (IReadOnlyList <double> a) => {
                double dLda = 0.0;
                double dLdb = 0.0;
                for (int i = 0; i < n; i++)
                {
                    double t = a[0] + a[1] * x[i];
                    if (y[i])
                    {
                        double s = Sigma(-t);
                        dLda += s;
                        dLdb += s * x[i];
                    }
                    else
                    {
                        double s = Sigma(t);
                        dLda -= s;
                        dLdb -= s * x[i];
                    }
                }
                return(new double[] { dLda, dLdb });
            };

            ColumnVector b = MultiFunctionMath.FindZero(J, new double[] { a0, b0 });

            SymmetricMatrix C = new SymmetricMatrix(2);

            for (int i = 0; i < n; i++)
            {
                double t = b[0] + b[1] * x[i];
                if (y[i])
                {
                    t = -t;
                }
                double e  = Math.Exp(-t);
                double sp = e / MoreMath.Sqr(1.0 + e);
                C[0, 0] += sp;
                C[0, 1] += sp * x[i];
                C[1, 1] += sp * x[i] * x[i];
            }
            CholeskyDecomposition CD = C.CholeskyDecomposition();

            if (CD == null)
            {
                throw new DivideByZeroException();
            }
            C = CD.Inverse();

            best       = b;
            covariance = C;
        }