Пример #1
0
        private static void FitToNormalInternal(IEnumerable <double> sample,
                                                out double m, out double dm, out double s, out double ds)
        {
            // We factor out this method because it is used by both the normal and the log-normal fit methods.

            // Maximum likelihood estimate is straightforward.
            //   p_i = \frac{1}{\sqrt{2\pi}\sigma} \exp \left[ -\frac{1}{2} \left( \frac{x_i - \mu}{\sigma} \right)^2 \right]
            //   \ln p_i = -\ln (\sqrt{2\pi} \sigma) - \frac{1}{2} \left( \frac{x_i - \mu}{\sigma} \right)^2
            //   \ln L = \sum_i
            //  so
            //    \frac{\partial \ln L}{\partial \mu} = \sum_i \frac{x_i - \mu}{\sigma^2}
            //    \frac{\partial \ln L}{\partial \sigma} = -\frac{n}{\sigma} - \frac{1}{\sigma^2} \sum_i (x_i - \mu)^2
            //  Setting equal to zero and solving gives the unsurprising result
            //     \mu = n^{-1} \sum_i x_i
            //     \sigma^2 = n^{-1} \sum_i (x_i - \mu)^2
            //  that MLE says to estimate the model mean and variance by the sample mean and variance.

            // MLE estimators are guaranteed to be asymptotically unbiased, but they can be biased for finite n.
            // You can see that must be the case for \sigma because the denominator has n instead of n-1.

            // To un-bias our estimators, we will derive exact distributions for these quantities.

            // First the mean estimator. Start from x_i \sim N(\mu, \sigma). By the addition of normal deviates,
            //   \sum_i x_i \sim N(n \mu, \sqrt{n} \sigma). So
            //   m  = \frac{1}{n} \sum_i x_i \sim N(\mu, \sigma / \sqrt{n}).
            // which means the estimator m is normally distributed with mean \mu and standard deviation
            // \sigma / \sqrt{n}. Now we know that m is unbiased and we know its variance.

            int    n;
            double ss;

            Univariate.ComputeMomentsUpToSecond(sample, out n, out m, out ss);
            dm = Math.Sqrt(ss) / n;

            // Next the variance estimator. By the definition of the chi squared distribution and a bit of algebra that
            // reduces the degrees of freedom by one, u^2 = \sum_i ( \frac{x_i - m}{\sigma} )^2 \sim \chi^2(n - 1), which has
            // mean n - 1 and variance 2(n-1). Therefore the estimator
            //   v = \sigma^2 u^2 / (n-1) = \frac{1}{n-1} \sum_i ( x_i - m )^2
            // has mean \sigma^2 and variance 2 \sigma^4 / (n-1).

            // If we consider \sigma^2 the parameter, we are done -- we have derived an estimator that is unbiased and
            // know its variance. But we don't consider the parameter \sigma^2, we consider it \sigma.
            // The mean of the square root is not the square root of the mean, so the square root of an unbiased
            // estimator of \sigma^2 will not be an unbiased estimator of \sigma. If we want an unbiased estimator of \sigma
            // itself, we need to go a bit further. Since u^2 ~ \chi^2(n-1), u ~ \chi(n-1). It's mean is a complicated ratio
            // of Gamma functions and it's variance is an even more complicated difference whose evaluation can be delicate,
            // but our machinery in the ChiDistribution class handles that. To get an unbiased estimator of \sigma, we just
            // need to apply the same principal of dividing by the mean of this distribution.
            //   s = \sigma u / <u> = \sqrt{\sum_i (x_i - m)^2} / <u>
            // to get an estimator with mean \sigma and known variance.

            ChiDistribution d = new ChiDistribution(n - 1);

            s  = Math.Sqrt(ss) / d.Mean;
            ds = d.StandardDeviation / d.Mean * s;
        }
        internal MultiLinearRegressionResult(IReadOnlyList <double> yColumn, IReadOnlyList <IReadOnlyList <double> > xColumns, IReadOnlyList <string> xNames) : base()
        {
            Debug.Assert(yColumn != null);
            Debug.Assert(xColumns != null);
            Debug.Assert(xColumns.Count > 0);
            Debug.Assert(xNames.Count == xColumns.Count);

            n = yColumn.Count;
            m = xColumns.Count;
            if (n <= m)
            {
                throw new InsufficientDataException();
            }

            // Compute the design matrix X.
            interceptIndex = -1;
            RectangularMatrix X = new RectangularMatrix(n, m);

            for (int c = 0; c < m; c++)
            {
                IReadOnlyList <double> xColumn = xColumns[c];
                if (xColumn == null)
                {
                    Debug.Assert(xNames[c] == "Intercept");
                    Debug.Assert(interceptIndex < 0);
                    for (int r = 0; r < n; r++)
                    {
                        X[r, c] = 1.0;
                    }
                    interceptIndex = c;
                }
                else
                {
                    Debug.Assert(xNames[c] != null);
                    if (xColumn.Count != n)
                    {
                        throw new DimensionMismatchException();
                    }
                    for (int r = 0; r < n; r++)
                    {
                        X[r, c] = xColumn[r];
                    }
                }
            }
            Debug.Assert(interceptIndex >= 0);
            ColumnVector v = new ColumnVector(yColumn);

            // Use X = QR to solve X b = y and compute C.
            QRDecomposition.SolveLinearSystem(X, v, out b, out C);

            // For ANOVA, we will need mean and variance of y
            int    yn;
            double ym;

            Univariate.ComputeMomentsUpToSecond(yColumn, out yn, out ym, out SST);

            // Compute residuals
            SSR = 0.0;
            SSF = 0.0;
            ColumnVector yHat = X * b;

            residuals = new List <double>(n);
            for (int i = 0; i < n; i++)
            {
                double z = yColumn[i] - yHat[i];
                residuals.Add(z);
                SSR += z * z;
                SSF += MoreMath.Sqr(yHat[i] - ym);
            }
            sigma2 = SSR / (n - m);

            // Scale up C by \sigma^2
            // (It sure would be great to be able to overload *=.)
            for (int i = 0; i < m; i++)
            {
                for (int j = i; j < m; j++)
                {
                    C[i, j] = C[i, j] * sigma2;
                }
            }

            names = xNames;
        }
Пример #3
0
        /// <summary>
        /// Performs a Spearman rank-order test of association between the two variables.
        /// </summary>
        /// <param name="x">The values of the first variable.</param>
        /// <param name="y">The values of the second variable.</param>
        /// <returns>The result of the test.</returns>
        /// <remarks>
        /// <para>The Spearman rank-order test of association is a non-parametric test for association between
        /// two variables. The test statistic rho is the correlation coefficient of the <em>rank</em> of
        /// each entry in the sample. It is thus invariant over monotonic re-parameterizations of the data,
        /// and will, for example, detect a quadratic or exponential association just as well as a linear
        /// association.</para>
        /// <para>The Spearman rank-order test requires O(N log N) operations.</para>
        /// </remarks>
        /// <exception cref="ArgumentNullException"><paramref name="x"/> or <paramref name="y"/> is <see langword="null"/>.</exception>
        /// <exception cref="DimensionMismatchException"><paramref name="x"/> and <paramref name="y"/> do not contain the same number of entries.</exception>
        /// <exception cref="InsufficientDataException">There are fewer than three data points.</exception>
        /// <seealso cref="PearsonRTest(IReadOnlyList{double},IReadOnlyList{double})"/>
        /// <seealso cref="KendallTauTest"/>
        /// <seealso href="http://en.wikipedia.org/wiki/Spearman%27s_rank_correlation_coefficient"/>
        public static TestResult SpearmanRhoTest(IReadOnlyList <double> x, IReadOnlyList <double> y)
        {
            if (x == null)
            {
                throw new ArgumentNullException(nameof(x));
            }
            if (y == null)
            {
                throw new ArgumentNullException(nameof(y));
            }
            if (x.Count != y.Count)
            {
                throw new DimensionMismatchException();
            }

            int n = x.Count;

            if (n < 3)
            {
                throw new InsufficientDataException();
            }

            // Find the ranks.
            int[] rx = Univariate.GetRanks(x);
            int[] ry = Univariate.GetRanks(y);

            // Compute the statistic and its null distribution.
            // Use analytic expressions for the mean M and variance V of the ranks.
            // C is the covariance of the ranks, rho is just the corresponding correlation coefficient.
            // S encodes the same information, but as an integer that varies in steps of one, so
            // its null distribution can be described by a DiscreteDistribution.
            double M = (n - 1) / 2.0;
            double V = (n + 1) * (n - 1) / 12.0;
            int    S = 0;
            double C = 0.0;

            for (int i = 0; i < n; i++)
            {
                // Statisticians define S using 1-based ranks, so add 1 to each
                // rank when computing S. This isn't important for C, because
                // we are subtracting off mean.
                S += (rx[i] + 1) * (ry[i] + 1);
                C += (rx[i] - M) * (ry[i] - M);
            }
            C = C / n;
            double rho = C / V;

            // Compute the null distribution.
            if (n < 12)
            {
                // For small enough samples, use the exact distribution.
                // It would be nice to do this for at least slightly higher n, but the time to compute the exact
                // distribution grows dramatically with n. I would like to return in less than about 100ms.
                // Current timings are n = 10 35ms, n = 11, 72ms, n = 12 190ms.
                DiscreteDistribution   sDistribution   = new SpearmanExactDistribution(n);
                ContinuousDistribution rhoDistribution = new DiscreteAsContinuousDistribution(new SpearmanExactDistribution(n), Interval.FromEndpoints(-1.0, 1.0));
                return(new TestResult("s", S, sDistribution, "ρ", rho, rhoDistribution, TestType.TwoTailed));
            }
            else
            {
                // For larger samples, use the normal approximation.
                // It would be nice to fit support and/or fourth cumulant.
                // I was not happy with an Edgeworth expansion, which can fit the fourth cumulant, but screws up the tails
                // badly, even giving negative probabilities for extreme values, which are quite likely for null-violating samples.
                // Look into bounded quasi-normal distributions such as the logit-normal and truncated normal.
                ContinuousDistribution rhoDistribution = new NormalDistribution(0.0, 1.0 / Math.Sqrt(n - 1));
                return(new TestResult("ρ", rho, rhoDistribution, TestType.TwoTailed));
            }
        }
        internal PolynomialRegressionResult(IReadOnlyList <double> x, IReadOnlyList <double> y, int degree) : base()
        {
            Debug.Assert(x != null);
            Debug.Assert(y != null);
            Debug.Assert(x.Count == y.Count);
            Debug.Assert(degree >= 0);

            m = degree;
            n = x.Count;
            if (n < (m + 1))
            {
                throw new InsufficientDataException();
            }

            // Construct the n X m design matrix X_{ij} = x_{i}^{j}
            RectangularMatrix X = new RectangularMatrix(n, m + 1);
            ColumnVector      Y = new ColumnVector(n);

            for (int i = 0; i < n; i++)
            {
                double x_i = x[i];
                X[i, 0] = 1.0;
                for (int j = 1; j <= m; j++)
                {
                    X[i, j] = X[i, j - 1] * x_i;
                }
                double y_i = y[i];
                Y[i] = y_i;
            }

            // Use X = QR to solve X b = y and compute C
            QRDecomposition.SolveLinearSystem(X, Y, out b, out C);

            // Compute mean and total sum of squares.
            // This could be done inside loop above, but this way we get to re-use code from Univariate.
            double yMean;

            Univariate.ComputeMomentsUpToSecond(y, out n, out yMean, out SST);

            // Compute residuals
            SSR = 0.0;
            SSF = 0.0;
            ColumnVector yHat = X * b;

            residuals = new List <double>(n);
            for (int i = 0; i < n; i++)
            {
                double z = y[i] - yHat[i];
                residuals.Add(z);
                SSR += z * z;
                SSF += MoreMath.Sqr(yHat[i] - yMean);
            }
            sigma2 = SSR / (n - (m + 1));

            // Scale up C by \sigma^2
            // (It sure would be great to be able to overload *=.)
            for (int i = 0; i <= m; i++)
            {
                for (int j = i; j <= m; j++)
                {
                    C[i, j] = C[i, j] * sigma2;
                }
            }
        }
Пример #5
0
        /// <summary>
        /// Find the Gumbel distribution that best fit the given sample.
        /// </summary>
        /// <param name="sample">The sample to fit.</param>
        /// <returns>The fit result.</returns>
        /// <exception cref="ArgumentNullException"><paramref name="sample"/> is <see langword="null"/>.</exception>
        /// <exception cref="InsufficientDataException"><paramref name="sample"/> contains fewer than three values.</exception>
        public static GumbelFitResult FitToGumbel(this IReadOnlyList <double> sample)
        {
            if (sample == null)
            {
                throw new ArgumentNullException(nameof(sample));
            }
            if (sample.Count < 3)
            {
                throw new InsufficientDataException();
            }

            // To do a maximum likelihood fit, start from the log probability of each data point and aggregate to
            // obtain the log likelihood of the sample
            //   z_i = \frac{x_i - m}{s}
            //   -\ln p_i = \ln s + ( z_i + e^{-z_i})
            //   \ln L = \sum_i \ln p_i

            // Take derivatives wrt m and s.
            //   \frac{\partial \ln L}{\partial m} = \frac{1}{s} \sum_i ( 1 - e^{-z_i} )
            //   \frac{\partial \ln L}{\partial s} = \frac{1}{s} \sum_i ( -1 + z_i - z_i e^{-z_i} )

            // Set derivatives to zero to get a system of equations for the maximum.
            //    n = \sum_i e^{-z_i}
            //    n = \sum_i ( z_i - z_i e^{-z_i} )
            // that is, <e^z> = 1 and <z> - <z e^z> = 1.

            // To solve this system, pull e^{m/s} out of the sum in the first equation and solve for m
            //    n = e^{m / s} \sum_i e^{-x_i / s}
            //    m = -s \ln \left( \frac{1}{n} \sum_i e^{-x_i / s} \right) = -s \ln <e^{-x/s}>
            // Substituting this result into the second equation gets us to
            //    s = \bar{x} - \frac{ <x e^{-x/s}> }{ <e^{x/s}> }
            // which involves only s. We can use a one-dimensional root-finder to determine s, then determine m
            // from the first equation.

            // To avoid exponentiating potentially large x_i, it's better to write the problem in terms
            // of d_i, where x_i = \bar{x} + d_i.
            //    m = \bar{x} - s \ln <e^{-d/s}>
            //    s = -\frac{ <d e^{-d/s}> }{ <e^{-d/s}> }

            // To get the covariance matrix, we need the curvature matrix at the minimum, so take more derivatives
            //    \frac{\partial^2 \ln L}{\partial m^2} = - \frac{1}{s} \sum_i e^{-z_i} = - \frac{n}{s^2}
            //    \frac{\partial^2 \ln L}{\partial m \partial s} = - \frac{n}{s^2} <z e^{-z}>
            //    \frac{\partial^2 \ln L}{\partial s^2} = - \frac{n}{s^2} ( <z^2 e^{-z}> + 1 )

            // Several crucial pieces of this analysis are taken from Mahdi and Cenac, "Estimating Parameters of Gumbel Distribution
            // "using the method of moments, probability weighted moments, and maximum likelihood", Revista de Mathematica:
            // Teoria y Aplicaciones 12 (2005) 151-156 (http://revistas.ucr.ac.cr/index.php/matematica/article/viewFile/259/239)

            // We will be needed the sample mean and standard deviation
            int    n;
            double mean, stdDev;

            Univariate.ComputeMomentsUpToSecond(sample, out n, out mean, out stdDev);
            stdDev = Math.Sqrt(stdDev / n);

            // Use the method of moments to get an initial estimate of s.
            double s0 = Math.Sqrt(6.0) / Math.PI * stdDev;

            // Define the function to zero
            Func <double, double> fnc = (double s) => {
                double u, v;
                MaximumLikelihoodHelper(sample, n, mean, s, out u, out v);
                return(s + v / u);
            };

            // Zero it to compute the best-fit s
            double s1 = FunctionMath.FindZero(fnc, s0);

            // Compute the corresponding best-fit m
            double u1, v1;

            MaximumLikelihoodHelper(sample, n, mean, s1, out u1, out v1);
            double m1 = mean - s1 * Math.Log(u1);

            // Compute the curvature matrix
            double w1 = 0.0;
            double w2 = 0.0;

            foreach (double x in sample)
            {
                double z = (x - m1) / s1;
                double e = Math.Exp(-z);
                w1 += z * e;
                w2 += z * z * e;
            }
            w1 /= sample.Count;
            w2 /= sample.Count;
            SymmetricMatrix C = new SymmetricMatrix(2);

            C[0, 0] = (n - 2) / (s1 * s1);
            C[0, 1] = (n - 2) / (s1 * s1) * w1;
            C[1, 1] = (n - 2) / (s1 * s1) * (w2 + 1.0);
            SymmetricMatrix CI = C.CholeskyDecomposition().Inverse();
            // The use of (n-2) here in place of n is a very ad hoc attempt to increase accuracy.


            // Compute goodness-of-fit
            GumbelDistribution dist = new GumbelDistribution(m1, s1);
            TestResult         test = sample.KolmogorovSmirnovTest(dist);

            return(new GumbelFitResult(m1, s1, CI[0, 0], CI[1, 1], CI[0, 1], test));
        }