private static void FitToNormalInternal(IEnumerable <double> sample, out double m, out double dm, out double s, out double ds) { // We factor out this method because it is used by both the normal and the log-normal fit methods. // Maximum likelihood estimate is straightforward. // p_i = \frac{1}{\sqrt{2\pi}\sigma} \exp \left[ -\frac{1}{2} \left( \frac{x_i - \mu}{\sigma} \right)^2 \right] // \ln p_i = -\ln (\sqrt{2\pi} \sigma) - \frac{1}{2} \left( \frac{x_i - \mu}{\sigma} \right)^2 // \ln L = \sum_i // so // \frac{\partial \ln L}{\partial \mu} = \sum_i \frac{x_i - \mu}{\sigma^2} // \frac{\partial \ln L}{\partial \sigma} = -\frac{n}{\sigma} - \frac{1}{\sigma^2} \sum_i (x_i - \mu)^2 // Setting equal to zero and solving gives the unsurprising result // \mu = n^{-1} \sum_i x_i // \sigma^2 = n^{-1} \sum_i (x_i - \mu)^2 // that MLE says to estimate the model mean and variance by the sample mean and variance. // MLE estimators are guaranteed to be asymptotically unbiased, but they can be biased for finite n. // You can see that must be the case for \sigma because the denominator has n instead of n-1. // To un-bias our estimators, we will derive exact distributions for these quantities. // First the mean estimator. Start from x_i \sim N(\mu, \sigma). By the addition of normal deviates, // \sum_i x_i \sim N(n \mu, \sqrt{n} \sigma). So // m = \frac{1}{n} \sum_i x_i \sim N(\mu, \sigma / \sqrt{n}). // which means the estimator m is normally distributed with mean \mu and standard deviation // \sigma / \sqrt{n}. Now we know that m is unbiased and we know its variance. int n; double ss; Univariate.ComputeMomentsUpToSecond(sample, out n, out m, out ss); dm = Math.Sqrt(ss) / n; // Next the variance estimator. By the definition of the chi squared distribution and a bit of algebra that // reduces the degrees of freedom by one, u^2 = \sum_i ( \frac{x_i - m}{\sigma} )^2 \sim \chi^2(n - 1), which has // mean n - 1 and variance 2(n-1). Therefore the estimator // v = \sigma^2 u^2 / (n-1) = \frac{1}{n-1} \sum_i ( x_i - m )^2 // has mean \sigma^2 and variance 2 \sigma^4 / (n-1). // If we consider \sigma^2 the parameter, we are done -- we have derived an estimator that is unbiased and // know its variance. But we don't consider the parameter \sigma^2, we consider it \sigma. // The mean of the square root is not the square root of the mean, so the square root of an unbiased // estimator of \sigma^2 will not be an unbiased estimator of \sigma. If we want an unbiased estimator of \sigma // itself, we need to go a bit further. Since u^2 ~ \chi^2(n-1), u ~ \chi(n-1). It's mean is a complicated ratio // of Gamma functions and it's variance is an even more complicated difference whose evaluation can be delicate, // but our machinery in the ChiDistribution class handles that. To get an unbiased estimator of \sigma, we just // need to apply the same principal of dividing by the mean of this distribution. // s = \sigma u / <u> = \sqrt{\sum_i (x_i - m)^2} / <u> // to get an estimator with mean \sigma and known variance. ChiDistribution d = new ChiDistribution(n - 1); s = Math.Sqrt(ss) / d.Mean; ds = d.StandardDeviation / d.Mean * s; }
internal MultiLinearRegressionResult(IReadOnlyList <double> yColumn, IReadOnlyList <IReadOnlyList <double> > xColumns, IReadOnlyList <string> xNames) : base() { Debug.Assert(yColumn != null); Debug.Assert(xColumns != null); Debug.Assert(xColumns.Count > 0); Debug.Assert(xNames.Count == xColumns.Count); n = yColumn.Count; m = xColumns.Count; if (n <= m) { throw new InsufficientDataException(); } // Compute the design matrix X. interceptIndex = -1; RectangularMatrix X = new RectangularMatrix(n, m); for (int c = 0; c < m; c++) { IReadOnlyList <double> xColumn = xColumns[c]; if (xColumn == null) { Debug.Assert(xNames[c] == "Intercept"); Debug.Assert(interceptIndex < 0); for (int r = 0; r < n; r++) { X[r, c] = 1.0; } interceptIndex = c; } else { Debug.Assert(xNames[c] != null); if (xColumn.Count != n) { throw new DimensionMismatchException(); } for (int r = 0; r < n; r++) { X[r, c] = xColumn[r]; } } } Debug.Assert(interceptIndex >= 0); ColumnVector v = new ColumnVector(yColumn); // Use X = QR to solve X b = y and compute C. QRDecomposition.SolveLinearSystem(X, v, out b, out C); // For ANOVA, we will need mean and variance of y int yn; double ym; Univariate.ComputeMomentsUpToSecond(yColumn, out yn, out ym, out SST); // Compute residuals SSR = 0.0; SSF = 0.0; ColumnVector yHat = X * b; residuals = new List <double>(n); for (int i = 0; i < n; i++) { double z = yColumn[i] - yHat[i]; residuals.Add(z); SSR += z * z; SSF += MoreMath.Sqr(yHat[i] - ym); } sigma2 = SSR / (n - m); // Scale up C by \sigma^2 // (It sure would be great to be able to overload *=.) for (int i = 0; i < m; i++) { for (int j = i; j < m; j++) { C[i, j] = C[i, j] * sigma2; } } names = xNames; }
internal PolynomialRegressionResult(IReadOnlyList <double> x, IReadOnlyList <double> y, int degree) : base() { Debug.Assert(x != null); Debug.Assert(y != null); Debug.Assert(x.Count == y.Count); Debug.Assert(degree >= 0); m = degree; n = x.Count; if (n < (m + 1)) { throw new InsufficientDataException(); } // Construct the n X m design matrix X_{ij} = x_{i}^{j} RectangularMatrix X = new RectangularMatrix(n, m + 1); ColumnVector Y = new ColumnVector(n); for (int i = 0; i < n; i++) { double x_i = x[i]; X[i, 0] = 1.0; for (int j = 1; j <= m; j++) { X[i, j] = X[i, j - 1] * x_i; } double y_i = y[i]; Y[i] = y_i; } // Use X = QR to solve X b = y and compute C QRDecomposition.SolveLinearSystem(X, Y, out b, out C); // Compute mean and total sum of squares. // This could be done inside loop above, but this way we get to re-use code from Univariate. double yMean; Univariate.ComputeMomentsUpToSecond(y, out n, out yMean, out SST); // Compute residuals SSR = 0.0; SSF = 0.0; ColumnVector yHat = X * b; residuals = new List <double>(n); for (int i = 0; i < n; i++) { double z = y[i] - yHat[i]; residuals.Add(z); SSR += z * z; SSF += MoreMath.Sqr(yHat[i] - yMean); } sigma2 = SSR / (n - (m + 1)); // Scale up C by \sigma^2 // (It sure would be great to be able to overload *=.) for (int i = 0; i <= m; i++) { for (int j = i; j <= m; j++) { C[i, j] = C[i, j] * sigma2; } } }
/// <summary> /// Find the Gumbel distribution that best fit the given sample. /// </summary> /// <param name="sample">The sample to fit.</param> /// <returns>The fit result.</returns> /// <exception cref="ArgumentNullException"><paramref name="sample"/> is <see langword="null"/>.</exception> /// <exception cref="InsufficientDataException"><paramref name="sample"/> contains fewer than three values.</exception> public static GumbelFitResult FitToGumbel(this IReadOnlyList <double> sample) { if (sample == null) { throw new ArgumentNullException(nameof(sample)); } if (sample.Count < 3) { throw new InsufficientDataException(); } // To do a maximum likelihood fit, start from the log probability of each data point and aggregate to // obtain the log likelihood of the sample // z_i = \frac{x_i - m}{s} // -\ln p_i = \ln s + ( z_i + e^{-z_i}) // \ln L = \sum_i \ln p_i // Take derivatives wrt m and s. // \frac{\partial \ln L}{\partial m} = \frac{1}{s} \sum_i ( 1 - e^{-z_i} ) // \frac{\partial \ln L}{\partial s} = \frac{1}{s} \sum_i ( -1 + z_i - z_i e^{-z_i} ) // Set derivatives to zero to get a system of equations for the maximum. // n = \sum_i e^{-z_i} // n = \sum_i ( z_i - z_i e^{-z_i} ) // that is, <e^z> = 1 and <z> - <z e^z> = 1. // To solve this system, pull e^{m/s} out of the sum in the first equation and solve for m // n = e^{m / s} \sum_i e^{-x_i / s} // m = -s \ln \left( \frac{1}{n} \sum_i e^{-x_i / s} \right) = -s \ln <e^{-x/s}> // Substituting this result into the second equation gets us to // s = \bar{x} - \frac{ <x e^{-x/s}> }{ <e^{x/s}> } // which involves only s. We can use a one-dimensional root-finder to determine s, then determine m // from the first equation. // To avoid exponentiating potentially large x_i, it's better to write the problem in terms // of d_i, where x_i = \bar{x} + d_i. // m = \bar{x} - s \ln <e^{-d/s}> // s = -\frac{ <d e^{-d/s}> }{ <e^{-d/s}> } // To get the covariance matrix, we need the curvature matrix at the minimum, so take more derivatives // \frac{\partial^2 \ln L}{\partial m^2} = - \frac{1}{s} \sum_i e^{-z_i} = - \frac{n}{s^2} // \frac{\partial^2 \ln L}{\partial m \partial s} = - \frac{n}{s^2} <z e^{-z}> // \frac{\partial^2 \ln L}{\partial s^2} = - \frac{n}{s^2} ( <z^2 e^{-z}> + 1 ) // Several crucial pieces of this analysis are taken from Mahdi and Cenac, "Estimating Parameters of Gumbel Distribution // "using the method of moments, probability weighted moments, and maximum likelihood", Revista de Mathematica: // Teoria y Aplicaciones 12 (2005) 151-156 (http://revistas.ucr.ac.cr/index.php/matematica/article/viewFile/259/239) // We will be needed the sample mean and standard deviation int n; double mean, stdDev; Univariate.ComputeMomentsUpToSecond(sample, out n, out mean, out stdDev); stdDev = Math.Sqrt(stdDev / n); // Use the method of moments to get an initial estimate of s. double s0 = Math.Sqrt(6.0) / Math.PI * stdDev; // Define the function to zero Func <double, double> fnc = (double s) => { double u, v; MaximumLikelihoodHelper(sample, n, mean, s, out u, out v); return(s + v / u); }; // Zero it to compute the best-fit s double s1 = FunctionMath.FindZero(fnc, s0); // Compute the corresponding best-fit m double u1, v1; MaximumLikelihoodHelper(sample, n, mean, s1, out u1, out v1); double m1 = mean - s1 * Math.Log(u1); // Compute the curvature matrix double w1 = 0.0; double w2 = 0.0; foreach (double x in sample) { double z = (x - m1) / s1; double e = Math.Exp(-z); w1 += z * e; w2 += z * z * e; } w1 /= sample.Count; w2 /= sample.Count; SymmetricMatrix C = new SymmetricMatrix(2); C[0, 0] = (n - 2) / (s1 * s1); C[0, 1] = (n - 2) / (s1 * s1) * w1; C[1, 1] = (n - 2) / (s1 * s1) * (w2 + 1.0); SymmetricMatrix CI = C.CholeskyDecomposition().Inverse(); // The use of (n-2) here in place of n is a very ad hoc attempt to increase accuracy. // Compute goodness-of-fit GumbelDistribution dist = new GumbelDistribution(m1, s1); TestResult test = sample.KolmogorovSmirnovTest(dist); return(new GumbelFitResult(m1, s1, CI[0, 0], CI[1, 1], CI[0, 1], test)); }