/// <summary> /// Performs a Kendall concordance test for association. /// </summary> /// <returns>The result of the test.</returns> /// <remarks> /// <para>Kendall's τ is a non-parameteric and robust test of association /// between two variables. It simply measures the number of cases where an increase /// in one variable is associated with an increase in the other (corcordant pairs), /// compared with the number of cases where an increase in one variable is associated /// with a decrease in the other (discordant pairs).</para> /// <para>Because τ depends only on the sign /// of a change and not its magnitude, it is not skewed by outliers exhibiting very large /// changes, nor by cases where the degree of change in one variable associated with /// a given change in the other changes over the range of the varibles. Of course, it may /// still miss an association whoose sign changes over the range of the variables. For example, /// if data points lie along a semi-circle in the plane, an increase in the first variable /// is associated with an increase in the second variable along the rising arc and and decrease in /// the second variable along the falling arc. No test that looks for single-signed correlation /// will catch this association. /// </para> /// <para>Because it examine all pairs of data points, the Kendall test requires /// O(N<sup>2</sup>) operations. It is thus impractical for very large data sets. While /// not quite as robust as the Kendall test, the Spearman test is a good fall-back in such cases.</para> /// </remarks> /// <exception cref="InsufficientDataException"><see cref="Count"/> is less than two.</exception> /// <seealso cref="PearsonRTest"/> /// <seealso cref="SpearmanRhoTest"/> /// <seealso href="http://en.wikipedia.org/wiki/Kendall_tau_test" /> public TestResult KendallTauTest() { int n = xData.Count; if (n < 2) { throw new InsufficientDataException(); } // loop over all pairs, counting concordant and discordant int C = 0; int D = 0; for (int i = 0; i < n; i++) { for (int j = 0; j < i; j++) { // note the way each variable varies in the pair int sx = Math.Sign(xData[i] - xData[j]); int sy = Math.Sign(yData[i] - yData[j]); // if they vary in the same way, they are concordant, otherwise they are discordant if (sx == sy) { C++; } else { D++; } // note this does not count ties specially, as is sometimes done } } // compute tau double t = 1.0 * (C - D) / (C + D); // compute tau distribution Distribution tauDistribution; if (n <= 20) { tauDistribution = new DiscreteAsContinuousDistribution(new KendallExactDistribution(n), Interval.FromEndpoints(-1.0, 1.0)); } else { double dt = Math.Sqrt((4 * n + 10) / 9.0 / n / (n - 1)); tauDistribution = new NormalDistribution(0.0, dt); } return(new TestResult(t, tauDistribution)); }
/// <summary> /// Performs a Spearman rank-order test of association between the two variables. /// </summary> /// <returns>The result of the test.</returns> /// <remarks> /// <para>The Spearman rank-order test of association is a non-parametric test for association between /// two variables. The test statistic rho is the correlation coefficient of the <em>rank</em> of /// each entry in the sample. It is thus invariant over monotonic reparameterizations of the data, /// and will, for example, detect a quadratic or exponential association just as well as a linear /// association.</para> /// <para>The Spearman rank-order test requires O(N log N) operations.</para> /// </remarks> /// <exception cref="InsufficientDataException">There are fewer than three data points.</exception> /// <seealso cref="PearsonRTest"/> /// <seealso cref="KendallTauTest"/> /// <seealso href="http://en.wikipedia.org/wiki/Spearman%27s_rank_correlation_coefficient"/> public TestResult SpearmanRhoTest() { if (Count < 3) { throw new InsufficientDataException(); } // analytic expressions for the mean and variance of ranks double M = (Count - 1) / 2.0; double V = (Count + 1) * (Count - 1) / 12.0; // compute the covariance of ranks int[] rx = xData.GetRanks(); int[] ry = yData.GetRanks(); double C = 0.0; for (int i = 0; i < Count; i++) { C += (rx[i] - M) * (ry[i] - M); } C = C / Count; // compute rho double rho = C / V; // for small enough sample, use the exact distribution Distribution rhoDistribution; if (Count <= 10) { // for small enough sample, use the exact distribution // it would be nice to do this for at least slightly higher n, but computation time grows dramatically // would like to ensure return in less than 100ms; current timings n=10 35ms, n=11 72ms, n=12 190ms rhoDistribution = new DiscreteAsContinuousDistribution(new SpearmanExactDistribution(Count), Interval.FromEndpoints(-1.0, 1.0)); } else { // for larger samples, use the normal approximation // would like to fit support and C_4 too; look into logit-normal // i was not happy with Edgeworth expansion, which can fit C_4 but screws up tails badly, even giving negative probabilities rhoDistribution = new NormalDistribution(0.0, 1.0 / Math.Sqrt(Count - 1)); } return(new TestResult(rho, rhoDistribution)); }
/// <summary> /// Performs a Wilcoxon signed rank test. /// </summary> /// <returns>The result of the test.</returns> /// <remarks> /// <para>The Wilcoxon signed rank test is a non-parametric alternative to the /// paired t-test (<see cref="PairedStudentTTest"/>). Given two measurements on /// the same subjects, this method tests for changes in the distribution between /// the two measurements. It is sensitive primarily to shifts in the median. /// Note that the distributions of the individual measurements /// may be far from normal, and may be different for each subject.</para> /// </remarks> /// <seealso href="https://en.wikipedia.org/wiki/Wilcoxon_signed-rank_test"/> public TestResult WilcoxonSignedRankTest() { int n = this.Count; if (n < 2) { throw new InsufficientDataException(); } double[] z = new double[n]; for (int i = 0; i < z.Length; i++) { z[i] = xData[i] - yData[i]; } Array.Sort(z, (x, y) => Math.Abs(x).CompareTo(Math.Abs(y))); int W = 0; for (int i = 0; i < z.Length; i++) { if (z[i] > 0.0) { W += (i + 1); } } ContinuousDistribution nullDistribution; if (Count < 32) { DiscreteDistribution wilcoxon = new WilcoxonDistribution(n); nullDistribution = new DiscreteAsContinuousDistribution(wilcoxon); } else { double mu = n * (n + 1.0) / 4.0; double sigma = Math.Sqrt(mu * (2.0 * n + 1.0) / 6.0); nullDistribution = new NormalDistribution(mu, sigma); } return(new TestResult("W", W, TestType.TwoTailed, nullDistribution)); }
/// <summary> /// Performs a Kendall concordance test for association. /// </summary> /// <param name="x">The values of the first variable.</param> /// <param name="y">The values of the second variable.</param> /// <returns>The result of the test.</returns> /// <remarks> /// <para>Kendall's τ is a non-parametric and robust test of association /// between two variables. It simply measures the number of cases where an increase /// in one variable is associated with an increase in the other (concordant pairs), /// compared with the number of cases where an increase in one variable is associated /// with a decrease in the other (discordant pairs).</para> /// <para>Because τ depends only on the sign /// of the difference and not its magnitude, it is not skewed by outliers exhibiting very large /// changes, nor by cases where the degree of difference /// changes over the ranges of the variables. Of course, it may /// still miss an association whose sign changes over the range of the variables. For example, /// if data points lie along a semi-circle in the plane, an increase in the first variable /// is associated with an increase in the second variable along the rising arc and and decrease in /// the second variable along the falling arc. /// </para> /// <para>Because it examines all pairs of data points, the Kendall test requires /// O(N<sup>2</sup>) operations. It is thus impractical for very large data sets. While /// not quite as robust as the Kendall test, the Spearman test is a good fall-back in such cases.</para> /// </remarks> /// <exception cref="ArgumentNullException"><paramref name="x"/> or <paramref name="y"/> is <see langword="null"/>.</exception> /// <exception cref="DimensionMismatchException"><paramref name="x"/> and <paramref name="y"/> do not contain the same number of entries.</exception> /// <exception cref="InsufficientDataException">There are fewer than two entries in the sample.</exception> /// <seealso cref="PearsonRTest(IReadOnlyList{double},IReadOnlyList{double})"/> /// <seealso cref="SpearmanRhoTest(IReadOnlyList{double},IReadOnlyList{double})"/> /// <seealso href="http://en.wikipedia.org/wiki/Kendall_tau_test" /> public static TestResult KendallTauTest(IReadOnlyList <double> x, IReadOnlyList <double> y) { if (x == null) { throw new ArgumentNullException(nameof(x)); } if (y == null) { throw new ArgumentNullException(nameof(y)); } if (x.Count != y.Count) { throw new DimensionMismatchException(); } int n = x.Count; if (n < 2) { throw new InsufficientDataException(); } // loop over all pairs, counting concordant and discordant int C = 0; int D = 0; for (int i = 0; i < n; i++) { for (int j = 0; j < i; j++) { // note the way each variable varies in the pair int sx = Math.Sign(x[i] - x[j]); int sy = Math.Sign(y[i] - y[j]); // if they vary in the same way, they are concordant, otherwise they are discordant if (sx == sy) { C++; } else { D++; } // note this does not count ties specially, as is sometimes done } } double tau = 1.0 * (C - D) / (C + D); // Concordant and discordant counts should sum to total pairs. Debug.Assert(C + D == n * (n - 1) / 2); // Compute null distribution. if (n <= 20) { DiscreteDistribution dDistribution = new KendallExactDistribution(n); ContinuousDistribution tauDistribution = new DiscreteAsContinuousDistribution(dDistribution, Interval.FromEndpoints(-1.0, +1.0)); return(new TestResult("D", D, dDistribution, "τ", tau, tauDistribution, TestType.TwoTailed)); } else { double dTau = Math.Sqrt((4 * n + 10) / 9.0 / n / (n - 1)); ContinuousDistribution tauDistribution = new NormalDistribution(0.0, dTau); return(new TestResult("τ", tau, tauDistribution, TestType.TwoTailed)); } }
/// <summary> /// Performs a Spearman rank-order test of association between the two variables. /// </summary> /// <param name="x">The values of the first variable.</param> /// <param name="y">The values of the second variable.</param> /// <returns>The result of the test.</returns> /// <remarks> /// <para>The Spearman rank-order test of association is a non-parametric test for association between /// two variables. The test statistic rho is the correlation coefficient of the <em>rank</em> of /// each entry in the sample. It is thus invariant over monotonic re-parameterizations of the data, /// and will, for example, detect a quadratic or exponential association just as well as a linear /// association.</para> /// <para>The Spearman rank-order test requires O(N log N) operations.</para> /// </remarks> /// <exception cref="ArgumentNullException"><paramref name="x"/> or <paramref name="y"/> is <see langword="null"/>.</exception> /// <exception cref="DimensionMismatchException"><paramref name="x"/> and <paramref name="y"/> do not contain the same number of entries.</exception> /// <exception cref="InsufficientDataException">There are fewer than three data points.</exception> /// <seealso cref="PearsonRTest(IReadOnlyList{double},IReadOnlyList{double})"/> /// <seealso cref="KendallTauTest"/> /// <seealso href="http://en.wikipedia.org/wiki/Spearman%27s_rank_correlation_coefficient"/> public static TestResult SpearmanRhoTest(IReadOnlyList <double> x, IReadOnlyList <double> y) { if (x == null) { throw new ArgumentNullException(nameof(x)); } if (y == null) { throw new ArgumentNullException(nameof(y)); } if (x.Count != y.Count) { throw new DimensionMismatchException(); } int n = x.Count; if (n < 3) { throw new InsufficientDataException(); } // Find the ranks. int[] rx = Univariate.GetRanks(x); int[] ry = Univariate.GetRanks(y); // Compute the statistic and its null distribution. // Use analytic expressions for the mean M and variance V of the ranks. // C is the covariance of the ranks, rho is just the corresponding correlation coefficient. // S encodes the same information, but as an integer that varies in steps of one, so // its null distribution can be described by a DiscreteDistribution. double M = (n - 1) / 2.0; double V = (n + 1) * (n - 1) / 12.0; int S = 0; double C = 0.0; for (int i = 0; i < n; i++) { // Statisticians define S using 1-based ranks, so add 1 to each // rank when computing S. This isn't important for C, because // we are subtracting off mean. S += (rx[i] + 1) * (ry[i] + 1); C += (rx[i] - M) * (ry[i] - M); } C = C / n; double rho = C / V; // Compute the null distribution. if (n < 12) { // For small enough samples, use the exact distribution. // It would be nice to do this for at least slightly higher n, but the time to compute the exact // distribution grows dramatically with n. I would like to return in less than about 100ms. // Current timings are n = 10 35ms, n = 11, 72ms, n = 12 190ms. DiscreteDistribution sDistribution = new SpearmanExactDistribution(n); ContinuousDistribution rhoDistribution = new DiscreteAsContinuousDistribution(new SpearmanExactDistribution(n), Interval.FromEndpoints(-1.0, 1.0)); return(new TestResult("s", S, sDistribution, "ρ", rho, rhoDistribution, TestType.TwoTailed)); } else { // For larger samples, use the normal approximation. // It would be nice to fit support and/or fourth cumulant. // I was not happy with an Edgeworth expansion, which can fit the fourth cumulant, but screws up the tails // badly, even giving negative probabilities for extreme values, which are quite likely for null-violating samples. // Look into bounded quasi-normal distributions such as the logit-normal and truncated normal. ContinuousDistribution rhoDistribution = new NormalDistribution(0.0, 1.0 / Math.Sqrt(n - 1)); return(new TestResult("ρ", rho, rhoDistribution, TestType.TwoTailed)); } }