protected override void ComputeTrainingStatistics(IChannel ch, FloatLabelCursor.Factory cursorFactory, Float loss, int numParams) { Contracts.AssertValue(ch); Contracts.AssertValue(cursorFactory); Contracts.Assert(NumGoodRows > 0); Contracts.Assert(WeightSum > 0); Contracts.Assert(BiasCount == 1); Contracts.Assert(loss >= 0); Contracts.Assert(numParams >= BiasCount); Contracts.Assert(CurrentWeights.IsDense); ch.Info("Model trained with {0} training examples.", NumGoodRows); // Compute deviance: start with loss function. Float deviance = (Float)(2 * loss * WeightSum); if (L2Weight > 0) { // Need to subtract L2 regularization loss. // The bias term is not regularized. var regLoss = VectorUtils.NormSquared(CurrentWeights.Values, 1, CurrentWeights.Length - 1) * L2Weight; deviance -= regLoss; } if (L1Weight > 0) { // Need to subtract L1 regularization loss. // The bias term is not regularized. Double regLoss = 0; VBufferUtils.ForEachDefined(ref CurrentWeights, (ind, value) => { if (ind >= BiasCount) { regLoss += Math.Abs(value); } }); deviance -= (Float)regLoss * L1Weight * 2; } ch.Info("Residual Deviance: \t{0} (on {1} degrees of freedom)", deviance, Math.Max(NumGoodRows - numParams, 0)); // Compute null deviance, i.e., the deviance of null hypothesis. // Cap the prior positive rate at 1e-15. Double priorPosRate = _posWeight / WeightSum; Contracts.Assert(0 <= priorPosRate && priorPosRate <= 1); Float nullDeviance = (priorPosRate <= 1e-15 || 1 - priorPosRate <= 1e-15) ? 0f : (Float)(2 * WeightSum * MathUtils.Entropy(priorPosRate, true)); ch.Info("Null Deviance: \t{0} (on {1} degrees of freedom)", nullDeviance, NumGoodRows - 1); // Compute AIC. ch.Info("AIC: \t{0}", 2 * numParams + deviance); // Show the coefficients statistics table. var featureColIdx = cursorFactory.Data.Schema.Feature.Index; var schema = cursorFactory.Data.Data.Schema; var featureLength = CurrentWeights.Length - BiasCount; var namesSpans = VBufferUtils.CreateEmpty <DvText>(featureLength); if (schema.HasSlotNames(featureColIdx, featureLength)) { schema.GetMetadata(MetadataUtils.Kinds.SlotNames, featureColIdx, ref namesSpans); } Host.Assert(namesSpans.Length == featureLength); // Inverse mapping of non-zero weight slots. Dictionary <int, int> weightIndicesInvMap = null; // Indices of bias and non-zero weight slots. int[] weightIndices = null; // Whether all weights are non-zero. bool denseWeight = numParams == CurrentWeights.Length; // Extract non-zero indices of weight. if (!denseWeight) { weightIndices = new int[numParams]; weightIndicesInvMap = new Dictionary <int, int>(numParams); weightIndices[0] = 0; weightIndicesInvMap[0] = 0; int j = 1; for (int i = 1; i < CurrentWeights.Length; i++) { if (CurrentWeights.Values[i] != 0) { weightIndices[j] = i; weightIndicesInvMap[i] = j++; } } Contracts.Assert(j == numParams); } // Compute the standard error of coefficients. long hessianDimension = (long)numParams * (numParams + 1) / 2; if (hessianDimension > int.MaxValue) { ch.Warning("The number of parameter is too large. Cannot hold the variance-covariance matrix in memory. " + "Skipping computation of standard errors and z-statistics of coefficients. Consider choosing a larger L1 regularizer" + "to reduce the number of parameters."); _stats = new LinearModelStatistics(Host, NumGoodRows, numParams, deviance, nullDeviance); return; } // Building the variance-covariance matrix for parameters. // The layout of this algorithm is a packed row-major lower triangular matrix. // E.g., layout of indices for 4-by-4: // 0 // 1 2 // 3 4 5 // 6 7 8 9 var hessian = new Double[hessianDimension]; // Initialize diagonal elements with L2 regularizers except for the first entry (index 0) // Since bias is not regularized. if (L2Weight > 0) { // i is the array index of the diagonal entry at iRow-th row and iRow-th column. // iRow is one-based. int i = 0; for (int iRow = 2; iRow <= numParams; iRow++) { i += iRow; hessian[i] = L2Weight; } Contracts.Assert(i == hessian.Length - 1); } // Initialize the remaining entries. var bias = CurrentWeights.Values[0]; using (var cursor = cursorFactory.Create()) { while (cursor.MoveNext()) { var label = cursor.Label; var weight = cursor.Weight; var score = bias + VectorUtils.DotProductWithOffset(ref CurrentWeights, 1, ref cursor.Features); // Compute Bernoulli variance n_i * p_i * (1 - p_i) for the i-th training example. var variance = weight / (2 + 2 * Math.Cosh(score)); // Increment the first entry of hessian. hessian[0] += variance; var values = cursor.Features.Values; if (cursor.Features.IsDense) { int ioff = 1; // Increment remaining entries of hessian. for (int i = 1; i < numParams; i++) { ch.Assert(ioff == i * (i + 1) / 2); int wi = weightIndices == null ? i - 1 : weightIndices[i] - 1; Contracts.Assert(0 <= wi && wi < cursor.Features.Length); var val = values[wi] * variance; // Add the implicit first bias term to X'X hessian[ioff++] += val; // Add the remainder of X'X for (int j = 0; j < i; j++) { int wj = weightIndices == null ? j : weightIndices[j + 1] - 1; Contracts.Assert(0 <= wj && wj < cursor.Features.Length); hessian[ioff++] += val * values[wj]; } } ch.Assert(ioff == hessian.Length); } else { var indices = cursor.Features.Indices; for (int ii = 0; ii < cursor.Features.Count; ++ii) { int i = indices[ii]; int wi = i + 1; if (weightIndicesInvMap != null && !weightIndicesInvMap.TryGetValue(i + 1, out wi)) { continue; } Contracts.Assert(0 < wi && wi <= cursor.Features.Length); int ioff = wi * (wi + 1) / 2; var val = values[ii] * variance; // Add the implicit first bias term to X'X hessian[ioff] += val; // Add the remainder of X'X for (int jj = 0; jj <= ii; jj++) { int j = indices[jj]; int wj = j + 1; if (weightIndicesInvMap != null && !weightIndicesInvMap.TryGetValue(j + 1, out wj)) { continue; } Contracts.Assert(0 < wj && wj <= cursor.Features.Length); hessian[ioff + wj] += val * values[jj]; } } } } } // Apply Cholesky Decomposition to find the inverse of the Hessian. Double[] invHessian = null; try { // First, find the Cholesky decomposition LL' of the Hessian. Mkl.Pptrf(Mkl.Layout.RowMajor, Mkl.UpLo.Lo, numParams, hessian); // Note that hessian is already modified at this point. It is no longer the original Hessian, // but instead represents the Cholesky decomposition L. // Also note that the following routine is supposed to consume the Cholesky decomposition L instead // of the original information matrix. Mkl.Pptri(Mkl.Layout.RowMajor, Mkl.UpLo.Lo, numParams, hessian); // At this point, hessian should contain the inverse of the original Hessian matrix. // Swap hessian with invHessian to avoid confusion in the following context. Utils.Swap(ref hessian, ref invHessian); Contracts.Assert(hessian == null); } catch (DllNotFoundException) { throw ch.ExceptNotSupp("The MKL library (Microsoft.ML.MklImports.dll) or one of its dependencies is missing."); } Float[] stdErrorValues = new Float[numParams]; stdErrorValues[0] = (Float)Math.Sqrt(invHessian[0]); for (int i = 1; i < numParams; i++) { // Initialize with inverse Hessian. stdErrorValues[i] = (Single)invHessian[i * (i + 1) / 2 + i]; } if (L2Weight > 0) { // Iterate through all entries of inverse Hessian to make adjustment to variance. // A discussion on ridge regularized LR coefficient covariance matrix can be found here: // http://www.ncbi.nlm.nih.gov/pmc/articles/PMC3228544/ // http://www.inf.unibz.it/dis/teaching/DWDM/project2010/LogisticRegression.pdf int ioffset = 1; for (int iRow = 1; iRow < numParams; iRow++) { for (int iCol = 0; iCol <= iRow; iCol++) { var entry = (Single)invHessian[ioffset]; var adjustment = -L2Weight * entry * entry; stdErrorValues[iRow] -= adjustment; if (0 < iCol && iCol < iRow) { stdErrorValues[iCol] -= adjustment; } ioffset++; } } Contracts.Assert(ioffset == invHessian.Length); } for (int i = 1; i < numParams; i++) { stdErrorValues[i] = (Float)Math.Sqrt(stdErrorValues[i]); } VBuffer <Float> stdErrors = new VBuffer <Float>(CurrentWeights.Length, numParams, stdErrorValues, weightIndices); _stats = new LinearModelStatistics(Host, NumGoodRows, numParams, deviance, nullDeviance, ref stdErrors); }
private OlsLinearRegressionPredictor TrainCore(IChannel ch, FloatLabelCursor.Factory cursorFactory, int featureCount) { Host.AssertValue(ch); ch.AssertValue(cursorFactory); int m = featureCount + 1; // Check for memory conditions first. if ((long)m * (m + 1) / 2 > int.MaxValue) { throw ch.Except("Cannot hold covariance matrix in memory with {0} features", m - 1); } // Track the number of examples. long n = 0; // Since we are accumulating over many values, we use Double even for the single precision build. var xty = new Double[m]; // The layout of this algorithm is a packed row-major lower triangular matrix. var xtx = new Double[m * (m + 1) / 2]; // Build X'X (lower triangular) and X'y incrementally (X'X+=X'X_i; X'y+=X'y_i): using (var cursor = cursorFactory.Create()) { while (cursor.MoveNext()) { var yi = cursor.Label; // Increment first element of X'y xty[0] += yi; // Increment first element of lower triangular X'X xtx[0] += 1; var values = cursor.Features.GetValues(); if (cursor.Features.IsDense) { int ioff = 1; ch.Assert(values.Length + 1 == m); // Increment rest of first column of lower triangular X'X for (int i = 1; i < m; i++) { ch.Assert(ioff == i * (i + 1) / 2); var val = values[i - 1]; // Add the implicit first bias term to X'X xtx[ioff++] += val; // Add the remainder of X'X for (int j = 0; j < i; j++) { xtx[ioff++] += val * values[j]; } // X'y xty[i] += val * yi; } ch.Assert(ioff == xtx.Length); } else { var fIndices = cursor.Features.GetIndices(); for (int ii = 0; ii < values.Length; ++ii) { int i = fIndices[ii] + 1; int ioff = i * (i + 1) / 2; var val = values[ii]; // Add the implicit first bias term to X'X xtx[ioff++] += val; // Add the remainder of X'X for (int jj = 0; jj <= ii; jj++) { xtx[ioff + fIndices[jj]] += val * values[jj]; } // X'y xty[i] += val * yi; } } n++; } ch.Check(n > 0, "No training examples in dataset."); if (cursor.BadFeaturesRowCount > 0) { ch.Warning("Skipped {0} instances with missing features/label during training", cursor.SkippedRowCount); } if (_l2Weight > 0) { // Skip the bias term for regularization, in the ridge regression case. // So start at [1,1] instead of [0,0]. // REVIEW: There are two ways to view this, firstly, it is more // user friendly ot make this scaling factor behave similarly regardless // of data size, so that if you have the same parameters, you get the same // model if you feed in your data than if you duplicate your data 10 times. // This is what I have now. The alternate point of view is to view this // L2 regularization parameter as providing some sort of prior, in which // case duplication 10 times should in fact be treated differently! (That // is, we should not multiply by n below.) Both interpretations seem // correct, in their way. Double squared = _l2Weight * _l2Weight * n; int ioff = 0; for (int i = 1; i < m; ++i) { xtx[ioff += i + 1] += squared; } ch.Assert(ioff == xtx.Length - 1); } } if (!(_l2Weight > 0) && n < m) { throw ch.Except("Ordinary least squares requires more examples than parameters. There are {0} parameters, but {1} examples. To enable training, use a positive L2 weight so this behaves as ridge regression.", m, n); } Double yMean = n == 0 ? 0 : xty[0] / n; ch.Info("Trainer solving for {0} parameters across {1} examples", m, n); // Cholesky Decomposition of X'X into LL' try { Mkl.Pptrf(Mkl.Layout.RowMajor, Mkl.UpLo.Lo, m, xtx); } catch (DllNotFoundException) { // REVIEW: Is there no better way? throw ch.ExceptNotSupp("The MKL library (libMklImports) or one of its dependencies is missing."); } // Solve for beta in (LL')beta = X'y: Mkl.Pptrs(Mkl.Layout.RowMajor, Mkl.UpLo.Lo, m, 1, xtx, xty, 1); // Note that the solver overwrote xty so it contains the solution. To be more clear, // we effectively change its name (through reassignment) so we don't get confused that // this is somehow xty in the remaining calculation. var beta = xty; xty = null; // Check that the solution is valid. for (int i = 0; i < beta.Length; ++i) { ch.Check(FloatUtils.IsFinite(beta[i]), "Non-finite values detected in OLS solution"); } var weights = VBufferUtils.CreateDense <float>(beta.Length - 1); for (int i = 1; i < beta.Length; ++i) { weights.Values[i - 1] = (float)beta[i]; } var bias = (float)beta[0]; if (!(_l2Weight > 0) && m == n) { // We would expect the solution to the problem to be exact in this case. ch.Info("Number of examples equals number of parameters, solution is exact but no statistics can be derived"); return(new OlsLinearRegressionPredictor(Host, in weights, bias, null, null, null, 1, float.NaN)); } Double rss = 0; // residual sum of squares Double tss = 0; // total sum of squares using (var cursor = cursorFactory.Create()) { var lrPredictor = new LinearRegressionPredictor(Host, in weights, bias); var lrMap = lrPredictor.GetMapper <VBuffer <float>, float>(); float yh = default; while (cursor.MoveNext()) { var features = cursor.Features; lrMap(in features, ref yh); var e = cursor.Label - yh; rss += e * e; var ydm = cursor.Label - yMean; tss += ydm * ydm; } } var rSquared = ProbClamp(1 - (rss / tss)); // R^2 adjusted differs from the normal formula on account of the bias term, by Said's reckoning. double rSquaredAdjusted; if (n > m) { rSquaredAdjusted = ProbClamp(1 - (1 - rSquared) * (n - 1) / (n - m)); ch.Info("Coefficient of determination R2 = {0:g}, or {1:g} (adjusted)", rSquared, rSquaredAdjusted); } else { rSquaredAdjusted = Double.NaN; } // The per parameter significance is compute intensive and may not be required for all practitioners. // Also we can't estimate it, unless we can estimate the variance, which requires more examples than // parameters. if (!_perParameterSignificance || m >= n) { return(new OlsLinearRegressionPredictor(Host, in weights, bias, null, null, null, rSquared, rSquaredAdjusted)); } ch.Assert(!Double.IsNaN(rSquaredAdjusted)); var standardErrors = new Double[m]; var tValues = new Double[m]; var pValues = new Double[m]; // Invert X'X: Mkl.Pptri(Mkl.Layout.RowMajor, Mkl.UpLo.Lo, m, xtx); var s2 = rss / (n - m); // estimate of variance of y for (int i = 0; i < m; i++) { // Initialize with inverse Hessian. standardErrors[i] = (Single)xtx[i * (i + 1) / 2 + i]; } if (_l2Weight > 0) { // Iterate through all entries of inverse Hessian to make adjustment to variance. int ioffset = 1; float reg = _l2Weight * _l2Weight * n; for (int iRow = 1; iRow < m; iRow++) { for (int iCol = 0; iCol <= iRow; iCol++) { var entry = (Single)xtx[ioffset]; var adjustment = -reg * entry * entry; standardErrors[iRow] -= adjustment; if (0 < iCol && iCol < iRow) { standardErrors[iCol] -= adjustment; } ioffset++; } } Contracts.Assert(ioffset == xtx.Length); } for (int i = 0; i < m; i++) { // sqrt of diagonal entries of s2 * inverse(X'X + reg * I) * X'X * inverse(X'X + reg * I). standardErrors[i] = Math.Sqrt(s2 * standardErrors[i]); ch.Check(FloatUtils.IsFinite(standardErrors[i]), "Non-finite standard error detected from OLS solution"); tValues[i] = beta[i] / standardErrors[i]; pValues[i] = (float)MathUtils.TStatisticToPValue(tValues[i], n - m); ch.Check(0 <= pValues[i] && pValues[i] <= 1, "p-Value calculated outside expected [0,1] range"); } return(new OlsLinearRegressionPredictor(Host, in weights, bias, standardErrors, tValues, pValues, rSquared, rSquaredAdjusted)); }
/// <summary> /// Computes the standart deviation matrix of each of the non-zero training weights, needed to calculate further the standart deviation, /// p-value and z-Score. /// If you need faster calculations, use the ComputeStd method from the Microsoft.ML.HALLearners package, which makes use of hardware acceleration. /// Due to the existence of regularization, an approximation is used to compute the variances of the trained linear coefficients. /// </summary> /// <param name="hessian"></param> /// <param name="weightIndices"></param> /// <param name="numSelectedParams"></param> /// <param name="currentWeightsCount"></param> /// <param name="ch">The <see cref="IChannel"/> used for messaging.</param> /// <param name="l2Weight">The L2Weight used for training. (Supply the same one that got used during training.)</param> public override VBuffer <float> ComputeStd(double[] hessian, int[] weightIndices, int numSelectedParams, int currentWeightsCount, IChannel ch, float l2Weight) { Contracts.AssertValue(ch); Contracts.AssertValue(hessian, nameof(hessian)); Contracts.Assert(numSelectedParams > 0); Contracts.Assert(currentWeightsCount > 0); Contracts.Assert(l2Weight > 0); // Apply Cholesky Decomposition to find the inverse of the Hessian. Double[] invHessian = null; try { // First, find the Cholesky decomposition LL' of the Hessian. Mkl.Pptrf(Mkl.Layout.RowMajor, Mkl.UpLo.Lo, numSelectedParams, hessian); // Note that hessian is already modified at this point. It is no longer the original Hessian, // but instead represents the Cholesky decomposition L. // Also note that the following routine is supposed to consume the Cholesky decomposition L instead // of the original information matrix. Mkl.Pptri(Mkl.Layout.RowMajor, Mkl.UpLo.Lo, numSelectedParams, hessian); // At this point, hessian should contain the inverse of the original Hessian matrix. // Swap hessian with invHessian to avoid confusion in the following context. Utils.Swap(ref hessian, ref invHessian); Contracts.Assert(hessian == null); } catch (DllNotFoundException) { throw ch.ExceptNotSupp("The MKL library (MklImports.dll) or one of its dependencies is missing."); } float[] stdErrorValues = new float[numSelectedParams]; stdErrorValues[0] = (float)Math.Sqrt(invHessian[0]); for (int i = 1; i < numSelectedParams; i++) { // Initialize with inverse Hessian. stdErrorValues[i] = (float)invHessian[i * (i + 1) / 2 + i]; } if (l2Weight > 0) { // Iterate through all entries of inverse Hessian to make adjustment to variance. // A discussion on ridge regularized LR coefficient covariance matrix can be found here: // http://www.aloki.hu/pdf/0402_171179.pdf (Equations 11 and 25) // http://www.inf.unibz.it/dis/teaching/DWDM/project2010/LogisticRegression.pdf (Section "Significance testing in ridge logistic regression") int ioffset = 1; for (int iRow = 1; iRow < numSelectedParams; iRow++) { for (int iCol = 0; iCol <= iRow; iCol++) { var entry = (float)invHessian[ioffset++]; AdjustVariance(entry, iRow, iCol, l2Weight, stdErrorValues); } } Contracts.Assert(ioffset == invHessian.Length); } for (int i = 1; i < numSelectedParams; i++) { stdErrorValues[i] = (float)Math.Sqrt(stdErrorValues[i]); } // currentWeights vector size is Weights2 + the bias return(new VBuffer <float>(currentWeightsCount, numSelectedParams, stdErrorValues, weightIndices)); }