protected TrainStateBase(IChannel ch, int numFeatures, LinearModelParameters predictor, OnlineLinearTrainer <TTransformer, TModel> parent) { Contracts.CheckValue(ch, nameof(ch)); ch.Check(numFeatures > 0, "Cannot train with zero features!"); ch.AssertValueOrNull(predictor); ch.AssertValue(parent); ch.Assert(Iteration == 0); ch.Assert(Bias == 0); ParentHost = parent.Host; ch.Trace("{0} Initializing {1} on {2} features", DateTime.UtcNow, parent.Name, numFeatures); // We want a dense vector, to prevent memory creation during training // unless we have a lot of features. if (predictor != null) { ((IHaveFeatureWeights)predictor).GetFeatureWeights(ref Weights); VBufferUtils.Densify(ref Weights); Bias = predictor.Bias; } else if (!string.IsNullOrWhiteSpace(parent.OnlineLinearTrainerOptions.InitialWeights)) { ch.Info("Initializing weights and bias to " + parent.OnlineLinearTrainerOptions.InitialWeights); string[] weightStr = parent.OnlineLinearTrainerOptions.InitialWeights.Split(','); if (weightStr.Length != numFeatures + 1) { throw ch.Except( "Could not initialize weights from 'initialWeights': expecting {0} values to initialize {1} weights and the intercept", numFeatures + 1, numFeatures); } var weightValues = new float[numFeatures]; for (int i = 0; i < numFeatures; i++) { weightValues[i] = float.Parse(weightStr[i], CultureInfo.InvariantCulture); } Weights = new VBuffer <float>(numFeatures, weightValues); Bias = float.Parse(weightStr[numFeatures], CultureInfo.InvariantCulture); } else if (parent.OnlineLinearTrainerOptions.InitialWeightsDiameter > 0) { var weightValues = new float[numFeatures]; for (int i = 0; i < numFeatures; i++) { weightValues[i] = parent.OnlineLinearTrainerOptions.InitialWeightsDiameter * (parent.Host.Rand.NextSingle() - (float)0.5); } Weights = new VBuffer <float>(numFeatures, weightValues); Bias = parent.OnlineLinearTrainerOptions.InitialWeightsDiameter * (parent.Host.Rand.NextSingle() - (float)0.5); } else if (numFeatures <= 1000) { Weights = VBufferUtils.CreateDense <float>(numFeatures); } else { Weights = VBufferUtils.CreateEmpty <float>(numFeatures); } WeightsScale = 1; }
public Counters(int size) { Contracts.Assert(size > 0); _size = size; TotalL1Loss = VBufferUtils.CreateDense <Double>(size); TotalL2Loss = VBufferUtils.CreateDense <Double>(size); TotalLoss = VBufferUtils.CreateDense <Double>(size); }
protected virtual void InitCore(IChannel ch, int numFeatures, LinearPredictor predictor) { Contracts.Check(numFeatures > 0, "Can't train with zero features!"); Contracts.Check(NumFeatures == 0, "Can't re-use trainer!"); Contracts.Assert(Iteration == 0); Contracts.Assert(Bias == 0); ch.Trace("{0} Initializing {1} on {2} features", DateTime.UtcNow, Name, numFeatures); NumFeatures = numFeatures; // We want a dense vector, to prevent memory creation during training // unless we have a lot of features. // REVIEW: make a setting if (predictor != null) { predictor.GetFeatureWeights(ref Weights); VBufferUtils.Densify(ref Weights); Bias = predictor.Bias; } else if (!string.IsNullOrWhiteSpace(Args.InitialWeights)) { ch.Info("Initializing weights and bias to " + Args.InitialWeights); string[] weightStr = Args.InitialWeights.Split(','); if (weightStr.Length != NumFeatures + 1) { throw Contracts.Except( "Could not initialize weights from 'initialWeights': expecting {0} values to initialize {1} weights and the intercept", NumFeatures + 1, NumFeatures); } Weights = VBufferUtils.CreateDense <Float>(NumFeatures); for (int i = 0; i < NumFeatures; i++) { Weights.Values[i] = Float.Parse(weightStr[i], CultureInfo.InvariantCulture); } Bias = Float.Parse(weightStr[NumFeatures], CultureInfo.InvariantCulture); } else if (Args.InitWtsDiameter > 0) { Weights = VBufferUtils.CreateDense <Float>(NumFeatures); for (int i = 0; i < NumFeatures; i++) { Weights.Values[i] = Args.InitWtsDiameter * (Host.Rand.NextSingle() - (Float)0.5); } Bias = Args.InitWtsDiameter * (Host.Rand.NextSingle() - (Float)0.5); } else if (NumFeatures <= 1000) { Weights = VBufferUtils.CreateDense <Float>(NumFeatures); } else { Weights = VBufferUtils.CreateEmpty <Float>(NumFeatures); } WeightsScale = 1; }
private static VBuffer <Float>[] Zeros(int k, int d) { var rv = new VBuffer <Float> [k]; for (var i = 0; i < k; ++i) { rv[i] = VBufferUtils.CreateDense <Float>(d); } return(rv); }
private OlsLinearRegressionPredictor TrainCore(IChannel ch, FloatLabelCursor.Factory cursorFactory, int featureCount) { Host.AssertValue(ch); ch.AssertValue(cursorFactory); int m = featureCount + 1; // Check for memory conditions first. if ((long)m * (m + 1) / 2 > int.MaxValue) { throw ch.Except("Cannot hold covariance matrix in memory with {0} features", m - 1); } // Track the number of examples. long n = 0; // Since we are accumulating over many values, we use Double even for the single precision build. var xty = new Double[m]; // The layout of this algorithm is a packed row-major lower triangular matrix. var xtx = new Double[m * (m + 1) / 2]; // Build X'X (lower triangular) and X'y incrementally (X'X+=X'X_i; X'y+=X'y_i): using (var cursor = cursorFactory.Create()) { while (cursor.MoveNext()) { var yi = cursor.Label; // Increment first element of X'y xty[0] += yi; // Increment first element of lower triangular X'X xtx[0] += 1; var values = cursor.Features.GetValues(); if (cursor.Features.IsDense) { int ioff = 1; ch.Assert(values.Length + 1 == m); // Increment rest of first column of lower triangular X'X for (int i = 1; i < m; i++) { ch.Assert(ioff == i * (i + 1) / 2); var val = values[i - 1]; // Add the implicit first bias term to X'X xtx[ioff++] += val; // Add the remainder of X'X for (int j = 0; j < i; j++) { xtx[ioff++] += val * values[j]; } // X'y xty[i] += val * yi; } ch.Assert(ioff == xtx.Length); } else { var fIndices = cursor.Features.GetIndices(); for (int ii = 0; ii < values.Length; ++ii) { int i = fIndices[ii] + 1; int ioff = i * (i + 1) / 2; var val = values[ii]; // Add the implicit first bias term to X'X xtx[ioff++] += val; // Add the remainder of X'X for (int jj = 0; jj <= ii; jj++) { xtx[ioff + fIndices[jj]] += val * values[jj]; } // X'y xty[i] += val * yi; } } n++; } ch.Check(n > 0, "No training examples in dataset."); if (cursor.BadFeaturesRowCount > 0) { ch.Warning("Skipped {0} instances with missing features/label during training", cursor.SkippedRowCount); } if (_l2Weight > 0) { // Skip the bias term for regularization, in the ridge regression case. // So start at [1,1] instead of [0,0]. // REVIEW: There are two ways to view this, firstly, it is more // user friendly ot make this scaling factor behave similarly regardless // of data size, so that if you have the same parameters, you get the same // model if you feed in your data than if you duplicate your data 10 times. // This is what I have now. The alternate point of view is to view this // L2 regularization parameter as providing some sort of prior, in which // case duplication 10 times should in fact be treated differently! (That // is, we should not multiply by n below.) Both interpretations seem // correct, in their way. Double squared = _l2Weight * _l2Weight * n; int ioff = 0; for (int i = 1; i < m; ++i) { xtx[ioff += i + 1] += squared; } ch.Assert(ioff == xtx.Length - 1); } } if (!(_l2Weight > 0) && n < m) { throw ch.Except("Ordinary least squares requires more examples than parameters. There are {0} parameters, but {1} examples. To enable training, use a positive L2 weight so this behaves as ridge regression.", m, n); } Double yMean = n == 0 ? 0 : xty[0] / n; ch.Info("Trainer solving for {0} parameters across {1} examples", m, n); // Cholesky Decomposition of X'X into LL' try { Mkl.Pptrf(Mkl.Layout.RowMajor, Mkl.UpLo.Lo, m, xtx); } catch (DllNotFoundException) { // REVIEW: Is there no better way? throw ch.ExceptNotSupp("The MKL library (libMklImports) or one of its dependencies is missing."); } // Solve for beta in (LL')beta = X'y: Mkl.Pptrs(Mkl.Layout.RowMajor, Mkl.UpLo.Lo, m, 1, xtx, xty, 1); // Note that the solver overwrote xty so it contains the solution. To be more clear, // we effectively change its name (through reassignment) so we don't get confused that // this is somehow xty in the remaining calculation. var beta = xty; xty = null; // Check that the solution is valid. for (int i = 0; i < beta.Length; ++i) { ch.Check(FloatUtils.IsFinite(beta[i]), "Non-finite values detected in OLS solution"); } var weights = VBufferUtils.CreateDense <float>(beta.Length - 1); for (int i = 1; i < beta.Length; ++i) { weights.Values[i - 1] = (float)beta[i]; } var bias = (float)beta[0]; if (!(_l2Weight > 0) && m == n) { // We would expect the solution to the problem to be exact in this case. ch.Info("Number of examples equals number of parameters, solution is exact but no statistics can be derived"); return(new OlsLinearRegressionPredictor(Host, in weights, bias, null, null, null, 1, float.NaN)); } Double rss = 0; // residual sum of squares Double tss = 0; // total sum of squares using (var cursor = cursorFactory.Create()) { var lrPredictor = new LinearRegressionPredictor(Host, in weights, bias); var lrMap = lrPredictor.GetMapper <VBuffer <float>, float>(); float yh = default; while (cursor.MoveNext()) { var features = cursor.Features; lrMap(in features, ref yh); var e = cursor.Label - yh; rss += e * e; var ydm = cursor.Label - yMean; tss += ydm * ydm; } } var rSquared = ProbClamp(1 - (rss / tss)); // R^2 adjusted differs from the normal formula on account of the bias term, by Said's reckoning. double rSquaredAdjusted; if (n > m) { rSquaredAdjusted = ProbClamp(1 - (1 - rSquared) * (n - 1) / (n - m)); ch.Info("Coefficient of determination R2 = {0:g}, or {1:g} (adjusted)", rSquared, rSquaredAdjusted); } else { rSquaredAdjusted = Double.NaN; } // The per parameter significance is compute intensive and may not be required for all practitioners. // Also we can't estimate it, unless we can estimate the variance, which requires more examples than // parameters. if (!_perParameterSignificance || m >= n) { return(new OlsLinearRegressionPredictor(Host, in weights, bias, null, null, null, rSquared, rSquaredAdjusted)); } ch.Assert(!Double.IsNaN(rSquaredAdjusted)); var standardErrors = new Double[m]; var tValues = new Double[m]; var pValues = new Double[m]; // Invert X'X: Mkl.Pptri(Mkl.Layout.RowMajor, Mkl.UpLo.Lo, m, xtx); var s2 = rss / (n - m); // estimate of variance of y for (int i = 0; i < m; i++) { // Initialize with inverse Hessian. standardErrors[i] = (Single)xtx[i * (i + 1) / 2 + i]; } if (_l2Weight > 0) { // Iterate through all entries of inverse Hessian to make adjustment to variance. int ioffset = 1; float reg = _l2Weight * _l2Weight * n; for (int iRow = 1; iRow < m; iRow++) { for (int iCol = 0; iCol <= iRow; iCol++) { var entry = (Single)xtx[ioffset]; var adjustment = -reg * entry * entry; standardErrors[iRow] -= adjustment; if (0 < iCol && iCol < iRow) { standardErrors[iCol] -= adjustment; } ioffset++; } } Contracts.Assert(ioffset == xtx.Length); } for (int i = 0; i < m; i++) { // sqrt of diagonal entries of s2 * inverse(X'X + reg * I) * X'X * inverse(X'X + reg * I). standardErrors[i] = Math.Sqrt(s2 * standardErrors[i]); ch.Check(FloatUtils.IsFinite(standardErrors[i]), "Non-finite standard error detected from OLS solution"); tValues[i] = beta[i] / standardErrors[i]; pValues[i] = (float)MathUtils.TStatisticToPValue(tValues[i], n - m); ch.Check(0 <= pValues[i] && pValues[i] <= 1, "p-Value calculated outside expected [0,1] range"); } return(new OlsLinearRegressionPredictor(Host, in weights, bias, standardErrors, tValues, pValues, rSquared, rSquaredAdjusted)); }
private TPredictor TrainCore(IChannel ch, RoleMappedData data, LinearModelParameters predictor, int weightSetCount) { int numFeatures = data.Schema.Feature.Value.Type.GetVectorSize(); var cursorFactory = new FloatLabelCursor.Factory(data, CursOpt.Label | CursOpt.Features); int numThreads = 1; ch.CheckUserArg(numThreads > 0, nameof(_options.NumberOfThreads), "The number of threads must be either null or a positive integer."); var positiveInstanceWeight = _options.PositiveInstanceWeight; VBuffer <float> weights = default; float bias = 0.0f; if (predictor != null) { predictor.GetFeatureWeights(ref weights); VBufferUtils.Densify(ref weights); bias = predictor.Bias; } else { weights = VBufferUtils.CreateDense <float>(numFeatures); } var weightsEditor = VBufferEditor.CreateFromBuffer(ref weights); // Reference: Parasail. SymSGD. bool tuneLR = _options.LearningRate == null; var lr = _options.LearningRate ?? 1.0f; bool tuneNumLocIter = (_options.UpdateFrequency == null); var numLocIter = _options.UpdateFrequency ?? 1; var l2Const = _options.L2Regularization; var piw = _options.PositiveInstanceWeight; // This is state of the learner that is shared with the native code. State state = new State(); GCHandle stateGCHandle = default; try { stateGCHandle = GCHandle.Alloc(state, GCHandleType.Pinned); state.TotalInstancesProcessed = 0; using (InputDataManager inputDataManager = new InputDataManager(this, cursorFactory, ch)) { bool shouldInitialize = true; using (var pch = Host.StartProgressChannel("Preprocessing")) inputDataManager.LoadAsMuchAsPossible(); int iter = 0; if (inputDataManager.IsFullyLoaded) { ch.Info("Data fully loaded into memory."); } using (var pch = Host.StartProgressChannel("Training")) { if (inputDataManager.IsFullyLoaded) { pch.SetHeader(new ProgressHeader(new[] { "iterations" }), entry => entry.SetProgress(0, state.PassIteration, _options.NumberOfIterations)); // If fully loaded, call the SymSGDNative and do not come back until learned for all iterations. Native.LearnAll(inputDataManager, tuneLR, ref lr, l2Const, piw, weightsEditor.Values, ref bias, numFeatures, _options.NumberOfIterations, numThreads, tuneNumLocIter, ref numLocIter, _options.Tolerance, _options.Shuffle, shouldInitialize, stateGCHandle, ch.Info); shouldInitialize = false; } else { pch.SetHeader(new ProgressHeader(new[] { "iterations" }), entry => entry.SetProgress(0, iter, _options.NumberOfIterations)); // Since we loaded data in batch sizes, multiple passes over the loaded data is feasible. int numPassesForABatch = inputDataManager.Count / 10000; while (iter < _options.NumberOfIterations) { // We want to train on the final passes thoroughly (without learning on the same batch multiple times) // This is for fine tuning the AUC. Experimentally, we found that 1 or 2 passes is enough int numFinalPassesToTrainThoroughly = 2; // We also do not want to learn for more passes than what the user asked int numPassesForThisBatch = Math.Min(numPassesForABatch, _options.NumberOfIterations - iter - numFinalPassesToTrainThoroughly); // If all of this leaves us with 0 passes, then set numPassesForThisBatch to 1 numPassesForThisBatch = Math.Max(1, numPassesForThisBatch); state.PassIteration = iter; Native.LearnAll(inputDataManager, tuneLR, ref lr, l2Const, piw, weightsEditor.Values, ref bias, numFeatures, numPassesForThisBatch, numThreads, tuneNumLocIter, ref numLocIter, _options.Tolerance, _options.Shuffle, shouldInitialize, stateGCHandle, ch.Info); shouldInitialize = false; // Check if we are done with going through the data if (inputDataManager.FinishedTheLoad) { iter += numPassesForThisBatch; // Check if more passes are left if (iter < _options.NumberOfIterations) { inputDataManager.RestartLoading(_options.Shuffle, Host); } } // If more passes are left, load as much as possible if (iter < _options.NumberOfIterations) { inputDataManager.LoadAsMuchAsPossible(); } } } // Maps back the dense features that are mislocated if (numThreads > 1) { Native.MapBackWeightVector(weightsEditor.Values, stateGCHandle); } Native.DeallocateSequentially(stateGCHandle); } } } finally { if (stateGCHandle.IsAllocated) { stateGCHandle.Free(); } } return(CreatePredictor(weights, bias)); }
private PcaPredictor TrainCore(IChannel ch, RoleMappedData data, int dimension) { Host.AssertValue(ch); ch.AssertValue(data); if (_rank > dimension) { throw ch.Except("Rank ({0}) cannot be larger than the original dimension ({1})", _rank, dimension); } int oversampledRank = Math.Min(_rank + _oversampling, dimension); //exact: (size of the 2 big matrices + other minor allocations) / (2^30) Double memoryUsageEstimate = 2.0 * dimension * oversampledRank * sizeof(Float) / 1e9; if (memoryUsageEstimate > 2) { ch.Info("Estimate memory usage: {0:G2} GB. If running out of memory, reduce rank and oversampling factor.", memoryUsageEstimate); } var y = Zeros(oversampledRank, dimension); var mean = _center ? VBufferUtils.CreateDense <Float>(dimension) : VBufferUtils.CreateEmpty <Float>(dimension); var omega = GaussianMatrix(oversampledRank, dimension, _seed); var cursorFactory = new FeatureFloatVectorCursor.Factory(data, CursOpt.Features | CursOpt.Weight); long numBad; Project(Host, cursorFactory, ref mean, omega, y, out numBad); if (numBad > 0) { ch.Warning("Skipped {0} instances with missing features/weights during training", numBad); } //Orthonormalize Y in-place using stabilized Gram Schmidt algorithm. //Ref: https://en.wikipedia.org/wiki/Gram-Schmidt#Algorithm for (var i = 0; i < oversampledRank; ++i) { var v = y[i]; VectorUtils.ScaleBy(ref v, 1 / VectorUtils.Norm(y[i])); // Make the next vectors in the queue orthogonal to the orthonormalized vectors. for (var j = i + 1; j < oversampledRank; ++j) //subtract the projection of y[j] on v. { VectorUtils.AddMult(ref v, -VectorUtils.DotProduct(ref v, ref y[j]), ref y[j]); } } var q = y; // q in QR decomposition. var b = omega; // reuse the memory allocated by Omega. Project(Host, cursorFactory, ref mean, q, b, out numBad); //Compute B2 = B' * B var b2 = new Float[oversampledRank * oversampledRank]; for (var i = 0; i < oversampledRank; ++i) { for (var j = i; j < oversampledRank; ++j) { b2[i * oversampledRank + j] = b2[j * oversampledRank + i] = VectorUtils.DotProduct(ref b[i], ref b[j]); } } Float[] smallEigenvalues;// eigenvectors and eigenvalues of the small matrix B2. Float[] smallEigenvectors; EigenUtils.EigenDecomposition(b2, out smallEigenvalues, out smallEigenvectors); PostProcess(b, smallEigenvalues, smallEigenvectors, dimension, oversampledRank); return(new PcaPredictor(Host, _rank, b, ref mean)); }
/// <summary> /// Initialize weights by running SGD up to specified tolerance. /// </summary> protected virtual VBuffer <float> InitializeWeightsSgd(IChannel ch, FloatLabelCursor.Factory cursorFactory) { if (!Quiet) { ch.Info("Running SGD initialization with tolerance {0}", SgdInitializationTolerance); } int numExamples = 0; var oldWeights = VBufferUtils.CreateEmpty <float>(BiasCount + WeightCount); DTerminate terminateSgd = (in VBuffer <float> x) => { if (++numExamples % 1000 != 0) { return(false); } VectorUtils.AddMult(in x, -1, ref oldWeights); float normDiff = VectorUtils.Norm(oldWeights); x.CopyTo(ref oldWeights); // #if OLD_TRACING // REVIEW: How should this be ported? if (!Quiet) { Console.Write("."); if (numExamples % 50000 == 0) { Console.WriteLine("\t{0}\t{1}", numExamples, normDiff); } } // #endif return(normDiff < SgdInitializationTolerance); }; VBuffer <float> result = default(VBuffer <float>); FloatLabelCursor cursor = null; try { float[] scratch = null; SgdOptimizer.DStochasticGradient lossSgd = (in VBuffer <float> x, ref VBuffer <float> grad) => { // Zero out the gradient by sparsifying. grad = new VBuffer <float>(grad.Length, 0, grad.Values, grad.Indices); EnsureBiases(ref grad); if (cursor == null || !cursor.MoveNext()) { if (cursor != null) { cursor.Dispose(); } cursor = cursorFactory.Create(); if (!cursor.MoveNext()) { return; } } AccumulateOneGradient(in cursor.Features, cursor.Label, cursor.Weight, in x, ref grad, ref scratch); }; VBuffer <float> sgdWeights; if (DenseOptimizer) { sgdWeights = VBufferUtils.CreateDense <float>(BiasCount + WeightCount); } else { sgdWeights = VBufferUtils.CreateEmpty <float>(BiasCount + WeightCount); } SgdOptimizer sgdo = new SgdOptimizer(terminateSgd); sgdo.Minimize(lossSgd, ref sgdWeights, ref result); // #if OLD_TRACING // REVIEW: How should this be ported? if (!Quiet) { Console.WriteLine(); } // #endif ch.Info("SGD initialization done in {0} rounds", numExamples); } finally { if (cursor != null) { cursor.Dispose(); } } return(result); }
/// <summary> /// Convenience function to construct a working vector of length <c>Dim</c>. /// </summary> /// <returns></returns> protected VBuffer <Float> CreateWorkingVector() { // Owing to the way the operations are structured, if the "x", "newX", and "dir" vectors // start out (or somehow naturally become) dense, they will remain dense. return(_keepDense ? VBufferUtils.CreateDense <Float>(Dim) : VBufferUtils.CreateEmpty <Float>(Dim)); }
public override Delegate[] CreateGetters(IRow input, Func <int, bool> activeCols, out Action disposer) { Host.Assert(LabelIndex >= 0); Host.Assert(ScoreIndex >= 0); disposer = null; long cachedPosition = -1; Float label = 0; var score = default(VBuffer <Float>); var l1 = VBufferUtils.CreateDense <Double>(_scoreSize); ValueGetter <Float> nanGetter = (ref Float value) => value = Single.NaN; var labelGetter = activeCols(L1Col) || activeCols(L2Col) ? RowCursorUtils.GetLabelGetter(input, LabelIndex) : nanGetter; ValueGetter <VBuffer <Float> > scoreGetter; if (activeCols(L1Col) || activeCols(L2Col)) { scoreGetter = input.GetGetter <VBuffer <Float> >(ScoreIndex); } else { scoreGetter = (ref VBuffer <Float> dst) => dst = default(VBuffer <Float>); } Action updateCacheIfNeeded = () => { if (cachedPosition != input.Position) { labelGetter(ref label); scoreGetter(ref score); var lab = (Double)label; foreach (var s in score.Items(all: true)) { l1.Values[s.Key] = Math.Abs(lab - s.Value); } cachedPosition = input.Position; } }; var getters = new Delegate[2]; if (activeCols(L1Col)) { ValueGetter <VBuffer <Double> > l1Fn = (ref VBuffer <Double> dst) => { updateCacheIfNeeded(); l1.CopyTo(ref dst); }; getters[L1Col] = l1Fn; } if (activeCols(L2Col)) { VBufferUtils.PairManipulator <Double, Double> sqr = (int slot, Double x, ref Double y) => y = x * x; ValueGetter <VBuffer <Double> > l2Fn = (ref VBuffer <Double> dst) => { updateCacheIfNeeded(); dst = new VBuffer <Double>(_scoreSize, 0, dst.Values, dst.Indices); VBufferUtils.ApplyWith(ref l1, ref dst, sqr); }; getters[L2Col] = l2Fn; } return(getters); }
protected override VBuffer <Double> Zero() { return(VBufferUtils.CreateDense <Double>(_size)); }