protected internal OptimizerState(IChannel ch, IProgressChannelProvider progress, ref VBuffer <Float> initial, int m, long totalMemLimit, bool keepDense, bool enforceNonNegativity) { Contracts.AssertValue(ch); Ch = ch; ch.AssertValueOrNull(progress); ProgressProvider = progress; Iter = 1; _keepDense = keepDense; Dim = initial.Length; _x = CreateWorkingVector(); initial.CopyTo(ref _x); _m = m; _totalMemLimit = totalMemLimit; Dim = initial.Length; _grad = CreateWorkingVector(); _dir = CreateWorkingVector(); _newX = CreateWorkingVector(); _newGrad = CreateWorkingVector(); _steepestDescDir = CreateWorkingVector(); _sList = new VBuffer <Float> [_m]; _yList = new VBuffer <Float> [_m]; _roList = new List <Float>(); EnforceNonNegativity = enforceNonNegativity; }
internal FunctionOptimizerState(IChannel ch, IProgressChannelProvider progress, DifferentiableFunction function, ref VBuffer <Float> initial, int m, long totalMemLimit, bool keepDense, bool enforceNonNegativity) : base(ch, progress, ref initial, m, totalMemLimit, keepDense, enforceNonNegativity) { Function = function; Init(); }
/// <summary> /// The gradient being used by the optimizer /// </summary> protected virtual float DifferentiableFunction(ref VBuffer <float> x, ref VBuffer <float> gradient, IProgressChannelProvider progress) { Contracts.Assert((_numChunks == 0) != (_data == null)); Contracts.Assert((_cursorFactory == null) == (_data == null)); Contracts.Assert(x.Length == BiasCount + WeightCount); Contracts.Assert(gradient.Length == BiasCount + WeightCount); // REVIEW: if/when LBFGS test code is removed, the progress provider needs to become required. Contracts.AssertValueOrNull(progress); float scaleFactor = 1 / (float)WeightSum; VBuffer <float> xDense = default(VBuffer <float>); if (x.IsDense) { xDense = x; } else { x.CopyToDense(ref xDense); } IProgressChannel pch = progress != null?progress.StartProgressChannel("Gradient") : null; float loss; using (pch) { loss = _data == null ? DifferentiableFunctionMultithreaded(ref xDense, ref gradient, pch) : DifferentiableFunctionStream(_cursorFactory, ref xDense, ref gradient, pch); } float regLoss = 0; if (L2Weight > 0) { Contracts.Assert(xDense.IsDense); var values = xDense.Values; Double r = 0; for (int i = BiasCount; i < values.Length; i++) { var xx = values[i]; r += xx * xx; } regLoss = (float)(r * L2Weight * 0.5); // Here we probably want to use sparse x VBufferUtils.ApplyWithEitherDefined(ref x, ref gradient, (int ind, float v1, ref float v2) => { if (ind >= BiasCount) { v2 += L2Weight * v1; } }); } VectorUtils.ScaleBy(ref gradient, scaleFactor); // REVIEW: The regularization component of the loss is being scaled as well, // but it's unclear that it should be scaled. return((loss + regLoss) * scaleFactor); }
internal L1OptimizerState(IChannel ch, IProgressChannelProvider progress, DifferentiableFunction function, ref VBuffer <Float> initial, int m, long totalMemLimit, int biasCount, Float l1Weight, bool keepDense, bool enforceNonNegativity) : base(ch, progress, ref initial, m, totalMemLimit, keepDense, enforceNonNegativity) { Contracts.AssertValue(ch); ch.Assert(0 <= biasCount && biasCount < initial.Length); ch.Assert(l1Weight > 0); _biasCount = biasCount; _l1weight = l1Weight; _function = function; Init(); }
internal override OptimizerState MakeState(IChannel ch, IProgressChannelProvider progress, DifferentiableFunction function, ref VBuffer <Float> initial) { Contracts.AssertValue(ch); ch.AssertValue(progress); if (EnforceNonNegativity) { VBufferUtils.Apply(ref initial, delegate(int ind, ref Float initialVal) { if (initialVal < 0.0 && ind >= _biasCount) { initialVal = 0; } }); } if (_l1weight > 0 && _biasCount < initial.Length) { return(new L1OptimizerState(ch, progress, function, in initial, M, TotalMemoryLimit, _biasCount, _l1weight, KeepDense, EnforceNonNegativity)); } return(new FunctionOptimizerState(ch, progress, function, in initial, M, TotalMemoryLimit, KeepDense, EnforceNonNegativity)); }
/// <inheritdoc/> private protected override void TrainWithoutLock(IProgressChannelProvider progress, FloatLabelCursor.Factory cursorFactory, Random rand, IdToIdxLookup idToIdx, int numThreads, DualsTableBase duals, float[] biasReg, float[] invariants, float lambdaNInv, VBuffer <float>[] weights, float[] biasUnreg, VBuffer <float>[] l1IntermediateWeights, float[] l1IntermediateBias, float[] featureNormSquared) { Contracts.AssertValueOrNull(progress); Contracts.Assert(SdcaTrainerOptions.L1Threshold.HasValue); Contracts.AssertValueOrNull(idToIdx); Contracts.AssertValueOrNull(invariants); Contracts.AssertValueOrNull(featureNormSquared); int numClasses = Utils.Size(weights); Contracts.Assert(Utils.Size(biasReg) == numClasses); Contracts.Assert(Utils.Size(biasUnreg) == numClasses); int maxUpdateTrials = 2 * numThreads; var l1Threshold = SdcaTrainerOptions.L1Threshold.Value; bool l1ThresholdZero = l1Threshold == 0; var lr = SdcaTrainerOptions.BiasLearningRate * SdcaTrainerOptions.L2Regularization.Value; var pch = progress != null?progress.StartProgressChannel("Dual update") : null; using (pch) using (var cursor = SdcaTrainerOptions.Shuffle ? cursorFactory.Create(rand) : cursorFactory.Create()) { long rowCount = 0; if (pch != null) { pch.SetHeader(new ProgressHeader("examples"), e => e.SetProgress(0, rowCount)); } Func <DataViewRowId, long> getIndexFromId = GetIndexFromIdGetter(idToIdx, biasReg.Length); while (cursor.MoveNext()) { long idx = getIndexFromId(cursor.Id); long dualIndexInitPos = idx * numClasses; var features = cursor.Features; var label = (int)cursor.Label; float invariant; float normSquared; if (invariants != null) { invariant = invariants[idx]; Contracts.AssertValue(featureNormSquared); normSquared = featureNormSquared[idx]; } else { normSquared = VectorUtils.NormSquared(in features); if (SdcaTrainerOptions.BiasLearningRate == 0) { normSquared += 1; } invariant = _loss.ComputeDualUpdateInvariant(2 * normSquared * lambdaNInv * GetInstanceWeight(cursor)); } // The output for the label class using current weights and bias. var labelOutput = WDot(in features, in weights[label], biasReg[label] + biasUnreg[label]); var instanceWeight = GetInstanceWeight(cursor); // This will be the new dual variable corresponding to the label class. float labelDual = 0; // This will be used to update the weights and regularized bias corresponding to the label class. float labelPrimalUpdate = 0; // This will be used to update the unregularized bias corresponding to the label class. float labelAdjustment = 0; // Iterates through all classes. for (int iClass = 0; iClass < numClasses; iClass++) { // Skip the dual/weights/bias update for label class. Will be taken care of at the end. if (iClass == label) { continue; } var weightsEditor = VBufferEditor.CreateFromBuffer(ref weights[iClass]); var l1IntermediateWeightsEditor = !l1ThresholdZero?VBufferEditor.CreateFromBuffer(ref l1IntermediateWeights[iClass]) : default; // Loop trials for compare-and-swap updates of duals. // In general, concurrent update conflict to the same dual variable is rare // if data is shuffled. for (int numTrials = 0; numTrials < maxUpdateTrials; numTrials++) { long dualIndex = iClass + dualIndexInitPos; var dual = duals[dualIndex]; var output = labelOutput + labelPrimalUpdate * normSquared - WDot(in features, in weights[iClass], biasReg[iClass] + biasUnreg[iClass]); var dualUpdate = _loss.DualUpdate(output, 1, dual, invariant, numThreads); // The successive over-relaxation approach to adjust the sum of dual variables (biasReg) to zero. // Reference to details: http://stat.rutgers.edu/home/tzhang/papers/ml02_dual.pdf, pp. 16-17. var adjustment = l1ThresholdZero ? lr * biasReg[iClass] : lr * l1IntermediateBias[iClass]; dualUpdate -= adjustment; bool success = false; duals.ApplyAt(dualIndex, (long index, ref float value) => success = Interlocked.CompareExchange(ref value, dual + dualUpdate, dual) == dual); if (success) { // Note: dualConstraint[iClass] = lambdaNInv * (sum of duals[iClass]) var primalUpdate = dualUpdate * lambdaNInv * instanceWeight; labelDual -= dual + dualUpdate; labelPrimalUpdate += primalUpdate; biasUnreg[iClass] += adjustment * lambdaNInv * instanceWeight; labelAdjustment -= adjustment; if (l1ThresholdZero) { VectorUtils.AddMult(in features, weightsEditor.Values, -primalUpdate); biasReg[iClass] -= primalUpdate; } else { //Iterative shrinkage-thresholding (aka. soft-thresholding) //Update v=denseWeights as if there's no L1 //Thresholding: if |v[j]| < threshold, turn off weights[j] //If not, shrink: w[j] = v[i] - sign(v[j]) * threshold l1IntermediateBias[iClass] -= primalUpdate; if (SdcaTrainerOptions.BiasLearningRate == 0) { biasReg[iClass] = Math.Abs(l1IntermediateBias[iClass]) - l1Threshold > 0.0 ? l1IntermediateBias[iClass] - Math.Sign(l1IntermediateBias[iClass]) * l1Threshold : 0; } var featureValues = features.GetValues(); if (features.IsDense) { CpuMathUtils.SdcaL1UpdateDense(-primalUpdate, featureValues.Length, featureValues, l1Threshold, l1IntermediateWeightsEditor.Values, weightsEditor.Values); } else if (featureValues.Length > 0) { CpuMathUtils.SdcaL1UpdateSparse(-primalUpdate, featureValues.Length, featureValues, features.GetIndices(), l1Threshold, l1IntermediateWeightsEditor.Values, weightsEditor.Values); } } break; } } } // Updating with label class weights and dual variable. duals[label + dualIndexInitPos] = labelDual; biasUnreg[label] += labelAdjustment * lambdaNInv * instanceWeight; if (l1ThresholdZero) { var weightsEditor = VBufferEditor.CreateFromBuffer(ref weights[label]); VectorUtils.AddMult(in features, weightsEditor.Values, labelPrimalUpdate); biasReg[label] += labelPrimalUpdate; } else { l1IntermediateBias[label] += labelPrimalUpdate; var intermediateBias = l1IntermediateBias[label]; biasReg[label] = Math.Abs(intermediateBias) - l1Threshold > 0.0 ? intermediateBias - Math.Sign(intermediateBias) * l1Threshold : 0; var weightsEditor = VBufferEditor.CreateFromBuffer(ref weights[label]); var l1IntermediateWeightsEditor = VBufferEditor.CreateFromBuffer(ref l1IntermediateWeights[label]); var featureValues = features.GetValues(); if (features.IsDense) { CpuMathUtils.SdcaL1UpdateDense(labelPrimalUpdate, featureValues.Length, featureValues, l1Threshold, l1IntermediateWeightsEditor.Values, weightsEditor.Values); } else if (featureValues.Length > 0) { CpuMathUtils.SdcaL1UpdateSparse(labelPrimalUpdate, featureValues.Length, featureValues, features.GetIndices(), l1Threshold, l1IntermediateWeightsEditor.Values, weightsEditor.Values); } } rowCount++; } } }
internal L1OptimizerState(IChannel ch, IProgressChannelProvider progress, DifferentiableFunction function, in VBuffer <Float> initial, int m, long totalMemLimit,
// Make sure _lossnormalizer is added only once protected override float DifferentiableFunction(ref VBuffer <float> x, ref VBuffer <float> gradient, IProgressChannelProvider progress) { return(base.DifferentiableFunction(ref x, ref gradient, progress) + (float)(_lossNormalizer / NumGoodRows)); }
internal virtual OptimizerState MakeState(IChannel ch, IProgressChannelProvider progress, DifferentiableFunction function, ref VBuffer <float> initial) { return(new FunctionOptimizerState(ch, progress, function, in initial, M, TotalMemoryLimit, KeepDense, EnforceNonNegativity)); }
internal FunctionOptimizerState(IChannel ch, IProgressChannelProvider progress, DifferentiableFunction function, in VBuffer <float> initial, int m,
private static Float QuadTest2D(ref VBuffer <Float> x, ref VBuffer <Float> grad, IProgressChannelProvider progress = null) { Float d1 = VectorUtils.DotProduct(ref x, ref _c1); Float d2 = VectorUtils.DotProduct(ref x, ref _c2); Float d3 = VectorUtils.DotProduct(ref x, ref _c3); _c3.CopyTo(ref grad); VectorUtils.AddMult(ref _c1, d1, ref grad); VectorUtils.AddMult(ref _c2, d2, ref grad); return((Float)0.5 * (d1 * d1 + d2 * d2) + d3 + 55); }
/// <summary> /// This is the original differentiable function with the injected L1 term. /// </summary> private Float EvalCore(ref VBuffer <Float> input, ref VBuffer <Float> gradient, IProgressChannelProvider progress) { // REVIEW: Leverage Vector methods that use SSE. Float res = 0; if (!EnforceNonNegativity) { if (_biasCount > 0) { VBufferUtils.ForEachDefined(ref input, (ind, value) => { if (ind >= _biasCount) { res += Math.Abs(value); } }); } else { VBufferUtils.ForEachDefined(ref input, (ind, value) => res += Math.Abs(value)); } } else { if (_biasCount > 0) { VBufferUtils.ForEachDefined(ref input, (ind, value) => { if (ind >= _biasCount) { res += value; } }); } else { VBufferUtils.ForEachDefined(ref input, (ind, value) => res += value); } } res = _l1weight * res + _function(ref input, ref gradient, progress); return(res); }