/// <summary> /// Creates SGDOptimizer and sets optimization parameters /// </summary> /// <param name="terminate">Termination criterion</param> /// <param name="rateSchedule">Annealing schedule type for learning rate</param> /// <param name="averaging">If true, all iterates are averaged</param> /// <param name="t0">Base for learning rate schedule</param> /// <param name="batchSize">Average this number of stochastic gradients for each update</param> /// <param name="momentum">Momentum parameter</param> /// <param name="maxSteps">Maximum number of updates (0 for no max)</param> public SgdOptimizer(DTerminate terminate, RateScheduleType rateSchedule = RateScheduleType.Sqrt, bool averaging = false, Float t0 = 1, int batchSize = 1, Float momentum = 0, int maxSteps = 0) { _terminate = terminate; _rateSchedule = rateSchedule; _averaging = averaging; _t0 = t0; _batchSize = batchSize; _momentum = momentum; _maxSteps = maxSteps; }
/// <summary> /// Makes a new GDOptimizer with the given optimization parameters /// </summary> /// <param name="terminate">Termination criterion</param> /// <param name="lineSearch">Line search to use</param> /// <param name="maxSteps">Maximum number of updates</param> /// <param name="useCG">Use Cubic interpolation line search or Backtracking line search with Armijo condition</param> public GDOptimizer(DTerminate terminate, IDiffLineSearch lineSearch = null, bool useCG = false, int maxSteps = 0) { Terminate = terminate; if (LineSearch == null) { if (useCG) { LineSearch = new CubicInterpLineSearch((Float)0.01); } else { LineSearch = new BacktrackingLineSearch(); } } else { LineSearch = lineSearch; } _maxSteps = maxSteps; UseCG = useCG; }
public static void Main(string[] argv) { RunTest(QuadTest); RunTest(LogTest); VBuffer <Float> grad = VBufferUtils.CreateEmpty <Float>(2); int n = 0; bool print = false; DTerminate term = (ref VBuffer <Float> x) => { QuadTest2D(ref x, ref grad); Float norm = VectorUtils.Norm(grad); if (++n % 1000 == 0 || print) { Console.WriteLine("{0}\t{1}", n, norm); } return(norm < 1e-5); }; SgdOptimizer sgdo = new SgdOptimizer(term, SgdOptimizer.RateScheduleType.Constant, false, 100, 1, (Float)0.99); VBuffer <Float> init; CreateWrapped(out init, 0, 0); VBuffer <Float> ans = default(VBuffer <Float>); sgdo.Minimize(StochasticQuadTest2D, ref init, ref ans); QuadTest2D(ref ans, ref grad); Console.WriteLine(VectorUtils.Norm(grad)); Console.WriteLine(); Console.WriteLine(); n = 0; GDOptimizer gdo = new GDOptimizer(term, null, true); print = true; CreateWrapped(out init, 0, 0); gdo.Minimize(QuadTest2D, ref init, ref ans); QuadTest2D(ref ans, ref grad); Console.WriteLine(VectorUtils.Norm(grad)); }
/// <summary> /// Initialize weights by running SGD up to specified tolerance. /// </summary> protected virtual VBuffer <float> InitializeWeightsSgd(IChannel ch, FloatLabelCursor.Factory cursorFactory) { if (!Quiet) { ch.Info("Running SGD initialization with tolerance {0}", SgdInitializationTolerance); } int numExamples = 0; var oldWeights = VBufferUtils.CreateEmpty <float>(BiasCount + WeightCount); DTerminate terminateSgd = (in VBuffer <float> x) => { if (++numExamples % 1000 != 0) { return(false); } VectorUtils.AddMult(in x, -1, ref oldWeights); float normDiff = VectorUtils.Norm(oldWeights); x.CopyTo(ref oldWeights); // #if OLD_TRACING // REVIEW: How should this be ported? if (!Quiet) { Console.Write("."); if (numExamples % 50000 == 0) { Console.WriteLine("\t{0}\t{1}", numExamples, normDiff); } } // #endif return(normDiff < SgdInitializationTolerance); }; VBuffer <float> result = default(VBuffer <float>); FloatLabelCursor cursor = null; try { float[] scratch = null; SgdOptimizer.DStochasticGradient lossSgd = (in VBuffer <float> x, ref VBuffer <float> grad) => { // Zero out the gradient by sparsifying. grad = new VBuffer <float>(grad.Length, 0, grad.Values, grad.Indices); EnsureBiases(ref grad); if (cursor == null || !cursor.MoveNext()) { if (cursor != null) { cursor.Dispose(); } cursor = cursorFactory.Create(); if (!cursor.MoveNext()) { return; } } AccumulateOneGradient(in cursor.Features, cursor.Label, cursor.Weight, in x, ref grad, ref scratch); }; VBuffer <float> sgdWeights; if (DenseOptimizer) { sgdWeights = VBufferUtils.CreateDense <float>(BiasCount + WeightCount); } else { sgdWeights = VBufferUtils.CreateEmpty <float>(BiasCount + WeightCount); } SgdOptimizer sgdo = new SgdOptimizer(terminateSgd); sgdo.Minimize(lossSgd, ref sgdWeights, ref result); // #if OLD_TRACING // REVIEW: How should this be ported? if (!Quiet) { Console.WriteLine(); } // #endif ch.Info("SGD initialization done in {0} rounds", numExamples); } finally { if (cursor != null) { cursor.Dispose(); } } return(result); }