public void Optimize(TensorOld target, TensorOld gradient) { if (!last.ContainsKey(gradient)) { last[gradient] = gradient.GetSameShape(); TensorOld.Apply(gradient, last[gradient], g => LearningRate * g); target.Minus(last[gradient]); return; } var prev = last[gradient]; TensorOld.Apply(prev, gradient, prev, (p, g) => g * LearningRate - p * Moment); target.Minus(prev); }
public void Optimize(TensorOld target, TensorOld gradient) { if (!dict.ContainsKey(gradient)) { dict[gradient] = new AdamCache(gradient.Shape); } var c = dict[gradient]; TensorOld.Apply(c.M, gradient, c.M, (m, g) => Beta1 * m + (1 - Beta1) * g); TensorOld.Apply(c.V, gradient, c.V, (v, g) => Beta2 * v + (1 - Beta2) * g * g); TensorOld.Apply(c.M, c.V, c.T, (m, v) => Alpha * m / (Math.Sqrt(v) + E)); target.Minus(c.T); }