public override Volume <T> Evaluate(Session <T> session)
        {
            if (!this.IsDirty)
            {
                return(base.Evaluate(session));
            }

            this.IsDirty = false;

            var y = this.Parents[0].Evaluate(session);

            if (this.Result == null)
            {
                this.Result = this._builder.SameAs(this.Index == -1 ? new Shape(4) : new Shape(1));
            }

            if (this.Index == -1)
            {
                this.Result.Set(0, Ops <T> .Cast(y.Shape.Dimensions[0]));
                this.Result.Set(1, Ops <T> .Cast(y.Shape.Dimensions[1]));
                this.Result.Set(2, Ops <T> .Cast(y.Shape.Dimensions[2]));
                this.Result.Set(3, Ops <T> .Cast(y.Shape.Dimensions[3]));
            }
            else
            {
                this.Result.Set(0, Ops <T> .Cast(y.Shape.Dimensions[this.Index]));
            }

            return(base.Evaluate(session));
        }
        protected virtual void Backward(Volume <T> y)
        {
            var chrono = Stopwatch.StartNew();

            var batchSize = y.Shape.Dimensions[3];

            this.Loss = Ops <T> .Divide(this.Net.Backward(y), Ops <T> .Cast(batchSize));

            this.BackwardTimeMs = chrono.Elapsed.TotalMilliseconds / batchSize;
        }
Exemplo n.º 3
0
        protected override void TrainImplem()
        {
            var parametersAndGradients    = this.Net.GetParametersAndGradients();
            var isMomentumGreaterThanZero = Ops <T> .GreaterThan(this.Momentum, Ops <T> .Zero);

            // initialize lists for accumulators. Will only be done once on first iteration
            if (this.gsum.Count == 0 && isMomentumGreaterThanZero)
            {
                foreach (var t in parametersAndGradients)
                {
                    this.gsum.Add(BuilderInstance <T> .Volume.SameAs(t.Volume.Shape));
                }
            }

            T factor = Ops <T> .Divide(this.LearningRate, Ops <T> .Cast(this.BatchSize));

            // perform an update for all sets of weights
            for (var i = 0; i < parametersAndGradients.Count; i++)
            {
                var parametersAndGradient = parametersAndGradients[i];
                var vol  = parametersAndGradient.Volume;
                var grad = parametersAndGradient.Gradient;

                // learning rate for some parameters.
                var l2DecayMul = parametersAndGradient.L2DecayMul ?? Ops <T> .One;
                var l1DecayMul = parametersAndGradient.L1DecayMul ?? Ops <T> .One;
                var l2Decay    = Ops <T> .Multiply(this.L2Decay, l2DecayMul);

                var l1Decay = Ops <T> .Multiply(this.L1Decay, l1DecayMul);

                //  this.L2DecayLoss += l2Decay * vol.Get(j) * vol.Get(j) / 2; // accumulate weight decay loss
                //  this.L1DecayLoss += l1Decay * Math.Abs(vol.Get(j));
                //  var l1Grad = l1Decay * (vol.Get(j) > 0 ? 1 : -1);

                var l2Grad = vol * l2Decay;
                var gij    = grad + l2Grad;

                //  var gij = (l2Grad + l1Grad + vol.GetGradient(j)) / this.BatchSize; // raw batch gradient

                if (isMomentumGreaterThanZero)
                {
                    // momentum update
                    var dx = this.gsum[i] * this.Momentum + gij * factor; // step
                    this.gsum[i] = dx.Clone();                            // back this up for next iteration of momentum
                    vol.MapInplace((v, d) => d, vol - dx);                // apply corrected gradient
                }
                else
                {
                    // vanilla sgd
                    vol.MapInplace((v, d) => d, vol - gij * factor);
                }

                grad.Clear(); // zero out gradient so that we can begin accumulating anew
            }
        }
        public override Volume <T> Evaluate(Session <T> session)
        {
            var variables = session.LearnableVariables;

            var volumes   = new Dictionary <Variable <T>, Volume <T> >();
            var gradients = new Dictionary <Variable <T>, Volume <T> >();

            if (this._updaters.Count == 0)
            {
                foreach (var variable in variables.Values)
                {
                    var lr   = this.Graph.PlaceHolder("lr");   // learning rate
                    var grad = this.Graph.PlaceHolder("grad"); // gradients
                    var v    = this.Graph.PlaceHolder("v");    // volume

                    this._updaters[variable] = v - grad * lr;
                }
            }

            this._learningRate.Set(0, Ops <T> .Divide(this._lr, Ops <T> .Cast(session.BatchSize)));

            // Prepare updated variables
            foreach (var variable in variables.Values)
            {
                volumes[variable]   = variable.Evaluate(session);
                gradients[variable] = variable.Derivate.Evaluate(session);
            }

            // Apply updated variables
            foreach (var variable in variables.Values)
            {
                var grad = gradients[variable];
                var v    = volumes[variable];

                var variableV = session.Run(this._updaters[variable],
                                            new Dictionary <string, Volume <T> >
                {
                    { "lr", this._learningRate },
                    { "grad", grad },
                    { "v", v }
                }, false);

                variable.Result.Storage.CopyFrom(variableV.Storage);
                variable.SetDirty();
            }

            return(null);
        }
Exemplo n.º 5
0
        public void TestCast()
        {
            Assert.AreEqual(5, Ops.Cast <int, int>(5));
            Assert.AreEqual(5L, Ops.Cast <int, long>(5));
            Assert.AreEqual(5, Ops.Cast <long, int>(5));
            Assert.AreEqual("abc", Ops.Cast <string, object>("abc"));
            Assert.AreEqual(new MyLong(5), Ops.Cast <int, MyLong>(5));

            Assert.ThrowsException <NotSupportedException>(() =>
            {
                Ops.Cast <string, long>("5");
            });

            {
                Assert.AreEqual((int)new BigInteger(12), Ops.Cast <BigInteger, int>(new BigInteger(12)));
            }
        }
Exemplo n.º 6
0
        protected override void TrainImplem()
        {
            var parametersAndGradients    = this.Net.GetParametersAndGradients();
            var isMomentumGreaterThanZero = Ops <T> .GreaterThan(this.Momentum, Ops <T> .Zero);

            // initialize lists for accumulators. Will only be done once on first iteration
            if (this.velocities.Count == 0)
            {
                foreach (var parameter in parametersAndGradients)
                {
                    this.velocities.Add(BuilderInstance <T> .Volume.SameAs(parameter.Volume.Shape));
                    this.regGrads.Add(BuilderInstance <T> .Volume.SameAs(parameter.Volume.Shape));
                }
            }

            // perform an update for all sets of weights
            for (var i = 0; i < parametersAndGradients.Count; i++)
            {
                var parametersAndGradient = parametersAndGradients[i];
                var parameters            = parametersAndGradient.Volume;
                var gradients             = parametersAndGradient.Gradient;
                var velocity = this.velocities[i];

                var batchAdjustedLearningRate = Ops <T> .Divide(this.LearningRate, Ops <T> .Cast(this.BatchSize));

                // delta = gradient + regularization;
                gradients.Multiply(batchAdjustedLearningRate, gradients);

                if (isMomentumGreaterThanZero)
                {
                    // sgd with momentum update
                    velocity.Multiply(this.Momentum, velocity);    // step
                    velocity.Add(gradients, velocity);
                    velocity.SubtractFrom(parameters, parameters); // apply corrected gradient
                }
                else
                {
                    // vanilla sgd
                    gradients.SubtractFrom(parameters, parameters);
                }

                // zero out gradient so that we can begin accumulating anew
                gradients.Clear();
            }
        }
Exemplo n.º 7
0
        public override Volume <T> Evaluate(Session <T> session)
        {
            if (!this.IsDirty)
            {
                return(base.Evaluate(session));
            }

            this.IsDirty = false;

            var y = this.Parents[1].Evaluate(session);
            var outputActivation = this.Parents[0].Evaluate(session);

            var loss = Ops <T> .Zero;

            for (var n = 0; n < y.Shape.Dimensions[3]; n++)
            {
                for (var d = 0; d < y.Shape.Dimensions[2]; d++)
                {
                    for (var h = 0; h < y.Shape.Dimensions[1]; h++)
                    {
                        for (var w = 0; w < y.Shape.Dimensions[0]; w++)
                        {
                            var expected = y.Get(w, h, d, n);
                            var actual   = outputActivation.Get(w, h, d, n);
                            if (Ops <T> .Zero.Equals(actual))
                            {
                                actual = Ops <T> .Epsilon;
                            }

                            var current = Ops <T> .Multiply(expected, Ops <T> .Log(actual));

                            loss = Ops <T> .Add(loss, current);
                        }
                    }
                }
            }

            var batchSize = outputActivation.Shape.Dimensions[3];

            loss = Ops <T> .Divide(Ops <T> .Negate(loss), Ops <T> .Cast(batchSize));

            this.Result.Set(0, loss);

            return(base.Evaluate(session));
        }
Exemplo n.º 8
0
        public override void Backward(Volume <T> y, out T loss)
        {
            y.DoSubtractFrom(this.OutputActivation, this.InputActivationGradients.ReShape(this.OutputActivation.Shape.Dimensions.ToArray()));

            if (this._result == null)
            {
                this._result = BuilderInstance <T> .Volume.SameAs(this.OutputActivation.Shape);

                this._sum = BuilderInstance <T> .Volume.SameAs(new Shape(1));
            }

            this._sum.Clear();
            this.OutputActivation.DoMultiply(this.OutputActivation, this._result); // dy * dy
            var half = (T)Convert.ChangeType(0.5, typeof(T));

            this._result.DoMultiply(this._result, half); // dy * dy * 0.5
            this._result.DoSum(this._sum);               // sum over all batch
            var batchSize = y.Shape.GetDimension(3);

            loss = Ops <T> .Divide(this._sum.Get(0), Ops <T> .Cast(batchSize)); // average
        }
Exemplo n.º 9
0
        public override Volume <T> Evaluate(Session <T> session)
        {
            if (!this.IsDirty)
            {
                return(this.Result);
            }
            this.IsDirty = false;

            var y = this.Parents[0].Evaluate(session);

            if (this.Result == null)
            {
                this.Result = this._builder.SameAs(new Shape(4));
            }

            this.Result.Set(0, Ops <T> .Cast(y.Shape.GetDimension(0)));
            this.Result.Set(1, Ops <T> .Cast(y.Shape.GetDimension(1)));
            this.Result.Set(2, Ops <T> .Cast(y.Shape.GetDimension(2)));
            this.Result.Set(3, Ops <T> .Cast(y.Shape.GetDimension(3)));

            return(this.Result);
        }
Exemplo n.º 10
0
        public override void Backward(Volume <T> y, out T loss)
        {
            var reshape = y.ReShape(new Shape(1, 1, -1, Shape.Keep));
            var dy      = this.InputActivationGradients.ReShape(this.OutputActivation.Shape.Dimensions);

            reshape.SubtractFrom(this.OutputActivation, dy);

            if (this._result == null)
            {
                this._result = BuilderInstance <T> .Volume.SameAs(this.OutputActivation.Shape);

                this._sum = BuilderInstance <T> .Volume.SameAs(new Shape(1));
            }

            this._sum.Clear();
            dy.Multiply(dy, this._result); // dy * dy
            var half = (T)Convert.ChangeType(0.5, typeof(T));

            this._result.Multiply(half, this._result); // dy * dy * 0.5
            this._result.Sum(this._sum);               // sum over all batch
            var batchSize = y.Shape.Dimensions[3];

            loss = Ops <T> .Divide(this._sum.Get(0), Ops <T> .Cast(batchSize)); // average
        }
Exemplo n.º 11
0
        public override Volume <T> Evaluate(Session <T> session)
        {
            var variables = session.LearnableVariables;

            var dico = new Dictionary <Variable <T>, Volume <T> >();

            if (this._updaters.Count == 0)
            {
                foreach (var variable in variables.Values)
                {
                    var lr   = this._cns.PlaceHolder("lr");   // learning rate
                    var grad = this._cns.PlaceHolder("grad"); // gradients
                    var v    = this._cns.PlaceHolder("v");    // volume

                    this._updaters[variable] = v - grad * lr;
                }
            }

            this._learningRate.Set(0, Ops <T> .Divide(this._lr, Ops <T> .Cast(session.BatchSize)));

            // Prepare updated variables
            foreach (var variable in variables.Values)
            {
                var grad   = variable.Derivate.Evaluate(session);
                var volume = variable.Evaluate(session);

                var gradBatchSize   = grad.Shape.GetDimension(3);
                var volumeBatchSize = volume.Shape.GetDimension(3);

                if (gradBatchSize != volumeBatchSize && gradBatchSize != 1)
                {
                    // Batch size > 1

                    var gradShape = new Shape(grad.Shape);
                    gradShape.SetDimension(0, variable.Result.Shape.GetDimension(0));
                    gradShape.SetDimension(1, variable.Result.Shape.GetDimension(1));
                    gradShape.SetDimension(3, 1);

                    Volume <T> tempGrad;
                    if (!this._tempGrads.TryGetValue(variable, out tempGrad) || !tempGrad.Shape.Equals(gradShape))
                    {
                        tempGrad = BuilderInstance <T> .Volume.SameAs(gradShape);

                        this._tempGrads[variable] = tempGrad;
                    }

                    grad.DoSum(tempGrad); // sum gradient batch
                    grad = tempGrad;
                }

                var variableV = session.Run(this._updaters[variable],
                                            new Dictionary <string, Volume <T> >
                {
                    { "lr", this._learningRate },
                    { "grad", grad },
                    { "v", volume }
                });

                dico[variable] = variableV;
            }

            // Apply updated variables
            foreach (var pair in dico)
            {
                pair.Key.Result.Storage.CopyFrom(pair.Value.Storage);
            }

            return(null);
        }
Exemplo n.º 12
0
        protected override void TrainImplem()
        {
            var parametersAndGradients = this.Net.GetParametersAndGradients();

            // initialize lists for accumulators. Will only be done once on first iteration
            if (this.gsum.Count == 0)
            {
                foreach (var t in parametersAndGradients)
                {
                    this.gsum.Add(BuilderInstance <T> .Volume.SameAs(t.Volume.Shape));
                    this.xsum.Add(BuilderInstance <T> .Volume.SameAs(t.Volume.Shape));
                }
            }

            // perform an update for all sets of weights
            for (var i = 0; i < parametersAndGradients.Count; i++)
            {
                var parametersAndGradient = parametersAndGradients[i];
                var vol  = parametersAndGradient.Volume;
                var grad = parametersAndGradient.Gradient;

                grad.Multiply(Ops <T> .Divide(Ops <T> .One, Ops <T> .Cast(this.BatchSize)), grad); // grad *= 1 / BatchSize

                using (var temp1 = BuilderInstance <T> .Volume.SameAs(vol.Shape))
                    using (var temp2 = BuilderInstance <T> .Volume.SameAs(vol.Shape))
                        using (var gradgrad = BuilderInstance <T> .Volume.SameAs(vol.Shape))
                            using (var two = BuilderInstance <T> .Volume.From(new[] { Ops <T> .Cast(2.0) }, new Shape(1)))
                                using (var epsilon = BuilderInstance <T> .Volume.From(new[] { this.Eps }, new Shape(1)))
                                {
                                    // momentum update

                                    // update biased first moment estimate: gsum[i] = gsum[i] * Beta1 +  (1 - Beta1) * grad
                                    this.gsum[i].Multiply(this.Beta1, temp1);                                             // temp1 = this.gsum[i] * this.Beta1
                                    grad.Multiply(Ops <T> .Add(Ops <T> .One, Ops <T> .Negate(this.Beta1)), this.gsum[i]); //  this.gsum[i] =  grad * (1 - Beta1)
                                    temp1.Add(this.gsum[i]);                                                              //  this.gsum[i] += temp1

                                    grad.Power(two, gradgrad);                                                            // gradgrad = grad * grad

                                    // update biased second moment estimate: xsum[i] = xsum[i] * Beta2 +  (1 - Beta2) * grad * grad
                                    this.xsum[i].Multiply(this.Beta2, temp1);                                                 // temp1 = this.xsum[i] * this.Beta2
                                    gradgrad.Multiply(Ops <T> .Add(Ops <T> .One, Ops <T> .Negate(this.Beta2)), this.xsum[i]); // temp2 =  gradgrad * (1 - Beta2)
                                    temp1.Add(this.xsum[i]);                                                                  //  this.xsum[i] += temp1

                                    var biasCorr1 = temp1;
                                    var biasCorr2 = temp2;

                                    this.gsum[i].Multiply(Ops <T> .Add(Ops <T> .One, Ops <T> .Negate(Ops <T> .Pow(this.Beta1, Ops <T> .Cast(this.k)))), biasCorr1); // correct bias first moment estimate
                                    this.xsum[i].Multiply(Ops <T> .Add(Ops <T> .One, Ops <T> .Negate(Ops <T> .Pow(this.Beta2, Ops <T> .Cast(this.k)))), biasCorr2); // correct bias second moment estimate

                                    biasCorr2.Sqrt(biasCorr2);                                                                                                      // biasCorr2 = sqrt(biasCorr2)
                                    epsilon.Add(biasCorr2);                                                                                                         // biasCorr2 += epsilon

                                    var dx = biasCorr1;
                                    dx.Multiply(this.LearningRate, dx);
                                    dx.Divide(biasCorr2, dx);

                                    dx.SubtractFrom(vol, vol);
                                }

                grad.Clear(); // zero out gradient so that we can begin accumulating anew


                this.k += this.BatchSize;
            }
        }
Exemplo n.º 13
0
        protected override void TrainImplem()
        {
            var parametersAndGradients = this.Net.GetParametersAndGradients();

            // initialize lists for accumulators. Will only be done once on first iteration
            if (this.gsum.Count == 0)
            {
                foreach (var t in parametersAndGradients)
                {
                    this.gsum.Add(BuilderInstance <T> .Volume.SameAs(t.Volume.Shape));
                    this.xsum.Add(BuilderInstance <T> .Volume.SameAs(t.Volume.Shape));
                }
            }

            var factor = Ops <T> .Divide(Ops <T> .One, Ops <T> .Cast(this.BatchSize));

            // perform an update for all sets of weights
            for (var i = 0; i < parametersAndGradients.Count; i++)
            {
                var parametersAndGradient = parametersAndGradients[i];
                var vol  = parametersAndGradient.Volume;
                var grad = parametersAndGradient.Gradient;

                // learning rate for some parameters.
                var l2DecayMul = parametersAndGradient.L2DecayMul ?? Ops <T> .One;
                var l1DecayMul = parametersAndGradient.L1DecayMul ?? Ops <T> .One;
                var l2Decay    = Ops <T> .Multiply(this.L2Decay, l2DecayMul);

                var l1Decay = Ops <T> .Multiply(this.L1Decay, l1DecayMul);

                //  this.L2DecayLoss += l2Decay * vol.Get(j) * vol.Get(j) / 2; // accumulate weight decay loss
                //  this.L1DecayLoss += l1Decay * Math.Abs(vol.Get(j));

                var l1Grad = vol.Clone();
                l1Grad.MapInplace(x => Ops <T> .GreaterThan(x, Ops <T> .Zero) ? Ops <T> .One : Ops <T> .Negate(Ops <T> .One));
                l1Grad = l1Grad * l1Decay;

                var l2Grad = vol * l2Decay;

                var gij = (grad + l2Grad + l1Grad) * factor;

                // momentum update
                this.gsum[i] = this.gsum[i] * this.Beta1 + gij * Ops <T> .Add(Ops <T> .One, Ops <T> .Negate(this.Beta1)); // update biased first moment estimate

                var gijgij = gij.Clone();
                gijgij.MapInplace(x => Ops <T> .Multiply(x, x));
                this.xsum[i] = this.xsum[i] * this.Beta2 + gijgij * Ops <T> .Add(Ops <T> .One, Ops <T> .Negate(this.Beta2));                 // update biased second moment estimate

                var biasCorr1 = this.gsum[i] * Ops <T> .Add(Ops <T> .One, Ops <T> .Negate(Ops <T> .Pow(this.Beta1, Ops <T> .Cast(this.k)))); // correct bias first moment estimate

                var biasCorr2 = this.xsum[i] * Ops <T> .Add(Ops <T> .One, Ops <T> .Negate(Ops <T> .Pow(this.Beta2, Ops <T> .Cast(this.k)))); // correct bias second moment estimate

                biasCorr2.MapInplace(x => Ops <T> .Add(Ops <T> .Sqrt(x), this.Eps));

                var dx = biasCorr1 * this.LearningRate;
                dx.MapInplace((l, r) => Ops <T> .Divide(l, r), biasCorr2);

                vol.MapInplace((v, d) => d, vol - dx); // apply corrected gradient

                grad.Clear();                          // zero out gradient so that we can begin accumulating anew
            }

            this.k += this.BatchSize;
        }
Exemplo n.º 14
0
 public Var <T> Cast <T>() => Ops.Cast <E, T>(Value);
        protected override void TrainImplem()
        {
            var parametersAndGradients    = this.Net.GetParametersAndGradients();
            var isMomentumGreaterThanZero = Ops <T> .GreaterThan(this.Momentum, Ops <T> .Zero);

            // initialize lists for accumulators. Will only be done once on first iteration
            if (this.velocities.Count == 0)
            {
                foreach (var parameter in parametersAndGradients)
                {
                    this.velocities.Add(BuilderInstance <T> .Volume.SameAs(parameter.Volume.Shape));
                    this.deltas.Add(BuilderInstance <T> .Volume.SameAs(parameter.Volume.Shape));
                    this.regGrads.Add(BuilderInstance <T> .Volume.SameAs(parameter.Volume.Shape));
                }
            }

            // perform an update for all sets of weights
            for (var i = 0; i < parametersAndGradients.Count; i++)
            {
                var parametersAndGradient = parametersAndGradients[i];
                var parameters            = parametersAndGradient.Volume;
                var gradients             = parametersAndGradient.Gradient;
                var delta = this.deltas[i];
                var regularizationGradients = this.regGrads[i];
                var velocity = this.velocities[i];

                // learning rate for some parameters.
                var l2DecayMul = parametersAndGradient.L2DecayMul ?? Ops <T> .One;
                var l1DecayMul = parametersAndGradient.L1DecayMul ?? Ops <T> .One;
                var l2Decay    = Ops <T> .Multiply(this.L2Decay, l2DecayMul);

                var l1Decay = Ops <T> .Multiply(this.L1Decay, l1DecayMul);

                //  this.L2DecayLoss += l2Decay * vol.Get(j) * vol.Get(j) / 2; // accumulate weight decay loss
                //  this.L1DecayLoss += l1Decay * Math.Abs(vol.Get(j));

                //L1 regularization
                if (Ops <T> .GreaterThan(l1Decay, Ops <T> .Zero))
                {
                    //l1Grad = l1Grad * l1Decay;
                    parameters.Storage.Map(x => Ops <T> .GreaterThan(x, Ops <T> .Zero) ? Ops <T> .One : Ops <T> .Negate(Ops <T> .One), regularizationGradients.Storage);
                    regularizationGradients.DoMultiply(delta, l1Decay);
                }
                else
                {
                    delta.Clear();
                }

                //L2 regularization
                if (Ops <T> .GreaterThan(l2Decay, Ops <T> .Zero))
                {
                    //l2Grad = vol * l2Decay;
                    parameters.DoMultiply(regularizationGradients, l2Decay);
                    delta.DoAdd(regularizationGradients, delta);
                }

                T batchAdjustedLearningRate = Ops <T> .Divide(this.LearningRate, Ops <T> .Cast(this.BatchSize));

                //delta = gradient + regularization;
                gradients.DoMultiply(gradients, batchAdjustedLearningRate);
                delta.DoMultiply(delta, this.LearningRate);
                delta.DoAdd(gradients, delta);

                if (isMomentumGreaterThanZero)
                {
                    // sgd with momentum update
                    velocity.DoMultiply(velocity, this.Momentum);    // step
                    velocity.DoAdd(delta, velocity);
                    velocity.DoSubtractFrom(parameters, parameters); // apply corrected gradient
                }
                else
                {
                    // vanilla sgd
                    delta.DoSubtractFrom(parameters, parameters);
                }

                // zero out gradient so that we can begin accumulating anew
                gradients.Clear();
            }
        }
        public override Volume <T> Evaluate(Session <T> session)
        {
            var variables = session.LearnableVariables;

            var volumes   = new Dictionary <Variable <T>, Volume <T> >();
            var gradients = new Dictionary <Variable <T>, Volume <T> >();

            if (this._updaters.Count == 0)
            {
                foreach (var variable in variables.Values)
                {
                    var one          = this.Graph.Const(Ops <T> .One, "one");
                    var epsilon      = this.Graph.PlaceHolder("epsilon");
                    var beta1        = this.Graph.PlaceHolder("beta1");
                    var beta2        = this.Graph.PlaceHolder("beta2");
                    var m            = this.Graph.Variable(Ops <T> .Zero, "m");
                    var v            = this.Graph.Variable(Ops <T> .Zero, "v");
                    var t            = this.Graph.Variable(Ops <T> .Zero, "t");
                    var grad         = this.Graph.PlaceHolder("grad");                // gradients
                    var learningRate = this.Graph.PlaceHolder("lr");                  // learning rate

                    var m_t = this.Graph.Assign(m, beta1 * m + (one - beta1) * grad); // m_t <- beta1 * m_{t-1} + (1 - beta1) * g
                    //m_t.Evaluated += (sender, args) => { Console.WriteLine($"m[{variable}]={ ((Op<T>)sender).Result.Get(0)}"); };

                    var v_t = this.Graph.Assign(v, beta2 * v + (one - beta2) * grad * grad);  // beta2 * v_{t-1} + (1 - beta2) * g * g
                    //v_t.Evaluated += (sender, args) => { Console.WriteLine($"v[{variable}]={ ((Op<T>)sender).Result.Get(0)}"); };

                    var t_plus_1 = this.Graph.Assign(t, t + one); // t = t + 1
                    //t_plus_1.Evaluated += (sender, args) => { Console.WriteLine($"t[{variable}]={ ((Op<T>)sender).Result.Get(0)}"); };

                    var lr = learningRate * this.Graph.Sqrt(one - (beta2 ^ t_plus_1)) / (one - (beta1 ^ t_plus_1)); // lr_t <- learning_rate * sqrt(1 - beta2^t) / (1 - beta1^t)
                    //lr.Evaluated += (sender, args) => { Console.WriteLine($"lr[{variable}]={ ((Op<T>)sender).Result.Get(0)}"); };

                    var vol = this.Graph.PlaceHolder("vol");

                    var delta = lr * (m_t / (this.Graph.Sqrt(v_t) + epsilon));
                    //delta.Evaluated += (sender, args) => { Console.WriteLine($"delta[{variable}]={ ((Op<T>)sender).Result.Get(0)}"); };

                    this._updaters[variable] = vol - delta;
                }
            }

            this._learningRate.Set(0, Ops <T> .Divide(this._lr, Ops <T> .Cast(session.BatchSize)));

            // Prepare updated variables
            foreach (var variable in variables.Values)
            {
                volumes[variable]   = variable.Evaluate(session);
                gradients[variable] = variable.Derivate.Evaluate(session);
            }

            // Apply updated variables
            foreach (var variable in variables.Values)
            {
                var grad = gradients[variable];
                var v    = volumes[variable];

                var variableV = session.Run(this._updaters[variable],
                                            new Dictionary <string, Volume <T> >
                {
                    { "epsilon", this._epsilon },
                    { "beta1", this._beta1 },
                    { "beta2", this._beta2 },
                    { "lr", this._learningRate },
                    { "grad", grad },
                    { "vol", v }
                }, false);

                variable.Result.Storage.CopyFrom(variableV.Storage);
                variable.SetDirty();

                //    Console.WriteLine("-----------------");
            }

            return(null);
        }