Пример #1
0
        override protected void CalcWeightUpdate(Connection p_connection, bool p_tensor)
        {
            /*
             * m = beta1*m + (1-beta1)*dx
             * v = beta2*v + (1-beta2)*(dx**2)
             * x += - learning_rate * m / (np.sqrt(v) + eps)
             */

            string id = p_connection.Id;

            if (p_tensor)
            {
                if (!_m_t.ContainsKey(id))
                {
                    int batch = (int)p_connection.GradientT.GetShape(0);
                    _m_t[id] = new Tensor(batch, p_connection.OutDim, p_connection.InDim);
                    _v_t[id] = new Tensor(batch, p_connection.OutDim, p_connection.InDim);
                    //_eps_t[id] = new Tensor(_eps[id], batch);
                }

                Tensor g = new Tensor(_gradient_t[id]);
                g.Apply(Math.Pow, 2);

                _m_t[id].RecurrentSum(_beta1, (1 - _beta1) * _gradient_t[id]);
                _v_t[id].RecurrentSum(_beta2, (1 - _beta2) * g);

                Tensor v = new Tensor(_v_t[id]);
                v.Apply(Math.Sqrt);

                Tensor gdiv = (v + _eps_t[id]);
                gdiv.Inv();

                Tensor m = new Tensor(_m_t[id]);
                m.Dot(gdiv);

                p_connection.Update(_alpha * m);
                p_connection.OutGroup.UpdateBias(_alpha * p_connection.OutGroup.DeltaT);
            }
            else
            {
                Matrix m = Matrix.ADAM_mCache(_beta1, _m[id], _gradient[id]);
                Matrix.Release(_m[id]);
                _m[id] = m;

                Matrix v = Matrix.ADAM_vCache(_beta2, _v[id], _gradient[id]);
                Matrix.Release(_v[id]);
                _v[id] = v;

                if (_batchingUnit.IsActive)
                {
                    Matrix gu = Matrix.ADAM_gradientUpdate(_alpha, _v[id], _m[id], _epsilon);
                    Matrix dW = _dW[id] + gu;
                    Matrix.Release(gu);
                    Matrix.Release(_dW[id]);
                    _dW[id] = dW;

                    Vector bu = _alpha * p_connection.OutGroup.Delta;
                    Vector db = _db[id] + bu;
                    Vector.Release(bu);
                    Vector.Release(_db[id]);
                    _db[id] = db;
                }
                else
                {
                    Matrix.Release(_dW[id]);
                    _dW[id] = Matrix.ADAM_gradientUpdate(_alpha, _v[id], _m[id], _epsilon);

                    Vector.Release(_db[id]);
                    _db[id] = _alpha * p_connection.OutGroup.Delta;
                }
            }
        }