Beispiel #1
0
        public double Train(Vector p_state0, int p_action0, Vector p_state1, float p_reward, bool p_final = false)
        {
            float mse     = 0;
            float maxQs1a = CalcMaxQa(p_state1);

            // updating phase for Q(s,a)
            _networkQ.Activate(p_state0);

            Vector target = Vector.Copy(_networkQ.Output);

            if (p_final)
            {
                target[p_action0] = p_reward;
            }
            else
            {
                target[p_action0] = p_reward + _gamma * maxQs1a;
            }

            mse = _optimizer.Train(p_state0, target);
            Vector.Release(target);

            _asyncUnit.Update(p_final);

            if (_asyncUnit.IsAsyncReady)
            {
                _optimizer.AsyncUpdate();
            }

            return(mse);
        }
Beispiel #2
0
        public float Train(Vector p_state0, int p_action0, Vector p_state1, float p_reward, bool p_final = false)
        {
            float mse = 0;

            if (_replayBuffer.Count == _replayBufferCapacity)
            {
                ReplayBufferElem e = _replayBuffer.Dequeue();
                Vector.Release(e.S0);
                Vector.Release(e.S1);
            }

            _replayBuffer.Enqueue(new ReplayBufferElem {
                S0 = Vector.Copy(p_state0), S1 = Vector.Copy(p_state1), Action = p_action0, Reward = p_reward, Final = p_final
            });

            if (_replayBuffer.Count >= _batchSize && _batchIndex == _batchSize)
            {
                _batchIndex = 0;
                _sample     = _replayBuffer.OrderBy(x => _rnd.Next()).Take(_batchSize);

                int i = 0;

                Stopwatch watch = Stopwatch.StartNew();

                foreach (ReplayBufferElem b in _sample)
                {
                    Vector t = CalcTarget(b.S0, b.S1, b.Action, b.Reward, b.Final);
                    _optimizer.Train(b.S0, t);
                    Vector.Release(t);
                    i++;
                }

                watch.Stop();
#if !DEBUG
                Console.WriteLine("Training time [ms] " + watch.ElapsedMilliseconds);
#endif
            }

            _batchIndex++;

            if (_qtUpdateIndex == _qtUpdateSize)
            {
                _qtUpdateIndex = 0;
                _QTnetwork.OverrideParams(_Qnetwork);
            }

            _qtUpdateIndex++;

            return(mse);
        }
Beispiel #3
0
        override public double Train(Vector p_state0, int p_action, Vector p_state1, float p_reward, bool p_final)
        {
            double mse = 0;

            _stack.Push(new StackItem {
                State = Vector.Copy(p_state0), Action = p_action, Reward = p_reward
            });

            _asyncUnit.Update(p_final);

            if (_asyncUnit.IsAsyncReady)
            {
                int   criticOutput = _thNetwork.Output.Size - 1;
                float value1       = _thNetwork.Activate(p_state1)[criticOutput];

                float R = p_final ? 0f : value1;

                while (_stack.Count > 0)
                {
                    StackItem item = (StackItem)_stack.Pop();
                    R = item.Reward + _gamma * R;

                    Vector target = Vector.Copy(_thNetwork.Activate(item.State));

                    float value0 = _thNetwork.Activate(p_state0)[criticOutput];

                    target[p_action]     = R - value0;
                    target[criticOutput] = R;

                    mse += _thOptimizer.Train(item.State, target);

                    Vector.Release(item.State);
                    Vector.Release(target);
                }

                _optimizer.AsyncUpdate(_thOptimizer);
                _thNetwork.OverrideParams(_network);
            }

            return(mse);
        }
Beispiel #4
0
        virtual public double Train(Vector p_state0, int p_action, Vector p_state1, float p_reward, bool p_final)
        {
            double mse          = 0;
            int    criticOutput = _network.Output.Size - 1;

            _network.Activate(p_state1);
            float value1 = _network.Output[criticOutput];

            _network.Activate(p_state0);
            float value0 = _network.Output[criticOutput];

            Vector target = Vector.Copy(_network.Output);

            target[p_action]     = p_reward + _gamma * value1 - value0;
            target[criticOutput] = p_reward + _gamma * value1;

            mse = _optimizer.Train(p_state0, target);

            Vector.Release(target);

            return(mse);
        }
Beispiel #5
0
        public double Train(Vector p_state0, int p_action0, Vector p_state1, float p_reward, bool final = false)
        {
            float mse     = 0;
            float maxQs1a = CalcMaxQa(p_state1);

            // updating phase for Q(s,a)
            _network.Activate(p_state0);

            Vector target = _network.Output;

            if (final)
            {
                target[p_action0] = p_reward;
            }
            else
            {
                target[p_action0] = p_reward + _gamma * maxQs1a;
            }

            mse = _optimizer.Train(p_state0, target);

            return(mse);
        }