public double Train(Vector p_state0, int p_action0, Vector p_state1, float p_reward, bool p_final = false) { float mse = 0; float maxQs1a = CalcMaxQa(p_state1); // updating phase for Q(s,a) _networkQ.Activate(p_state0); Vector target = Vector.Copy(_networkQ.Output); if (p_final) { target[p_action0] = p_reward; } else { target[p_action0] = p_reward + _gamma * maxQs1a; } mse = _optimizer.Train(p_state0, target); Vector.Release(target); _asyncUnit.Update(p_final); if (_asyncUnit.IsAsyncReady) { _optimizer.AsyncUpdate(); } return(mse); }
public float Train(Vector p_state0, int p_action0, Vector p_state1, float p_reward, bool p_final = false) { float mse = 0; if (_replayBuffer.Count == _replayBufferCapacity) { ReplayBufferElem e = _replayBuffer.Dequeue(); Vector.Release(e.S0); Vector.Release(e.S1); } _replayBuffer.Enqueue(new ReplayBufferElem { S0 = Vector.Copy(p_state0), S1 = Vector.Copy(p_state1), Action = p_action0, Reward = p_reward, Final = p_final }); if (_replayBuffer.Count >= _batchSize && _batchIndex == _batchSize) { _batchIndex = 0; _sample = _replayBuffer.OrderBy(x => _rnd.Next()).Take(_batchSize); int i = 0; Stopwatch watch = Stopwatch.StartNew(); foreach (ReplayBufferElem b in _sample) { Vector t = CalcTarget(b.S0, b.S1, b.Action, b.Reward, b.Final); _optimizer.Train(b.S0, t); Vector.Release(t); i++; } watch.Stop(); #if !DEBUG Console.WriteLine("Training time [ms] " + watch.ElapsedMilliseconds); #endif } _batchIndex++; if (_qtUpdateIndex == _qtUpdateSize) { _qtUpdateIndex = 0; _QTnetwork.OverrideParams(_Qnetwork); } _qtUpdateIndex++; return(mse); }
override public double Train(Vector p_state0, int p_action, Vector p_state1, float p_reward, bool p_final) { double mse = 0; _stack.Push(new StackItem { State = Vector.Copy(p_state0), Action = p_action, Reward = p_reward }); _asyncUnit.Update(p_final); if (_asyncUnit.IsAsyncReady) { int criticOutput = _thNetwork.Output.Size - 1; float value1 = _thNetwork.Activate(p_state1)[criticOutput]; float R = p_final ? 0f : value1; while (_stack.Count > 0) { StackItem item = (StackItem)_stack.Pop(); R = item.Reward + _gamma * R; Vector target = Vector.Copy(_thNetwork.Activate(item.State)); float value0 = _thNetwork.Activate(p_state0)[criticOutput]; target[p_action] = R - value0; target[criticOutput] = R; mse += _thOptimizer.Train(item.State, target); Vector.Release(item.State); Vector.Release(target); } _optimizer.AsyncUpdate(_thOptimizer); _thNetwork.OverrideParams(_network); } return(mse); }
virtual public double Train(Vector p_state0, int p_action, Vector p_state1, float p_reward, bool p_final) { double mse = 0; int criticOutput = _network.Output.Size - 1; _network.Activate(p_state1); float value1 = _network.Output[criticOutput]; _network.Activate(p_state0); float value0 = _network.Output[criticOutput]; Vector target = Vector.Copy(_network.Output); target[p_action] = p_reward + _gamma * value1 - value0; target[criticOutput] = p_reward + _gamma * value1; mse = _optimizer.Train(p_state0, target); Vector.Release(target); return(mse); }
public double Train(Vector p_state0, int p_action0, Vector p_state1, float p_reward, bool final = false) { float mse = 0; float maxQs1a = CalcMaxQa(p_state1); // updating phase for Q(s,a) _network.Activate(p_state0); Vector target = _network.Output; if (final) { target[p_action0] = p_reward; } else { target[p_action0] = p_reward + _gamma * maxQs1a; } mse = _optimizer.Train(p_state0, target); return(mse); }