public double Train(Vector p_state0, int p_action0, Vector p_state1, float p_reward, bool p_final = false) { float mse = 0; float maxQs1a = CalcMaxQa(p_state1); // updating phase for Q(s,a) _networkQ.Activate(p_state0); Vector target = Vector.Copy(_networkQ.Output); if (p_final) { target[p_action0] = p_reward; } else { target[p_action0] = p_reward + _gamma * maxQs1a; } mse = _optimizer.Train(p_state0, target); Vector.Release(target); _asyncUnit.Update(p_final); if (_asyncUnit.IsAsyncReady) { _optimizer.AsyncUpdate(); } return(mse); }
virtual protected Vector CalcTarget(Vector p_s0, Vector p_s1, int p_action, float p_reward, bool p_final) { float maxQs1a = CalcMaxQa(p_s1); // updating phase for Q(s,a) _Qnetwork.Activate(p_s0); Vector target = Vector.Copy(_Qnetwork.Output); if (p_final) { target[p_action] = p_reward; } else { target[p_action] = p_reward + _gamma * maxQs1a; } return(target); }
override public double Train(Vector p_state0, int p_action, Vector p_state1, float p_reward, bool p_final) { double mse = 0; _stack.Push(new StackItem { State = Vector.Copy(p_state0), Action = p_action, Reward = p_reward }); _asyncUnit.Update(p_final); if (_asyncUnit.IsAsyncReady) { int criticOutput = _thNetwork.Output.Size - 1; float value1 = _thNetwork.Activate(p_state1)[criticOutput]; float R = p_final ? 0f : value1; while (_stack.Count > 0) { StackItem item = (StackItem)_stack.Pop(); R = item.Reward + _gamma * R; Vector target = Vector.Copy(_thNetwork.Activate(item.State)); float value0 = _thNetwork.Activate(p_state0)[criticOutput]; target[p_action] = R - value0; target[criticOutput] = R; mse += _thOptimizer.Train(item.State, target); Vector.Release(item.State); Vector.Release(target); } _optimizer.AsyncUpdate(_thOptimizer); _thNetwork.OverrideParams(_network); } return(mse); }
virtual public double Train(Vector p_state0, int p_action, Vector p_state1, float p_reward, bool p_final) { double mse = 0; int criticOutput = _network.Output.Size - 1; _network.Activate(p_state1); float value1 = _network.Output[criticOutput]; _network.Activate(p_state0); float value0 = _network.Output[criticOutput]; Vector target = Vector.Copy(_network.Output); target[p_action] = p_reward + _gamma * value1 - value0; target[criticOutput] = p_reward + _gamma * value1; mse = _optimizer.Train(p_state0, target); Vector.Release(target); return(mse); }
public double Train(Vector p_state0, int p_action0, Vector p_state1, float p_reward, bool final = false) { float mse = 0; float maxQs1a = CalcMaxQa(p_state1); // updating phase for Q(s,a) _network.Activate(p_state0); Vector target = _network.Output; if (final) { target[p_action0] = p_reward; } else { target[p_action0] = p_reward + _gamma * maxQs1a; } mse = _optimizer.Train(p_state0, target); return(mse); }
virtual protected float CalcMaxQa(Vector p_state) { _networkQt.Activate(p_state); float maxQa = _networkQt.Output[0]; for (int i = 0; i < _networkQt.Output.Size; i++) { if (_networkQt.Output[i] > maxQa) { maxQa = _networkQt.Output[i]; } } return(maxQa); }