public double Train(Vector p_state0, int p_action0, Vector p_state1, float p_reward, bool p_final = false) { float mse = 0; float maxQs1a = CalcMaxQa(p_state1); // updating phase for Q(s,a) _networkQ.Activate(p_state0); Vector target = Vector.Copy(_networkQ.Output); if (p_final) { target[p_action0] = p_reward; } else { target[p_action0] = p_reward + _gamma * maxQs1a; } mse = _optimizer.Train(p_state0, target); Vector.Release(target); _asyncUnit.Update(p_final); if (_asyncUnit.IsAsyncReady) { _optimizer.AsyncUpdate(); } return(mse); }
override public double Train(Vector p_state0, int p_action, Vector p_state1, float p_reward, bool p_final) { double mse = 0; _stack.Push(new StackItem { State = Vector.Copy(p_state0), Action = p_action, Reward = p_reward }); _asyncUnit.Update(p_final); if (_asyncUnit.IsAsyncReady) { int criticOutput = _thNetwork.Output.Size - 1; float value1 = _thNetwork.Activate(p_state1)[criticOutput]; float R = p_final ? 0f : value1; while (_stack.Count > 0) { StackItem item = (StackItem)_stack.Pop(); R = item.Reward + _gamma * R; Vector target = Vector.Copy(_thNetwork.Activate(item.State)); float value0 = _thNetwork.Activate(p_state0)[criticOutput]; target[p_action] = R - value0; target[criticOutput] = R; mse += _thOptimizer.Train(item.State, target); Vector.Release(item.State); Vector.Release(target); } _optimizer.AsyncUpdate(_thOptimizer); _thNetwork.OverrideParams(_network); } return(mse); }