public virtual ReinforcementLearningAction Step(ReinforcementLearningEnvironment env, ReinforcementLearningPolicy policy) { ReinforcementLearningState state = env.State(); float reward = env.Reward(); ReinforcementLearningAction action = policy.ActionForState(state, qFunction); if ((prevState != null) && (prevAction != null)) { ReinforcementLearningAction bestAction; float Qtp1max; qFunction.GetBestActionAndUtilityForState(state, out bestAction, out Qtp1max); float Qt = qFunction.Evaluate(prevState, prevAction); float deltaQ = Alpha * (reward + Discount * Qtp1max - Qt); if (float.IsNaN(deltaQ)) throw new Exception(); qFunction.ModifyValue(prevState, prevAction, deltaQ); } prevAction = action; prevState = state; return action; }
public override ReinforcementLearningAction Step(ReinforcementLearningEnvironment env, ReinforcementLearningPolicy policy) { ReinforcementLearningState state = env.State(); float reward = env.Reward(); ReinforcementLearningAction action = policy.ActionForState(state, qFunction); if ((prevState != null) && (prevAction != null)) { ReinforcementLearningAction bestAction; float Qtp1max; qFunction.GetBestActionAndUtilityForState(state, out bestAction, out Qtp1max); float Qt = qFunction.Evaluate(prevState, prevAction); float newQ = Alpha * (reward + Discount * Qtp1max - Qt) + Qt; InvertedPendulumESIGMNQStore qs = ((InvertedPendulumESIGMNQStore)qFunction); qs.ReplaceValue((InvertedPendulumState)prevState, (InvertedPendulumAction2)prevAction, newQ); } prevAction = action; prevState = state; return action; }