void Learn(PolarCoord s1, HandleAction a, PolarCoord s2, float reward) { const float alpha = 0.1f; const float gamma = 0.99f; QTable[s1][a] += alpha * (reward + gamma * QTable[s2].Values.Max() - QTable[s1][a]); }
public override Actions RunStep(States states) { PolarCoord nowState = (PolarCoord)StateFactory.FromRawState(states); HandleAction decision = (HandleAction)ActionFactory.Random(); Debug(nowState.DebugStr()); if (states.episode_i == CurrentEpisode) { Store(PrevState, PrevAction, nowState); Learn(); decision = Policy(nowState); } else { CurrentEpisode = states.episode_i; } PrevState = nowState; PrevAction = decision; bool shoot = states.bullet_num2 != 0 && nowState.RawAbsPhi < 60; return(decision.ToRawAction(shoot)); }
float Reward(PolarCoord s1, HandleAction a, PolarCoord s2) { float reward = 0; reward += -(s2.RawAbsPhi / 180f * 10f); reward += s2.RawAbsTheta / 180f * 10f; reward += s2.RawDistance > 800 ? -20 : 0; return(reward); }
void Store(PolarCoord s1, HandleAction a, PolarCoord s2) { const int bufsize = 1000; History.Add(new Step <PolarCoord, HandleAction>(s1, a, s2)); if (History.Count() > bufsize) { History.RemoveAt(0); } }
float Reward(PolarCoord s1, HandleAction a, PolarCoord s2) { return(-s2.AbsPhi + s2.AbsTheta + s2.Distance * (s2.RawAbsTheta < 90 ? -1 : 1)); }