void Learn(Stat s, ActIdx a, Stat next_s, float reward) { QActIdx qa = MaxQActIdx(next_s); q_table[s.Idx(), a.Idx()] += alpha * (reward + gamma * qa.q - q_table[s.Idx(), a.Idx()]); }
public Actions RunStep(States states) { next_stat = new Stat(states, isRed); if (is_started) { float reward = Reward(stat, next_stat); Learn(stat, act_idx, next_stat, reward); } is_started = true; act_idx = Policy(next_stat); stat = next_stat; return(act_idx.Actions(next_stat)); }
QActIdx MaxQActIdx(Stat s) { int s_idx = s.Idx(); float max_q = float.NegativeInfinity; ActIdx max_a = new ActIdx(0); for (int i = 0; i < ActIdx.max_idx; i++) { if (q_table[s_idx, i] > max_q) { max_q = q_table[s_idx, i]; max_a = new ActIdx(i); } } return(new QActIdx(max_q, max_a)); }
public QActIdx(float q, ActIdx a_idx) { this.q = q; this.a_idx = a_idx; }