void Learn(Stat s, ActIdx a, Stat next_s, float reward) { QActIdx qa = MaxQActIdx(next_s); q_table[s.Idx(), a.Idx()] += alpha * (reward + gamma * qa.q - q_table[s.Idx(), a.Idx()]); }
ActIdx Policy(Stat s) { // FIXME if (Random.value < 0.2) { return(new ActIdx(Random.Range(0, ActIdx.max_idx))); } QActIdx qa = MaxQActIdx(s); return(qa.a_idx); }