예제 #1
0
    void Learn(Stat s, ActIdx a, Stat next_s, float reward)
    {
        QActIdx qa = MaxQActIdx(next_s);

        q_table[s.Idx(), a.Idx()] +=
            alpha * (reward + gamma * qa.q - q_table[s.Idx(), a.Idx()]);
    }
예제 #2
0
    public Actions RunStep(States states)
    {
        next_stat = new Stat(states, isRed);

        if (is_started)
        {
            float reward = Reward(stat, next_stat);
            Learn(stat, act_idx, next_stat, reward);
        }
        is_started = true;
        act_idx    = Policy(next_stat);
        stat       = next_stat;
        return(act_idx.Actions(next_stat));
    }
예제 #3
0
    QActIdx MaxQActIdx(Stat s)
    {
        int s_idx = s.Idx();

        float  max_q = float.NegativeInfinity;
        ActIdx max_a = new ActIdx(0);

        for (int i = 0; i < ActIdx.max_idx; i++)
        {
            if (q_table[s_idx, i] > max_q)
            {
                max_q = q_table[s_idx, i];
                max_a = new ActIdx(i);
            }
        }
        return(new QActIdx(max_q, max_a));
    }
예제 #4
0
 public QActIdx(float q, ActIdx a_idx)
 {
     this.q     = q;
     this.a_idx = a_idx;
 }