Пример #1
0
 protected void Reset()
 {
     // reset the agent"s policy and value function
     Q = Tembo.ArrayOfZeros(NS * NA);
     if (Options.QInitVal != 0)
     {
         Tembo.SetConst(Q, Options.QInitVal);
     }
     P = Tembo.ArrayOfZeros(NS * NA);
     E = Tembo.ArrayOfZeros(NS * NA);
     // model/planning vars
     EnvModelS = Tembo.ArrayOfZeros(NS * NA);
     Tembo.SetConst(EnvModelS, -1); // init to -1 so we can test if we saw the state before
     EnvModelR = Tembo.ArrayOfZeros(NS * NA);
     SaSeen    = new double[] { };
     PQ        = Tembo.ArrayOfZeros(NS * NA);
     // initialize uniform random policy
     for (var s = 0; s < NS; s++)
     {
         var poss = AllowedActions(s);
         for (var i = 0; i < poss.Length; i++)
         {
             P[poss[i] * NS + s] = 1.0 / poss.Length;
         }
     }
     // agent memory, needed for streaming updates
     // (s0,a0,r0,s1,a1,r1,...)
     r0 = 999999999;
     s0 = 999999999;
     s1 = 999999999;
     a0 = 999999999;
     a1 = 999999999;
 }
Пример #2
0
 public Matrix(int numberOfRows, int numberOfColumns, string id = "")
 {
     Rows    = numberOfRows;
     Columns = numberOfColumns;
     W       = Tembo.ArrayOfZeros(Rows * numberOfColumns);
     DW      = Tembo.ArrayOfZeros(Rows * numberOfColumns);
     Id      = id != "" ? id : Tembo.GetId();
 }
Пример #3
0
        private void LearnFromTuple(Experience exp /*int s0, int a0, double r0, int s1, int a1*/, int lambda)
        {
            var sa     = exp.PreviousAction * NS + exp.PreviousStateInt;
            var target = 0.0;

            // calculate the target for Q(s,a)
            if (Options.Update == "qlearn")
            {
                // Q learning target is Q(s0,a0) = r0 + gamma * max_a Q[s1,a]
                var poss = AllowedActions(exp.CurrentStateInt);
                var qmax = 0.0;
                for (var i = 0; i < poss.Length; i++)
                {
                    var s1a  = poss[i] * NS + exp.CurrentStateInt;
                    var qval = Q[s1a];
                    if (i == 0 || qval > qmax)
                    {
                        qmax = qval;
                    }
                }
                target = exp.PreviousReward + Options.Gamma * qmax;
            }
            else if (Options.Update == "sarsa")
            {
                // SARSA target is Q(s0,a0) = r0 + gamma * Q[s1,a1]
                var s1a1 = exp.CurrentAction * NS + exp.CurrentStateInt;
                target = exp.PreviousReward + Options.Gamma * Q[s1a1];
            }
            if (lambda > 0)
            {
                // perform an eligibility trace update
                if (Options.ReplacingTraces)
                {
                    E[sa] = 1;
                }
                else
                {
                    E[sa] += 1;
                }
                var edecay       = lambda * Options.Gamma;
                var state_update = Tembo.ArrayOfZeros(NS);
                for (var s = 0; s < NS; s++)
                {
                    var poss = AllowedActions(s);
                    for (var i = 0; i < poss.Length; i++)
                    {
                        var a      = poss[i];
                        var saloop = a * NS + s;
                        var esa    = E[saloop];
                        var update = Options.Alpha * esa * (target - Q[saloop]);
                        Q[saloop] += update;
                        UpdatePriority(s, a, update);
                        E[saloop] *= edecay;
                        var u = Math.Abs(update);
                        if (u > state_update[s])
                        {
                            state_update[s] = u;
                        }
                    }
                }
                for (var s = 0; s < NS; s++)
                {
                    if (state_update[s] > 1e-5)
                    { // save efficiency here
                        UpdatePolicy(s);
                    }
                }
                if (Explored && Options.Update == "qlearn")
                {
                    // have to wipe the trace since q learning is off-policy :(
                    E = Tembo.ArrayOfZeros(NS * NA);
                }
            }
            else
            {
                // simpler and faster update without eligibility trace
                // update Q[sa] towards it with some step size
                var update = Options.Alpha * (target - Q[sa]);
                Q[sa] += update;
                UpdatePriority(exp.PreviousStateInt, exp.PreviousAction, update);
                // update the policy to reflect the change (if appropriate)
                UpdatePolicy(exp.PreviousStateInt);
            }
        }