protected void Reset() { // reset the agent"s policy and value function Q = Tembo.ArrayOfZeros(NS * NA); if (Options.QInitVal != 0) { Tembo.SetConst(Q, Options.QInitVal); } P = Tembo.ArrayOfZeros(NS * NA); E = Tembo.ArrayOfZeros(NS * NA); // model/planning vars EnvModelS = Tembo.ArrayOfZeros(NS * NA); Tembo.SetConst(EnvModelS, -1); // init to -1 so we can test if we saw the state before EnvModelR = Tembo.ArrayOfZeros(NS * NA); SaSeen = new double[] { }; PQ = Tembo.ArrayOfZeros(NS * NA); // initialize uniform random policy for (var s = 0; s < NS; s++) { var poss = AllowedActions(s); for (var i = 0; i < poss.Length; i++) { P[poss[i] * NS + s] = 1.0 / poss.Length; } } // agent memory, needed for streaming updates // (s0,a0,r0,s1,a1,r1,...) r0 = 999999999; s0 = 999999999; s1 = 999999999; a0 = 999999999; a1 = 999999999; }
public Matrix(int numberOfRows, int numberOfColumns, string id = "") { Rows = numberOfRows; Columns = numberOfColumns; W = Tembo.ArrayOfZeros(Rows * numberOfColumns); DW = Tembo.ArrayOfZeros(Rows * numberOfColumns); Id = id != "" ? id : Tembo.GetId(); }
private void LearnFromTuple(Experience exp /*int s0, int a0, double r0, int s1, int a1*/, int lambda) { var sa = exp.PreviousAction * NS + exp.PreviousStateInt; var target = 0.0; // calculate the target for Q(s,a) if (Options.Update == "qlearn") { // Q learning target is Q(s0,a0) = r0 + gamma * max_a Q[s1,a] var poss = AllowedActions(exp.CurrentStateInt); var qmax = 0.0; for (var i = 0; i < poss.Length; i++) { var s1a = poss[i] * NS + exp.CurrentStateInt; var qval = Q[s1a]; if (i == 0 || qval > qmax) { qmax = qval; } } target = exp.PreviousReward + Options.Gamma * qmax; } else if (Options.Update == "sarsa") { // SARSA target is Q(s0,a0) = r0 + gamma * Q[s1,a1] var s1a1 = exp.CurrentAction * NS + exp.CurrentStateInt; target = exp.PreviousReward + Options.Gamma * Q[s1a1]; } if (lambda > 0) { // perform an eligibility trace update if (Options.ReplacingTraces) { E[sa] = 1; } else { E[sa] += 1; } var edecay = lambda * Options.Gamma; var state_update = Tembo.ArrayOfZeros(NS); for (var s = 0; s < NS; s++) { var poss = AllowedActions(s); for (var i = 0; i < poss.Length; i++) { var a = poss[i]; var saloop = a * NS + s; var esa = E[saloop]; var update = Options.Alpha * esa * (target - Q[saloop]); Q[saloop] += update; UpdatePriority(s, a, update); E[saloop] *= edecay; var u = Math.Abs(update); if (u > state_update[s]) { state_update[s] = u; } } } for (var s = 0; s < NS; s++) { if (state_update[s] > 1e-5) { // save efficiency here UpdatePolicy(s); } } if (Explored && Options.Update == "qlearn") { // have to wipe the trace since q learning is off-policy :( E = Tembo.ArrayOfZeros(NS * NA); } } else { // simpler and faster update without eligibility trace // update Q[sa] towards it with some step size var update = Options.Alpha * (target - Q[sa]); Q[sa] += update; UpdatePriority(exp.PreviousStateInt, exp.PreviousAction, update); // update the policy to reflect the change (if appropriate) UpdatePolicy(exp.PreviousStateInt); } }