/// <summary> /// Returns an action from a state /// </summary> /// <param name="state">state size must be equal to NumberOfStates</param> /// <returns></returns> public int Act(double[] state) { Tembo.Assert(state.Length == NumberOfStates, $"Current state({state.Length}) not equal to NS({NumberOfStates})"); var a = 0; // convert to a Mat column vector var s = new Matrix(NumberOfStates, 1); s.Set(state); // epsilon greedy policy if (Tembo.Random() < Options.Epsilon) { a = Tembo.RandomInt(0, NumberOfActions); } else { // greedy wrt Q function var amat = ForwardQ(Network, s, false); a = Tembo.Maxi(amat.W); // returns index of argmax action } // shift state memory this.s0 = this.s1; this.a0 = this.a1; this.s1 = s; this.a1 = a; return(a); }
/// <summary> /// OOP advatages adopted during translation... /// </summary> /// <param name="experience">See Experience</param> /// <returns></returns> private double LearnFromExperience(Experience experience /*Matrix s0, int a0, double r0, Matrix s1, int a1*/) { // want: Q(s,a) = r + gamma * max_a' Q(s',a') // compute the target Q value var tmat = ForwardQ(Network, s1, false); var qmax = r0 + Options.Gamma * tmat.W[Tembo.Maxi(tmat.W)]; // now predict var pred = ForwardQ(Network, s0, true); var tderror = pred.W[a0] - qmax; var clamp = Options.ErrorClamp; if (Math.Abs(tderror) > clamp) { // huber loss to robustify if (tderror > clamp) { tderror = clamp; } if (tderror < -clamp) { tderror = -clamp; } } pred.DW[a0] = tderror; LastGraph.Backward(); // compute gradients on net params // update net Tembo.UpdateNetwork(Network, Options.Alpha); return(tderror); }
public int Act(double[] state) { var s = StateKey(state); // act according to epsilon greedy policy var a = 0; var poss = AllowedActions(state); var probs = new List <double>(); for (var i = 0; i < poss.Length; i++) { probs.Add(P[poss[i] * NS + s]); } // epsilon greedy policy if (Tembo.Random() < Options.Epsilon) { a = poss[Tembo.RandomInt(0, poss.Length)]; // random available action Explored = true; } else { a = poss[Tembo.SampleWeighted(probs.ToArray())]; Explored = false; } // shift state memory s0 = s1; a0 = a1; s1 = s; a1 = a; return(a); }
protected void Reset() { // reset the agent"s policy and value function Q = Tembo.ArrayOfZeros(NS * NA); if (Options.QInitVal != 0) { Tembo.SetConst(Q, Options.QInitVal); } P = Tembo.ArrayOfZeros(NS * NA); E = Tembo.ArrayOfZeros(NS * NA); // model/planning vars EnvModelS = Tembo.ArrayOfZeros(NS * NA); Tembo.SetConst(EnvModelS, -1); // init to -1 so we can test if we saw the state before EnvModelR = Tembo.ArrayOfZeros(NS * NA); SaSeen = new double[] { }; PQ = Tembo.ArrayOfZeros(NS * NA); // initialize uniform random policy for (var s = 0; s < NS; s++) { var poss = AllowedActions(s); for (var i = 0; i < poss.Length; i++) { P[poss[i] * NS + s] = 1.0 / poss.Length; } } // agent memory, needed for streaming updates // (s0,a0,r0,s1,a1,r1,...) r0 = 999999999; s0 = 999999999; s1 = 999999999; a0 = 999999999; a1 = 999999999; }
public void Set(int row, int col, double value) { var ix = (Columns * row) + col; Tembo.Assert(ix >= 0 && ix < W.Length); W[ix] = value; }
public double Get(int row, int col) { var ix = (Columns * row) + col; Tembo.Assert(ix >= 0 && ix < W.Length); return(W[ix]); }
public Matrix(int numberOfRows, int numberOfColumns, string id = "") { Rows = numberOfRows; Columns = numberOfColumns; W = Tembo.ArrayOfZeros(Rows * numberOfColumns); DW = Tembo.ArrayOfZeros(Rows * numberOfColumns); Id = id != "" ? id : Tembo.GetId(); }
private void Plan() { // order the states based on current priority queue information var spq = new List <dynamic>(); for (var i = 0; i < SaSeen.Length; i++) { var sa = SaSeen[i].ToInt(); var sap = PQ[sa]; if (sap > 1e-5) { // gain a bit of efficiency dynamic dy = new ExpandoObject(); dy.sa = sa; dy.p = sap; spq.Add(dy); } } var spqSorted = spq.OrderByDescending(a => a.p).ToList(); spq = spqSorted; /*spq.sort(function (a, b) { * return a.p < b.p ? 1 : -1 * });*/ // perform the updates var nsteps = Math.Min(Options.PlanN, spq.Count); for (var k = 0; k < nsteps; k++) { // random exploration //var i = randi(0, SaSeen.Length); // pick random prev seen state action //var s0a0 = SaSeen[i]; var s0a0 = spq[k].sa; PQ[s0a0] = 0; // erase priority, since we"re backing up this state var s0 = s0a0 % NS; var a0 = Math.Floor(s0a0 / NS); var r0 = EnvModelR[s0a0]; var s1 = EnvModelS[s0a0].ToInt(); var a1 = -1; // not used for Q learning if (Options.Update == "sarsa") { // generate random action?... var poss = AllowedActions(s1); a1 = poss[Tembo.RandomInt(0, poss.Length)]; } var exp = new Experience { PreviousStateInt = s0, PreviousAction = a0, PreviousReward = r0, CurrentStateInt = s1, CurrentAction = a1 }; LearnFromTuple(exp, 0); // note Options.Lambda = 0 - shouldnt use eligibility trace here } }
public int Direction(double[] state) { var sta = new State { Values = state, Occurrence = 1, Output = -1 }; lastKey = Tembo.GetId(); Historical.Add(lastKey, sta); lastAction = dqnAgent.Act(state); return(lastAction); }
private void BLearning() { while (true) { if (Historical.Count < 20000) { // Thread.Sleep(TimeSpan.FromMinutes(30)); } var correct = 0.0; var total = 0.0; var options = new AgentOptions { Gamma = Tembo.Random(0.01, 0.99), Epsilon = Tembo.Random(0.01, 0.75), Alpha = Tembo.Random(0.01, 0.99), ExperinceAddEvery = Tembo.RandomInt(1, 10000), ExperienceSize = 0, LearningSteps = Tembo.RandomInt(1, 10), HiddenUnits = Tembo.RandomInt(100000, 100000000), ErrorClamp = Tembo.Random(0.01, 1.0), AdaptiveLearningSteps = true }; var agent = new DQN(dqnAgent.NumberOfStates, dqnAgent.NumberOfActions, options); for (var i = 0; i < Historical.Count; i++) { var spi = Historical.ElementAt(i); var action = agent.Act(spi.Value.Values); if (action == spi.Value.Output) { correct += 1; agent.Learn(1); } else { agent.Learn(-1); } total += 1; } var winrate = (correct / total) * 100; if (winrate > WinRate) { CN.Log($"NEW AGENT DISCOVERED --> WINRATE {winrate.ToString("p")}, CLASS: {AgentName}", 2); Save(); dqnAgent = agent; WinRate = winrate; } } }
/// <summary> /// Rewards the agent for perfomic an action /// ,memorizes and learns from the experience /// </summary> /// <param name="reward">-+</param> public void Learn(double reward) { // perform an update on Q function if (this.r0 > 0 && Options.Alpha > 0) { // learn from this tuple to get a sense of how "surprising" it is to the agent var exp = new Experience { PreviousState = s0, PreviousAction = a0, PreviousReward = r0, CurrentState = s1, CurrentAction = a1 }; var tderror = LearnFromExperience(exp); TDError = tderror; // a measure of surprise // decide if we should keep this experience in the replay if (t % Options.ExperinceAddEvery == 0) { Memory.Add(new Experience { PreviousState = s0, PreviousAction = a0, PreviousReward = r0, CurrentState = s1, CurrentAction = a1 }); if (Options.ExperienceSize > 0 && Memory.Count > Options.ExperienceSize) { //forget oldest Memory.RemoveAt(0); } } this.t += 1; // sample some additional experience from replay memory and learn from it if (Options.AdaptiveLearningSteps) { var op = Memory.Count * 0.005; if (op > 0) { Options.LearningSteps = op.ToInt(); } } for (var k = 0; k < Options.LearningSteps; k++) { var ri = Tembo.RandomInt(0, Memory.Count); // todo: priority sweeps? var e = Memory[ri]; LearnFromExperience(e); } } this.r0 = reward; // store for next update }
public DQN(int numberOfStates, int numberOfActions, AgentOptions options) { NumberOfStates = numberOfStates; NumberOfActions = numberOfActions; Options = options; HiddenUnits = options.HiddenUnits; Network = new Dictionary <string, Matrix> { //w1 { "W1", Tembo.RandomMatrix(HiddenUnits, NumberOfStates, 0, 0.01) }, //b1 { "B1", Tembo.RandomMatrix(HiddenUnits, 1, 0, 0.01) }, //w2 { "W2", Tembo.RandomMatrix(NumberOfActions, HiddenUnits, 0, 0.01) }, //b2 { "B2", Tembo.RandomMatrix(NumberOfActions, 1, 0, 0.01) } }; Memory = new List <Experience>(); }
private void LearnFromTuple(Experience exp /*int s0, int a0, double r0, int s1, int a1*/, int lambda) { var sa = exp.PreviousAction * NS + exp.PreviousStateInt; var target = 0.0; // calculate the target for Q(s,a) if (Options.Update == "qlearn") { // Q learning target is Q(s0,a0) = r0 + gamma * max_a Q[s1,a] var poss = AllowedActions(exp.CurrentStateInt); var qmax = 0.0; for (var i = 0; i < poss.Length; i++) { var s1a = poss[i] * NS + exp.CurrentStateInt; var qval = Q[s1a]; if (i == 0 || qval > qmax) { qmax = qval; } } target = exp.PreviousReward + Options.Gamma * qmax; } else if (Options.Update == "sarsa") { // SARSA target is Q(s0,a0) = r0 + gamma * Q[s1,a1] var s1a1 = exp.CurrentAction * NS + exp.CurrentStateInt; target = exp.PreviousReward + Options.Gamma * Q[s1a1]; } if (lambda > 0) { // perform an eligibility trace update if (Options.ReplacingTraces) { E[sa] = 1; } else { E[sa] += 1; } var edecay = lambda * Options.Gamma; var state_update = Tembo.ArrayOfZeros(NS); for (var s = 0; s < NS; s++) { var poss = AllowedActions(s); for (var i = 0; i < poss.Length; i++) { var a = poss[i]; var saloop = a * NS + s; var esa = E[saloop]; var update = Options.Alpha * esa * (target - Q[saloop]); Q[saloop] += update; UpdatePriority(s, a, update); E[saloop] *= edecay; var u = Math.Abs(update); if (u > state_update[s]) { state_update[s] = u; } } } for (var s = 0; s < NS; s++) { if (state_update[s] > 1e-5) { // save efficiency here UpdatePolicy(s); } } if (Explored && Options.Update == "qlearn") { // have to wipe the trace since q learning is off-policy :( E = Tembo.ArrayOfZeros(NS * NA); } } else { // simpler and faster update without eligibility trace // update Q[sa] towards it with some step size var update = Options.Alpha * (target - Q[sa]); Q[sa] += update; UpdatePriority(exp.PreviousStateInt, exp.PreviousAction, update); // update the policy to reflect the change (if appropriate) UpdatePolicy(exp.PreviousStateInt); } }