public int Act(int[] stateArray) { var r = new Random(); // convert to a Mat column vector var state = new Matrix(NumberOfStates, 1); state.Set(stateArray); var a = 0; var y = r.NextDouble(); // epsilon greedy policy if (y < Options.Epsilon) { a = Util.Random(0, NumberOfActions); } else { // greedy wrt Q function var amat = Forward(Net, state, false); a = Util.ActionFromWeights(amat.Weights); // returns index of argmax action } // shift state memory previousStateCache = nextStateCache; previousAction = nextAction; nextStateCache = state; nextAction = a; return(a); }
private double LearnFromTuple(Matrix prevState, int prevAction, double reward, Matrix nextState) { // want: Q(s,a) = r + gamma * max_a' Q(s',a') // compute the target Q value var tmat = Forward(Net, nextState, false); var qmax = reward + Options.Gamma * tmat.Weights[Util.ActionFromWeights(tmat.Weights)]; // now predict var pred = Forward(Net, prevState, true); var tderror = pred.Weights[prevAction] - qmax; var clamp = Options.ErrorClamp; if (Math.Abs(tderror) > clamp) { // huber loss to robustify if (tderror > clamp) { tderror = clamp; } if (tderror < -clamp) { tderror = -clamp; } } pred.BackPropWeights[prevAction] = tderror; LastGraph.Backward(); // update net Util.UpdateNetwork(Net, Options.Alpha); return(tderror); }