Example #1
0
        public int Act(int[] stateArray)
        {
            var r = new Random();
            // convert to a Mat column vector
            var state = new Matrix(NumberOfStates, 1);

            state.Set(stateArray);
            var a = 0;
            var y = r.NextDouble();

            // epsilon greedy policy
            if (y < Options.Epsilon)
            {
                a = Util.Random(0, NumberOfActions);
            }
            else
            {
                // greedy wrt Q function
                var amat = Forward(Net, state, false);
                a = Util.ActionFromWeights(amat.Weights); // returns index of argmax action
            }
            // shift state memory
            previousStateCache = nextStateCache;
            previousAction     = nextAction;
            nextStateCache     = state;
            nextAction         = a;
            return(a);
        }
Example #2
0
        private double LearnFromTuple(Matrix prevState, int prevAction, double reward, Matrix nextState)
        {
            // want: Q(s,a) = r + gamma * max_a' Q(s',a')

            // compute the target Q value
            var tmat = Forward(Net, nextState, false);
            var qmax = reward + Options.Gamma * tmat.Weights[Util.ActionFromWeights(tmat.Weights)];

            // now predict
            var pred = Forward(Net, prevState, true);

            var tderror = pred.Weights[prevAction] - qmax;
            var clamp   = Options.ErrorClamp;

            if (Math.Abs(tderror) > clamp)
            {  // huber loss to robustify
                if (tderror > clamp)
                {
                    tderror = clamp;
                }
                if (tderror < -clamp)
                {
                    tderror = -clamp;
                }
            }
            pred.BackPropWeights[prevAction] = tderror;
            LastGraph.Backward();

            // update net
            Util.UpdateNetwork(Net, Options.Alpha);
            return(tderror);
        }