예제 #1
0
파일: DQN.cs 프로젝트: mbithy/TemboRL
        /// <summary>
        /// Returns an action from a state
        /// </summary>
        /// <param name="state">state size must be equal to NumberOfStates</param>
        /// <returns></returns>
        public int Act(double[] state)
        {
            Tembo.Assert(state.Length == NumberOfStates, $"Current state({state.Length}) not equal to NS({NumberOfStates})");
            var a = 0;
            // convert to a Mat column vector
            var s = new Matrix(NumberOfStates, 1);

            s.Set(state);
            // epsilon greedy policy
            if (Tembo.Random() < Options.Epsilon)
            {
                a = Tembo.RandomInt(0, NumberOfActions);
            }
            else
            {
                // greedy wrt Q function
                var amat = ForwardQ(Network, s, false);
                a = Tembo.Maxi(amat.W); // returns index of argmax action
            }
            // shift state memory
            this.s0 = this.s1;
            this.a0 = this.a1;
            this.s1 = s;
            this.a1 = a;
            return(a);
        }
예제 #2
0
파일: DQN.cs 프로젝트: mbithy/TemboRL
        /// <summary>
        /// OOP advatages adopted during translation...
        /// </summary>
        /// <param name="experience">See Experience</param>
        /// <returns></returns>
        private double LearnFromExperience(Experience experience /*Matrix s0, int a0, double r0, Matrix s1, int a1*/)
        {
            // want: Q(s,a) = r + gamma * max_a' Q(s',a')
            // compute the target Q value
            var tmat = ForwardQ(Network, s1, false);
            var qmax = r0 + Options.Gamma * tmat.W[Tembo.Maxi(tmat.W)];
            // now predict
            var pred    = ForwardQ(Network, s0, true);
            var tderror = pred.W[a0] - qmax;
            var clamp   = Options.ErrorClamp;

            if (Math.Abs(tderror) > clamp)
            {  // huber loss to robustify
                if (tderror > clamp)
                {
                    tderror = clamp;
                }
                if (tderror < -clamp)
                {
                    tderror = -clamp;
                }
            }
            pred.DW[a0] = tderror;
            LastGraph.Backward(); // compute gradients on net params
            // update net
            Tembo.UpdateNetwork(Network, Options.Alpha);
            return(tderror);
        }
예제 #3
0
파일: TD.cs 프로젝트: mbithy/TemboRL
        public int Act(double[] state)
        {
            var s = StateKey(state);
            // act according to epsilon greedy policy
            var a     = 0;
            var poss  = AllowedActions(state);
            var probs = new List <double>();

            for (var i = 0; i < poss.Length; i++)
            {
                probs.Add(P[poss[i] * NS + s]);
            }
            // epsilon greedy policy
            if (Tembo.Random() < Options.Epsilon)
            {
                a        = poss[Tembo.RandomInt(0, poss.Length)]; // random available action
                Explored = true;
            }
            else
            {
                a        = poss[Tembo.SampleWeighted(probs.ToArray())];
                Explored = false;
            }
            // shift state memory
            s0 = s1;
            a0 = a1;
            s1 = s;
            a1 = a;
            return(a);
        }
예제 #4
0
파일: TD.cs 프로젝트: mbithy/TemboRL
 protected void Reset()
 {
     // reset the agent"s policy and value function
     Q = Tembo.ArrayOfZeros(NS * NA);
     if (Options.QInitVal != 0)
     {
         Tembo.SetConst(Q, Options.QInitVal);
     }
     P = Tembo.ArrayOfZeros(NS * NA);
     E = Tembo.ArrayOfZeros(NS * NA);
     // model/planning vars
     EnvModelS = Tembo.ArrayOfZeros(NS * NA);
     Tembo.SetConst(EnvModelS, -1); // init to -1 so we can test if we saw the state before
     EnvModelR = Tembo.ArrayOfZeros(NS * NA);
     SaSeen    = new double[] { };
     PQ        = Tembo.ArrayOfZeros(NS * NA);
     // initialize uniform random policy
     for (var s = 0; s < NS; s++)
     {
         var poss = AllowedActions(s);
         for (var i = 0; i < poss.Length; i++)
         {
             P[poss[i] * NS + s] = 1.0 / poss.Length;
         }
     }
     // agent memory, needed for streaming updates
     // (s0,a0,r0,s1,a1,r1,...)
     r0 = 999999999;
     s0 = 999999999;
     s1 = 999999999;
     a0 = 999999999;
     a1 = 999999999;
 }
예제 #5
0
        public void Set(int row, int col, double value)
        {
            var ix = (Columns * row) + col;

            Tembo.Assert(ix >= 0 && ix < W.Length);
            W[ix] = value;
        }
예제 #6
0
        public double Get(int row, int col)
        {
            var ix = (Columns * row) + col;

            Tembo.Assert(ix >= 0 && ix < W.Length);
            return(W[ix]);
        }
예제 #7
0
 public Matrix(int numberOfRows, int numberOfColumns, string id = "")
 {
     Rows    = numberOfRows;
     Columns = numberOfColumns;
     W       = Tembo.ArrayOfZeros(Rows * numberOfColumns);
     DW      = Tembo.ArrayOfZeros(Rows * numberOfColumns);
     Id      = id != "" ? id : Tembo.GetId();
 }
예제 #8
0
파일: TD.cs 프로젝트: mbithy/TemboRL
        private void Plan()
        {
            // order the states based on current priority queue information
            var spq = new List <dynamic>();

            for (var i = 0; i < SaSeen.Length; i++)
            {
                var sa  = SaSeen[i].ToInt();
                var sap = PQ[sa];
                if (sap > 1e-5)
                { // gain a bit of efficiency
                    dynamic dy = new ExpandoObject();
                    dy.sa = sa;
                    dy.p  = sap;
                    spq.Add(dy);
                }
            }
            var spqSorted = spq.OrderByDescending(a => a.p).ToList();

            spq = spqSorted;

            /*spq.sort(function (a, b) {
             *  return a.p < b.p ? 1 : -1
             * });*/
            // perform the updates
            var nsteps = Math.Min(Options.PlanN, spq.Count);

            for (var k = 0; k < nsteps; k++)
            {
                // random exploration
                //var i = randi(0, SaSeen.Length); // pick random prev seen state action
                //var s0a0 = SaSeen[i];
                var s0a0 = spq[k].sa;
                PQ[s0a0] = 0; // erase priority, since we"re backing up this state
                var s0 = s0a0 % NS;
                var a0 = Math.Floor(s0a0 / NS);
                var r0 = EnvModelR[s0a0];
                var s1 = EnvModelS[s0a0].ToInt();
                var a1 = -1; // not used for Q learning
                if (Options.Update == "sarsa")
                {
                    // generate random action?...
                    var poss = AllowedActions(s1);
                    a1 = poss[Tembo.RandomInt(0, poss.Length)];
                }
                var exp = new Experience
                {
                    PreviousStateInt = s0,
                    PreviousAction   = a0,
                    PreviousReward   = r0,
                    CurrentStateInt  = s1,
                    CurrentAction    = a1
                };
                LearnFromTuple(exp, 0); // note Options.Lambda = 0 - shouldnt use eligibility trace here
            }
        }
예제 #9
0
        public int Direction(double[] state)
        {
            var sta = new State
            {
                Values     = state,
                Occurrence = 1,
                Output     = -1
            };

            lastKey = Tembo.GetId();
            Historical.Add(lastKey, sta);
            lastAction = dqnAgent.Act(state);
            return(lastAction);
        }
예제 #10
0
 private void BLearning()
 {
     while (true)
     {
         if (Historical.Count < 20000)
         {
             //
             Thread.Sleep(TimeSpan.FromMinutes(30));
         }
         var correct = 0.0;
         var total   = 0.0;
         var options = new AgentOptions
         {
             Gamma                 = Tembo.Random(0.01, 0.99),
             Epsilon               = Tembo.Random(0.01, 0.75),
             Alpha                 = Tembo.Random(0.01, 0.99),
             ExperinceAddEvery     = Tembo.RandomInt(1, 10000),
             ExperienceSize        = 0,
             LearningSteps         = Tembo.RandomInt(1, 10),
             HiddenUnits           = Tembo.RandomInt(100000, 100000000),
             ErrorClamp            = Tembo.Random(0.01, 1.0),
             AdaptiveLearningSteps = true
         };
         var agent = new DQN(dqnAgent.NumberOfStates, dqnAgent.NumberOfActions, options);
         for (var i = 0; i < Historical.Count; i++)
         {
             var spi    = Historical.ElementAt(i);
             var action = agent.Act(spi.Value.Values);
             if (action == spi.Value.Output)
             {
                 correct += 1;
                 agent.Learn(1);
             }
             else
             {
                 agent.Learn(-1);
             }
             total += 1;
         }
         var winrate = (correct / total) * 100;
         if (winrate > WinRate)
         {
             CN.Log($"NEW AGENT DISCOVERED --> WINRATE {winrate.ToString("p")}, CLASS: {AgentName}", 2);
             Save();
             dqnAgent = agent;
             WinRate  = winrate;
         }
     }
 }
예제 #11
0
파일: DQN.cs 프로젝트: mbithy/TemboRL
 /// <summary>
 /// Rewards the agent for perfomic an action
 /// ,memorizes and learns from the experience
 /// </summary>
 /// <param name="reward">-+</param>
 public void Learn(double reward)
 {
     // perform an update on Q function
     if (this.r0 > 0 && Options.Alpha > 0)
     {
         // learn from this tuple to get a sense of how "surprising" it is to the agent
         var exp = new Experience
         {
             PreviousState  = s0,
             PreviousAction = a0,
             PreviousReward = r0,
             CurrentState   = s1,
             CurrentAction  = a1
         };
         var tderror = LearnFromExperience(exp);
         TDError = tderror; // a measure of surprise
         // decide if we should keep this experience in the replay
         if (t % Options.ExperinceAddEvery == 0)
         {
             Memory.Add(new Experience {
                 PreviousState = s0, PreviousAction = a0, PreviousReward = r0, CurrentState = s1, CurrentAction = a1
             });
             if (Options.ExperienceSize > 0 && Memory.Count > Options.ExperienceSize)
             {
                 //forget oldest
                 Memory.RemoveAt(0);
             }
         }
         this.t += 1;
         // sample some additional experience from replay memory and learn from it
         if (Options.AdaptiveLearningSteps)
         {
             var op = Memory.Count * 0.005;
             if (op > 0)
             {
                 Options.LearningSteps = op.ToInt();
             }
         }
         for (var k = 0; k < Options.LearningSteps; k++)
         {
             var ri = Tembo.RandomInt(0, Memory.Count); // todo: priority sweeps?
             var e  = Memory[ri];
             LearnFromExperience(e);
         }
     }
     this.r0 = reward; // store for next update
 }
예제 #12
0
파일: DQN.cs 프로젝트: mbithy/TemboRL
 public DQN(int numberOfStates, int numberOfActions, AgentOptions options)
 {
     NumberOfStates  = numberOfStates;
     NumberOfActions = numberOfActions;
     Options         = options;
     HiddenUnits     = options.HiddenUnits;
     Network         = new Dictionary <string, Matrix>
     {
         //w1
         { "W1", Tembo.RandomMatrix(HiddenUnits, NumberOfStates, 0, 0.01) },
         //b1
         { "B1", Tembo.RandomMatrix(HiddenUnits, 1, 0, 0.01) },
         //w2
         { "W2", Tembo.RandomMatrix(NumberOfActions, HiddenUnits, 0, 0.01) },
         //b2
         { "B2", Tembo.RandomMatrix(NumberOfActions, 1, 0, 0.01) }
     };
     Memory = new List <Experience>();
 }
예제 #13
0
파일: TD.cs 프로젝트: mbithy/TemboRL
        private void LearnFromTuple(Experience exp /*int s0, int a0, double r0, int s1, int a1*/, int lambda)
        {
            var sa     = exp.PreviousAction * NS + exp.PreviousStateInt;
            var target = 0.0;

            // calculate the target for Q(s,a)
            if (Options.Update == "qlearn")
            {
                // Q learning target is Q(s0,a0) = r0 + gamma * max_a Q[s1,a]
                var poss = AllowedActions(exp.CurrentStateInt);
                var qmax = 0.0;
                for (var i = 0; i < poss.Length; i++)
                {
                    var s1a  = poss[i] * NS + exp.CurrentStateInt;
                    var qval = Q[s1a];
                    if (i == 0 || qval > qmax)
                    {
                        qmax = qval;
                    }
                }
                target = exp.PreviousReward + Options.Gamma * qmax;
            }
            else if (Options.Update == "sarsa")
            {
                // SARSA target is Q(s0,a0) = r0 + gamma * Q[s1,a1]
                var s1a1 = exp.CurrentAction * NS + exp.CurrentStateInt;
                target = exp.PreviousReward + Options.Gamma * Q[s1a1];
            }
            if (lambda > 0)
            {
                // perform an eligibility trace update
                if (Options.ReplacingTraces)
                {
                    E[sa] = 1;
                }
                else
                {
                    E[sa] += 1;
                }
                var edecay       = lambda * Options.Gamma;
                var state_update = Tembo.ArrayOfZeros(NS);
                for (var s = 0; s < NS; s++)
                {
                    var poss = AllowedActions(s);
                    for (var i = 0; i < poss.Length; i++)
                    {
                        var a      = poss[i];
                        var saloop = a * NS + s;
                        var esa    = E[saloop];
                        var update = Options.Alpha * esa * (target - Q[saloop]);
                        Q[saloop] += update;
                        UpdatePriority(s, a, update);
                        E[saloop] *= edecay;
                        var u = Math.Abs(update);
                        if (u > state_update[s])
                        {
                            state_update[s] = u;
                        }
                    }
                }
                for (var s = 0; s < NS; s++)
                {
                    if (state_update[s] > 1e-5)
                    { // save efficiency here
                        UpdatePolicy(s);
                    }
                }
                if (Explored && Options.Update == "qlearn")
                {
                    // have to wipe the trace since q learning is off-policy :(
                    E = Tembo.ArrayOfZeros(NS * NA);
                }
            }
            else
            {
                // simpler and faster update without eligibility trace
                // update Q[sa] towards it with some step size
                var update = Options.Alpha * (target - Q[sa]);
                Q[sa] += update;
                UpdatePriority(exp.PreviousStateInt, exp.PreviousAction, update);
                // update the policy to reflect the change (if appropriate)
                UpdatePolicy(exp.PreviousStateInt);
            }
        }