Exemplo n.º 1
0
        /// <summary>
        /// Returns an action from a state
        /// </summary>
        /// <param name="state">state size must be equal to NumberOfStates</param>
        /// <returns></returns>
        public int Act(double[] state)
        {
            Tembo.Assert(state.Length == NumberOfStates, $"Current state({state.Length}) not equal to NS({NumberOfStates})");
            var a = 0;
            // convert to a Mat column vector
            var s = new Matrix(NumberOfStates, 1);

            s.Set(state);
            // epsilon greedy policy
            if (Tembo.Random() < Options.Epsilon)
            {
                a = Tembo.RandomInt(0, NumberOfActions);
            }
            else
            {
                // greedy wrt Q function
                var amat = ForwardQ(Network, s, false);
                a = Tembo.Maxi(amat.W); // returns index of argmax action
            }
            // shift state memory
            this.s0 = this.s1;
            this.a0 = this.a1;
            this.s1 = s;
            this.a1 = a;
            return(a);
        }
Exemplo n.º 2
0
Arquivo: TD.cs Projeto: mbithy/TemboRL
        public int Act(double[] state)
        {
            var s = StateKey(state);
            // act according to epsilon greedy policy
            var a     = 0;
            var poss  = AllowedActions(state);
            var probs = new List <double>();

            for (var i = 0; i < poss.Length; i++)
            {
                probs.Add(P[poss[i] * NS + s]);
            }
            // epsilon greedy policy
            if (Tembo.Random() < Options.Epsilon)
            {
                a        = poss[Tembo.RandomInt(0, poss.Length)]; // random available action
                Explored = true;
            }
            else
            {
                a        = poss[Tembo.SampleWeighted(probs.ToArray())];
                Explored = false;
            }
            // shift state memory
            s0 = s1;
            a0 = a1;
            s1 = s;
            a1 = a;
            return(a);
        }
Exemplo n.º 3
0
Arquivo: TD.cs Projeto: mbithy/TemboRL
        private void Plan()
        {
            // order the states based on current priority queue information
            var spq = new List <dynamic>();

            for (var i = 0; i < SaSeen.Length; i++)
            {
                var sa  = SaSeen[i].ToInt();
                var sap = PQ[sa];
                if (sap > 1e-5)
                { // gain a bit of efficiency
                    dynamic dy = new ExpandoObject();
                    dy.sa = sa;
                    dy.p  = sap;
                    spq.Add(dy);
                }
            }
            var spqSorted = spq.OrderByDescending(a => a.p).ToList();

            spq = spqSorted;

            /*spq.sort(function (a, b) {
             *  return a.p < b.p ? 1 : -1
             * });*/
            // perform the updates
            var nsteps = Math.Min(Options.PlanN, spq.Count);

            for (var k = 0; k < nsteps; k++)
            {
                // random exploration
                //var i = randi(0, SaSeen.Length); // pick random prev seen state action
                //var s0a0 = SaSeen[i];
                var s0a0 = spq[k].sa;
                PQ[s0a0] = 0; // erase priority, since we"re backing up this state
                var s0 = s0a0 % NS;
                var a0 = Math.Floor(s0a0 / NS);
                var r0 = EnvModelR[s0a0];
                var s1 = EnvModelS[s0a0].ToInt();
                var a1 = -1; // not used for Q learning
                if (Options.Update == "sarsa")
                {
                    // generate random action?...
                    var poss = AllowedActions(s1);
                    a1 = poss[Tembo.RandomInt(0, poss.Length)];
                }
                var exp = new Experience
                {
                    PreviousStateInt = s0,
                    PreviousAction   = a0,
                    PreviousReward   = r0,
                    CurrentStateInt  = s1,
                    CurrentAction    = a1
                };
                LearnFromTuple(exp, 0); // note Options.Lambda = 0 - shouldnt use eligibility trace here
            }
        }
Exemplo n.º 4
0
 private void BLearning()
 {
     while (true)
     {
         if (Historical.Count < 20000)
         {
             //
             Thread.Sleep(TimeSpan.FromMinutes(30));
         }
         var correct = 0.0;
         var total   = 0.0;
         var options = new AgentOptions
         {
             Gamma                 = Tembo.Random(0.01, 0.99),
             Epsilon               = Tembo.Random(0.01, 0.75),
             Alpha                 = Tembo.Random(0.01, 0.99),
             ExperinceAddEvery     = Tembo.RandomInt(1, 10000),
             ExperienceSize        = 0,
             LearningSteps         = Tembo.RandomInt(1, 10),
             HiddenUnits           = Tembo.RandomInt(100000, 100000000),
             ErrorClamp            = Tembo.Random(0.01, 1.0),
             AdaptiveLearningSteps = true
         };
         var agent = new DQN(dqnAgent.NumberOfStates, dqnAgent.NumberOfActions, options);
         for (var i = 0; i < Historical.Count; i++)
         {
             var spi    = Historical.ElementAt(i);
             var action = agent.Act(spi.Value.Values);
             if (action == spi.Value.Output)
             {
                 correct += 1;
                 agent.Learn(1);
             }
             else
             {
                 agent.Learn(-1);
             }
             total += 1;
         }
         var winrate = (correct / total) * 100;
         if (winrate > WinRate)
         {
             CN.Log($"NEW AGENT DISCOVERED --> WINRATE {winrate.ToString("p")}, CLASS: {AgentName}", 2);
             Save();
             dqnAgent = agent;
             WinRate  = winrate;
         }
     }
 }
Exemplo n.º 5
0
 /// <summary>
 /// Rewards the agent for perfomic an action
 /// ,memorizes and learns from the experience
 /// </summary>
 /// <param name="reward">-+</param>
 public void Learn(double reward)
 {
     // perform an update on Q function
     if (this.r0 > 0 && Options.Alpha > 0)
     {
         // learn from this tuple to get a sense of how "surprising" it is to the agent
         var exp = new Experience
         {
             PreviousState  = s0,
             PreviousAction = a0,
             PreviousReward = r0,
             CurrentState   = s1,
             CurrentAction  = a1
         };
         var tderror = LearnFromExperience(exp);
         TDError = tderror; // a measure of surprise
         // decide if we should keep this experience in the replay
         if (t % Options.ExperinceAddEvery == 0)
         {
             Memory.Add(new Experience {
                 PreviousState = s0, PreviousAction = a0, PreviousReward = r0, CurrentState = s1, CurrentAction = a1
             });
             if (Options.ExperienceSize > 0 && Memory.Count > Options.ExperienceSize)
             {
                 //forget oldest
                 Memory.RemoveAt(0);
             }
         }
         this.t += 1;
         // sample some additional experience from replay memory and learn from it
         if (Options.AdaptiveLearningSteps)
         {
             var op = Memory.Count * 0.005;
             if (op > 0)
             {
                 Options.LearningSteps = op.ToInt();
             }
         }
         for (var k = 0; k < Options.LearningSteps; k++)
         {
             var ri = Tembo.RandomInt(0, Memory.Count); // todo: priority sweeps?
             var e  = Memory[ri];
             LearnFromExperience(e);
         }
     }
     this.r0 = reward; // store for next update
 }