Пример #1
0
        /// <summary>
        /// Updates the Q-Learner model by reinforcing with the new state/action feedback values.
        /// </summary>
        /// <param name="x">State vector.</param>
        /// <param name="y">Action label.</param>
        /// <param name="r">Reward value.</param>
        public override void Learn(Vector x, double y, double r)
        {
            var state  = this.Q.Keys.Last();
            var stateP = MDPConverter.GetState(x, this.FeatureProperties, this.FeatureDiscretizer);
            var action = MDPConverter.GetAction(y, state.Id, stateP.Id);

            this.Q.AddOrUpdate(stateP, action, r);

            this.Q[state, action] = (1.0 - this.LearningRate) * Q[state, action]
                                    + this.LearningRate * (r + this.Lambda * Q[stateP, Q.GetMaxAction(stateP)]);
        }
Пример #2
0
        /// <summary>
        /// Updates the Q-Learner model by reinforcing with the new state/action and transition state feedback values.
        /// </summary>
        /// <param name="x1">Item features, i.e. the original State.</param>
        /// <param name="y">Action label.</param>
        /// <param name="x2">Transition state value.</param>
        /// <param name="r">Reward value.</param>
        public override void Learn(Vector x1, double y, Vector x2, double r)
        {
            var state  = MDPConverter.GetState(x1, this.FeatureProperties, this.FeatureDiscretizer);
            var stateP = MDPConverter.GetState(x2, this.FeatureProperties, this.FeatureDiscretizer);
            var action = MDPConverter.GetAction(y, state.Id, stateP.Id);

            if (!Q.ContainsKey(state))
            {
                Q.AddOrUpdate(state, action, r);
            }

            if (!Q.ContainsKey(stateP))
            {
                Q.AddKey(stateP);
            }

            this.Q[state, action] = (1.0 - this.LearningRate) * Q[state, action]
                                    + this.LearningRate * (r + this.Lambda * Q[stateP, Q.GetMaxAction(stateP)]);
        }
Пример #3
0
        /// <summary>
        /// Generates a <see cref="QLearnerModel"/> based on states/actions with transitions and rewards.
        /// </summary>
        /// <param name="X1">Initial State matrix.</param>
        /// <param name="y">Action label vector.</param>
        /// <param name="X2">Transition State matrix.</param>
        /// <param name="r">Reward values.</param>
        /// <returns>QLearnerModel.</returns>
        public override IReinforcementModel Generate(Matrix X1, Vector y, Matrix X2, Vector r)
        {
            this.Preprocess(X1, y, X2, r);

            var examples = MDPConverter.GetStates(X1, y, X2, this.FeatureProperties, this.FeatureDiscretizer);

            var states = examples.Item1; var actions = examples.Item2; var statesP = examples.Item3;

            QTable Q = new QTable();

            // construct Q table
            for (int i = 0; i < states.Count(); i++)
            {
                var state  = states.ElementAt(i);
                var action = actions.ElementAt(i);
                var stateP = statesP.ElementAt(i);

                Q.AddOrUpdate(state, action, r[i]);

                if (!Q.ContainsKey(stateP))
                {
                    Q.AddKey(stateP);
                }
            }

            double count = states.Select(s => s.Id).Distinct().Count();

            double change = 0;

            for (int pass = 0; pass < this.MaxIterations; pass++)
            {
                change = 0;

                for (int i = 0; i < states.Count(); i++)
                {
                    IState  state  = states.ElementAt(i);
                    IAction action = actions.ElementAt(i);
                    IState  stateP = statesP.ElementAt(i);
                    double  reward = r[i];

                    double q = (1.0 - this.LearningRate) * Q[state, action]
                               + this.LearningRate * (reward + this.Lambda * Q[stateP, Q.GetMaxAction(stateP)]);

                    change += (1.0 / count) * System.Math.Abs((Q[state, action] - q));

                    Q[state, action] = q;
                }

                if (change <= this.Epsilon)
                {
                    break;
                }
            }

            return(new QLearnerModel()
            {
                Descriptor = this.Descriptor,
                TransitionDescriptor = this.TransitionDescriptor,
                NormalizeFeatures = this.NormalizeFeatures,
                FeatureNormalizer = this.FeatureNormalizer,
                FeatureProperties = this.FeatureProperties,
                FeatureDiscretizer = this.FeatureDiscretizer,
                LearningRate = this.LearningRate,
                Lambda = this.Lambda,
                Q = Q
            });
        }
Пример #4
0
        public void Test_QLearning_Path_Finder()
        {
            // start
            var master  = new MDPState(2);
            var kitchen = new MDPState(3);

            master.Successors.Add(new MDPSuccessorState(new AI.Action(1, "Goto Kitchen"), 0.1, kitchen, 0));

            var entrance = new MDPState(1);
            var lounge   = new MDPState(4);

            kitchen.Successors.Add(new MDPSuccessorState(new AI.Action(2, "Goto Lounge"), 0.1, lounge, -15));
            kitchen.Successors.Add(new MDPSuccessorState(new AI.Action(3, "Goto Entrance Hall"), 0, entrance, -30));

            var spare = new MDPState(0);

            lounge.Successors.Add(new MDPSuccessorState(new AI.Action(4, "Goto Spare Room"), 0.1, spare, -10));

            var outside = new MDPState(5);

            lounge.Successors.Add(new MDPSuccessorState(new AI.Action(5, "Go Outside"), 0.1, outside, 30));
            entrance.Successors.Add(new MDPSuccessorState(new AI.Action(6, "Go Outside"), 0.1, outside, 50));
            outside.Successors.Add(new MDPSuccessorState(new AI.Action(7, "Stay Outside"), 0.2, outside, 50));

            var examples = MDPConverter.ToExamples(master);

            Assert.Equal(7, examples.Item1.Rows);
            Assert.Equal(7, examples.Item2.Length);
            Assert.Equal(7, examples.Item3.Rows);
            Assert.Equal(7, examples.Item4.Length);

            var generator = new Reinforcement.QLearning.QLearnerGenerator()
            {
                Lambda = 0.9
            };

            Reinforcement.QLearning.QLearnerModel model = (Reinforcement.QLearning.QLearnerModel)generator.Generate(examples.Item1, examples.Item2, examples.Item3, examples.Item4);

            Assert.Equal(3, (int)model.Predict(kitchen.ToVector()) /*, "Expected to move from kitchen to entrance hall"*/);
            Assert.Equal(5, (int)model.Predict(lounge.ToVector()) /*, "Expected to move from lounge to outside"*/);
            Assert.Equal(7, (int)model.Predict(outside.ToVector()) /*, "Expected to stay outside"*/);

            string path = "Start: " + master.Id; IMDPState current = master;
            int    counter = 0;

            while (current.Id != outside.Id)
            {
                if (counter > 20)
                {
                    break;
                }

                double v    = model.Predict(current.ToVector());
                var    next = current.GetSuccessors().Where(w => w.Action.Id == (int)v).FirstOrDefault() as IMDPSuccessor;
                if (next == null)
                {
                    break;
                }

                current = next.State as IMDPState;

                counter++;

                path += $"\n next: { current.Id } ({ next.Reward.ToString("N2") })";
            }

            Console.Write(path);
        }
Пример #5
0
        /// <summary>
        ///   Generates a <see cref="QLearnerModel" /> based on states/actions with transitions and rewards.
        /// </summary>
        /// <param name="X1">Initial State matrix.</param>
        /// <param name="y">Action label vector.</param>
        /// <param name="X2">Transition State matrix.</param>
        /// <param name="r">Reward values.</param>
        /// <returns>QLearnerModel.</returns>
        public override IReinforcementModel Generate(Matrix X1, Vector y, Matrix X2, Vector r)
        {
            Preprocess(X1, y, X2, r);

            var examples = MDPConverter.GetStates(X1, y, X2, FeatureProperties, FeatureDiscretizer);

            var states  = examples.Item1;
            var actions = examples.Item2;
            var statesP = examples.Item3;

            var Q = new QTable();

            // construct Q table
            for (var i = 0; i < states.Count(); i++)
            {
                var state  = states.ElementAt(i);
                var action = actions.ElementAt(i);
                var stateP = statesP.ElementAt(i);

                Q.AddOrUpdate(state, action, r[i]);

                if (!Q.ContainsKey(stateP))
                {
                    Q.AddKey(stateP);
                }
            }

            double count = states.Select(s => s.Id).Distinct().Count();

            for (var pass = 0; pass < MaxIterations; pass++)
            {
                double change = 0;

                for (var i = 0; i < states.Count(); i++)
                {
                    var state  = states.ElementAt(i);
                    var action = actions.ElementAt(i);
                    var stateP = statesP.ElementAt(i);
                    var reward = r[i];

                    var q = (1.0 - LearningRate) * Q[state, action]
                            + LearningRate * (reward + Lambda * Q[stateP, Q.GetMaxAction(stateP)]);

                    change += 1.0 / count * System.Math.Abs(Q[state, action] - q);

                    Q[state, action] = q;
                }

                if (change <= Epsilon)
                {
                    break;
                }
            }

            return(new QLearnerModel
            {
                Descriptor = Descriptor,
                TransitionDescriptor = TransitionDescriptor,
                NormalizeFeatures = NormalizeFeatures,
                FeatureNormalizer = FeatureNormalizer,
                FeatureProperties = FeatureProperties,
                FeatureDiscretizer = FeatureDiscretizer,
                LearningRate = LearningRate,
                Lambda = Lambda,
                Q = Q
            });
        }