/// <summary> /// Updates the Q-Learner model by reinforcing with the new state/action feedback values. /// </summary> /// <param name="x">State vector.</param> /// <param name="y">Action label.</param> /// <param name="r">Reward value.</param> public override void Learn(Vector x, double y, double r) { var state = this.Q.Keys.Last(); var stateP = MDPConverter.GetState(x, this.FeatureProperties, this.FeatureDiscretizer); var action = MDPConverter.GetAction(y, state.Id, stateP.Id); this.Q.AddOrUpdate(stateP, action, r); this.Q[state, action] = (1.0 - this.LearningRate) * Q[state, action] + this.LearningRate * (r + this.Lambda * Q[stateP, Q.GetMaxAction(stateP)]); }
/// <summary> /// Updates the Q-Learner model by reinforcing with the new state/action and transition state feedback values. /// </summary> /// <param name="x1">Item features, i.e. the original State.</param> /// <param name="y">Action label.</param> /// <param name="x2">Transition state value.</param> /// <param name="r">Reward value.</param> public override void Learn(Vector x1, double y, Vector x2, double r) { var state = MDPConverter.GetState(x1, this.FeatureProperties, this.FeatureDiscretizer); var stateP = MDPConverter.GetState(x2, this.FeatureProperties, this.FeatureDiscretizer); var action = MDPConverter.GetAction(y, state.Id, stateP.Id); if (!Q.ContainsKey(state)) { Q.AddOrUpdate(state, action, r); } if (!Q.ContainsKey(stateP)) { Q.AddKey(stateP); } this.Q[state, action] = (1.0 - this.LearningRate) * Q[state, action] + this.LearningRate * (r + this.Lambda * Q[stateP, Q.GetMaxAction(stateP)]); }
/// <summary> /// Generates a <see cref="QLearnerModel"/> based on states/actions with transitions and rewards. /// </summary> /// <param name="X1">Initial State matrix.</param> /// <param name="y">Action label vector.</param> /// <param name="X2">Transition State matrix.</param> /// <param name="r">Reward values.</param> /// <returns>QLearnerModel.</returns> public override IReinforcementModel Generate(Matrix X1, Vector y, Matrix X2, Vector r) { this.Preprocess(X1, y, X2, r); var examples = MDPConverter.GetStates(X1, y, X2, this.FeatureProperties, this.FeatureDiscretizer); var states = examples.Item1; var actions = examples.Item2; var statesP = examples.Item3; QTable Q = new QTable(); // construct Q table for (int i = 0; i < states.Count(); i++) { var state = states.ElementAt(i); var action = actions.ElementAt(i); var stateP = statesP.ElementAt(i); Q.AddOrUpdate(state, action, r[i]); if (!Q.ContainsKey(stateP)) { Q.AddKey(stateP); } } double count = states.Select(s => s.Id).Distinct().Count(); double change = 0; for (int pass = 0; pass < this.MaxIterations; pass++) { change = 0; for (int i = 0; i < states.Count(); i++) { IState state = states.ElementAt(i); IAction action = actions.ElementAt(i); IState stateP = statesP.ElementAt(i); double reward = r[i]; double q = (1.0 - this.LearningRate) * Q[state, action] + this.LearningRate * (reward + this.Lambda * Q[stateP, Q.GetMaxAction(stateP)]); change += (1.0 / count) * System.Math.Abs((Q[state, action] - q)); Q[state, action] = q; } if (change <= this.Epsilon) { break; } } return(new QLearnerModel() { Descriptor = this.Descriptor, TransitionDescriptor = this.TransitionDescriptor, NormalizeFeatures = this.NormalizeFeatures, FeatureNormalizer = this.FeatureNormalizer, FeatureProperties = this.FeatureProperties, FeatureDiscretizer = this.FeatureDiscretizer, LearningRate = this.LearningRate, Lambda = this.Lambda, Q = Q }); }
public void Test_QLearning_Path_Finder() { // start var master = new MDPState(2); var kitchen = new MDPState(3); master.Successors.Add(new MDPSuccessorState(new AI.Action(1, "Goto Kitchen"), 0.1, kitchen, 0)); var entrance = new MDPState(1); var lounge = new MDPState(4); kitchen.Successors.Add(new MDPSuccessorState(new AI.Action(2, "Goto Lounge"), 0.1, lounge, -15)); kitchen.Successors.Add(new MDPSuccessorState(new AI.Action(3, "Goto Entrance Hall"), 0, entrance, -30)); var spare = new MDPState(0); lounge.Successors.Add(new MDPSuccessorState(new AI.Action(4, "Goto Spare Room"), 0.1, spare, -10)); var outside = new MDPState(5); lounge.Successors.Add(new MDPSuccessorState(new AI.Action(5, "Go Outside"), 0.1, outside, 30)); entrance.Successors.Add(new MDPSuccessorState(new AI.Action(6, "Go Outside"), 0.1, outside, 50)); outside.Successors.Add(new MDPSuccessorState(new AI.Action(7, "Stay Outside"), 0.2, outside, 50)); var examples = MDPConverter.ToExamples(master); Assert.Equal(7, examples.Item1.Rows); Assert.Equal(7, examples.Item2.Length); Assert.Equal(7, examples.Item3.Rows); Assert.Equal(7, examples.Item4.Length); var generator = new Reinforcement.QLearning.QLearnerGenerator() { Lambda = 0.9 }; Reinforcement.QLearning.QLearnerModel model = (Reinforcement.QLearning.QLearnerModel)generator.Generate(examples.Item1, examples.Item2, examples.Item3, examples.Item4); Assert.Equal(3, (int)model.Predict(kitchen.ToVector()) /*, "Expected to move from kitchen to entrance hall"*/); Assert.Equal(5, (int)model.Predict(lounge.ToVector()) /*, "Expected to move from lounge to outside"*/); Assert.Equal(7, (int)model.Predict(outside.ToVector()) /*, "Expected to stay outside"*/); string path = "Start: " + master.Id; IMDPState current = master; int counter = 0; while (current.Id != outside.Id) { if (counter > 20) { break; } double v = model.Predict(current.ToVector()); var next = current.GetSuccessors().Where(w => w.Action.Id == (int)v).FirstOrDefault() as IMDPSuccessor; if (next == null) { break; } current = next.State as IMDPState; counter++; path += $"\n next: { current.Id } ({ next.Reward.ToString("N2") })"; } Console.Write(path); }
/// <summary> /// Generates a <see cref="QLearnerModel" /> based on states/actions with transitions and rewards. /// </summary> /// <param name="X1">Initial State matrix.</param> /// <param name="y">Action label vector.</param> /// <param name="X2">Transition State matrix.</param> /// <param name="r">Reward values.</param> /// <returns>QLearnerModel.</returns> public override IReinforcementModel Generate(Matrix X1, Vector y, Matrix X2, Vector r) { Preprocess(X1, y, X2, r); var examples = MDPConverter.GetStates(X1, y, X2, FeatureProperties, FeatureDiscretizer); var states = examples.Item1; var actions = examples.Item2; var statesP = examples.Item3; var Q = new QTable(); // construct Q table for (var i = 0; i < states.Count(); i++) { var state = states.ElementAt(i); var action = actions.ElementAt(i); var stateP = statesP.ElementAt(i); Q.AddOrUpdate(state, action, r[i]); if (!Q.ContainsKey(stateP)) { Q.AddKey(stateP); } } double count = states.Select(s => s.Id).Distinct().Count(); for (var pass = 0; pass < MaxIterations; pass++) { double change = 0; for (var i = 0; i < states.Count(); i++) { var state = states.ElementAt(i); var action = actions.ElementAt(i); var stateP = statesP.ElementAt(i); var reward = r[i]; var q = (1.0 - LearningRate) * Q[state, action] + LearningRate * (reward + Lambda * Q[stateP, Q.GetMaxAction(stateP)]); change += 1.0 / count * System.Math.Abs(Q[state, action] - q); Q[state, action] = q; } if (change <= Epsilon) { break; } } return(new QLearnerModel { Descriptor = Descriptor, TransitionDescriptor = TransitionDescriptor, NormalizeFeatures = NormalizeFeatures, FeatureNormalizer = FeatureNormalizer, FeatureProperties = FeatureProperties, FeatureDiscretizer = FeatureDiscretizer, LearningRate = LearningRate, Lambda = Lambda, Q = Q }); }