/// <summary> /// Updates the Q-Learner model by reinforcing with the new state/action feedback values. /// </summary> /// <param name="x">State vector.</param> /// <param name="y">Action label.</param> /// <param name="r">Reward value.</param> public override void Learn(Vector x, double y, double r) { var state = this.Q.Keys.Last(); var stateP = MDPConverter.GetState(x, this.FeatureProperties, this.FeatureDiscretizer); var action = MDPConverter.GetAction(y, state.Id, stateP.Id); this.Q.AddOrUpdate(stateP, action, r); this.Q[state, action] = (1.0 - this.LearningRate) * Q[state, action] + this.LearningRate * (r + this.Lambda * Q[stateP, Q.GetMaxAction(stateP)]); }
/// <summary> /// Updates the Q-Learner model by reinforcing with the new state/action and transition state feedback values. /// </summary> /// <param name="x1">Item features, i.e. the original State.</param> /// <param name="y">Action label.</param> /// <param name="x2">Transition state value.</param> /// <param name="r">Reward value.</param> public override void Learn(Vector x1, double y, Vector x2, double r) { var state = MDPConverter.GetState(x1, this.FeatureProperties, this.FeatureDiscretizer); var stateP = MDPConverter.GetState(x2, this.FeatureProperties, this.FeatureDiscretizer); var action = MDPConverter.GetAction(y, state.Id, stateP.Id); if (!Q.ContainsKey(state)) { Q.AddOrUpdate(state, action, r); } if (!Q.ContainsKey(stateP)) { Q.AddKey(stateP); } this.Q[state, action] = (1.0 - this.LearningRate) * Q[state, action] + this.LearningRate * (r + this.Lambda * Q[stateP, Q.GetMaxAction(stateP)]); }