public override void ActionProbabilities(ReinforcementLearningState state, ReinforcementLearningQStore qFunction, out ReinforcementLearningAction[] actions, out float[] probabilities) { ReinforcementLearningAction action; float utility; actions = state.GetActions(); probabilities = new float[actions.Length]; qFunction.GetBestActionAndUtilityForState(state, out action, out utility); for (int i = 0; i < actions.Length; ++i) { if (actions[i].Equals(action)) { probabilities[i] = Epsilon; } else probabilities[i] = (1 - Epsilon) / (actions.Length - 1); } }
public override void ActionProbabilities(ReinforcementLearningState state, ReinforcementLearningQStore qFunction, out ReinforcementLearningAction[] actions, out float[] probabilities) { actions = state.GetActions(); probabilities = new float[actions.Length]; float maxq = float.MinValue; for(int i=0; i<actions.Length; ++i) { float q = qFunction.Evaluate(state, actions[i]); probabilities[i] = q; if (q > maxq) maxq = q; } float sum = 0; for (int i = 0; i < actions.Length; ++i) { probabilities[i] = (float)Math.Exp((probabilities[i] - maxq) / Temperature); sum += probabilities[i]; } for (int i = 0; i < actions.Length; ++i) { probabilities[i] /= sum; } }