public void GetBestActionAndUtilityForState(ReinforcementLearningState state, out ReinforcementLearningAction action, out float value) { InvertedPendulumAction2 bestaction = null; float bestq = float.MinValue; for (int i = 0; i < 10; ++i) { InvertedPendulumAction2 act = new InvertedPendulumAction2((float)r.NextDouble() * 2 - 1); float q = Evaluate(state, act); if (q > bestq) { bestq = q; bestaction = act; } } action = bestaction; value = bestq; }
public override void ActionProbabilities(ReinforcementLearningState state, ReinforcementLearningQStore qFunction, out ReinforcementLearningAction[] actions, out float[] probabilities) { ReinforcementLearningAction action; float utility; actions = state.GetActions(); probabilities = new float[actions.Length]; qFunction.GetBestActionAndUtilityForState(state, out action, out utility); for (int i = 0; i < actions.Length; ++i) { if (actions[i].Equals(action)) { probabilities[i] = Epsilon; } else probabilities[i] = (1 - Epsilon) / (actions.Length - 1); } }
public bool Equals(ReinforcementLearningAction obj) { return (((InvertedPendulumAction2)obj).action == action); }
public void GetBestActionAndUtilityForState(ReinforcementLearningState state, out ReinforcementLearningAction action, out float retval) { float max = float.MinValue; int maxact = 0; int i; int j; int k; GetStateIndices((InvertedPendulumState)state, out j, out k); for (int act = -1; act <= 1; ++act) { GetActionIndices(new InvertedPendulumAction(act), out i); if (value[i, j, k] > max) { max = value[i, j, k]; maxact = act; } } if (value[1, j, k] == max) maxact = 0; action = new InvertedPendulumAction(maxact); retval = max; }
public void ModifyValue(ReinforcementLearningState state, ReinforcementLearningAction action, float delta) { int i; int j; int k; GetIndices((InvertedPendulumState)state, (InvertedPendulumAction)action, out i, out j, out k); value[i, j, k] += delta; //for (int jj = j - DD; jj <= j + DD; ++jj) //{ // for (int kk = Math.Max(k - DD, 0); kk <= Math.Min(k + DD, LEN - 1); ++kk) // { // value[i, (jj + LEN) % LEN, kk] += (float)Math.Exp(-((j - jj) * (j - jj) + (k - kk) * (k - kk)) / DD / DD) / DD / DD * delta; // } //} }
public float Evaluate(ReinforcementLearningState state, ReinforcementLearningAction action) { int i; int j; int k; GetIndices((InvertedPendulumState)state, (InvertedPendulumAction)action, out i, out j, out k); return value[i,j,k]; }
public bool Equals(ReinforcementLearningAction obj) { return (((CarNavigationAction)obj).ang == ang); }
public void GetBestActionAndUtilityForState(ReinforcementLearningState state, out ReinforcementLearningAction action, out float retval) { float max = float.MinValue; int maxact = 0; int j; int k; int l; GetStateIndices((CarNavigationState)state, out j, out k, out l); for (int i = 0; i < LENACTION; ++i) { if (value[i, j, k, l] > max) { max = value[i, j, k, l]; maxact = i; } } if (value[(int)Math.Ceiling(LENACTION / 2.0), j, k, l] == max) maxact = (int)Math.Ceiling(LENACTION / 2.0); CarNavigationAction retaction; GetAction(maxact, out retaction); action = retaction; retval = max; }
public float Evaluate(ReinforcementLearningState state, ReinforcementLearningAction action) { int i; int j; int k; int l; GetIndices((CarNavigationState)state, (CarNavigationAction)action, out i, out j, out k, out l); return value[i, j, k, l]; }
public abstract void ActionProbabilities(ReinforcementLearningState state, ReinforcementLearningQStore qFunction, out ReinforcementLearningAction[] actions, out float[] probabilities);
public void EpisodeBegin() { prevAction = null; prevState = null; }
public virtual ReinforcementLearningAction Step(ReinforcementLearningEnvironment env, ReinforcementLearningPolicy policy) { ReinforcementLearningState state = env.State(); float reward = env.Reward(); ReinforcementLearningAction action = policy.ActionForState(state, qFunction); if ((prevState != null) && (prevAction != null)) { ReinforcementLearningAction bestAction; float Qtp1max; qFunction.GetBestActionAndUtilityForState(state, out bestAction, out Qtp1max); float Qt = qFunction.Evaluate(prevState, prevAction); float deltaQ = Alpha * (reward + Discount * Qtp1max - Qt); if (float.IsNaN(deltaQ)) throw new Exception(); qFunction.ModifyValue(prevState, prevAction, deltaQ); } prevAction = action; prevState = state; return action; }
public override void ActionProbabilities(ReinforcementLearningState state, ReinforcementLearningQStore qFunction, out ReinforcementLearningAction[] actions, out float[] probabilities) { actions = state.GetActions(); probabilities = new float[actions.Length]; float maxq = float.MinValue; for(int i=0; i<actions.Length; ++i) { float q = qFunction.Evaluate(state, actions[i]); probabilities[i] = q; if (q > maxq) maxq = q; } float sum = 0; for (int i = 0; i < actions.Length; ++i) { probabilities[i] = (float)Math.Exp((probabilities[i] - maxq) / Temperature); sum += probabilities[i]; } for (int i = 0; i < actions.Length; ++i) { probabilities[i] /= sum; } }
public override void ActionProbabilities(ReinforcementLearningState state, ReinforcementLearningQStore qFunction, out ReinforcementLearningAction[] actions, out float[] probabilities) { throw new NotImplementedException(); }
public void ModifyValue(ReinforcementLearningState state, ReinforcementLearningAction action, float delta) { throw new NotImplementedException(); }
public float Evaluate(ReinforcementLearningState state, ReinforcementLearningAction action) { return (float)igmn.Recall(new Vector(new double[] { ((InvertedPendulumState)state).a, ((InvertedPendulumState)state).w, ((InvertedPendulumAction2)action).action, 0 })).Elements[0]; }