public override ReinforcementLearningAction ActionForState(ReinforcementLearningState state, ReinforcementLearningQStore qFunction) { Random r = new Random(); if (r.NextDouble() > EPSILON) { return new InvertedPendulumAction2((float)r.NextDouble()); } else { ReinforcementLearningAction ret; float value; qFunction.GetBestActionAndUtilityForState(state, out ret, out value); return ret; } }
public virtual ReinforcementLearningAction ActionForState(ReinforcementLearningState state, ReinforcementLearningQStore qFunction) { ReinforcementLearningAction[] actions; float[] probs; ActionProbabilities(state, qFunction, out actions, out probs); float rand = (float)r.NextDouble(); int i = 0; float sum = 0; do { sum += probs[i]; ++i; } while ((i < actions.Length) && (sum < rand)); return actions[i - 1]; }
public override void ActionProbabilities(ReinforcementLearningState state, ReinforcementLearningQStore qFunction, out ReinforcementLearningAction[] actions, out float[] probabilities) { ReinforcementLearningAction action; float utility; actions = state.GetActions(); probabilities = new float[actions.Length]; qFunction.GetBestActionAndUtilityForState(state, out action, out utility); for (int i = 0; i < actions.Length; ++i) { if (actions[i].Equals(action)) { probabilities[i] = Epsilon; } else probabilities[i] = (1 - Epsilon) / (actions.Length - 1); } }
public void GetBestActionAndUtilityForState(ReinforcementLearningState state, out ReinforcementLearningAction action, out float value) { InvertedPendulumAction2 bestaction = null; float bestq = float.MinValue; for (int i = 0; i < 10; ++i) { InvertedPendulumAction2 act = new InvertedPendulumAction2((float)r.NextDouble() * 2 - 1); float q = Evaluate(state, act); if (q > bestq) { bestq = q; bestaction = act; } } action = bestaction; value = bestq; }
public float Evaluate(ReinforcementLearningState state, ReinforcementLearningAction action) { int i; int j; int k; GetIndices((InvertedPendulumState)state, (InvertedPendulumAction)action, out i, out j, out k); return value[i,j,k]; }
public ReinforcementLearningAction[] GetAllActionsInState(ReinforcementLearningState state) { if (allActions == null) { List<CarNavigationAction> list = new List<CarNavigationAction>(); for (int i = 0; i < LENACTION; ++i) { CarNavigationAction action; GetAction(i, out action); list.Add(action); } allActions = list.ToArray(); } return allActions; }
public void GetBestActionAndUtilityForState(ReinforcementLearningState state, out ReinforcementLearningAction action, out float retval) { float max = float.MinValue; int maxact = 0; int j; int k; int l; GetStateIndices((CarNavigationState)state, out j, out k, out l); for (int i = 0; i < LENACTION; ++i) { if (value[i, j, k, l] > max) { max = value[i, j, k, l]; maxact = i; } } if (value[(int)Math.Ceiling(LENACTION / 2.0), j, k, l] == max) maxact = (int)Math.Ceiling(LENACTION / 2.0); CarNavigationAction retaction; GetAction(maxact, out retaction); action = retaction; retval = max; }
public float Evaluate(ReinforcementLearningState state, ReinforcementLearningAction action) { int i; int j; int k; int l; GetIndices((CarNavigationState)state, (CarNavigationAction)action, out i, out j, out k, out l); return value[i, j, k, l]; }
public new ReinforcementLearningAction ActionForState(ReinforcementLearningState state, ReinforcementLearningQStore qFunction) { ReinforcementLearningAction action; float utility; qFunction.GetBestActionAndUtilityForState(state, out action, out utility); return action; }
public void EpisodeBegin() { prevAction = null; prevState = null; }
public virtual ReinforcementLearningAction Step(ReinforcementLearningEnvironment env, ReinforcementLearningPolicy policy) { ReinforcementLearningState state = env.State(); float reward = env.Reward(); ReinforcementLearningAction action = policy.ActionForState(state, qFunction); if ((prevState != null) && (prevAction != null)) { ReinforcementLearningAction bestAction; float Qtp1max; qFunction.GetBestActionAndUtilityForState(state, out bestAction, out Qtp1max); float Qt = qFunction.Evaluate(prevState, prevAction); float deltaQ = Alpha * (reward + Discount * Qtp1max - Qt); if (float.IsNaN(deltaQ)) throw new Exception(); qFunction.ModifyValue(prevState, prevAction, deltaQ); } prevAction = action; prevState = state; return action; }
public override void ActionProbabilities(ReinforcementLearningState state, ReinforcementLearningQStore qFunction, out ReinforcementLearningAction[] actions, out float[] probabilities) { actions = state.GetActions(); probabilities = new float[actions.Length]; float maxq = float.MinValue; for(int i=0; i<actions.Length; ++i) { float q = qFunction.Evaluate(state, actions[i]); probabilities[i] = q; if (q > maxq) maxq = q; } float sum = 0; for (int i = 0; i < actions.Length; ++i) { probabilities[i] = (float)Math.Exp((probabilities[i] - maxq) / Temperature); sum += probabilities[i]; } for (int i = 0; i < actions.Length; ++i) { probabilities[i] /= sum; } }
public override void ActionProbabilities(ReinforcementLearningState state, ReinforcementLearningQStore qFunction, out ReinforcementLearningAction[] actions, out float[] probabilities) { throw new NotImplementedException(); }
public void ModifyValue(ReinforcementLearningState state, ReinforcementLearningAction action, float delta) { throw new NotImplementedException(); }
public void ModifyValue(ReinforcementLearningState state, ReinforcementLearningAction action, float delta) { int i; int j; int k; GetIndices((InvertedPendulumState)state, (InvertedPendulumAction)action, out i, out j, out k); value[i, j, k] += delta; //for (int jj = j - DD; jj <= j + DD; ++jj) //{ // for (int kk = Math.Max(k - DD, 0); kk <= Math.Min(k + DD, LEN - 1); ++kk) // { // value[i, (jj + LEN) % LEN, kk] += (float)Math.Exp(-((j - jj) * (j - jj) + (k - kk) * (k - kk)) / DD / DD) / DD / DD * delta; // } //} }
public abstract void ActionProbabilities(ReinforcementLearningState state, ReinforcementLearningQStore qFunction, out ReinforcementLearningAction[] actions, out float[] probabilities);
public void GetBestActionAndUtilityForState(ReinforcementLearningState state, out ReinforcementLearningAction action, out float retval) { float max = float.MinValue; int maxact = 0; int i; int j; int k; GetStateIndices((InvertedPendulumState)state, out j, out k); for (int act = -1; act <= 1; ++act) { GetActionIndices(new InvertedPendulumAction(act), out i); if (value[i, j, k] > max) { max = value[i, j, k]; maxact = act; } } if (value[1, j, k] == max) maxact = 0; action = new InvertedPendulumAction(maxact); retval = max; }
public float Evaluate(ReinforcementLearningState state, ReinforcementLearningAction action) { return (float)igmn.Recall(new Vector(new double[] { ((InvertedPendulumState)state).a, ((InvertedPendulumState)state).w, ((InvertedPendulumAction2)action).action, 0 })).Elements[0]; }