/// <summary> /// called after step and when the enviorment is resolved. return whether the enviourment should reset /// </summary> /// <param name="environment"></param> public bool Record(IRLEnvironment environment) { Debug.Assert(environment.IsResolved()); bool isEnd = environment.IsEnd(); for (int i = 0; i < NumberOfActor; ++i) { float reward = environment.LastReward(); AddHistory(LastState[i], reward, LastAction[i], LastActionProbs[i], LastValue[i], i); } if (isEnd || environment.CurrentStep() >= MaxStepHorizon) { float[] nextValues = new float[NumberOfActor]; if (!isEnd) { nextValues = Model.EvaluateValue(environment.CurrentState()); } else { for (int i = 0; i < NumberOfActor; ++i) { nextValues[i] = 0; } } for (int i = 0; i < NumberOfActor; ++i) { ProcessEpisodeHistory(nextValues[i], i); } return(true); } return(false); }
/// <summary> /// Step the enviormenet for training. /// </summary> /// <param name="environment"></param> public void Step(IRLEnvironment environment) { float[][] actions = new float[NumberOfActor][]; float[] statesAll = new float[NumberOfActor * Model.StateSize]; for (int i = 0; i < NumberOfActor; ++i) { var states = environment.CurrentState(i).CopyToArray(); LastState[i] = states; Array.Copy(states, 0, statesAll, i * Model.StateSize, Model.StateSize); } if (Model.IsActionContinuous) { float[] actionProbs = null; float[] tempAction = Model.EvaluateActionContinuous(statesAll, out actionProbs); for (int i = 0; i < NumberOfActor; ++i) { actions[i] = new float[Model.ActionSize]; Array.Copy(tempAction, i * Model.ActionSize, actions[i], 0, Model.ActionSize); LastAction[i] = actions[i]; LastActionProbs[i] = new float[Model.ActionSize]; Array.Copy(actionProbs, i * Model.ActionSize, LastActionProbs[i], 0, Model.ActionSize); } } else { float[] actionProbs = null; int[] tempAction = Model.EvaluateActionDiscrete(statesAll, out actionProbs, true); for (int i = 0; i < NumberOfActor; ++i) { actions[i] = new float[] { tempAction[i] }; LastAction[i] = actions[i]; LastActionProbs[i] = new float[] { actionProbs[i] }; } } for (int i = 0; i < NumberOfActor; ++i) { LastValue[i] = Model.EvaluateValue(statesAll)[i]; } environment.Step(actions); Steps++; }
/// <summary> /// called after step and when the enviorment is resolved. return whether the enviourment should reset /// </summary> /// <param name="environment"></param> public virtual bool Record(IRLEnvironment environment) { Debug.Assert(environment.IsResolved()); bool isEnd = environment.IsEnd(); for (int i = 0; i < NumberOfActor; ++i) { float reward = environment.LastReward(i); AddHistory(i, LastState[i], reward, LastAction[i], isEnd); } if (isEnd || environment.CurrentStep() >= MaxStepHorizon) { for (int i = 0; i < NumberOfActor; ++i) { UpdateReplayBuffer(i); } return(true); } return(false); }
/// <summary> /// Step the enviormenet for training. /// </summary> /// <param name="environment"></param> public void Step(IRLEnvironment environment) { float[][] actions = new float[NumberOfActor][]; float[] statesAll = new float[NumberOfActor * Model.StateSize]; for (int i = 0; i < NumberOfActor; ++i) { var states = environment.CurrentState(i).CopyToArray(); LastState[i] = states; Array.Copy(states, 0, statesAll, i * Model.StateSize, Model.StateSize); } bool random = UnityEngine.Random.Range(0, 1.0f) < CurrentRandomChance; if (random) { for (int i = 0; i < NumberOfActor; ++i) { actions[i] = new float[] { UnityEngine.Random.Range(0, Model.ActionSize) }; LastAction[i] = Mathf.RoundToInt(actions[i][0]); } } else { float[] maxQs; int[] tempAction = Model.EvaluateAction(statesAll, out maxQs); for (int i = 0; i < NumberOfActor; ++i) { actions[i] = new float[] { tempAction[i] }; LastAction[i] = tempAction[i]; } } environment.Step(actions); Steps++; }