public void Step(double reward, int nextState) { var nextAction = ExplorationPolicy.SelectAction(_q[nextState]); var target = reward + DiscountFactor * _q[nextState][nextAction]; var delta = target - _q[CurrentState][SelectedAction]; _q[CurrentState][SelectedAction] += LearningRate * delta; CurrentState = nextState; SelectedAction = ExplorationPolicy.SelectAction(_q[CurrentState]); }
public void Step(double reward, int nextState) { if (!_visited.ContainsKey(CurrentState)) { var actions = new HashSet <int>(); actions.Add(SelectedAction); _visited[CurrentState] = actions; } UpdateQ(reward, nextState); Plan(); CurrentState = nextState; SelectedAction = ExplorationPolicy.SelectAction(_q[CurrentState]); }
public void Step(double reward, int nextState) { var bestNext = _q[nextState][0]; for (var i = 1; i < ActionCount; i++) { if (_q[nextState][i] > bestNext) { bestNext = _q[nextState][i]; } } var target = reward + DiscountFactor * bestNext; var delta = target - _q[CurrentState][SelectedAction]; _q[CurrentState][SelectedAction] += LearningRate * delta; CurrentState = nextState; SelectedAction = ExplorationPolicy.SelectAction(_q[CurrentState]); }
public void Begin(int state) { CurrentState = state; SelectedAction = ExplorationPolicy.SelectAction(_q[CurrentState]); }