public void RunTraining() { QMethod.Validate(this); /* * For each episode: Select random initial state * Do while not reach goal state * Select one among all possible actions for the current state * Using this possible action, consider to go to the next state * Get maximum Q value of this next state based on all possible actions * Set the next state as the current state */ // For each episode var rand = new Random(); long maxloopEventCount = 0; // Train episodes for (long i = 0; i < Episodes; i++) { long maxloop = 0; // Select random initial state int stateIndex = rand.Next(States.Count); QState state = States[stateIndex]; QAction action = null; do { if (++maxloop > MaxExploreStepsWithinOneEpisode) { if (ShowWarning) { string msg = string.Format( "{0} !! MAXLOOP state: {1} action: {2}, {3} endstate is to difficult to reach?", ++maxloopEventCount, state, action, "maybe your path setup is wrong or the "); QMethod.Log(msg); } break; } // no actions, skip this state if (state.Actions.Count == 0) { break; } // Selection strategy is random based on probability int index = rand.Next(state.Actions.Count); action = state.Actions[index]; // Using this possible action, consider to go to the next state // Pick random Action outcome QActionResult nextStateResult = action.PickActionByProbability(); string nextStateName = nextStateResult.StateName; double q = nextStateResult.QEstimated; double r = nextStateResult.Reward; double maxQ = MaxQ(nextStateName); // Q(s,a)= Q(s,a) + alpha * (R(s,a) + gamma * Max(next state, all actions) - Q(s,a)) double value = q + Alpha * (r + Gamma * maxQ - q); // q-learning nextStateResult.QValue = value; // update // is end state go to next episode if (EndStates.Contains(nextStateResult.StateName)) { break; } // Set the next state as the current state state = StateLookup[nextStateResult.StateName]; } while (true); } }
public void AddActionResult(QActionResult actionResult) { ActionsResult.Add(actionResult); }