//Update traces -- qlearning---Peng's Q(λ) private bool updateQTraces(Observation obs, Monopoly.RLClasses.Action a, double reward) { bool found = false; //Since the state space is huge we'll use a similarity function to decide whether two states are similar enough for (int i = 0; i < traces.Count; i++) { if (checkStateSimilarity(obs, traces[i].observation) && (!a.action.Equals(traces[i].action.action))) { traces[i].value = 0; traces.RemoveAt(i); i--; } else if (checkStateSimilarity(obs, traces[i].observation) && (a.action.Equals(traces[i].action.action))) { found = true; traces[i].value = 1; //Q[t] (s,a) double qT = network.Run(createInput(traces[i].observation, traces[i].action.action))[0]; //maxQ[t] (s[t+1],a) int act = findMaxValues(calculateQValues(obs)); double maxQt = network.Run(createInput(obs, act))[0]; //maxQ[t] (s[t],a) act = findMaxValues(calculateQValues(lastState)); double maxQ = network.Run(createInput(lastState, act))[0]; //Q[t+1] (s,a) = Q[t] (s,a) + alpha * ( trace[i].value ) * ( reward + gamma * maxQ[t] (s[t+1],a) * maxQ[t] (s[t],a)) double qVal = qT + alpha * (traces[i].value) * (reward + gamma * maxQt - maxQ); trainNeural(createInput(traces[i].observation, traces[i].action.action), qVal); } else { traces[i].value = gamma * lamda * traces[i].value; //Q[t] (s,a) double qT = network.Run(createInput(traces[i].observation, traces[i].action.action))[0]; //maxQ[t] (s[t+1],a) int act = findMaxValues(calculateQValues(obs)); double maxQt = network.Run(createInput(obs, act))[0]; //maxQ[t] (s[t],a) act = findMaxValues(calculateQValues(lastState)); double maxQ = network.Run(createInput(lastState, act))[0]; //Q[t+1] (s,a) = Q[t] (s,a) + alpha * ( trace[i].value ) * ( reward + gamma * maxQ[t] (s[t+1],a) * maxQ[t] (s[t],a)) double qVal = qT + alpha * (traces[i].value) * (reward + gamma * maxQt - maxQ); trainNeural(createInput(traces[i].observation, traces[i].action.action), qVal); } } return(found); }
//Sarsa algorithm private double Sarsa(Observation lastState, Monopoly.RLClasses.Action lastAction, Observation newState, Monopoly.RLClasses.Action newAction, double reward) { double QValue = network.Run(createInput(lastState, lastAction.action)).First(); //run network for last state and last action double previousQ = QValue; //run network for new state and best action double newQ = network.Run(createInput(newState, newAction.action)).First(); QValue += alpha * (reward + gamma * newQ - previousQ); return(QValue); }
//Update traces -- sarsa private bool updateSTraces(Observation obs, Monopoly.RLClasses.Action a) { return(false); }