//Set info about agent ( full details ) public void setDetailedInfo(string agentName, Observation obs, int[] action, int position) { string info = Environment.NewLine + "---------------------------------" + Environment.NewLine + agentName + " is next.Currently on position : " + position.ToString(); info += Environment.NewLine + "Observation received : " + obs.printInfo(); // info += Environment.NewLine + "Action selected : "; for (int i = 0; i < action.Length; i++) { info += action[i].ToString() + ","; } info.Remove(info.Length - 1); MainWindow mw = (MainWindow)Application.Current.MainWindow; mw.obsInfo.Text = info; // MessageBox.Show(info); }
//Receive observation and reward and send an action back to the environment public virtual int agent_step(Observation obs, double reward) { return 0; }
//Receive the first observation of the game //No reward is expected now //Send an action back to the environment public virtual int agent_start(Observation obs) { return 0; }
public EligibilityTrace(Observation o, Action a, double v) { this.observation = o; this.action = a; this.value = v; }
public EligibilityTrace(Observation o, Action a, double v) { this.observation = o; this.action = a; this.value = v; }
//Update traces -- sarsa private bool updateSTraces(Observation obs, Monopoly.RLClasses.Action a) { return false; }
//Change agent's current observation based on what the agent receives public void agent_changeCurrentState(Observation obs) { this.lastState = obs; }
//Sarsa algorithm private double Sarsa(Observation lastState, Monopoly.RLClasses.Action lastAction, Observation newState, Monopoly.RLClasses.Action newAction, double reward) { double QValue = network.Run(createInput(lastState, lastAction.action)).First(); //run network for last state and last action double previousQ = QValue; //run network for new state and best action double newQ = network.Run(createInput(newState, newAction.action)).First(); QValue += alpha * (reward + gamma * newQ - previousQ); return QValue; }
//Update traces -- qlearning---Peng's Q(λ) private bool updateQTraces(Observation obs, Monopoly.RLClasses.Action a, double reward) { bool found = false; //Since the state space is huge we'll use a similarity function to decide whether two states are similar enough for (int i = 0; i < traces.Count; i++) { if (checkStateSimilarity(obs,traces[i].observation) && (!a.action.Equals(traces[i].action.action))) { traces[i].value = 0; traces.RemoveAt(i); i--; } else if (checkStateSimilarity(obs, traces[i].observation) && (a.action.Equals(traces[i].action.action))) { found = true; traces[i].value = 1 ; //Q[t] (s,a) double qT = network.Run(createInput(traces[i].observation, traces[i].action.action))[0]; //maxQ[t] (s[t+1],a) int act = findMaxValues(calculateQValues(obs)); double maxQt = network.Run(createInput(obs, act))[0]; //maxQ[t] (s[t],a) act = findMaxValues(calculateQValues(lastState)); double maxQ = network.Run(createInput(lastState, act))[0]; //Q[t+1] (s,a) = Q[t] (s,a) + alpha * ( trace[i].value ) * ( reward + gamma * maxQ[t] (s[t+1],a) * maxQ[t] (s[t],a)) double qVal = qT + alpha * (traces[i].value) * (reward + gamma * maxQt - maxQ); trainNeural(createInput(traces[i].observation, traces[i].action.action), qVal); } else { traces[i].value = gamma * lamda * traces[i].value; //Q[t] (s,a) double qT = network.Run(createInput(traces[i].observation, traces[i].action.action))[0]; //maxQ[t] (s[t+1],a) int act = findMaxValues(calculateQValues(obs)); double maxQt = network.Run(createInput(obs, act))[0]; //maxQ[t] (s[t],a) act = findMaxValues(calculateQValues(lastState)); double maxQ = network.Run(createInput(lastState, act))[0]; //Q[t+1] (s,a) = Q[t] (s,a) + alpha * ( trace[i].value ) * ( reward + gamma * maxQ[t] (s[t+1],a) * maxQ[t] (s[t],a)) double qVal = qT + alpha * (traces[i].value) * (reward + gamma * maxQt - maxQ); trainNeural(createInput(traces[i].observation, traces[i].action.action), qVal); } } return found; }
//Calculate network's output private double[] calculateQValues(Observation obs) { double[] tempQ = new double[3]; for (int i = 0; i < tempQ.Length; i++) { //Run netowrk for action i,j to given observation double[] input = createInput(obs, i - 1); tempQ[i] = network.Run(input)[0]; } return tempQ; }
//Calculate similarity of states private bool checkStateSimilarity(Observation obs1, Observation obs2) { bool similar = true; //Check money similarity double moneyDif = Math.Abs(obs1.finance.relativeAssets - obs2.finance.relativeAssets) + Math.Abs(obs1.finance.relativePlayersMoney - obs2.finance.relativePlayersMoney); if (moneyDif >= 0.1) similar = false; //Check area similarity if (!obs1.position.relativePlayersArea.Equals(obs2.position.relativePlayersArea)) similar = false; double countDif = 0; for (int i = 0; i < obs1.area.gameGroupInfo.GetLength(0); i++) { if (!similar) break; countDif = 0; for (int j = 0; j < obs1.area.gameGroupInfo.GetLength(1); j++) { if (!obs1.area.gameGroupInfo[i, j].Equals(obs2.area.gameGroupInfo[i, j])) { countDif += Math.Abs(obs1.area.gameGroupInfo[i, j] - obs2.area.gameGroupInfo[i, j]); if (countDif >= 0.1) { similar = false; break; } } } } return similar; }
//Initialize local parameters for a new game public void initParams() { if (this.policyFrozen) { this.alpha = 0; this.epsilon = 0; this.lamda = 0; this.gamma = 0; } //numberOfProperties = 28 base.propertiesPurchased = new int[28]; base.mortgagedProperties = new int[28]; base.buildingsBuilt = new int[28]; this.agent_changeCurrentState(new Observation()); //Initialize arrays for (int i = 0; i < 28; i++) { propertiesPurchased[i] = 0; mortgagedProperties[i] = 0; buildingsBuilt[i] = 0; } this.isAlive = true; base.inJail = false; base.money = 1500; base.position = 0; lastAction = 0; lastState = new Observation(); traces = new List<EligibilityTrace>(); }
//Create input for the neural network public double[] createInput(Observation observation, int action) { List<double> input = new List<double>(); //Add action input.Add((((double)(action+2))/3)); //Add every variable of the observation to the input list for (int k = 0; k < observation.area.gameGroupInfo.GetLength(0); k++) { for (int kk = 0; kk < observation.area.gameGroupInfo.GetLength(1); kk++) input.Add(observation.area.gameGroupInfo[k, kk]); } input.Add(observation.finance.relativeAssets); input.Add(observation.finance.relativePlayersMoney); input.Add(observation.position.relativePlayersArea); //Return the input array return input.ToArray(); }
//Receive an observation and a reward from the environment and send the appropriate action public override int agent_step(Observation observation, double reward) { //If this isn't a random agent calculate the Q values for every possible action int action = 0; if (!agentType.Equals('r')) { //Calculate Qvalues double [] QValues = calculateQValues(observation); //Select action action = e_greedySelection(QValues); //If the policy of the agent isn't frozen then train the neural network if (!policyFrozen) { //If the agent is learning then update it's qValue for the selected action double QValue = 0; bool exists = false; //Calculate the qValue either using the Q-learning or the SARSA algorithm if (this.agentType.Equals('q')) { exists = updateQTraces(observation, new Monopoly.RLClasses.Action(action), reward); QValue = Qlearning(lastState, new Monopoly.RLClasses.Action(lastAction), observation, new Monopoly.RLClasses.Action(findMaxValues(QValues)), reward); } else { exists = updateSTraces(observation, new Monopoly.RLClasses.Action(action)); QValue = Sarsa(lastState, new Monopoly.RLClasses.Action(lastAction), observation, new Monopoly.RLClasses.Action(action), reward); } trainNeural(createInput(lastState, lastAction), QValue); //Add trace to list if (!exists) traces.Add(new EligibilityTrace(lastState, new RLClasses.Action(lastAction), 1)); } //Update local values lastAction = action; lastState = observation; return action; } //Random action else { return randomAction(); } }
//First action of the agent, where no reward is to be expected from the environment public override int agent_start(Observation observation) { //Increase currentEpoch paramater ( used only in nn training) currentEpoch++; //Initialize agent's parameters initParams(); //Create new array for action int action = 0; if (!agentType.Equals('r')) { ///Calculate Qvalues double[] QValues = calculateQValues(observation); //Select final action based on the ε-greedy algorithm action = e_greedySelection(QValues); //Update local values lastAction = action; lastState = observation; traces.Add(new EligibilityTrace(observation, new RLClasses.Action(action), 1)); return action; } else { return randomAction(); } }
//Create an instance of Observation class //Representing the current state of the environment public Observation createObservation() { Observation obs = new Observation(); //Create the specific instances of the classes-fields of the Observation Obs_Finance finance = createFinance(); Obs_Position position = createPosition(); Obs_Area area = createArea(); obs.area = area; obs.finance = finance; obs.position = position; return obs; }