public void PolicyEvaluation(int steps) { Position p = new Position(); Position next; double total; for (int i = 0; i < steps; i++) { for (int j = 0; j < grid.height; j++) { for (int k = 0; k < grid.length; k++) { p.Set(k, j); total = 0; foreach(AgentAction a in actions){ if (grid.Reward(p) == 0) { next = p; } else { next = grid.NextState(p, a); } total += (PolicityProb(a,p) * (grid.Reward(next) + discount * grid.world[next.y, next.x])); } grid.world[j, k] = total; } } } }
public void PolicyImprovement() { Position p = new Position(); Position next; AgentAction bestAction; double prevActionValue = Double.MinValue; double currentValue; for (int j = 0; j < grid.height; j++) { for (int k = 0; k < grid.length; k++) { p.Set(k, j); bestAction = null; prevActionValue = Double.MinValue; foreach (AgentAction a in policy[j, k]) { if (a == null) continue; if (grid.Reward(p) == 0) { next = p; } else { next = grid.NextState(p, a); } if (PolicityProb(a,p) != 0) { currentValue = (PolicityProb(a,p) * (grid.Reward(next) + discount * grid.world[next.y, next.x])); if (currentValue >= prevActionValue) { bestAction = a; prevActionValue = currentValue; } } } if (bestAction != null) { policy[j, k] = new AgentAction[1]; policy[j, k][0] = bestAction; } } } }
public void ValueIteration() { Position p = new Position(); Position next; double total; double tmp = 0; //delta indicates how much the value function has changed double delta; //omega denotes the minimum change for the vlaue function //to terminate the calculation double omega = 0.001; do { delta = 0; for (int j = 0; j < grid.height; j++) { for (int k = 0; k < grid.length; k++) { p.Set(k, j); total = double.MinValue; foreach (AgentAction a in actions) { if (grid.Reward(p) == 0) { next = p; } else { next = grid.NextState(p, a); } tmp = (PolicityProb(a, p) * (grid.Reward(next) + discount * grid.world[next.y, next.x])); if (tmp > total) { total = tmp; } } //we check if the change is bigger delta = Math.Max(delta, Math.Abs(total - grid.world[j, k])); grid.world[j, k] = total; } } } while (delta > omega); }