/* * From "Play to test" * Value iteration is the most widely used algorithm for solving discounted Markov decision * problems (see e.g. [21]). Reachability games give rise to non-discounted Markov * decision problems. Nevertheless the value iteration algorithm applies; this is a practical * approach for computing strategies for transient test graphs. Test graphs, modified by inserting * a zero-cost edge (0; 0), correspond to a subclass of negative stationary Markov * decision processes (MDPs) with an infinite horizon, where rewards are negative and * thus regarded as costs, strategies are stationary, i.e. time independent, and there is no * finite upper bound on the number of steps in the process. The optimization criterion * for our strategies corresponds to the expected total reward criterion, rather than the * expected discounted reward criterion used in discounted Markov decision problems. * Let G = (V;E; V a; V p; g; p; c) be a test graph modified by inserting a zero-cost * edge (0; 0). The classical value iteration algorithm works as follows on G. * * Value iteration Let n = 0 and let M0 be the zero vector with coordinates V so that * every M0[u] = 0. Given n and Mn, we compute Mn+1 (and then increment n): * Mn+1[u] ={ min {c(u,v) +Mn[v]:(u,v) in E} if u is an active state} * or sum {p(u,v)*(c(u,v) +Mn[v]); if u is a choice point * * Value iteration for negative MDPs with the expected total reward criterion, or negative * Markov decision problems for short, does not in general converge to an optimal * solution, even if one exists. However, if there exists a strategy for which the the expected * cost is finite for all states [21, Assumption 7.3.1], then value iteration does converge for * negative Markov decision problems [21, Theorem 7.3.10]. In light of lemmas 2 and 3, * this implies that value iteration converges for transient test graphs. Let us make this * more precise, as a corollary of Theorem 7.3.10 in [21]. */ //nStates marks the end of active states, choice points start after that static double[] ValueIteration(Graph graph, HSet targets, int nStates) //ValueIteration(Graph graph,int[] sources,HSet targets,int nStates) { graph.InitEdgeProbabilities(); double[] v0 = new double[graph.NumberOfVertices]; double[] v1 = new double[graph.NumberOfVertices]; double eps = 1.0E-6; double delta; double[] v = v0; //double[] vnew=v1; // CheckTransience(graph,targets); int nOfIter = 0; do { delta = 0; for (int i = 0; i < nStates && i < graph.NumberOfVertices; i++) { if (targets.Contains(i)) { continue; } double min = Double.MaxValue; foreach (Edge l in graph.EdgesAtVertex(i)) { double r = ((double)l.weight) + v[l.target]; if (r < min) { min = r; } } if (min != Double.MaxValue) { v1[i] = min; if (delta < min - v[i]) { delta = min - v[i]; } } } for (int i = nStates; i < graph.NumberOfVertices; i++) { if (targets.Contains(i)) { continue; } double r = 0; foreach (Edge l in graph.EdgesAtVertex(i)) { r += graph.EdgeProbability(l) * (((double)l.weight) + v[l.target]); } v1[i] = r; if (delta < r - v[i]) { delta = r - v[i]; } } nOfIter++; //swap v and v1 double[] vtmp = v; v = v1; v1 = vtmp; }while(delta > eps && nOfIter < 10000); if (delta > eps) { return(null); //the result is erroneous } return(v); }
//ValueIteration(Graph graph,int[] sources,HSet targets,int nStates) /* From "Play to test" Value iteration is the most widely used algorithm for solving discounted Markov decision problems (see e.g. [21]). Reachability games give rise to non-discounted Markov decision problems. Nevertheless the value iteration algorithm applies; this is a practical approach for computing strategies for transient test graphs. Test graphs, modified by inserting a zero-cost edge (0; 0), correspond to a subclass of negative stationary Markov decision processes (MDPs) with an infinite horizon, where rewards are negative and thus regarded as costs, strategies are stationary, i.e. time independent, and there is no finite upper bound on the number of steps in the process. The optimization criterion for our strategies corresponds to the expected total reward criterion, rather than the expected discounted reward criterion used in discounted Markov decision problems. Let G = (V;E; V a; V p; g; p; c) be a test graph modified by inserting a zero-cost edge (0; 0). The classical value iteration algorithm works as follows on G. Value iteration Let n = 0 and let M0 be the zero vector with coordinates V so that every M0[u] = 0. Given n and Mn, we compute Mn+1 (and then increment n): Mn+1[u] ={ min {c(u,v) +Mn[v]:(u,v) in E} if u is an active state} or sum {p(u,v)*(c(u,v) +Mn[v]); if u is a choice point Value iteration for negative MDPs with the expected total reward criterion, or negative Markov decision problems for short, does not in general converge to an optimal solution, even if one exists. However, if there exists a strategy for which the the expected cost is finite for all states [21, Assumption 7.3.1], then value iteration does converge for negative Markov decision problems [21, Theorem 7.3.10]. In light of lemmas 2 and 3, this implies that value iteration converges for transient test graphs. Let us make this more precise, as a corollary of Theorem 7.3.10 in [21]. */ //nStates marks the end of active states, choice points start after that static double[] ValueIteration(Graph graph, HSet targets, int nStates) { graph.InitEdgeProbabilities(); double[]v0=new double[graph.NumberOfVertices]; double[]v1=new double[graph.NumberOfVertices]; double eps=1.0E-6; double delta; double[] v=v0; //double[] vnew=v1; // CheckTransience(graph,targets); int nOfIter=0; do{ delta=0; for(int i=0;i<nStates&&i<graph.NumberOfVertices;i++){ if(targets.Contains(i)) continue; double min=Double.MaxValue; foreach(Edge l in graph.EdgesAtVertex(i)){ double r=((double)l.weight)+v[l.target]; if(r<min) min=r; } if(min!=Double.MaxValue){ v1[i]=min; if(delta<min-v[i]) delta=min-v[i]; } } for(int i=nStates;i<graph.NumberOfVertices;i++){ if(targets.Contains(i)) continue; double r=0; foreach(Edge l in graph.EdgesAtVertex(i)) r+=graph.EdgeProbability(l)*(((double)l.weight)+v[l.target]); v1[i]=r; if(delta<r-v[i]) delta=r-v[i]; } nOfIter++; //swap v and v1 double[] vtmp=v; v=v1; v1=vtmp; } while(delta>eps && nOfIter<10000); if(delta>eps){ return null; //the result is erroneous } return v; }