Пример #1
0
        /*
         * From "Play to test"
         * Value iteration is the most widely used algorithm for solving discounted Markov decision
         * problems (see e.g. [21]). Reachability games give rise to non-discounted Markov
         * decision problems. Nevertheless the value iteration algorithm applies; this is a practical
         * approach for computing strategies for transient test graphs. Test graphs, modified by inserting
         * a zero-cost edge (0; 0), correspond to a subclass of negative stationary Markov
         * decision processes (MDPs) with an infinite horizon, where rewards are negative and
         * thus regarded as costs, strategies are stationary, i.e. time independent, and there is no
         * finite upper bound on the number of steps in the process. The optimization criterion
         * for our strategies corresponds to the expected total reward criterion, rather than the
         * expected discounted reward criterion used in discounted Markov decision problems.
         * Let G = (V;E; V a; V p; g; p; c) be a test graph modified by inserting a zero-cost
         * edge (0; 0). The classical value iteration algorithm works as follows on G.
         *
         * Value iteration Let n = 0 and let M0 be the zero vector with coordinates V so that
         * every M0[u] = 0. Given n and Mn, we compute Mn+1 (and then increment n):
         * Mn+1[u] ={ min {c(u,v) +Mn[v]:(u,v) in E} if u is an active state}
         * or sum {p(u,v)*(c(u,v) +Mn[v]); if u is a choice point
         *
         * Value iteration for negative MDPs with the expected total reward criterion, or negative
         * Markov decision problems for short, does not in general converge to an optimal
         * solution, even if one exists. However, if there exists a strategy for which the the expected
         * cost is finite for all states [21, Assumption 7.3.1], then value iteration does converge for
         * negative Markov decision problems [21, Theorem 7.3.10]. In light of lemmas 2 and 3,
         * this implies that value iteration converges for transient test graphs. Let us make this
         * more precise, as a corollary of Theorem 7.3.10 in [21].
         */

        //nStates marks the end of active states, choice points start after that
        static double[] ValueIteration(Graph graph, HSet targets, int nStates) //ValueIteration(Graph graph,int[] sources,HSet targets,int nStates)
        {
            graph.InitEdgeProbabilities();

            double[] v0  = new double[graph.NumberOfVertices];
            double[] v1  = new double[graph.NumberOfVertices];
            double   eps = 1.0E-6;
            double   delta;

            double[] v = v0;
            //double[] vnew=v1;


            //      CheckTransience(graph,targets);


            int nOfIter = 0;

            do
            {
                delta = 0;


                for (int i = 0; i < nStates && i < graph.NumberOfVertices; i++)
                {
                    if (targets.Contains(i))
                    {
                        continue;
                    }

                    double min = Double.MaxValue;

                    foreach (Edge l in graph.EdgesAtVertex(i))
                    {
                        double r = ((double)l.weight) + v[l.target];
                        if (r < min)
                        {
                            min = r;
                        }
                    }
                    if (min != Double.MaxValue)
                    {
                        v1[i] = min;
                        if (delta < min - v[i])
                        {
                            delta = min - v[i];
                        }
                    }
                }

                for (int i = nStates; i < graph.NumberOfVertices; i++)
                {
                    if (targets.Contains(i))
                    {
                        continue;
                    }

                    double r = 0;
                    foreach (Edge l in graph.EdgesAtVertex(i))
                    {
                        r += graph.EdgeProbability(l) * (((double)l.weight) + v[l.target]);
                    }

                    v1[i] = r;
                    if (delta < r - v[i])
                    {
                        delta = r - v[i];
                    }
                }


                nOfIter++;

                //swap v and v1
                double[] vtmp = v;
                v  = v1;
                v1 = vtmp;
            }while(delta > eps && nOfIter < 10000);

            if (delta > eps)
            {
                return(null); //the result is erroneous
            }

            return(v);
        }
Пример #2
0
        //ValueIteration(Graph graph,int[] sources,HSet targets,int nStates)
        /*
          From "Play to test"
          Value iteration is the most widely used algorithm for solving discounted Markov decision
          problems (see e.g. [21]). Reachability games give rise to non-discounted Markov
          decision problems. Nevertheless the value iteration algorithm applies; this is a practical
          approach for computing strategies for transient test graphs. Test graphs, modified by inserting
          a zero-cost edge (0; 0), correspond to a subclass of negative stationary Markov
          decision processes (MDPs) with an infinite horizon, where rewards are negative and
          thus regarded as costs, strategies are stationary, i.e. time independent, and there is no
          finite upper bound on the number of steps in the process. The optimization criterion
          for our strategies corresponds to the expected total reward criterion, rather than the
          expected discounted reward criterion used in discounted Markov decision problems.
          Let G = (V;E; V a; V p; g; p; c) be a test graph modified by inserting a zero-cost
          edge (0; 0). The classical value iteration algorithm works as follows on G.

          Value iteration Let n = 0 and let M0 be the zero vector with coordinates V so that
          every M0[u] = 0. Given n and Mn, we compute Mn+1 (and then increment n):
          Mn+1[u] ={ min {c(u,v) +Mn[v]:(u,v) in E} if u is an active state}
          or sum {p(u,v)*(c(u,v) +Mn[v]); if u is a choice point

          Value iteration for negative MDPs with the expected total reward criterion, or negative
          Markov decision problems for short, does not in general converge to an optimal
          solution, even if one exists. However, if there exists a strategy for which the the expected
          cost is finite for all states [21, Assumption 7.3.1], then value iteration does converge for
          negative Markov decision problems [21, Theorem 7.3.10]. In light of lemmas 2 and 3,
          this implies that value iteration converges for transient test graphs. Let us make this
          more precise, as a corollary of Theorem 7.3.10 in [21].
        */
        //nStates marks the end of active states, choice points start after that
        static double[] ValueIteration(Graph graph, HSet targets, int nStates)
        {
            graph.InitEdgeProbabilities();

              double[]v0=new double[graph.NumberOfVertices];
              double[]v1=new double[graph.NumberOfVertices];
              double eps=1.0E-6;
              double delta;
              double[] v=v0;
              //double[] vnew=v1;

              //      CheckTransience(graph,targets);

              int nOfIter=0;
              do{

            delta=0;

            for(int i=0;i<nStates&&i<graph.NumberOfVertices;i++){

              if(targets.Contains(i))
            continue;

              double min=Double.MaxValue;

              foreach(Edge l in graph.EdgesAtVertex(i)){
            double r=((double)l.weight)+v[l.target];
            if(r<min)
              min=r;
              }
              if(min!=Double.MaxValue){
            v1[i]=min;
            if(delta<min-v[i])
              delta=min-v[i];
              }

            }

            for(int i=nStates;i<graph.NumberOfVertices;i++){
              if(targets.Contains(i))
            continue;

              double r=0;
              foreach(Edge l in graph.EdgesAtVertex(i))
            r+=graph.EdgeProbability(l)*(((double)l.weight)+v[l.target]);

              v1[i]=r;
              if(delta<r-v[i])
            delta=r-v[i];
            }

            nOfIter++;

            //swap v and v1
            double[] vtmp=v;
            v=v1;
            v1=vtmp;
              }
              while(delta>eps && nOfIter<10000);

              if(delta>eps){
            return null; //the result is erroneous
              }

              return v;
        }