// function VALUE-ITERATION(mdp, ε) returns a utility function

        /**
         * The value iteration algorithm for calculating the utility of states.
         *
         * @param mdp
         *            an MDP with states S, actions A(s), <br>
         *            transition model P(s' | s, a), rewards R(s)
         * @param epsilon
         *            the maximum error allowed in the utility of any state
         * @return a vector of utilities for states in S
         */
        public IMap <S, double> valueIteration(IMarkovDecisionProcess <S, A> mdp, double epsilon)
        {
            //
            // local variables: U, U', vectors of utilities for states in S,
            // initially zero
            IMap <S, double> U      = Util.create(mdp.states(), 0D);
            IMap <S, double> Udelta = Util.create(mdp.states(), 0D);
            // &delta; the maximum change in the utility of any state in an
            // iteration
            double delta = 0;
            // Note: Just calculate this once for efficiency purposes:
            // &epsilon;(1 - &gamma;)/&gamma;
            double minDelta = epsilon * (1 - gamma) / gamma;

            // repeat
            do
            {
                // U <- U'; &delta; <- 0
                U.PutAll(Udelta);
                delta = 0;
                // for each state s in S do
                foreach (S s in mdp.states())
                {
                    // max<sub>a &isin; A(s)</sub>
                    ISet <A> actions = mdp.actions(s);
                    // Handle terminal states (i.e. no actions).
                    double aMax = 0;
                    if (actions.Size() > 0)
                    {
                        aMax = double.NegativeInfinity;
                    }
                    foreach (A a in actions)
                    {
                        // &Sigma;<sub>s'</sub>P(s' | s, a) U[s']
                        double aSum = 0;
                        foreach (S sDelta in mdp.states())
                        {
                            aSum += mdp.transitionProbability(sDelta, s, a) * U.Get(sDelta);
                        }
                        if (aSum > aMax)
                        {
                            aMax = aSum;
                        }
                    }
                    // U'[s] <- R(s) + &gamma;
                    // max<sub>a &isin; A(s)</sub>
                    Udelta.Put(s, mdp.reward(s) + gamma * aMax);
                    // if |U'[s] - U[s]| > &delta; then &delta; <- |U'[s] - U[s]|
                    double aDiff = System.Math.Abs(Udelta.Get(s) - U.Get(s));
                    if (aDiff > delta)
                    {
                        delta = aDiff;
                    }
                }
                // until &delta; < &epsilon;(1 - &gamma;)/&gamma;
            } while (delta > minDelta);

            // return U
            return(U);
        }
 public void setUp()
 {
     cw  = CellWorldFactory.CreateCellWorldForFig17_1();
     mdp = MDPFactory.createMDPForFigure17_3(cw);
     pi  = new PolicyIteration <Cell <double>, CellWorldAction>(
         new ModifiedPolicyEvaluation <Cell <double>, CellWorldAction>(50, 1.0));
 }
        public IMap <S, double> evaluate(IMap <S, A> pi_i, IMap <S, double> U, IMarkovDecisionProcess <S, A> mdp)
        {
            IMap <S, double> U_i   = CollectionFactory.CreateMap <S, double>(U);
            IMap <S, double> U_ip1 = CollectionFactory.CreateMap <S, double>(U);

            // repeat k times to produce the next utility estimate
            for (int i = 0; i < k; ++i)
            {
                // U<sub>i+1</sub>(s) <- R(s) +
                // &gamma;&Sigma;<sub>s'</sub>P(s'|s,&pi;<sub>i</sub>(s))U<sub>i</sub>(s')
                foreach (S s in U.GetKeys())
                {
                    A      ap_i = pi_i.Get(s);
                    double aSum = 0;
                    // Handle terminal states (i.e. no actions)
                    if (null != ap_i)
                    {
                        foreach (S sDelta in U.GetKeys())
                        {
                            aSum += mdp.transitionProbability(sDelta, s, ap_i) * U_i.Get(sDelta);
                        }
                    }
                    U_ip1.Put(s, mdp.reward(s) + gamma * aSum);
                }

                U_i.PutAll(U_ip1);
            }
            return(U_ip1);
        }
        static void policyIterationDemo()
        {
            System.Console.WriteLine("DEMO: Policy Iteration");
            System.Console.WriteLine("======================");
            System.Console.WriteLine("Figure 17.3");
            System.Console.WriteLine("-----------");

            CellWorld <double> cw = CellWorldFactory.CreateCellWorldForFig17_1();
            IMarkovDecisionProcess <Cell <double>, CellWorldAction> mdp = MDPFactory.createMDPForFigure17_3(cw);
            PolicyIteration <Cell <double>, CellWorldAction>
            pi = new PolicyIteration <Cell <double>, CellWorldAction>(
                new ModifiedPolicyEvaluation <Cell <double>, CellWorldAction>(50, 1.0));

            IPolicy <Cell <double>, CellWorldAction> policy = pi.policyIteration(mdp);

            System.Console.WriteLine("(1,1) = " + policy.action(cw.GetCellAt(1, 1)));
            System.Console.WriteLine("(1,2) = " + policy.action(cw.GetCellAt(1, 2)));
            System.Console.WriteLine("(1,3) = " + policy.action(cw.GetCellAt(1, 3)));

            System.Console.WriteLine("(2,1) = " + policy.action(cw.GetCellAt(2, 1)));
            System.Console.WriteLine("(2,3) = " + policy.action(cw.GetCellAt(2, 3)));

            System.Console.WriteLine("(3,1) = " + policy.action(cw.GetCellAt(3, 1)));
            System.Console.WriteLine("(3,2) = " + policy.action(cw.GetCellAt(3, 2)));
            System.Console.WriteLine("(3,3) = " + policy.action(cw.GetCellAt(3, 3)));

            System.Console.WriteLine("(4,1) = " + policy.action(cw.GetCellAt(4, 1)));
            System.Console.WriteLine("(4,2) = " + policy.action(cw.GetCellAt(4, 2)));
            System.Console.WriteLine("(4,3) = " + policy.action(cw.GetCellAt(4, 3)));

            System.Console.WriteLine("=========================");
        }
        static void valueIterationDemo()
        {
            System.Console.WriteLine("DEMO: Value Iteration");
            System.Console.WriteLine("=====================");
            System.Console.WriteLine("Figure 17.3");
            System.Console.WriteLine("-----------");

            CellWorld <double> cw = CellWorldFactory.CreateCellWorldForFig17_1();
            IMarkovDecisionProcess <Cell <double>, CellWorldAction> mdp = MDPFactory.createMDPForFigure17_3(cw);
            ValueIteration <Cell <double>, CellWorldAction>
            vi = new ValueIteration <Cell <double>, CellWorldAction>(1.0);

            IMap <Cell <double>, double> U = vi.valueIteration(mdp, 0.0001);

            System.Console.WriteLine("(1,1) = " + U.Get(cw.GetCellAt(1, 1)));
            System.Console.WriteLine("(1,2) = " + U.Get(cw.GetCellAt(1, 2)));
            System.Console.WriteLine("(1,3) = " + U.Get(cw.GetCellAt(1, 3)));

            System.Console.WriteLine("(2,1) = " + U.Get(cw.GetCellAt(2, 1)));
            System.Console.WriteLine("(2,3) = " + U.Get(cw.GetCellAt(2, 3)));

            System.Console.WriteLine("(3,1) = " + U.Get(cw.GetCellAt(3, 1)));
            System.Console.WriteLine("(3,2) = " + U.Get(cw.GetCellAt(3, 2)));
            System.Console.WriteLine("(3,3) = " + U.Get(cw.GetCellAt(3, 3)));

            System.Console.WriteLine("(4,1) = " + U.Get(cw.GetCellAt(4, 1)));
            System.Console.WriteLine("(4,2) = " + U.Get(cw.GetCellAt(4, 2)));
            System.Console.WriteLine("(4,3) = " + U.Get(cw.GetCellAt(4, 3)));

            System.Console.WriteLine("=========================");
        }
        /**
         * Create a policy vector indexed by state, initially random.
         *
         * @param mdp
         *            an MDP with states S, actions A(s), transition model P(s'|s,a)
         * @return a policy vector indexed by state, initially random.
         */
        public static IMap <S, A> initialPolicyVector(IMarkovDecisionProcess <S, A> mdp)
        {
            IMap <S, A>     pi      = CollectionFactory.CreateInsertionOrderedMap <S, A>();
            ICollection <A> actions = CollectionFactory.CreateQueue <A>();

            foreach (S s in mdp.states())
            {
                actions.Clear();
                actions.AddAll(mdp.actions(s));
                // Handle terminal states (i.e. no actions).
                if (actions.Size() > 0)
                {
                    pi.Put(s, Util.selectRandomlyFromList(actions));
                }
            }
            return(pi);
        }
 public void setUp()
 {
     cw = CellWorldFactory.CreateCellWorldForFig17_1();
     mdp = MDPFactory.createMDPForFigure17_3(cw);
     vi = new ValueIteration<Cell<double>, CellWorldAction>(1.0);
 }
Exemple #8
0
 public void setUp()
 {
     cw  = CellWorldFactory.CreateCellWorldForFig17_1();
     mdp = MDPFactory.createMDPForFigure17_3(cw);
 }
        // function POLICY-ITERATION(mdp) returns a policy

        /**
         * The policy iteration algorithm for calculating an optimal policy.
         *
         * @param mdp
         *            an MDP with states S, actions A(s), transition model P(s'|s,a)
         * @return an optimal policy
         */
        public IPolicy <S, A> policyIteration(IMarkovDecisionProcess <S, A> mdp)
        {
            // local variables: U, a vector of utilities for states in S, initially
            // zero
            IMap <S, double> U = Util.create(mdp.states(), 0D);
            // &pi;, a policy vector indexed by state, initially random
            IMap <S, A> pi = initialPolicyVector(mdp);
            bool        unchanged;

            // repeat
            do
            {
                // U <- POLICY-EVALUATION(&pi;, U, mdp)
                U = policyEvaluation.evaluate(pi, U, mdp);
                // unchanged? <- true
                unchanged = true;
                // for each state s in S do
                foreach (S s in mdp.states())
                {
                    // calculate:
                    // max<sub>a &isin; A(s)</sub>
                    // &Sigma;<sub>s'</sub>P(s'|s,a)U[s']
                    double aMax = double.NegativeInfinity, piVal = 0;
                    A      aArgmax = pi.Get(s);
                    foreach (A a in mdp.actions(s))
                    {
                        double aSum = 0;
                        foreach (S sDelta in mdp.states())
                        {
                            aSum += mdp.transitionProbability(sDelta, s, a) * U.Get(sDelta);
                        }
                        if (aSum > aMax)
                        {
                            aMax    = aSum;
                            aArgmax = a;
                        }
                        // track:
                        // &Sigma;<sub>s'</sub>P(s'|s,&pi;[s])U[s']
                        if (a.Equals(pi.Get(s)))
                        {
                            piVal = aSum;
                        }
                    }
                    // if max<sub>a &isin; A(s)</sub>
                    // &Sigma;<sub>s'</sub>P(s'|s,a)U[s']
                    // > &Sigma;<sub>s'</sub>P(s'|s,&pi;[s])U[s'] then do
                    if (aMax > piVal)
                    {
                        // &pi;[s] <- argmax<sub>a &isin;A(s)</sub>
                        // &Sigma;<sub>s'</sub>P(s'|s,a)U[s']
                        pi.Put(s, aArgmax);
                        // unchanged? <- false
                        unchanged = false;
                    }
                }
                // until unchanged?
            } while (!unchanged);

            // return &pi;
            return(new LookupPolicy <S, A>(pi));
        }