Пример #1
0
        /* simulates your policy for a number of iterations multiple times, and computes the average reward obtained.
         * To generate a single trial:
         *   1. Sample a starting state s from the initial belief state.
         *   2. Repeat until goal is reached
         *        a) compute the action a for the belief state.
         *        b) sample the result of applying a to s, obtaining s'.
         *        c) sample an observation o based on a and s'
         *        d) compute the new belief state given your old belief state, a, and o.
         *        e) accumulate the reward
         * cStepsPerTrial = Number of iterations
         * cTrials = number of times of itrating cStepsPerTrial times. */
        public double ComputeAverageDiscountedReward(Policy p, int cTrials, int cStepsPerTrial)
        {
            double accumulatedReward = 0;

            for (int i = 1; i <= cTrials; i++)
            {
                int         remainingSteps = cStepsPerTrial;
                BeliefState bs             = InitialBelief;
                // step 1: Sample a starting state s from the initial belief state.
                State s = bs.RandomState();
                // step 2: Repeat until goal is reached
                while (!IsGoalState(s) && remainingSteps > 0)
                {
                    //step 2a: compute the action a for the belief state.
                    Action a = p.GetAction(bs);
                    //step 2b: sample the result of applying a to s, obtaining s'.
                    State  sTag   = s.Apply(a);
                    double reward = bs.Reward(a);
                    // step 2c: sample an observation o based on a and s(implemented with RandomObservation?)
                    Observation o = s.RandomObservation(a);
                    //step 2d: compute the new belief state given your old belief state, a, and o.
                    BeliefState newBeliefState = bs.Next(a, o);
                    bs = newBeliefState; //change bs for next iteration
                    //step 2e: accumulate the reward
                    accumulatedReward += reward;
                    s = sTag;
                    remainingSteps--;
                }
            }
            return(accumulatedReward / cTrials);
        }
Пример #2
0
        private List <BeliefState> SimulateTrial(Policy p, int cMaxSteps)
        {
            BeliefState        bsCurrent = m_dDomain.InitialBelief, bsNext = null;
            State              sCurrent = bsCurrent.RandomState(), sNext = null;
            Action             a        = null;
            Observation        o        = null;
            List <BeliefState> lBeliefs = new List <BeliefState>();

            while (!m_dDomain.IsGoalState(sCurrent) && lBeliefs.Count < cMaxSteps)
            {
                a         = p.GetAction(bsCurrent);
                sNext     = sCurrent.Apply(a);
                o         = sNext.RandomObservation(a);
                bsNext    = bsCurrent.Next(a, o);
                bsCurrent = bsNext;
                lBeliefs.Add(bsCurrent);
                sCurrent = sNext;
            }
            return(lBeliefs);
        }
Пример #3
0
        private void SimulateTrial(Policy p, MazeViewer viewer)
        {
            BeliefState bsCurrent = InitialBelief, bsNext = null;
            State       sCurrent = bsCurrent.RandomState(), sNext = null;
            Action      a = null;
            Observation o = null;

            viewer.CurrentState  = (MazeState)sCurrent;
            viewer.CurrentBelief = bsCurrent;
            while (!IsGoalState(sCurrent))
            {
                a                         = p.GetAction(bsCurrent);
                sNext                     = sCurrent.Apply(a);
                o                         = sNext.RandomObservation(a);
                bsNext                    = bsCurrent.Next(a, o);
                bsCurrent                 = bsNext;
                sCurrent                  = sNext;
                viewer.CurrentState       = (MazeState)sCurrent;
                viewer.CurrentBelief      = bsCurrent;
                viewer.CurrentObservation = (MazeObservation)o;
                Thread.Sleep(500);
            }
        }