Beispiel #1
0
        /* simulates your policy for a number of iterations multiple times, and computes the average reward obtained.
         * To generate a single trial:
         *   1. Sample a starting state s from the initial belief state.
         *   2. Repeat until goal is reached
         *        a) compute the action a for the belief state.
         *        b) sample the result of applying a to s, obtaining s'.
         *        c) sample an observation o based on a and s'
         *        d) compute the new belief state given your old belief state, a, and o.
         *        e) accumulate the reward
         * cStepsPerTrial = Number of iterations
         * cTrials = number of times of itrating cStepsPerTrial times. */
        public double ComputeAverageDiscountedReward(Policy p, int cTrials, int cStepsPerTrial)
        {
            double accumulatedReward = 0;

            for (int i = 1; i <= cTrials; i++)
            {
                int         remainingSteps = cStepsPerTrial;
                BeliefState bs             = InitialBelief;
                // step 1: Sample a starting state s from the initial belief state.
                State s = bs.RandomState();
                // step 2: Repeat until goal is reached
                while (!IsGoalState(s) && remainingSteps > 0)
                {
                    //step 2a: compute the action a for the belief state.
                    Action a = p.GetAction(bs);
                    //step 2b: sample the result of applying a to s, obtaining s'.
                    State  sTag   = s.Apply(a);
                    double reward = bs.Reward(a);
                    // step 2c: sample an observation o based on a and s(implemented with RandomObservation?)
                    Observation o = s.RandomObservation(a);
                    //step 2d: compute the new belief state given your old belief state, a, and o.
                    BeliefState newBeliefState = bs.Next(a, o);
                    bs = newBeliefState; //change bs for next iteration
                    //step 2e: accumulate the reward
                    accumulatedReward += reward;
                    s = sTag;
                    remainingSteps--;
                }
            }
            return(accumulatedReward / cTrials);
        }
Beispiel #2
0
        private List <BeliefState> GenerateB(int cBeliefs, Random rand)
        {
            List <BeliefState> B     = new List <BeliefState>();
            BeliefState        initB = m_dDomain.InitialBelief;
            int n = cBeliefs;

            while (n > 0)
            {
                Action      a     = (m_dDomain.GetRandomAction(rand));
                Observation oCurr = initB.RandomObservation(a);
                BeliefState bNext = initB.Next(a, oCurr);
                B.Add(bNext);
                initB = bNext;
                n--;
            }
            return(B);
        }
Beispiel #3
0
        private List <BeliefState> SimulateTrial(Policy p, int cMaxSteps)
        {
            BeliefState        bsCurrent = m_dDomain.InitialBelief, bsNext = null;
            State              sCurrent = bsCurrent.RandomState(), sNext = null;
            Action             a        = null;
            Observation        o        = null;
            List <BeliefState> lBeliefs = new List <BeliefState>();

            while (!m_dDomain.IsGoalState(sCurrent) && lBeliefs.Count < cMaxSteps)
            {
                a         = p.GetAction(bsCurrent);
                sNext     = sCurrent.Apply(a);
                o         = sNext.RandomObservation(a);
                bsNext    = bsCurrent.Next(a, o);
                bsCurrent = bsNext;
                lBeliefs.Add(bsCurrent);
                sCurrent = sNext;
            }
            return(lBeliefs);
        }
Beispiel #4
0
        public double ComputeAverageDiscountedReward(Policy p, int cTrials, int cStepsPerTrial)
        {
            //your code here
            double        to_ret  = 0.0;
            List <double> rewards = new List <double>();

            for (int i = 0; i < cTrials; i++)
            {
                State       target             = sampleInitialState();
                BeliefState currentBeliefState = InitialBelief;
                double      sumRewards         = 0.0;
                int         counter            = 0;
                //while ((!IsGoalState(target)))
                while ((!IsGoalState(target) && counter < cStepsPerTrial))
                {
                    Action a        = p.GetAction(currentBeliefState);
                    State  newState = target.Apply(a: a);
                    List <KeyValuePair <Observation, double> > probabilitiesForObservation = new List <KeyValuePair <Observation, double> >();
                    double sum = 0.0;
                    foreach (Observation obs in Observations)
                    {
                        double prob = newState.ObservationProbability(a: a, o: obs);
                        sum += prob;
                        probabilitiesForObservation.Add(new KeyValuePair <Observation, double>(obs, sum));
                    }
                    Observation newObservation = samplingObservations(probabilitiesForObservation);
                    double      reward         = currentBeliefState.Reward(a);
                    currentBeliefState = currentBeliefState.Next(a: a, o: newObservation);
                    sumRewards        += reward * Math.Pow(DiscountFactor, counter);
                    counter++;
                    target = newState;
                }
                rewards.Add(sumRewards);
            }

            foreach (double r in rewards)
            {
                to_ret += r;
            }
            return((to_ret) / cTrials);
        }
Beispiel #5
0
        private void SimulateTrial(Policy p, MazeViewer viewer)
        {
            BeliefState bsCurrent = InitialBelief, bsNext = null;
            State       sCurrent = bsCurrent.sampleState(), sNext = null;
            Action      a = null;
            Observation o = null;

            viewer.CurrentState  = (MazeState)sCurrent;
            viewer.CurrentBelief = bsCurrent;
            while (!IsGoalState(sCurrent))
            {
                a                         = p.GetAction(bsCurrent);
                sNext                     = sCurrent.Apply(a);
                o                         = sNext.RandomObservation(a);
                bsNext                    = bsCurrent.Next(a, o);
                bsCurrent                 = bsNext;
                sCurrent                  = sNext;
                viewer.CurrentState       = (MazeState)sCurrent;
                viewer.CurrentBelief      = bsCurrent;
                viewer.CurrentObservation = (MazeObservation)o;
                Thread.Sleep(500);
            }
        }
Beispiel #6
0
         /**
         * a recursive function performing one trial with stepsLeft steps, we are given a policy p,
         * stepsLeft, current state state, and current belief state bs
         * 
         * a) compute the action for the belief state bs
         * b) sample the result of applying a to s, obtaining nextState.
         * c) sample an observation o based on a and nextState
         * d) compute the new belief state given your old belief state, a, and o.
         * e) call the function recursively with the same policy p, stepsLeft-1, the new state, and the new
         * belief state. finally we accumalate the reward.
         * 
         * 
         */
        private double calcTrialReward(Policy p, int stepsLeft, State state, BeliefState bs)
        {
            // If we are already in a goal state or no steps are left then the reward is 0
            if (IsGoalState(state) || stepsLeft == 0)
                return 0;

            //Calculating the action for the belief state based on the policy
            Action a = p.GetAction(bs);
            
            //applying a on state, resulting in a new state nextState
            State nextState = state.Apply(a);
            //The reward of performing a on state
            double reward = nextState.Reward(a);
            // We sample an observation based on nextState and a
            Observation o = nextState.RandomObservation(a);

            // Updating the reward, calling the function recursively so we continue the "forward search" to goal state 
            reward += this.DiscountFactor * calcTrialReward(p, stepsLeft-1, nextState, bs.Next(a,o));
            return reward;
        }