/* simulates your policy for a number of iterations multiple times, and computes the average reward obtained. * To generate a single trial: * 1. Sample a starting state s from the initial belief state. * 2. Repeat until goal is reached * a) compute the action a for the belief state. * b) sample the result of applying a to s, obtaining s'. * c) sample an observation o based on a and s' * d) compute the new belief state given your old belief state, a, and o. * e) accumulate the reward * cStepsPerTrial = Number of iterations * cTrials = number of times of itrating cStepsPerTrial times. */ public double ComputeAverageDiscountedReward(Policy p, int cTrials, int cStepsPerTrial) { double accumulatedReward = 0; for (int i = 1; i <= cTrials; i++) { int remainingSteps = cStepsPerTrial; BeliefState bs = InitialBelief; // step 1: Sample a starting state s from the initial belief state. State s = bs.RandomState(); // step 2: Repeat until goal is reached while (!IsGoalState(s) && remainingSteps > 0) { //step 2a: compute the action a for the belief state. Action a = p.GetAction(bs); //step 2b: sample the result of applying a to s, obtaining s'. State sTag = s.Apply(a); double reward = bs.Reward(a); // step 2c: sample an observation o based on a and s(implemented with RandomObservation?) Observation o = s.RandomObservation(a); //step 2d: compute the new belief state given your old belief state, a, and o. BeliefState newBeliefState = bs.Next(a, o); bs = newBeliefState; //change bs for next iteration //step 2e: accumulate the reward accumulatedReward += reward; s = sTag; remainingSteps--; } } return(accumulatedReward / cTrials); }
private List <BeliefState> GenerateB(int cBeliefs, Random rand) { List <BeliefState> B = new List <BeliefState>(); BeliefState initB = m_dDomain.InitialBelief; int n = cBeliefs; while (n > 0) { Action a = (m_dDomain.GetRandomAction(rand)); Observation oCurr = initB.RandomObservation(a); BeliefState bNext = initB.Next(a, oCurr); B.Add(bNext); initB = bNext; n--; } return(B); }
private List <BeliefState> SimulateTrial(Policy p, int cMaxSteps) { BeliefState bsCurrent = m_dDomain.InitialBelief, bsNext = null; State sCurrent = bsCurrent.RandomState(), sNext = null; Action a = null; Observation o = null; List <BeliefState> lBeliefs = new List <BeliefState>(); while (!m_dDomain.IsGoalState(sCurrent) && lBeliefs.Count < cMaxSteps) { a = p.GetAction(bsCurrent); sNext = sCurrent.Apply(a); o = sNext.RandomObservation(a); bsNext = bsCurrent.Next(a, o); bsCurrent = bsNext; lBeliefs.Add(bsCurrent); sCurrent = sNext; } return(lBeliefs); }
public double ComputeAverageDiscountedReward(Policy p, int cTrials, int cStepsPerTrial) { //your code here double to_ret = 0.0; List <double> rewards = new List <double>(); for (int i = 0; i < cTrials; i++) { State target = sampleInitialState(); BeliefState currentBeliefState = InitialBelief; double sumRewards = 0.0; int counter = 0; //while ((!IsGoalState(target))) while ((!IsGoalState(target) && counter < cStepsPerTrial)) { Action a = p.GetAction(currentBeliefState); State newState = target.Apply(a: a); List <KeyValuePair <Observation, double> > probabilitiesForObservation = new List <KeyValuePair <Observation, double> >(); double sum = 0.0; foreach (Observation obs in Observations) { double prob = newState.ObservationProbability(a: a, o: obs); sum += prob; probabilitiesForObservation.Add(new KeyValuePair <Observation, double>(obs, sum)); } Observation newObservation = samplingObservations(probabilitiesForObservation); double reward = currentBeliefState.Reward(a); currentBeliefState = currentBeliefState.Next(a: a, o: newObservation); sumRewards += reward * Math.Pow(DiscountFactor, counter); counter++; target = newState; } rewards.Add(sumRewards); } foreach (double r in rewards) { to_ret += r; } return((to_ret) / cTrials); }
private void SimulateTrial(Policy p, MazeViewer viewer) { BeliefState bsCurrent = InitialBelief, bsNext = null; State sCurrent = bsCurrent.sampleState(), sNext = null; Action a = null; Observation o = null; viewer.CurrentState = (MazeState)sCurrent; viewer.CurrentBelief = bsCurrent; while (!IsGoalState(sCurrent)) { a = p.GetAction(bsCurrent); sNext = sCurrent.Apply(a); o = sNext.RandomObservation(a); bsNext = bsCurrent.Next(a, o); bsCurrent = bsNext; sCurrent = sNext; viewer.CurrentState = (MazeState)sCurrent; viewer.CurrentBelief = bsCurrent; viewer.CurrentObservation = (MazeObservation)o; Thread.Sleep(500); } }
/** * a recursive function performing one trial with stepsLeft steps, we are given a policy p, * stepsLeft, current state state, and current belief state bs * * a) compute the action for the belief state bs * b) sample the result of applying a to s, obtaining nextState. * c) sample an observation o based on a and nextState * d) compute the new belief state given your old belief state, a, and o. * e) call the function recursively with the same policy p, stepsLeft-1, the new state, and the new * belief state. finally we accumalate the reward. * * */ private double calcTrialReward(Policy p, int stepsLeft, State state, BeliefState bs) { // If we are already in a goal state or no steps are left then the reward is 0 if (IsGoalState(state) || stepsLeft == 0) return 0; //Calculating the action for the belief state based on the policy Action a = p.GetAction(bs); //applying a on state, resulting in a new state nextState State nextState = state.Apply(a); //The reward of performing a on state double reward = nextState.Reward(a); // We sample an observation based on nextState and a Observation o = nextState.RandomObservation(a); // Updating the reward, calling the function recursively so we continue the "forward search" to goal state reward += this.DiscountFactor * calcTrialReward(p, stepsLeft-1, nextState, bs.Next(a,o)); return reward; }