/* simulates your policy for a number of iterations multiple times, and computes the average reward obtained. * To generate a single trial: * 1. Sample a starting state s from the initial belief state. * 2. Repeat until goal is reached * a) compute the action a for the belief state. * b) sample the result of applying a to s, obtaining s'. * c) sample an observation o based on a and s' * d) compute the new belief state given your old belief state, a, and o. * e) accumulate the reward * cStepsPerTrial = Number of iterations * cTrials = number of times of itrating cStepsPerTrial times. */ public double ComputeAverageDiscountedReward(Policy p, int cTrials, int cStepsPerTrial) { double accumulatedReward = 0; for (int i = 1; i <= cTrials; i++) { int remainingSteps = cStepsPerTrial; BeliefState bs = InitialBelief; // step 1: Sample a starting state s from the initial belief state. State s = bs.RandomState(); // step 2: Repeat until goal is reached while (!IsGoalState(s) && remainingSteps > 0) { //step 2a: compute the action a for the belief state. Action a = p.GetAction(bs); //step 2b: sample the result of applying a to s, obtaining s'. State sTag = s.Apply(a); double reward = bs.Reward(a); // step 2c: sample an observation o based on a and s(implemented with RandomObservation?) Observation o = s.RandomObservation(a); //step 2d: compute the new belief state given your old belief state, a, and o. BeliefState newBeliefState = bs.Next(a, o); bs = newBeliefState; //change bs for next iteration //step 2e: accumulate the reward accumulatedReward += reward; s = sTag; remainingSteps--; } } return(accumulatedReward / cTrials); }
private List <BeliefState> SimulateTrial(Policy p, int cMaxSteps) { BeliefState bsCurrent = m_dDomain.InitialBelief, bsNext = null; State sCurrent = bsCurrent.RandomState(), sNext = null; Action a = null; Observation o = null; List <BeliefState> lBeliefs = new List <BeliefState>(); while (!m_dDomain.IsGoalState(sCurrent) && lBeliefs.Count < cMaxSteps) { a = p.GetAction(bsCurrent); sNext = sCurrent.Apply(a); o = sNext.RandomObservation(a); bsNext = bsCurrent.Next(a, o); bsCurrent = bsNext; lBeliefs.Add(bsCurrent); sCurrent = sNext; } return(lBeliefs); }
private void SimulateTrial(Policy p, MazeViewer viewer) { BeliefState bsCurrent = InitialBelief, bsNext = null; State sCurrent = bsCurrent.RandomState(), sNext = null; Action a = null; Observation o = null; viewer.CurrentState = (MazeState)sCurrent; viewer.CurrentBelief = bsCurrent; while (!IsGoalState(sCurrent)) { a = p.GetAction(bsCurrent); sNext = sCurrent.Apply(a); o = sNext.RandomObservation(a); bsNext = bsCurrent.Next(a, o); bsCurrent = bsNext; sCurrent = sNext; viewer.CurrentState = (MazeState)sCurrent; viewer.CurrentBelief = bsCurrent; viewer.CurrentObservation = (MazeObservation)o; Thread.Sleep(500); } }