public BeliefState Next(Action a, Observation o) { BeliefState bsNext = new BeliefState(m_dDomain); //your code here double normalizationFactor = 0.0; foreach (State stateTag in m_dDomain.States) { double updateProbabilityForState = 0.0; foreach (State state in m_dDomain.States) //or in States???????????? { if (state.Successors(a).Contains(stateTag)) { double transitionProbability = state.TransitionProbability(a: a, sTag: stateTag); double beliefOfstate = this.m_dBeliefs[state]; updateProbabilityForState += transitionProbability * beliefOfstate; } } updateProbabilityForState *= stateTag.ObservationProbability(a: a, o: o); bsNext.AddBelief(stateTag, updateProbabilityForState); normalizationFactor += updateProbabilityForState; } for (int i = 0; i < bsNext.m_dBeliefs.Keys.Count; i++) { State stateToNormalize = bsNext.m_dBeliefs.Keys.ElementAt(i); bsNext.m_dBeliefs[stateToNormalize] /= normalizationFactor; } Debug.Assert(bsNext.Validate()); return(bsNext); }
// t(b,a,b') = pr(b'| a,b) = (sum over all o in omega) pr(b'|a,o,b) * pr(o|a,b). lecture 13, page 3 public BeliefState Next(Action a, Observation o) { BeliefState bsNext = new BeliefState(m_dDomain); //double sumOfBTag = 0; foreach (State sTag in m_dDomain.States) { double stateProbabilityInBtag; stateProbabilityInBtag = sTag.ObservationProbability(a, o) * transitionProbabilityForEachState(sTag, a) / probabilityOfObservationGivenAB(a, o); bsNext.AddBelief(sTag, stateProbabilityInBtag); } return(bsNext); //foreach (State sTag in m_dDomain.States) //{ // //pr(b'|a,o,b) // double currBTag = CalculateBTagForEachState(sTag, a, o); // sumOfBTag += currBTag; // bsNext.AddBelief(sTag, currBTag); //} //foreach (State sTag in m_dDomain.States) //{ // bsNext.m_dBeliefs[sTag] = bsNext.m_dBeliefs[sTag]/sumOfBTag; //} //Debug.Assert(bsNext.Validate()); //return bsNext; }
//problem: at the end of getAction, avBest is still null, maybe m_lVectors is null? public override Action GetAction(BeliefState bs) { AlphaVector avBest = null; ValueOf(bs, m_lVectors, out avBest); return(avBest.Action); }
/* simulates your policy for a number of iterations multiple times, and computes the average reward obtained. * To generate a single trial: * 1. Sample a starting state s from the initial belief state. * 2. Repeat until goal is reached * a) compute the action a for the belief state. * b) sample the result of applying a to s, obtaining s'. * c) sample an observation o based on a and s' * d) compute the new belief state given your old belief state, a, and o. * e) accumulate the reward * cStepsPerTrial = Number of iterations * cTrials = number of times of itrating cStepsPerTrial times. */ public double ComputeAverageDiscountedReward(Policy p, int cTrials, int cStepsPerTrial) { double accumulatedReward = 0; for (int i = 1; i <= cTrials; i++) { int remainingSteps = cStepsPerTrial; BeliefState bs = InitialBelief; // step 1: Sample a starting state s from the initial belief state. State s = bs.RandomState(); // step 2: Repeat until goal is reached while (!IsGoalState(s) && remainingSteps > 0) { //step 2a: compute the action a for the belief state. Action a = p.GetAction(bs); //step 2b: sample the result of applying a to s, obtaining s'. State sTag = s.Apply(a); double reward = bs.Reward(a); // step 2c: sample an observation o based on a and s(implemented with RandomObservation?) Observation o = s.RandomObservation(a); //step 2d: compute the new belief state given your old belief state, a, and o. BeliefState newBeliefState = bs.Next(a, o); bs = newBeliefState; //change bs for next iteration //step 2e: accumulate the reward accumulatedReward += reward; s = sTag; remainingSteps--; } } return(accumulatedReward / cTrials); }
//generate a new alpha vector from a belief state and an action private AlphaVector G(BeliefState bs, Action a) { AlphaVector avSum = new AlphaVector(a); AlphaVector avGMax = null; double dValue = 0.0, dMaxValue = double.NegativeInfinity; foreach (Observation o in m_dDomain.Observations) { dMaxValue = double.NegativeInfinity; avGMax = null; foreach (AlphaVector avCurrent in m_lVectors) { AlphaVector avG = G(a, o, avCurrent); dValue = avG.InnerProduct(bs); if (dValue > dMaxValue) { dMaxValue = dValue; avGMax = avG; } } avSum += avGMax; } avSum *= m_dDomain.DiscountFactor; AlphaVector avResult = new AlphaVector(a); foreach (State s in m_dDomain.States) { avResult[s] = avSum[s] + s.Reward(a); } return(avResult); }
public BeliefState(BeliefState bs) { this.m_dDomain = bs.m_dDomain; this.m_dBeliefs = new Dictionary <State, double>(); foreach (KeyValuePair <State, double> p in bs.m_dBeliefs) { m_dBeliefs.Add(p.Key, p.Value); } }
private void pruneAlphaVector(List <BeliefState> bsSet) { List <BeliefState> copyBset = new List <BeliefState>(bsSet); List <AlphaVector> temp_lVectors = new List <AlphaVector>(); while (copyBset.Any()) { BeliefState _bs = copyBset.ElementAt(0); AlphaVector _alpha = backup(_bs); double _reward = _alpha.InnerProduct(_bs); if (this.m_valueFunction[_bs].InnerProduct(_bs) < _reward) { this.m_valueFunction[_bs] = _alpha; temp_lVectors.Add(_alpha); copyBset.Remove(_bs); List <BeliefState> copyBset_inner = new List <BeliefState>(copyBset); foreach (BeliefState temp_bs in copyBset_inner) { double __reward = _alpha.InnerProduct(temp_bs); double curr_val = this.m_valueFunction[temp_bs].InnerProduct(temp_bs); if (curr_val < __reward) { this.m_valueFunction[temp_bs] = _alpha; copyBset.Remove(temp_bs); } } } else { copyBset.Remove(_bs); double max_reward = double.NegativeInfinity; AlphaVector max_alpha = null; foreach (AlphaVector alpha in m_lVectors) { double reward = alpha.InnerProduct(_bs); if (reward > max_reward) { max_reward = reward; max_alpha = alpha; } } if (!temp_lVectors.Contains(max_alpha)) { temp_lVectors.Add(max_alpha); } this.m_valueFunction[_bs] = max_alpha; } } //this.m_lVectors = new List<AlphaVector>(); this.m_lVectors = temp_lVectors; //foreach (AlphaVector updated_alpha in m_valueFunction.Values) //{ // if(!this.m_lVectors.Contains(updated_alpha)) // this.m_lVectors.Add(updated_alpha); //} }
public double InnerProduct(BeliefState bs) { double dSum = 0.0; foreach (KeyValuePair <State, double> p in m_dValues) { dSum += p.Value * bs[p.Key]; } return(dSum); }
private List <BeliefState> CopyB(List <BeliefState> B) { List <BeliefState> Btag = new List <BeliefState>(); foreach (BeliefState bs in B) { BeliefState newBeliefState = new BeliefState(bs); Btag.Add(newBeliefState); } return(Btag); }
/** * Computes the best alpha vector with action on root for belief state bs * (alpha_action_bs) */ private AlphaVector computeBestAlpha(Action action, BeliefState bs) { //initializing an alpha vector with action on its root AlphaVector discountedRewardVector = new AlphaVector(action); // We loop over all observations and alpha vectors for each observation obs, // we find the alpha vector maximizing dot(bs,alpha_action_obs) - we will use // these vectors (their sum) in order to calculate alpha_a_b foreach (Observation obs in m_dDomain.Observations) { //We compute alpha_a_o for every observation o, according to the equation in the slides AlphaVector cur_alpha_ao = null; AlphaVector best_alpha_ao = new AlphaVector(); double best_val = double.NegativeInfinity; double cur_val = 0; //Looping over all alpha vectors, finding the best alpha that maximizes dot(bs,alpha_action_obs) foreach (AlphaVector av in m_lVectors) { //We compute av_action_obs for every av cur_alpha_ao = computeAlphaAO(av, action, obs); // dot product between av_action_obs abd the belief state bs cur_val = cur_alpha_ao.InnerProduct(bs); //We take the vector maximizing the dot product if (cur_val > best_val) { best_alpha_ao = cur_alpha_ao; best_val = cur_val; } } //We compute the sum of these vectors, (SUM(arg max(dot(bs,alpha_bs_a)))) discountedRewardVector += best_alpha_ao; } // Multiplying it with the discount factor discountedRewardVector = discountedRewardVector * m_dDomain.DiscountFactor; AlphaVector rA; //action's rewards vector, We add it to the sum, and return the result if (rewardsVectors.ContainsKey(action)) { rA = rewardsVectors[action]; } else { rA = new AlphaVector(); foreach (State s in m_dDomain.States) { rA[s] = s.Reward(action); } rewardsVectors[action] = rA; } return(discountedRewardVector + rA); }
private AlphaVector ArgMax(List <AlphaVector> m_lVectors, BeliefState b) { AlphaVector maxAlphaVector = new AlphaVector(); foreach (AlphaVector aVector in m_lVectors) { if (aVector.InnerProduct(b) > maxAlphaVector.InnerProduct(b)) { maxAlphaVector = aVector; } } return(maxAlphaVector); }
private double ValueOf(BeliefState bs, List <AlphaVector> lVectors, out AlphaVector avBest) { double dValue = 0.0, dMaxValue = double.NegativeInfinity; avBest = null; foreach (AlphaVector av in lVectors) { dValue = av.InnerProduct(bs); if (dValue > dMaxValue) { dMaxValue = dValue; avBest = av; } } return(dMaxValue); }
public BeliefState Next(Action a, Observation o) { BeliefState bsNext = new BeliefState(m_dDomain); //Represents the new belief state b_o_s double normalizing_factor = 0; //We will divide our resulted belief state by this factor, instead of calculating Pr(o|a,b) HashSet <State> reachableStates = new HashSet <State>(); // The neighboring states are the union of all neighboring states // of states with positive probability on current belief state. // When we calculate the new distribution over states, we just need // to look on S' such that Tr(S,a,S')>0 foreach (KeyValuePair <State, double> entry in m_dBeliefs) { if (entry.Value > 0) { foreach (State s in entry.Key.Successors(a)) { reachableStates.Add(s); // We optimize the calculation by adding the weighted transition value as we build the reachableStates Set // Instead of first calculating the set and only then finding all its ancenstors and perform the calculation bsNext.AddBelief(s, entry.Value * entry.Key.TransitionProbability(a, s)); } } } foreach (State s_prime in reachableStates) { double trans_prob = 0; double obs_prob = s_prime.ObservationProbability(a, o); // We Calculate O(o,s',a)*(b\dot\Tr(s',a)) trans_prob = bsNext[s_prime]; // for each state s_prime trans_prob equals O(s_prime,a,o)*dot(b,Tr(s,a,s_prime)) trans_prob *= obs_prob; //The normalizing factor is sum of all values, we divide the vector by this number to make it a distribution normalizing_factor += trans_prob; // Updating the new belief state bsNext[s_prime] = trans_prob; } foreach (State s in reachableStates) { bsNext[s] /= normalizing_factor; } Debug.Assert(bsNext.Validate()); return(bsNext); }
/** * Calculates the value of a belief state bs w.r.t a list to alpha vectors. * i.e finds the alpha vector alpha that maximizes dot(bs,alpha), returns the value of * this dot product, and return the vector as avBest * * */ private double ValueOf(BeliefState bs, List <AlphaVector> lVectors, out AlphaVector avBest) { double dValue = 0.0, dMaxValue = double.NegativeInfinity; avBest = null; //We loop over all alpha vectors foreach (AlphaVector av in lVectors) { dValue = av.InnerProduct(bs); if (dValue > dMaxValue) //taking the maximum dot product { dMaxValue = dValue; avBest = av; } } return(dMaxValue); }
private List <BeliefState> GenerateB(int cBeliefs, Random rand) { List <BeliefState> B = new List <BeliefState>(); BeliefState initB = m_dDomain.InitialBelief; int n = cBeliefs; while (n > 0) { Action a = (m_dDomain.GetRandomAction(rand)); Observation oCurr = initB.RandomObservation(a); BeliefState bNext = initB.Next(a, oCurr); B.Add(bNext); initB = bNext; n--; } return(B); }
private AlphaVector backup(BeliefState bs) { AlphaVector avBest = null, avCurrent = null; double dMaxValue = double.NegativeInfinity, dValue = 0.0; //your code here foreach (var action in m_dDomain.Actions) { avCurrent = G(bs, action); dValue = avCurrent.InnerProduct(bs); if (dValue > dMaxValue) { dMaxValue = dValue; avBest = avCurrent; } } return(avBest); }
/** * The Backup operation, receives a belief state bs, and returns * Backup(m_lVectors,bs) * (The best alpha vector alpha_a_bs maximizing * dot(b,alpha_a_bs)) * */ private AlphaVector backup(BeliefState bs) { AlphaVector avBest = new AlphaVector(), avCurrent = new AlphaVector(); double dMaxValue = double.NegativeInfinity, dValue = 0.0; //We loop over all actions in domain, and for every action a //we take the best alpha vector with a on its root foreach (Action a in m_dDomain.Actions) { avCurrent = computeBestAlpha(a, bs); // alpha_a_b dValue = avCurrent.InnerProduct(bs); // dot product with bs if (dValue > dMaxValue) { // taking the vector alpha_a_b that maximizes the dot product avBest = avCurrent; dMaxValue = dValue; } } return(avBest); //returns the best alpha_a_bs }
private List <BeliefState> SimulateTrial(Policy p, int cMaxSteps) { BeliefState bsCurrent = m_dDomain.InitialBelief, bsNext = null; State sCurrent = bsCurrent.RandomState(), sNext = null; Action a = null; Observation o = null; List <BeliefState> lBeliefs = new List <BeliefState>(); while (!m_dDomain.IsGoalState(sCurrent) && lBeliefs.Count < cMaxSteps) { a = p.GetAction(bsCurrent); sNext = sCurrent.Apply(a); o = sNext.RandomObservation(a); bsNext = bsCurrent.Next(a, o); bsCurrent = bsNext; lBeliefs.Add(bsCurrent); sCurrent = sNext; } return(lBeliefs); }
/** * a recursive function performing one trial with stepsLeft steps, we are given a policy p, * stepsLeft, current state state, and current belief state bs * * a) compute the action for the belief state bs * b) sample the result of applying a to s, obtaining nextState. * c) sample an observation o based on a and nextState * d) compute the new belief state given your old belief state, a, and o. * e) call the function recursively with the same policy p, stepsLeft-1, the new state, and the new * belief state. finally we accumalate the reward. * * */ private double calcTrialReward(Policy p, int stepsLeft, State state, BeliefState bs) { // If we are already in a goal state or no steps are left then the reward is 0 if (IsGoalState(state) || stepsLeft == 0) return 0; //Calculating the action for the belief state based on the policy Action a = p.GetAction(bs); //applying a on state, resulting in a new state nextState State nextState = state.Apply(a); //The reward of performing a on state double reward = nextState.Reward(a); // We sample an observation based on nextState and a Observation o = nextState.RandomObservation(a); // Updating the reward, calling the function recursively so we continue the "forward search" to goal state reward += this.DiscountFactor * calcTrialReward(p, stepsLeft-1, nextState, bs.Next(a,o)); return reward; }
public double ComputeAverageDiscountedReward(Policy p, int cTrials, int cStepsPerTrial) { //your code here double to_ret = 0.0; List <double> rewards = new List <double>(); for (int i = 0; i < cTrials; i++) { State target = sampleInitialState(); BeliefState currentBeliefState = InitialBelief; double sumRewards = 0.0; int counter = 0; //while ((!IsGoalState(target))) while ((!IsGoalState(target) && counter < cStepsPerTrial)) { Action a = p.GetAction(currentBeliefState); State newState = target.Apply(a: a); List <KeyValuePair <Observation, double> > probabilitiesForObservation = new List <KeyValuePair <Observation, double> >(); double sum = 0.0; foreach (Observation obs in Observations) { double prob = newState.ObservationProbability(a: a, o: obs); sum += prob; probabilitiesForObservation.Add(new KeyValuePair <Observation, double>(obs, sum)); } Observation newObservation = samplingObservations(probabilitiesForObservation); double reward = currentBeliefState.Reward(a); currentBeliefState = currentBeliefState.Next(a: a, o: newObservation); sumRewards += reward * Math.Pow(DiscountFactor, counter); counter++; target = newState; } rewards.Add(sumRewards); } foreach (double r in rewards) { to_ret += r; } return((to_ret) / cTrials); }
//returns the best alphaVector corresponds to a certain belief state private AlphaVector Backup(BeliefState bs) { AlphaVector avBest = null; //AlphaVector avCurrent = null; double dMaxValue = double.NegativeInfinity, dValue = 0.0; foreach (Action aCurr in m_dDomain.Actions) { foreach (AlphaVector avCurr in m_lVectors) { AlphaVector avBA = G(bs, aCurr); dValue = avBA.InnerProduct(bs); if (dMaxValue < dValue) { dMaxValue = dValue; avBest = avCurr; } } } return(avBest); }
public void PointBasedVI(int cBeliefs, int cMaxIterations) { Random rand = new Random(); List <BeliefState> B = GenerateB(cBeliefs, rand); InitV(); m_dGCache = new Dictionary <AlphaVector, Dictionary <Action, Dictionary <Observation, AlphaVector> > >(); List <BeliefState> BTag; while (cMaxIterations > 0) { BTag = CopyB(B); List <AlphaVector> VTag = new List <AlphaVector>(); while (BTag.Count != 0) { //choose arbitrary point in BTag to improve BeliefState bCurr = RandomBeliefState(BTag, rand); AlphaVector newAV = Backup(bCurr); AlphaVector avBest = new AlphaVector(); double currValue = ValueOf(bCurr, m_lVectors, out avBest); double AlphaDotb = newAV.InnerProduct(bCurr); if (AlphaDotb > currValue) { //remove from B points whose value was improved by new newAV BTag.Where(b => newAV.InnerProduct(b) >= ValueOf(b, m_lVectors, out AlphaVector avTmp)).ToList(); avBest = newAV; } else { BTag.Remove(bCurr); avBest = ArgMax(m_lVectors, b: bCurr); } VTag.Add(avBest); } m_lVectors = VTag; cMaxIterations--; } }
private void SimulateTrial(Policy p, MazeViewer viewer) { BeliefState bsCurrent = InitialBelief, bsNext = null; State sCurrent = bsCurrent.sampleState(), sNext = null; Action a = null; Observation o = null; viewer.CurrentState = (MazeState)sCurrent; viewer.CurrentBelief = bsCurrent; while (!IsGoalState(sCurrent)) { a = p.GetAction(bsCurrent); sNext = sCurrent.Apply(a); o = sNext.RandomObservation(a); bsNext = bsCurrent.Next(a, o); bsCurrent = bsNext; sCurrent = sNext; viewer.CurrentState = (MazeState)sCurrent; viewer.CurrentBelief = bsCurrent; viewer.CurrentObservation = (MazeObservation)o; Thread.Sleep(500); } }
public abstract Action GetAction(BeliefState bs);
public override Action GetAction(BeliefState bs) { int idx = RandomGenerator.Next(m_lActions.Count); return(m_lActions[idx]); }
public override Action GetAction(BeliefState bs) { //your code here throw new NotImplementedException(); }
/** * Performs the Value Iteration algorithm using the Perseus update algorithm, * generates a set containing cBelief belief states, and performs value iterations for * maximum cMaxIterations * */ public void PointBasedVI(int cBeliefs, int cMaxIterations) { //Generates an initial set containing cBelief belief states List <BeliefState> beliefStates = CollectBeliefs(cBeliefs); List <AlphaVector> vTag; //V' //const double EPSILON = 0.1; //The convergence boundry int iterationsLeft = cMaxIterations; while (iterationsLeft > 0) { vTag = new List <AlphaVector>(); List <BeliefState> beliefStatesLeftToImprove = new List <BeliefState>(beliefStates); // B' while (beliefStatesLeftToImprove.Count() > 0) { //While there are belief states to improve //Console.WriteLine("Improvable belief states left"); //Console.WriteLine(beliefStatesLeftToImprove.Count()); //selecting a random index of a belief state to improve int ri = RandomGenerator.Next(beliefStatesLeftToImprove.Count()); //We want to iterate over the belief states set and recieve the ri'th item List <BeliefState> .Enumerator e = beliefStatesLeftToImprove.GetEnumerator(); for (int i = 0; i < ri + 1; i++) //iterating until the belief state at index ri { e.MoveNext(); } BeliefState sampledBS = e.Current;//samplesBS is a randomly chosen belief state to for improvement //Console.WriteLine("Iterations left: " + iterationsLeft); //Console.WriteLine("Improvable bs left: " + beliefStatesLeftToImprove.Count()); //We calculate the backup of samplesBS AlphaVector alpha = backup(sampledBS); AlphaVector alphaToAdd;//It will contain the alpha vector to add to V' AlphaVector prevBestAlphaVector = null; //calculating the value of sampledBS (V(samplesBS)) which is the best dot product alpha*b double prevValue = ValueOf(sampledBS, m_lVectors, out prevBestAlphaVector); if (alpha.InnerProduct(sampledBS) >= prevValue) // alpha is dominating, remove all belief states that are improved by it { //Console.WriteLine("Found an improving vec"); List <BeliefState> beliefStatesToKeep = new List <BeliefState>(); foreach (BeliefState b_prime in beliefStatesLeftToImprove) { AlphaVector a = null; if (alpha.InnerProduct(b_prime) < ValueOf(b_prime, m_lVectors, out a)) { beliefStatesToKeep.Add(b_prime); } } beliefStatesLeftToImprove = beliefStatesToKeep; //In the case alpha is dominating, we add alpha to V' alphaToAdd = alpha; } else { //alpha does not improve,we remove sampledBS from the set beliefStatesLeftToImprove.Remove(sampledBS); alphaToAdd = prevBestAlphaVector; } if (!vTag.Contains(alphaToAdd)) { vTag.Add(alphaToAdd); } } /** * //We estimate how the alpha vectors set was changed * double diff = estimateDiff(m_lVectors, vTag, beliefStates); * Console.WriteLine(diff); * * //The difference between the current set, and the previous is less than epsilon * //We finish the update algorithm * //if (diff < EPSILON) * // break; **/ Console.WriteLine("Iterations left {0}", iterationsLeft); m_lVectors = vTag; iterationsLeft--; } }