public MDPPolicy <STATE_TYPE, ACTION_TYPE> policyIteration(double gamma) { MDPUtilityFunction <STATE_TYPE> U = initialUtilityFunction(); MDPPolicy <STATE_TYPE, ACTION_TYPE> pi = randomPolicy(); bool unchanged = false; do { unchanged = true; U = policyEvaluation(pi, U, gamma, 3); foreach (STATE_TYPE s in nonFinalstates) { Pair <ACTION_TYPE, Double> maxTransit = transitionModel .getTransitionWithMaximumExpectedUtility(s, U); Pair <ACTION_TYPE, Double> maxPolicyTransit = transitionModel .getTransitionWithMaximumExpectedUtilityUsingPolicy(pi, s, U); if (maxTransit.getSecond() > maxPolicyTransit.getSecond()) { pi.setAction(s, maxTransit.getFirst()); unchanged = false; } } } while (unchanged == false); return(pi); }
public MDPPolicy <STATE_TYPE, ACTION_TYPE> randomPolicy() { MDPPolicy <STATE_TYPE, ACTION_TYPE> policy = new MDPPolicy <STATE_TYPE, ACTION_TYPE>(); foreach (STATE_TYPE s in nonFinalstates) { policy.setAction(s, transitionModel.randomActionFor(s)); } return(policy); }
public MDPUtilityFunction <STATE_TYPE> policyEvaluation( MDPPolicy <STATE_TYPE, ACTION_TYPE> pi, MDPUtilityFunction <STATE_TYPE> U, double gamma, int iterations) { MDPUtilityFunction <STATE_TYPE> U_dash = U.copy(); for (int i = 0; i < iterations; i++) { U_dash = valueIterateOnceWith(gamma, pi, U_dash); } return(U_dash); }
public Pair <ACTION_TYPE, Double> getTransitionWithMaximumExpectedUtilityUsingPolicy( MDPPolicy <STATE_TYPE, ACTION_TYPE> policy, STATE_TYPE s, MDPUtilityFunction <STATE_TYPE> uf) { if ((isTerminal(s))) { return(new Pair <ACTION_TYPE, Double>(default(ACTION_TYPE), 0.0)); } List <MDPTransition <STATE_TYPE, ACTION_TYPE> > transitionsWithStartingStateSAndActionFromPolicy = getTransitionsWithStartingStateAndAction( s, policy.getAction(s)); Dictionary <ACTION_TYPE, Double> actionsToUtilities = getExpectedUtilityForSelectedTransitions( transitionsWithStartingStateSAndActionFromPolicy, uf); return(getActionWithMaximumUtility(actionsToUtilities)); }
private MDPUtilityFunction <STATE_TYPE> valueIterateOnceWith(double gamma, MDPPolicy <STATE_TYPE, ACTION_TYPE> pi, MDPUtilityFunction <STATE_TYPE> U) { MDPUtilityFunction <STATE_TYPE> U_dash = U.copy(); foreach (STATE_TYPE s in nonFinalstates) { Pair <ACTION_TYPE, Double> highestPolicyTransition = transitionModel .getTransitionWithMaximumExpectedUtilityUsingPolicy(pi, s, U); double utility = rewardFunction.getRewardFor(s) + (gamma * highestPolicyTransition.getSecond()); U_dash.setUtility(s, utility); } // System.Console.WriteLine("ValueIterationOnce before " + U); // System.Console.WriteLine("ValueIterationOnce after " + U_dash); return(U_dash); }