public MDPPolicy <STATE_TYPE, ACTION_TYPE> policyIteration(double gamma) { MDPUtilityFunction <STATE_TYPE> U = initialUtilityFunction(); MDPPolicy <STATE_TYPE, ACTION_TYPE> pi = randomPolicy(); bool unchanged = false; do { unchanged = true; U = policyEvaluation(pi, U, gamma, 3); foreach (STATE_TYPE s in nonFinalstates) { Pair <ACTION_TYPE, Double> maxTransit = transitionModel .getTransitionWithMaximumExpectedUtility(s, U); Pair <ACTION_TYPE, Double> maxPolicyTransit = transitionModel .getTransitionWithMaximumExpectedUtilityUsingPolicy(pi, s, U); if (maxTransit.getSecond() > maxPolicyTransit.getSecond()) { pi.setAction(s, maxTransit.getFirst()); unchanged = false; } } } while (unchanged == false); return(pi); }
public MDPUtilityFunction <STATE_TYPE> valueIteration(double gamma, double error, double delta) { MDPUtilityFunction <STATE_TYPE> U = initialUtilityFunction(); MDPUtilityFunction <STATE_TYPE> U_dash = initialUtilityFunction(); double delta_max = (error * gamma) / (1 - gamma); do { U = U_dash.copy(); // System.Console.WriteLine(U); delta = 0.0; foreach (STATE_TYPE s in nonFinalstates) { Pair <ACTION_TYPE, Double> highestUtilityTransition = transitionModel .getTransitionWithMaximumExpectedUtility(s, U); double utility = rewardFunction.getRewardFor(s) + (gamma * highestUtilityTransition.getSecond()); U_dash.setUtility(s, utility); if ((Math.Abs(U_dash.getUtility(s) - U.getUtility(s))) > delta) { delta = Math.Abs(U_dash.getUtility(s) - U.getUtility(s)); } } } while (delta < delta_max); return(U); }
public Pair <MDPUtilityFunction <STATE_TYPE>, Double> valueIterateOnce( double gamma, MDPUtilityFunction <STATE_TYPE> presentUtilityFunction) { double maxUtilityGrowth = 0.0; MDPUtilityFunction <STATE_TYPE> newUtilityFunction = new MDPUtilityFunction <STATE_TYPE>(); foreach (STATE_TYPE s in nonFinalstates) { // double utility = rewardFunction.getRewardFor(s) // + (gamma * highestUtilityTransition.getSecond()); double utility = valueIterateOnceForGivenState(gamma, presentUtilityFunction, s); double differenceInUtility = Math.Abs(utility - presentUtilityFunction.getUtility(s)); if (differenceInUtility > maxUtilityGrowth) { maxUtilityGrowth = differenceInUtility; } newUtilityFunction.setUtility(s, utility); foreach (STATE_TYPE state in terminalStates) { newUtilityFunction.setUtility(state, presentUtilityFunction .getUtility(state)); } } return(new Pair <MDPUtilityFunction <STATE_TYPE>, Double>( newUtilityFunction, maxUtilityGrowth)); }
private Dictionary <ACTION_TYPE, Double> getExpectedUtilityForSelectedTransitions( List <MDPTransition <STATE_TYPE, ACTION_TYPE> > transitions, MDPUtilityFunction <STATE_TYPE> uf) { Dictionary <ACTION_TYPE, Double> actionsToUtilities = new Dictionary <ACTION_TYPE, Double>(); foreach (MDPTransition <STATE_TYPE, ACTION_TYPE> triplet in transitions) { STATE_TYPE s = triplet.getInitialState(); ACTION_TYPE action = triplet.getAction(); STATE_TYPE destinationState = triplet.getDestinationState(); double probabilityOfTransition = getTransitionProbability(s, action, destinationState); double expectedUtility = (probabilityOfTransition * uf .getUtility(destinationState)); Double presentValue = actionsToUtilities.ContainsKey(action)? actionsToUtilities[action]: Double.MinValue; if (presentValue == Double.MinValue) { actionsToUtilities.Add(action, expectedUtility); } else { actionsToUtilities[action] += expectedUtility; } } return(actionsToUtilities); }
public MDPUtilityFunction <STATE_TYPE> asUtilityFunction() { MDPUtilityFunction <STATE_TYPE> uf = new MDPUtilityFunction <STATE_TYPE>(); foreach (STATE_TYPE state in stateToReward.Keys) { uf.setUtility(state, getRewardFor(state)); } return(uf); }
public MDPUtilityFunction <STATE_TYPE> copy() { MDPUtilityFunction <STATE_TYPE> other = new MDPUtilityFunction <STATE_TYPE>(); foreach (STATE_TYPE state in hash.Keys) { other.setUtility(state, hash[state]); } return(other); }
// // PRIVATE METHODS // private double valueIterateOnceForGivenState(double gamma, MDPUtilityFunction <STATE_TYPE> presentUtilityFunction, STATE_TYPE state) { Pair <ACTION_TYPE, Double> highestUtilityTransition = transitionModel .getTransitionWithMaximumExpectedUtility(state, presentUtilityFunction); double utility = rewardFunction.getRewardFor(state) + (gamma * highestUtilityTransition.getSecond()); return(utility); }
public MDPUtilityFunction <STATE_TYPE> policyEvaluation( MDPPolicy <STATE_TYPE, ACTION_TYPE> pi, MDPUtilityFunction <STATE_TYPE> U, double gamma, int iterations) { MDPUtilityFunction <STATE_TYPE> U_dash = U.copy(); for (int i = 0; i < iterations; i++) { U_dash = valueIterateOnceWith(gamma, pi, U_dash); } return(U_dash); }
public Pair <ACTION_TYPE, Double> getTransitionWithMaximumExpectedUtility( STATE_TYPE s, MDPUtilityFunction <STATE_TYPE> uf) { if ((isTerminal(s))) { return(new Pair <ACTION_TYPE, Double>(default(ACTION_TYPE), 0.0)); } List <MDPTransition <STATE_TYPE, ACTION_TYPE> > transitionsStartingWithS = getTransitionsStartingWith(s); Dictionary <ACTION_TYPE, Double> actionsToUtilities = getExpectedUtilityForSelectedTransitions( transitionsStartingWithS, uf); return(getActionWithMaximumUtility(actionsToUtilities)); }
public Pair <ACTION_TYPE, Double> getTransitionWithMaximumExpectedUtilityUsingPolicy( MDPPolicy <STATE_TYPE, ACTION_TYPE> policy, STATE_TYPE s, MDPUtilityFunction <STATE_TYPE> uf) { if ((isTerminal(s))) { return(new Pair <ACTION_TYPE, Double>(default(ACTION_TYPE), 0.0)); } List <MDPTransition <STATE_TYPE, ACTION_TYPE> > transitionsWithStartingStateSAndActionFromPolicy = getTransitionsWithStartingStateAndAction( s, policy.getAction(s)); Dictionary <ACTION_TYPE, Double> actionsToUtilities = getExpectedUtilityForSelectedTransitions( transitionsWithStartingStateSAndActionFromPolicy, uf); return(getActionWithMaximumUtility(actionsToUtilities)); }
public MDPUtilityFunction <STATE_TYPE> valueIterationForFixedIterations( int numberOfIterations, double gamma) { MDPUtilityFunction <STATE_TYPE> utilityFunction = initialUtilityFunction(); for (int i = 0; i < numberOfIterations; i++) { Pair <MDPUtilityFunction <STATE_TYPE>, Double> result = valueIterateOnce( gamma, utilityFunction); utilityFunction = result.getFirst(); // double maxUtilityGrowth = result.getSecond(); // System.Console.WriteLine("maxUtilityGrowth " + maxUtilityGrowth); } return(utilityFunction); }
private MDPUtilityFunction <STATE_TYPE> valueIterateOnceWith(double gamma, MDPPolicy <STATE_TYPE, ACTION_TYPE> pi, MDPUtilityFunction <STATE_TYPE> U) { MDPUtilityFunction <STATE_TYPE> U_dash = U.copy(); foreach (STATE_TYPE s in nonFinalstates) { Pair <ACTION_TYPE, Double> highestPolicyTransition = transitionModel .getTransitionWithMaximumExpectedUtilityUsingPolicy(pi, s, U); double utility = rewardFunction.getRewardFor(s) + (gamma * highestPolicyTransition.getSecond()); U_dash.setUtility(s, utility); } // System.Console.WriteLine("ValueIterationOnce before " + U); // System.Console.WriteLine("ValueIterationOnce after " + U_dash); return(U_dash); }
public MDPUtilityFunction <STATE_TYPE> valueIterationTillMAximumUtilityGrowthFallsBelowErrorMargin( double gamma, double errorMargin) { int iterationCounter = 0; double maxUtilityGrowth = 0.0; MDPUtilityFunction <STATE_TYPE> utilityFunction = initialUtilityFunction(); do { Pair <MDPUtilityFunction <STATE_TYPE>, Double> result = valueIterateOnce( gamma, utilityFunction); utilityFunction = result.getFirst(); maxUtilityGrowth = result.getSecond(); iterationCounter++; // System.Console.WriteLine("Itration Number" +iterationCounter + " max // utility growth " + maxUtilityGrowth); } while (maxUtilityGrowth > errorMargin); return(utilityFunction); }