public MDPUtilityFunction <STATE_TYPE> valueIteration(double gamma, double error, double delta) { MDPUtilityFunction <STATE_TYPE> U = initialUtilityFunction(); MDPUtilityFunction <STATE_TYPE> U_dash = initialUtilityFunction(); double delta_max = (error * gamma) / (1 - gamma); do { U = U_dash.copy(); // System.Console.WriteLine(U); delta = 0.0; foreach (STATE_TYPE s in nonFinalstates) { Pair <ACTION_TYPE, Double> highestUtilityTransition = transitionModel .getTransitionWithMaximumExpectedUtility(s, U); double utility = rewardFunction.getRewardFor(s) + (gamma * highestUtilityTransition.getSecond()); U_dash.setUtility(s, utility); if ((Math.Abs(U_dash.getUtility(s) - U.getUtility(s))) > delta) { delta = Math.Abs(U_dash.getUtility(s) - U.getUtility(s)); } } } while (delta < delta_max); return(U); }
public MDPUtilityFunction <STATE_TYPE> policyEvaluation( MDPPolicy <STATE_TYPE, ACTION_TYPE> pi, MDPUtilityFunction <STATE_TYPE> U, double gamma, int iterations) { MDPUtilityFunction <STATE_TYPE> U_dash = U.copy(); for (int i = 0; i < iterations; i++) { U_dash = valueIterateOnceWith(gamma, pi, U_dash); } return(U_dash); }
private MDPUtilityFunction <STATE_TYPE> valueIterateOnceWith(double gamma, MDPPolicy <STATE_TYPE, ACTION_TYPE> pi, MDPUtilityFunction <STATE_TYPE> U) { MDPUtilityFunction <STATE_TYPE> U_dash = U.copy(); foreach (STATE_TYPE s in nonFinalstates) { Pair <ACTION_TYPE, Double> highestPolicyTransition = transitionModel .getTransitionWithMaximumExpectedUtilityUsingPolicy(pi, s, U); double utility = rewardFunction.getRewardFor(s) + (gamma * highestPolicyTransition.getSecond()); U_dash.setUtility(s, utility); } // System.Console.WriteLine("ValueIterationOnce before " + U); // System.Console.WriteLine("ValueIterationOnce after " + U_dash); return(U_dash); }