/// <summary> /// Policy iteration for MDP /// </summary> /// <param name="basePolicy">Starting policy</param> /// <param name="optimalPolicy">Resulting optimal policy</param> /// <param name="tolerance">Convergence tolerance</param> /// <returns>Optimal value</returns> public double GetOptimalValueViaPolicyIteration( IDeterministicPolicy <TState> basePolicy, out IDeterministicPolicy <TState> optimalPolicy, double tolerance = 0.0001) { var policy = (IDeterministicPolicy <TState>)basePolicy.Clone(); var node = Policies.AddFirst(basePolicy); var value = GetOptimalValue(policy, tolerance); optimalPolicy = policy = PolicyIteration(policy); var outputStates = new List <TState> { _initialState }; outputStates.AddRange(policy.GetAllowedActions(_initialState).Where(a => a != null) .SelectMany(a => a[_initialState])); while (policy.IsModified) { node = Policies.AddAfter(node, policy); var nextValue = GetOptimalValue(policy, tolerance, false, outputStates.Distinct().ToArray()); if (LogProgress) { Log?.Info($"{value.Mean}->{nextValue.Mean} at variance {Math.Sqrt(value.Variance - (value.Mean * value.Mean))}->{Math.Sqrt(nextValue.Variance - (nextValue.Mean * nextValue.Mean))} with {policy.Modifications.Length} modifications"); } var ratio = value.Mean > 0 ? nextValue.Mean / value.Mean : value.Mean / nextValue.Mean; if ((nextValue.Mean - (tolerance * 100) < value.Mean) || Math.Abs(ratio) < 1 + RelativeOptimalTolerance) { value = nextValue; break; } value = nextValue; policy = PolicyIteration(policy); if (node.Previous == null) { continue; } ValueFunctions[node.Previous.Value].Clear(); ValueFunctions.Remove(node.Previous.Value); Policies.Remove(node.Previous); } value = GetOptimalValue(policy, tolerance); optimalPolicy = (IDeterministicPolicy <TState>)basePolicy.Clone(); foreach (var state in AllStateSpace) { optimalPolicy[state] = policy[state]; } ValueFunctions.Clear(); Policies.Clear(); return(value.Mean); }