/// <summary>
        /// Policy iteration for MDP
        /// </summary>
        /// <param name="basePolicy">Starting policy</param>
        /// <param name="optimalPolicy">Resulting optimal policy</param>
        /// <param name="tolerance">Convergence tolerance</param>
        /// <returns>Optimal value</returns>
        public double GetOptimalValueViaPolicyIteration(
            IDeterministicPolicy <TState> basePolicy,
            out IDeterministicPolicy <TState> optimalPolicy,
            double tolerance = 0.0001)
        {
            var policy = (IDeterministicPolicy <TState>)basePolicy.Clone();
            var node   = Policies.AddFirst(basePolicy);
            var value  = GetOptimalValue(policy, tolerance);

            optimalPolicy = policy = PolicyIteration(policy);
            var outputStates = new List <TState> {
                _initialState
            };

            outputStates.AddRange(policy.GetAllowedActions(_initialState).Where(a => a != null)
                                  .SelectMany(a => a[_initialState]));
            while (policy.IsModified)
            {
                node = Policies.AddAfter(node, policy);
                var nextValue = GetOptimalValue(policy, tolerance, false, outputStates.Distinct().ToArray());
                if (LogProgress)
                {
                    Log?.Info($"{value.Mean}->{nextValue.Mean} at variance {Math.Sqrt(value.Variance - (value.Mean * value.Mean))}->{Math.Sqrt(nextValue.Variance - (nextValue.Mean * nextValue.Mean))} with {policy.Modifications.Length} modifications");
                }
                var ratio = value.Mean > 0 ? nextValue.Mean / value.Mean : value.Mean / nextValue.Mean;
                if ((nextValue.Mean - (tolerance * 100) < value.Mean) || Math.Abs(ratio) < 1 + RelativeOptimalTolerance)
                {
                    value = nextValue;
                    break;
                }

                value  = nextValue;
                policy = PolicyIteration(policy);
                if (node.Previous == null)
                {
                    continue;
                }

                ValueFunctions[node.Previous.Value].Clear();
                ValueFunctions.Remove(node.Previous.Value);
                Policies.Remove(node.Previous);
            }

            value         = GetOptimalValue(policy, tolerance);
            optimalPolicy = (IDeterministicPolicy <TState>)basePolicy.Clone();
            foreach (var state in AllStateSpace)
            {
                optimalPolicy[state] = policy[state];
            }

            ValueFunctions.Clear();
            Policies.Clear();

            return(value.Mean);
        }