private IMarkovAction <TState> GetOptimalActionEx(IDeterministicPolicy <TState> policy, TState state)
        {
            var iterations        = ValueFunctions[policy].Count - 1;
            var lastValueFunction = ValueFunctions[policy][iterations];
            var argmax            = policy[state];
            var value             = new Value(double.MinValue, 0.0);
            var baseValue         = value;

            if (lastValueFunction.HasState(state))
            {
                baseValue = value = lastValueFunction[state];
            }

            var deps = GetGreedyDependencies(policy, state);

            for (var i = 1; i <= iterations; ++i)
            {
                var states = deps[iterations - i];
                if (states.Count == 0)
                {
                    continue;
                }
                ComputeValueFunction(i, policy, states);
            }

            lastValueFunction = ValueFunctions[policy][iterations];
            foreach (var action in policy.GetAllowedActions(state))
            {
                if (action == null)
                {
                    continue;
                }

                var expectation = GetExpectation(action, state, lastValueFunction);
                if (expectation.Mean <= value.Mean)
                {
                    continue;
                }
                value  = new Value(expectation.Mean, expectation.Variance);
                argmax = action;
            }

            if (baseValue.Mean != double.MinValue && value.Mean != baseValue.Mean)
            {
                Changes[state] = value.Mean - baseValue.Mean;
            }

            return(argmax);
        }
        private List <HashSet <TState> > GetGreedyDependencies(IDeterministicPolicy <TState> policy, TState state)
        {
            var list           = new List <HashSet <TState> >();
            var iterations     = ValueFunctions[policy].Count - 1;
            var valueFunction  = ValueFunctions[policy][iterations];
            var allowedActions = policy.GetAllowedActions(state).Where(a => a != null);

            var prevPolicy = Policies.Find(policy)?.Previous?.Value;

            var nextStates = new HashSet <TState>();

            foreach (var action in allowedActions)
            {
                var actionStates = action[state].Where(s => !valueFunction.HasState(s));
                nextStates.UnionWith(actionStates);
            }

            list.Add(nextStates);

            while (iterations-- > 0)
            {
                var set = new HashSet <TState>();
                valueFunction = ValueFunctions[policy][iterations];

                foreach (var nextState in nextStates)
                {
                    if (!UseReachableStateSpace && !policy.HasOptimal(nextState) && prevPolicy != null)
                    {
                        policy[nextState] = GetOptimalActionEx(prevPolicy, nextState);
                    }

                    var action = policy[nextState];
                    if (action == null)
                    {
                        continue;
                    }

                    // IEnumerable<TState> actionStates;
                    // lock (action)
                    var actionStates = action[nextState].Where(s => !valueFunction.HasState(s));
                    set.UnionWith(actionStates);
                }

                list.Add(set);
                nextStates = set;
            }

            return(list);
        }
        private Dictionary <IMarkovAction <TState>, List <HashSet <TState> > > GetAllowedActionsDependencies(IDeterministicPolicy <TState> policy, TState state)
        {
            var iterations = ValueFunctions[policy].Count - 1;
            var dict       = new Dictionary <IMarkovAction <TState>, List <HashSet <TState> > >();

            Parallel.ForEach(policy.GetAllowedActions(state).Where(a => a != null), new ParallelOptions {
                MaxDegreeOfParallelism = Configuration.ThreadsPerInstance
            }, action =>
            {
                var deps = GetValueDependencies(policy, state, iterations, action);

                lock (dict)
                    dict[action] = deps;
            });
            return(dict);
        }
        private IMarkovAction <TState> GetOptimalAction(IDeterministicPolicy <TState> policy, TState state)
        {
            var iterations        = ValueFunctions[policy].Count - 1;
            var lastValueFunction = ValueFunctions[policy][iterations];
            var argmax            = policy[state];
            var value             = new Value(double.MinValue, 0.0);

            if (lastValueFunction.HasState(state))
            {
                value = lastValueFunction[state];
            }

            var dict = GetAllowedActionsDependencies(policy, state);

            foreach (var action in policy.GetAllowedActions(state))
            {
                if (action == null)
                {
                    continue;
                }

                var deps = dict[action];
                for (var i = 1; i <= iterations; ++i)
                {
                    var valueFunction = ValueFunctions[policy][i];
                    var states        = deps[iterations - i];
                    if (states.Count == 0)
                    {
                        continue;
                    }
                    var newStates = states.Where(s => !valueFunction.HasState(s));
                    ComputeValueFunction(i, policy, newStates);
                }

                lastValueFunction = ValueFunctions[policy][iterations];
                var expectation = GetExpectation(action, state, lastValueFunction);
                if (expectation.Mean <= value.Mean)
                {
                    continue;
                }
                value  = new Value(expectation.Mean, expectation.Variance);
                argmax = action;
            }

            return(argmax);
        }
        private IDeterministicPolicy <TState> PartialPolicyIteration(IDeterministicPolicy <TState> policy)
        {
            var nextPolicy = (IDeterministicPolicy <TState>)policy.Clone();
            var dict       = new Dictionary <TState, double> {
                [_initialState] = 1.0
            };
            var allStates = new HashSet <TState> {
                _initialState
            };

            // {0} -> A{0}
            var nextStates = new HashSet <TState>();

            nextPolicy[_initialState] = GetOptimalActionEx(policy, _initialState);
            foreach (var action in policy.GetAllowedActions(_initialState).Where(a => a != null))
            {
                var states = action[_initialState].ToImmutableHashSet();
                foreach (var state in states)
                {
                    dict[state] = action[_initialState, state];
                    var optimalAction = GetOptimalActionEx(policy, state);
                    if (optimalAction == null)
                    {
                        continue;
                    }
                    nextPolicy[state] = optimalAction;
                }

                nextStates.UnionWith(states);
                allStates.UnionWith(states);
            }

            var tol      = MinProbability;
            var maxProba = 1.0;
            var logStep  = 0.3;
            var scale    = -1 / Math.Log(tol, 10);

            if (LogProgress)
            {
                Log?.Info("Calculating optimal actions...");
            }
            var prevMaxProba = 1.0;

            while (maxProba > tol)
            {
                maxProba = 0.0;
                var nextNextStates = new HashSet <TState>();
                foreach (var state in nextStates)
                {
                    var optimalAction = GetOptimalActionEx(policy, state);
                    if (optimalAction != null)
                    {
                        nextPolicy[state] = optimalAction;
                    }

                    var stateProba = dict[state];
                    if (stateProba > maxProba)
                    {
                        maxProba = stateProba;
                    }

                    var action = nextPolicy[state];
                    if (action == null)
                    {
                        continue;
                    }

                    var actionStates = action[state].ToImmutableHashSet();
                    foreach (var nextState in actionStates)
                    {
                        var p = action[state, nextState];

                        if (dict.TryGetValue(nextState, out var current))
                        {
                            dict[nextState] = Math.Max(current, stateProba * p);
                        }
                        else
                        {
                            dict.Add(nextState, stateProba * p);
                        }
                    }

                    nextNextStates.UnionWith(actionStates);
                }

                if (Math.Log(prevMaxProba, 10) - Math.Log(maxProba, 10) > logStep && LogProgress)
                {
                    Log?.Info(
                        $"{Math.Truncate(Math.Min(-Math.Log(maxProba, 10) * scale, 1) * 10000) / 100}%");
                    prevMaxProba = maxProba;
                }

                nextStates.Clear();
                foreach (var state in nextNextStates)
                {
                    if (allStates.Add(state))
                    {
                        nextStates.Add(state);
                    }
                }
            }

            return(nextPolicy);
        }
        private Dictionary <TState, double> GetReachableStateSpace(IDeterministicPolicy <TState> policy, TState initialState)
        {
            var dict      = new Dictionary <TState, double>();
            var tol       = MinProbability;
            var maxProba  = 1.0;
            var allStates = new HashSet <TState>();
            var states    = new HashSet <TState> {
                initialState
            };

            dict[initialState] = 1.0;

            var it = 0;

            while (maxProba > tol && states.Count > 0)
            {
                maxProba = 0.0;
                var allowedStates = new HashSet <TState>();
                foreach (var state in states)
                {
                    var allowedActions = policy.GetAllowedActions(state).Where(a => a != null).ToArray();
                    var stateProba     = dict[state];
                    if (stateProba > maxProba)
                    {
                        maxProba = stateProba;
                    }
                    if (stateProba < tol)
                    {
                        continue;
                    }

                    foreach (var action in allowedActions)
                    {
                        var nextStates = action[state].ToImmutableHashSet();
                        foreach (var nextState in nextStates)
                        {
                            var p = action[state, nextState];

                            if (dict.TryGetValue(nextState, out var current))
                            {
                                dict[nextState] = Math.Max(current, stateProba * p);
                            }
                            else
                            {
                                dict.Add(nextState, stateProba * p);
                            }
                        }

                        allowedStates.UnionWith(nextStates);
                    }
                }

                states.Clear();
                foreach (var allowedState in allowedStates)
                {
                    if (allStates.Add(allowedState))
                    {
                        states.Add(allowedState);
                    }
                }

                ++it;
            }

            return(dict);
        }