Пример #1
0
        private void ComputeValueFunction(int iteration, IDeterministicPolicy <TState> policy, IEnumerable <TState> states)
        {
            if (!ValueFunctions[policy].TryGetValue(iteration - 1, out var previousFunction))
            {
                throw new InvalidOperationException("Previous function not available");
            }

            if (!ValueFunctions[policy].ContainsKey(iteration))
            {
                ValueFunctions[policy][iteration] = NextFunction();
            }

            var function = ValueFunctions[policy][iteration];

            foreach (var state in states)
            {
                var action      = policy[state];
                var expectation = new Value(0.0, 0.0);
                if (action != null)
                {
                    expectation = GetExpectation(action, state, previousFunction);
                }

                function.Add(state, expectation);
            }
        }
Пример #2
0
        private Value GetOptimalValue(IDeterministicPolicy <TState> policy, double tolerance = 1e-4, bool ignoreZeroChange = false, TState[] outputStates = null)
        {
            if (outputStates == null)
            {
                outputStates = new[] { _initialState }
            }
            ;
            Initialize(policy, outputStates);
            var values = new Value[outputStates.Length];
            var diff   = new Value[outputStates.Length];
            var change = double.MaxValue;

            while ((change > tolerance || (change == 0 && !ignoreZeroChange) || values[0].Mean == 0) && _iteration < _maxIterations && StateSpace[_iteration].Count > 0)
            {
                var nextValues = Iterate(policy);

                for (var i = 0; i < outputStates.Length; ++i)
                {
                    diff[i] = nextValues[i] + ((-1) * values[i]);
                }
                change = Norm(diff);

                if (LogIterations)
                {
                    Log?.Info($"Iteration {_iteration} : {values[0].Mean} -> {nextValues[0].Mean} \t {change}");
                }

                values = nextValues;
            }

            return(values[0]);
        }
Пример #3
0
        private IDeterministicPolicy <TState> PolicyIteration(IDeterministicPolicy <TState> policy)
        {
            if (!UseReachableStateSpace)
            {
                return(PartialPolicyIteration(policy));
            }

            if (ReachableStates == null)
            {
                ReachableStates = GetReachableStateSpace(policy, _initialState);
            }

            var nextPolicy = (IDeterministicPolicy <TState>)policy.Clone();

            var progress = 0.0;
            var step     = 1.0 / ReachableStates.Count;
            var logStep  = 0.01;

            Log?.Info("Calculating optimal actions...");
            foreach (var state in ReachableStates.Keys)
            {
                var optimalAction = GetOptimalActionEx(policy, state);
                if (optimalAction != null)
                {
                    nextPolicy[state] = optimalAction;
                }
                if ((progress / logStep) - Math.Truncate(progress / logStep) < step / logStep)
                {
                    Log?.Info($"{Math.Truncate(progress * 10000) / 100}%");
                }
                progress += step;
            }

            return(nextPolicy);
        }
Пример #4
0
 private static void PrintPolicyActions(
     GamblersWorld world,
     IDeterministicPolicy <GamblersWorldState, GamblersWorldAction> policy)
 {
     foreach (var state in world.AllStates())
     {
         Console.WriteLine($"{state}: {policy.Action(state)}");
     }
 }
Пример #5
0
        /// <summary>
        /// Policy iteration for MDP
        /// </summary>
        /// <param name="basePolicy">Starting policy</param>
        /// <param name="optimalPolicy">Resulting optimal policy</param>
        /// <param name="tolerance">Convergence tolerance</param>
        /// <returns>Optimal value</returns>
        public double GetOptimalValueViaPolicyIteration(
            IDeterministicPolicy <TState> basePolicy,
            out IDeterministicPolicy <TState> optimalPolicy,
            double tolerance = 0.0001)
        {
            var policy = (IDeterministicPolicy <TState>)basePolicy.Clone();
            var node   = Policies.AddFirst(basePolicy);
            var value  = GetOptimalValue(policy, tolerance);

            optimalPolicy = policy = PolicyIteration(policy);
            var outputStates = new List <TState> {
                _initialState
            };

            outputStates.AddRange(policy.GetAllowedActions(_initialState).Where(a => a != null)
                                  .SelectMany(a => a[_initialState]));
            while (policy.IsModified)
            {
                node = Policies.AddAfter(node, policy);
                var nextValue = GetOptimalValue(policy, tolerance, false, outputStates.Distinct().ToArray());
                if (LogProgress)
                {
                    Log?.Info($"{value.Mean}->{nextValue.Mean} at variance {Math.Sqrt(value.Variance - (value.Mean * value.Mean))}->{Math.Sqrt(nextValue.Variance - (nextValue.Mean * nextValue.Mean))} with {policy.Modifications.Length} modifications");
                }
                var ratio = value.Mean > 0 ? nextValue.Mean / value.Mean : value.Mean / nextValue.Mean;
                if ((nextValue.Mean - (tolerance * 100) < value.Mean) || Math.Abs(ratio) < 1 + RelativeOptimalTolerance)
                {
                    value = nextValue;
                    break;
                }

                value  = nextValue;
                policy = PolicyIteration(policy);
                if (node.Previous == null)
                {
                    continue;
                }

                ValueFunctions[node.Previous.Value].Clear();
                ValueFunctions.Remove(node.Previous.Value);
                Policies.Remove(node.Previous);
            }

            value         = GetOptimalValue(policy, tolerance);
            optimalPolicy = (IDeterministicPolicy <TState>)basePolicy.Clone();
            foreach (var state in AllStateSpace)
            {
                optimalPolicy[state] = policy[state];
            }

            ValueFunctions.Clear();
            Policies.Clear();

            return(value.Mean);
        }
Пример #6
0
        public EnhancementGraph(IDeterministicPolicy <EnhancementState> policy, EnhancementState initialState, int iterations)
        {
            var nextStates = new HashSet <EnhancementState> {
                initialState
            };
            var allStates = new HashSet <EnhancementState>();

            while (iterations-- > 0)
            {
                var nextNextStates = new HashSet <EnhancementState>();
                foreach (var state in nextStates)
                {
                    var from = _graph.Vertices.SingleOrDefault(v => v.Label == state.ToString());
                    if (from == null)
                    {
                        from = new StateVertex(state);
                        _graph.AddVertex(from);
                    }

                    var action = policy[state];
                    if (action == null)
                    {
                        continue;
                    }

                    var actionStates = action[state].ToArray();
                    foreach (var s in actionStates)
                    {
                        var to = _graph.Vertices.SingleOrDefault(v => v.Label == s.ToString());
                        if (to == null)
                        {
                            to = new StateVertex(s);
                            _graph.AddVertex(to);
                        }

                        _graph.AddEdge(new ActionEdge(action)
                        {
                            Source = from,
                            Target = to,
                        });
                    }

                    nextNextStates.UnionWith(actionStates);
                }

                nextStates.Clear();
                foreach (var s in nextNextStates)
                {
                    if (allStates.Add(s))
                    {
                        nextStates.Add(s);
                    }
                }
            }
        }
Пример #7
0
        private IMarkovAction <TState> GetOptimalActionEx(IDeterministicPolicy <TState> policy, TState state)
        {
            var iterations        = ValueFunctions[policy].Count - 1;
            var lastValueFunction = ValueFunctions[policy][iterations];
            var argmax            = policy[state];
            var value             = new Value(double.MinValue, 0.0);
            var baseValue         = value;

            if (lastValueFunction.HasState(state))
            {
                baseValue = value = lastValueFunction[state];
            }

            var deps = GetGreedyDependencies(policy, state);

            for (var i = 1; i <= iterations; ++i)
            {
                var states = deps[iterations - i];
                if (states.Count == 0)
                {
                    continue;
                }
                ComputeValueFunction(i, policy, states);
            }

            lastValueFunction = ValueFunctions[policy][iterations];
            foreach (var action in policy.GetAllowedActions(state))
            {
                if (action == null)
                {
                    continue;
                }

                var expectation = GetExpectation(action, state, lastValueFunction);
                if (expectation.Mean <= value.Mean)
                {
                    continue;
                }
                value  = new Value(expectation.Mean, expectation.Variance);
                argmax = action;
            }

            if (baseValue.Mean != double.MinValue && value.Mean != baseValue.Mean)
            {
                Changes[state] = value.Mean - baseValue.Mean;
            }

            return(argmax);
        }
Пример #8
0
        private List <HashSet <TState> > GetGreedyDependencies(IDeterministicPolicy <TState> policy, TState state)
        {
            var list           = new List <HashSet <TState> >();
            var iterations     = ValueFunctions[policy].Count - 1;
            var valueFunction  = ValueFunctions[policy][iterations];
            var allowedActions = policy.GetAllowedActions(state).Where(a => a != null);

            var prevPolicy = Policies.Find(policy)?.Previous?.Value;

            var nextStates = new HashSet <TState>();

            foreach (var action in allowedActions)
            {
                var actionStates = action[state].Where(s => !valueFunction.HasState(s));
                nextStates.UnionWith(actionStates);
            }

            list.Add(nextStates);

            while (iterations-- > 0)
            {
                var set = new HashSet <TState>();
                valueFunction = ValueFunctions[policy][iterations];

                foreach (var nextState in nextStates)
                {
                    if (!UseReachableStateSpace && !policy.HasOptimal(nextState) && prevPolicy != null)
                    {
                        policy[nextState] = GetOptimalActionEx(prevPolicy, nextState);
                    }

                    var action = policy[nextState];
                    if (action == null)
                    {
                        continue;
                    }

                    // IEnumerable<TState> actionStates;
                    // lock (action)
                    var actionStates = action[nextState].Where(s => !valueFunction.HasState(s));
                    set.UnionWith(actionStates);
                }

                list.Add(set);
                nextStates = set;
            }

            return(list);
        }
Пример #9
0
        private List <HashSet <TState> > GetValueDependencies(IDeterministicPolicy <TState> policy, TState state, int depth, IMarkovAction <TState> nextAction = null)
        {
            var deps          = new List <HashSet <TState> >(depth + 1);
            var it            = 0;
            var states        = new HashSet <TState>();
            var valueFunction = ValueFunctions[policy][depth];

            // replace the policy action with next action
            if (nextAction != null)
            {
                var actionStates = new List <TState>();
                lock (nextAction)
                    actionStates.AddRange(nextAction[state].Where(s => !valueFunction.HasState(s)));
                states.UnionWith(actionStates);
                depth--;
            }
            else
            {
                states.Add(state);
            }

            deps.Add(states);

            while (depth-- > 0)
            {
                ++it;
                var nextStates = new HashSet <TState>();
                foreach (var s in states)
                {
                    var action = policy[s];
                    if (action == null)
                    {
                        continue;
                    }
                    IEnumerable <TState> actionStates;
                    lock (action)
                        actionStates = action[s];

                    nextStates.UnionWith(actionStates);
                }

                deps.Add(nextStates);
                states = nextStates;
            }

            return(deps);
        }
Пример #10
0
        private IMarkovAction <TState> GetOptimalAction(IDeterministicPolicy <TState> policy, TState state)
        {
            var iterations        = ValueFunctions[policy].Count - 1;
            var lastValueFunction = ValueFunctions[policy][iterations];
            var argmax            = policy[state];
            var value             = new Value(double.MinValue, 0.0);

            if (lastValueFunction.HasState(state))
            {
                value = lastValueFunction[state];
            }

            var dict = GetAllowedActionsDependencies(policy, state);

            foreach (var action in policy.GetAllowedActions(state))
            {
                if (action == null)
                {
                    continue;
                }

                var deps = dict[action];
                for (var i = 1; i <= iterations; ++i)
                {
                    var valueFunction = ValueFunctions[policy][i];
                    var states        = deps[iterations - i];
                    if (states.Count == 0)
                    {
                        continue;
                    }
                    var newStates = states.Where(s => !valueFunction.HasState(s));
                    ComputeValueFunction(i, policy, newStates);
                }

                lastValueFunction = ValueFunctions[policy][iterations];
                var expectation = GetExpectation(action, state, lastValueFunction);
                if (expectation.Mean <= value.Mean)
                {
                    continue;
                }
                value  = new Value(expectation.Mean, expectation.Variance);
                argmax = action;
            }

            return(argmax);
        }
Пример #11
0
        /// <summary>
        /// Populate states reachable from initial state within the current number of iterations
        /// </summary>
        private void ExtendStateSpace(IDeterministicPolicy <TState> policy)
        {
            var previousLayer = StateSpace[_iteration - 1];

            if (StateSpace.ContainsKey(_iteration))
            {
                return;
            }
            var layer = new HashSet <TState>();

            if (_iteration > 2)
            {
                layer = StateSpace[_iteration - 2];
                layer.Clear();
            }

            var distinctLayer = new List <TState>();
            var nextStates    = new HashSet <TState>();

            foreach (var state in previousLayer)
            {
                var action = policy[state];
                if (action == null)
                {
                    continue;
                }

                nextStates.UnionWith(action[state]);
            }

            foreach (var nextState in nextStates)
            {
                if (AllStateSpace.Add(nextState))
                {
                    distinctLayer.Add(nextState);
                }

                layer.Add(nextState);
            }

            StateSpace[_iteration]         = layer;
            DistinctStateSpace[_iteration] = distinctLayer;
        }
Пример #12
0
        private IDeterministicPolicy <TState> PartialPolicyIteration(IDeterministicPolicy <TState> policy)
        {
            var nextPolicy = (IDeterministicPolicy <TState>)policy.Clone();
            var dict       = new Dictionary <TState, double> {
                [_initialState] = 1.0
            };
            var allStates = new HashSet <TState> {
                _initialState
            };

            // {0} -> A{0}
            var nextStates = new HashSet <TState>();

            nextPolicy[_initialState] = GetOptimalActionEx(policy, _initialState);
            foreach (var action in policy.GetAllowedActions(_initialState).Where(a => a != null))
            {
                var states = action[_initialState].ToImmutableHashSet();
                foreach (var state in states)
                {
                    dict[state] = action[_initialState, state];
                    var optimalAction = GetOptimalActionEx(policy, state);
                    if (optimalAction == null)
                    {
                        continue;
                    }
                    nextPolicy[state] = optimalAction;
                }

                nextStates.UnionWith(states);
                allStates.UnionWith(states);
            }

            var tol      = MinProbability;
            var maxProba = 1.0;
            var logStep  = 0.3;
            var scale    = -1 / Math.Log(tol, 10);

            if (LogProgress)
            {
                Log?.Info("Calculating optimal actions...");
            }
            var prevMaxProba = 1.0;

            while (maxProba > tol)
            {
                maxProba = 0.0;
                var nextNextStates = new HashSet <TState>();
                foreach (var state in nextStates)
                {
                    var optimalAction = GetOptimalActionEx(policy, state);
                    if (optimalAction != null)
                    {
                        nextPolicy[state] = optimalAction;
                    }

                    var stateProba = dict[state];
                    if (stateProba > maxProba)
                    {
                        maxProba = stateProba;
                    }

                    var action = nextPolicy[state];
                    if (action == null)
                    {
                        continue;
                    }

                    var actionStates = action[state].ToImmutableHashSet();
                    foreach (var nextState in actionStates)
                    {
                        var p = action[state, nextState];

                        if (dict.TryGetValue(nextState, out var current))
                        {
                            dict[nextState] = Math.Max(current, stateProba * p);
                        }
                        else
                        {
                            dict.Add(nextState, stateProba * p);
                        }
                    }

                    nextNextStates.UnionWith(actionStates);
                }

                if (Math.Log(prevMaxProba, 10) - Math.Log(maxProba, 10) > logStep && LogProgress)
                {
                    Log?.Info(
                        $"{Math.Truncate(Math.Min(-Math.Log(maxProba, 10) * scale, 1) * 10000) / 100}%");
                    prevMaxProba = maxProba;
                }

                nextStates.Clear();
                foreach (var state in nextNextStates)
                {
                    if (allStates.Add(state))
                    {
                        nextStates.Add(state);
                    }
                }
            }

            return(nextPolicy);
        }
Пример #13
0
        private Dictionary <IMarkovAction <TState>, List <HashSet <TState> > > GetAllowedActionsDependencies(IDeterministicPolicy <TState> policy, TState state)
        {
            var iterations = ValueFunctions[policy].Count - 1;
            var dict       = new Dictionary <IMarkovAction <TState>, List <HashSet <TState> > >();

            Parallel.ForEach(policy.GetAllowedActions(state).Where(a => a != null), new ParallelOptions {
                MaxDegreeOfParallelism = Configuration.ThreadsPerInstance
            }, action =>
            {
                var deps = GetValueDependencies(policy, state, iterations, action);

                lock (dict)
                    dict[action] = deps;
            });
            return(dict);
        }
Пример #14
0
        private Dictionary <TState, double> GetReachableStateSpace(IDeterministicPolicy <TState> policy, TState initialState)
        {
            var dict      = new Dictionary <TState, double>();
            var tol       = MinProbability;
            var maxProba  = 1.0;
            var allStates = new HashSet <TState>();
            var states    = new HashSet <TState> {
                initialState
            };

            dict[initialState] = 1.0;

            var it = 0;

            while (maxProba > tol && states.Count > 0)
            {
                maxProba = 0.0;
                var allowedStates = new HashSet <TState>();
                foreach (var state in states)
                {
                    var allowedActions = policy.GetAllowedActions(state).Where(a => a != null).ToArray();
                    var stateProba     = dict[state];
                    if (stateProba > maxProba)
                    {
                        maxProba = stateProba;
                    }
                    if (stateProba < tol)
                    {
                        continue;
                    }

                    foreach (var action in allowedActions)
                    {
                        var nextStates = action[state].ToImmutableHashSet();
                        foreach (var nextState in nextStates)
                        {
                            var p = action[state, nextState];

                            if (dict.TryGetValue(nextState, out var current))
                            {
                                dict[nextState] = Math.Max(current, stateProba * p);
                            }
                            else
                            {
                                dict.Add(nextState, stateProba * p);
                            }
                        }

                        allowedStates.UnionWith(nextStates);
                    }
                }

                states.Clear();
                foreach (var allowedState in allowedStates)
                {
                    if (allStates.Add(allowedState))
                    {
                        states.Add(allowedState);
                    }
                }

                ++it;
            }

            return(dict);
        }
Пример #15
0
 public bool HasSameActionsAs(IDeterministicPolicy <TState, TAction> otherPolicy)
 {
     return(_actions.Keys.All(state =>
                              _actions[state].Equals(otherPolicy.Action(state))));
 }