private IMarkovAction <TState> GetOptimalActionEx(IDeterministicPolicy <TState> policy, TState state) { var iterations = ValueFunctions[policy].Count - 1; var lastValueFunction = ValueFunctions[policy][iterations]; var argmax = policy[state]; var value = new Value(double.MinValue, 0.0); var baseValue = value; if (lastValueFunction.HasState(state)) { baseValue = value = lastValueFunction[state]; } var deps = GetGreedyDependencies(policy, state); for (var i = 1; i <= iterations; ++i) { var states = deps[iterations - i]; if (states.Count == 0) { continue; } ComputeValueFunction(i, policy, states); } lastValueFunction = ValueFunctions[policy][iterations]; foreach (var action in policy.GetAllowedActions(state)) { if (action == null) { continue; } var expectation = GetExpectation(action, state, lastValueFunction); if (expectation.Mean <= value.Mean) { continue; } value = new Value(expectation.Mean, expectation.Variance); argmax = action; } if (baseValue.Mean != double.MinValue && value.Mean != baseValue.Mean) { Changes[state] = value.Mean - baseValue.Mean; } return(argmax); }
private List <HashSet <TState> > GetGreedyDependencies(IDeterministicPolicy <TState> policy, TState state) { var list = new List <HashSet <TState> >(); var iterations = ValueFunctions[policy].Count - 1; var valueFunction = ValueFunctions[policy][iterations]; var allowedActions = policy.GetAllowedActions(state).Where(a => a != null); var prevPolicy = Policies.Find(policy)?.Previous?.Value; var nextStates = new HashSet <TState>(); foreach (var action in allowedActions) { var actionStates = action[state].Where(s => !valueFunction.HasState(s)); nextStates.UnionWith(actionStates); } list.Add(nextStates); while (iterations-- > 0) { var set = new HashSet <TState>(); valueFunction = ValueFunctions[policy][iterations]; foreach (var nextState in nextStates) { if (!UseReachableStateSpace && !policy.HasOptimal(nextState) && prevPolicy != null) { policy[nextState] = GetOptimalActionEx(prevPolicy, nextState); } var action = policy[nextState]; if (action == null) { continue; } // IEnumerable<TState> actionStates; // lock (action) var actionStates = action[nextState].Where(s => !valueFunction.HasState(s)); set.UnionWith(actionStates); } list.Add(set); nextStates = set; } return(list); }
private Dictionary <IMarkovAction <TState>, List <HashSet <TState> > > GetAllowedActionsDependencies(IDeterministicPolicy <TState> policy, TState state) { var iterations = ValueFunctions[policy].Count - 1; var dict = new Dictionary <IMarkovAction <TState>, List <HashSet <TState> > >(); Parallel.ForEach(policy.GetAllowedActions(state).Where(a => a != null), new ParallelOptions { MaxDegreeOfParallelism = Configuration.ThreadsPerInstance }, action => { var deps = GetValueDependencies(policy, state, iterations, action); lock (dict) dict[action] = deps; }); return(dict); }
private IMarkovAction <TState> GetOptimalAction(IDeterministicPolicy <TState> policy, TState state) { var iterations = ValueFunctions[policy].Count - 1; var lastValueFunction = ValueFunctions[policy][iterations]; var argmax = policy[state]; var value = new Value(double.MinValue, 0.0); if (lastValueFunction.HasState(state)) { value = lastValueFunction[state]; } var dict = GetAllowedActionsDependencies(policy, state); foreach (var action in policy.GetAllowedActions(state)) { if (action == null) { continue; } var deps = dict[action]; for (var i = 1; i <= iterations; ++i) { var valueFunction = ValueFunctions[policy][i]; var states = deps[iterations - i]; if (states.Count == 0) { continue; } var newStates = states.Where(s => !valueFunction.HasState(s)); ComputeValueFunction(i, policy, newStates); } lastValueFunction = ValueFunctions[policy][iterations]; var expectation = GetExpectation(action, state, lastValueFunction); if (expectation.Mean <= value.Mean) { continue; } value = new Value(expectation.Mean, expectation.Variance); argmax = action; } return(argmax); }
private IDeterministicPolicy <TState> PartialPolicyIteration(IDeterministicPolicy <TState> policy) { var nextPolicy = (IDeterministicPolicy <TState>)policy.Clone(); var dict = new Dictionary <TState, double> { [_initialState] = 1.0 }; var allStates = new HashSet <TState> { _initialState }; // {0} -> A{0} var nextStates = new HashSet <TState>(); nextPolicy[_initialState] = GetOptimalActionEx(policy, _initialState); foreach (var action in policy.GetAllowedActions(_initialState).Where(a => a != null)) { var states = action[_initialState].ToImmutableHashSet(); foreach (var state in states) { dict[state] = action[_initialState, state]; var optimalAction = GetOptimalActionEx(policy, state); if (optimalAction == null) { continue; } nextPolicy[state] = optimalAction; } nextStates.UnionWith(states); allStates.UnionWith(states); } var tol = MinProbability; var maxProba = 1.0; var logStep = 0.3; var scale = -1 / Math.Log(tol, 10); if (LogProgress) { Log?.Info("Calculating optimal actions..."); } var prevMaxProba = 1.0; while (maxProba > tol) { maxProba = 0.0; var nextNextStates = new HashSet <TState>(); foreach (var state in nextStates) { var optimalAction = GetOptimalActionEx(policy, state); if (optimalAction != null) { nextPolicy[state] = optimalAction; } var stateProba = dict[state]; if (stateProba > maxProba) { maxProba = stateProba; } var action = nextPolicy[state]; if (action == null) { continue; } var actionStates = action[state].ToImmutableHashSet(); foreach (var nextState in actionStates) { var p = action[state, nextState]; if (dict.TryGetValue(nextState, out var current)) { dict[nextState] = Math.Max(current, stateProba * p); } else { dict.Add(nextState, stateProba * p); } } nextNextStates.UnionWith(actionStates); } if (Math.Log(prevMaxProba, 10) - Math.Log(maxProba, 10) > logStep && LogProgress) { Log?.Info( $"{Math.Truncate(Math.Min(-Math.Log(maxProba, 10) * scale, 1) * 10000) / 100}%"); prevMaxProba = maxProba; } nextStates.Clear(); foreach (var state in nextNextStates) { if (allStates.Add(state)) { nextStates.Add(state); } } } return(nextPolicy); }
private Dictionary <TState, double> GetReachableStateSpace(IDeterministicPolicy <TState> policy, TState initialState) { var dict = new Dictionary <TState, double>(); var tol = MinProbability; var maxProba = 1.0; var allStates = new HashSet <TState>(); var states = new HashSet <TState> { initialState }; dict[initialState] = 1.0; var it = 0; while (maxProba > tol && states.Count > 0) { maxProba = 0.0; var allowedStates = new HashSet <TState>(); foreach (var state in states) { var allowedActions = policy.GetAllowedActions(state).Where(a => a != null).ToArray(); var stateProba = dict[state]; if (stateProba > maxProba) { maxProba = stateProba; } if (stateProba < tol) { continue; } foreach (var action in allowedActions) { var nextStates = action[state].ToImmutableHashSet(); foreach (var nextState in nextStates) { var p = action[state, nextState]; if (dict.TryGetValue(nextState, out var current)) { dict[nextState] = Math.Max(current, stateProba * p); } else { dict.Add(nextState, stateProba * p); } } allowedStates.UnionWith(nextStates); } } states.Clear(); foreach (var allowedState in allowedStates) { if (allStates.Add(allowedState)) { states.Add(allowedState); } } ++it; } return(dict); }