/// <summary> /// Uses the local Multi-Armed-Bandits to explore the action space and uses the global Multi-Armed-Bandit to exploit the best performing actions. /// </summary> /// <param name="context">The current search context.</param> /// <param name="state">The game state for the node.</param> /// <param name="gMAB">The global Multi-Armed-Bandit.</param> /// <returns>An <see cref="A"/> that was selected from the global Multi-Armed-Bandit.</returns> private A NaïveSampling(SearchContext <D, P, A, S, Sol> context, P state, IDictionary <long, Dictionary <int, LocalArm> > gMAB) { var apply = context.Application; var stateClone = context.Cloner.Clone(state); var stateHash = stateClone.HashMethod(); if (!gMAB.ContainsKey(stateHash)) { gMAB.Add(stateHash, new Dictionary <int, LocalArm>()); } // Use a policy p_0 to determine whether to explore or exploit // If explore was selected // x_1...x_n is sampled by using a policy p_l to select a value for each X_i in X independently. // As a side effect, the resulting value combination is added to the global MAB. // If exploit was selected // x_1...x_n is sampled by using a policy p_g to select a value combination using MAB_g. // Can only exploit if there is anything to exploit in the first place if (gMAB[stateHash].IsNullOrEmpty() || ExplorationStrategy.Policy(context, 0)) { // Explore // Create an action according to policy p_1 var action = SamplingStrategy.Sample(stateClone); var actionHash = action.GetHashCode(); // Evaluate the sampled action var endState = PlayoutStrategy.Playout(context, apply.Apply(context, stateClone, action)); var tempNode = new TreeSearchNode <P, A> { Payload = action }; var reward = EvaluationStrategy.Evaluate(context, tempNode, endState); // Add the action to the global MAB if (gMAB[stateHash].ContainsKey(actionHash)) { gMAB[stateHash][actionHash].Visit(reward); } else { var newArm = new LocalArm(action); newArm.Visit(reward); gMAB[stateHash].Add(actionHash, newArm); } return(action); } // Exploit; epsilon-greedy by returning the action with the highest expected reward with probability 1-e, otherwise returning random. return(_rng.NextDouble() <= 1 - PolicyGlobal ? gMAB[stateHash].Values.OrderByDescending(i => i.ExpectedReward).First().Action : gMAB[stateHash].RandomElementOrDefault().Value.Action); }
/// <summary> /// Generates the interesting subset of actions C* from C. /// /// 1) Generate a weight function R^ from PartialActions(adopting the linear side information assumption). /// 2) Schematically generating a probability distribution D_R^ over CombinedAction space C, biased "towards" R^. /// 3) Sample a number of CombinedActions C* from D_R^. /// </summary> /// <param name="context">The current search context.</param> /// <returns>List of <see cref="A"/>.</returns> private List <A> Generate(SearchContext <D, P, A, S, A> context) { // Create the Side Information using the allowed number of generation samples. SideInformation = SideInformationStrategy.Create(context, GenerationSamples); // Create combined-actions by sampling the side information. var sampledActions = new List <A>(); for (var i = 0; i < EvaluationSamples; i++) { sampledActions.Add(SamplingStrategy.Sample(context.Source, SideInformation)); } return(sampledActions); }