Пример #1
0
        /// <summary>
        /// Uses the local Multi-Armed-Bandits to explore the action space and uses the global Multi-Armed-Bandit to exploit the best performing actions.
        /// </summary>
        /// <param name="context">The current search context.</param>
        /// <param name="state">The game state for the node.</param>
        /// <param name="gMAB">The global Multi-Armed-Bandit.</param>
        /// <returns>An <see cref="A"/> that was selected from the global Multi-Armed-Bandit.</returns>
        private A NaïveSampling(SearchContext <D, P, A, S, Sol> context, P state, IDictionary <long, Dictionary <int, LocalArm> > gMAB)
        {
            var apply      = context.Application;
            var stateClone = context.Cloner.Clone(state);
            var stateHash  = stateClone.HashMethod();

            if (!gMAB.ContainsKey(stateHash))
            {
                gMAB.Add(stateHash, new Dictionary <int, LocalArm>());
            }

            // Use a policy p_0 to determine whether to explore or exploit
            // If explore was selected
            //      x_1...x_n is sampled by using a policy p_l to select a value for each X_i in X independently.
            //      As a side effect, the resulting value combination is added to the global MAB.
            // If exploit was selected
            //      x_1...x_n is sampled by using a policy p_g to select a value combination using MAB_g.

            // Can only exploit if there is anything to exploit in the first place
            if (gMAB[stateHash].IsNullOrEmpty() || ExplorationStrategy.Policy(context, 0))
            {
                // Explore

                // Create an action according to policy p_1
                var action     = SamplingStrategy.Sample(stateClone);
                var actionHash = action.GetHashCode();
                // Evaluate the sampled action
                var endState = PlayoutStrategy.Playout(context, apply.Apply(context, stateClone, action));
                var tempNode = new TreeSearchNode <P, A> {
                    Payload = action
                };
                var reward = EvaluationStrategy.Evaluate(context, tempNode, endState);
                // Add the action to the global MAB
                if (gMAB[stateHash].ContainsKey(actionHash))
                {
                    gMAB[stateHash][actionHash].Visit(reward);
                }
                else
                {
                    var newArm = new LocalArm(action);
                    newArm.Visit(reward);
                    gMAB[stateHash].Add(actionHash, newArm);
                }

                return(action);
            }

            // Exploit; epsilon-greedy by returning the action with the highest expected reward with probability 1-e, otherwise returning random.
            return(_rng.NextDouble() <= 1 - PolicyGlobal ? gMAB[stateHash].Values.OrderByDescending(i => i.ExpectedReward).First().Action : gMAB[stateHash].RandomElementOrDefault().Value.Action);
        }
Пример #2
0
 public RomanNumeral(int number, EvaluationStrategy strategy)
 {
     Number = number;
     Text   = strategy.Evaluate(number);
 }