예제 #1
0
        public override ExplorerDecision <int> MapContext(PRG random, float[] weights, int numActions)
        {
            int numWeights = weights.Length;

            if (numActions != int.MaxValue && numWeights != numActions)
            {
                throw new ArgumentException("The number of weights returned by the scorer must equal number of actions");
            }

            // Create a discrete_distribution based on the returned weights. This class handles the
            // case where the sum of the weights is < or > 1, by normalizing agains the sum.
            float total = 0f;

            for (int i = 0; i < numWeights; i++)
            {
                if (weights[i] < 0)
                {
                    throw new ArgumentException("Scores must be non-negative.");
                }

                total += weights[i];
            }

            if (total == 0)
            {
                throw new ArgumentException("At least one score must be positive.");
            }

            float draw = random.UniformUnitInterval();

            float sum = 0f;
            float actionProbability = 0f;
            int   actionIndex       = numWeights - 1;

            for (int i = 0; i < numWeights; i++)
            {
                weights[i] = weights[i] / total;
                sum       += weights[i];
                // This needs to be >=, not >, in case the random draw = 1.0, since sum would never
                // be > 1.0 and the loop would exit without assigning the right action probability.
                if (sum >= draw)
                {
                    actionIndex       = i;
                    actionProbability = weights[i];
                    break;
                }
            }

            actionIndex++;

            // action id is one-based
            return(ExplorerDecision.Create(
                       actionIndex,
                       new GenericExplorerState {
                Probability = actionProbability
            },
                       true));
        }
예제 #2
0
 public ExplorerDecision <int> Explore(PRG random, int numActions)
 {
     return(ExplorerDecision.Create(
                random.UniformInt(1, numActions),
                new GenericExplorerState {
         Probability = 1f
     },
                shouldRecord: true));
 }
예제 #3
0
        public override ExplorerDecision <int[]> MapContext(PRG prg, int[] ranking, int numActions)
        {
            if (ranking == null || ranking.Length < 1)
            {
                throw new ArgumentException("Actions chosen by default policy must not be empty.");
            }

            var decision = this.explorer.MapContext(prg, ranking[0], ranking.Length);

            MultiActionHelper.PutActionToList(decision.Value, ranking);

            return(ExplorerDecision.Create(ranking, decision.ExplorerState, decision.ShouldRecord));
        }
예제 #4
0
        public ExplorerDecision <TAction> MapContext(PRG random, IReadOnlyCollection <TAction> policyActions, int numActions)
        {
            // Invoke the default policy function to get the action
            TAction chosenDecision    = default(TAction);
            float   actionProbability = 0f;

            if (this.explore)
            {
                // Select bag
                int chosenBag = random.UniformInt(0, policyActions.Count - 1);

                int[] actionsSelected = Enumerable.Repeat <int>(0, numActions).ToArray();

                int currentBag = 0;
                foreach (var policyAction in policyActions)
                {
                    var actionFromBag = this.GetTopAction(policyAction);

                    if (actionFromBag == 0 || actionFromBag > numActions)
                    {
                        throw new ArgumentException("Action chosen by default policy is not within valid range.");
                    }

                    //this won't work if actions aren't 0 to Count
                    actionsSelected[actionFromBag - 1]++; // action id is one-based

                    if (currentBag == chosenBag)
                    {
                        chosenDecision = policyAction;
                    }

                    currentBag++;
                }

                actionProbability = (float)actionsSelected[this.GetTopAction(chosenDecision) - 1] / policyActions.Count; // action id is one-based
            }
            else
            {
                chosenDecision    = policyActions.First();
                actionProbability = 1f;
            }

            GenericExplorerState explorerState = new GenericExplorerState
            {
                Probability = actionProbability
            };

            return(ExplorerDecision.Create(chosenDecision, explorerState, true));
        }
예제 #5
0
        public override ExplorerDecision <int[]> MapContext(PRG random, float[] weights, int numActions)
        {
            var decision = this.explorer.MapContext(random, weights, numActions);

            float actionProbability = 0f;

            int[] chosenActions = MultiActionHelper.SampleWithoutReplacement(weights, weights.Length, random, ref actionProbability);

            // action id is one-based
            return(ExplorerDecision.Create(chosenActions,
                                           new GenericExplorerState {
                Probability = actionProbability
            },
                                           true));
        }
예제 #6
0
        public override ExplorerDecision <int> MapContext(PRG random, int policyAction, int numActionsVariable)
        {
            if (policyAction == 0 || policyAction > numActionsVariable)
            {
                throw new ArgumentException("Action chosen by default policy is not within valid range.");
            }

            float actionProbability;
            bool  isExplore;

            float epsilon         = explore ? this.defaultEpsilon : 0f;
            float baseProbability = epsilon / numActionsVariable; // uniform probability

            if (random.UniformUnitInterval() < 1f - epsilon)
            {
                actionProbability = 1f - epsilon + baseProbability;
                isExplore         = false;
            }
            else
            {
                // Get uniform random 1-based action ID
                int actionId = random.UniformInt(1, numActionsVariable);

                if (actionId == policyAction)
                {
                    // If it matches the one chosen by the default policy
                    // then increase the probability
                    actionProbability = 1f - epsilon + baseProbability;
                }
                else
                {
                    // Otherwise it's just the uniform probability
                    actionProbability = baseProbability;
                }
                policyAction = actionId;
                isExplore    = true;
            }

            EpsilonGreedyState explorerState = new EpsilonGreedyState
            {
                Epsilon     = epsilon,
                IsExplore   = isExplore,
                Probability = actionProbability
            };

            return(ExplorerDecision.Create(policyAction, explorerState, true));
        }
        public ExplorerDecision <int[]> Explore(PRG random, int numActionsVariable)
        {
            var ranking = Enumerable.Range(1, numActionsVariable).ToArray();

            for (int i = 0; i < ranking.Length - 1 && i < maxPermutations; i++)
            {
                int swapIndex = random.UniformInt(i, ranking.Length - 1);

                int temp = ranking[swapIndex];
                ranking[swapIndex] = ranking[i];
                ranking[i]         = temp;
            }

            return(ExplorerDecision.Create(ranking, new GenericExplorerState {
                Probability = 1f
            }, true));
        }
        public override ExplorerDecision <int> MapContext(PRG random, int policyAction, int numActionsVariable)
        {
            if (policyAction == 0 || policyAction > numActionsVariable)
            {
                throw new ArgumentException("Action chosen by default policy is not within valid range.");
            }

            int   chosenAction         = 0;
            float actionProbability    = 0f;
            bool  shouldRecordDecision = true;
            bool  isExplore            = true;
            int   tau = this.tau;

            lock (this.lockObject)
            {
                if (this.tau > 0 && this.explore)
                {
                    this.tau--;

                    chosenAction      = random.UniformInt(1, numActionsVariable);
                    actionProbability = 1f / numActionsVariable;
                    isExplore         = true;
                }
                else
                {
                    chosenAction = policyAction;

                    actionProbability = 1f;
                    isExplore         = false;
                }
            }

            TauFirstState explorerState = new TauFirstState
            {
                IsExplore   = isExplore,
                Probability = actionProbability,
                Tau         = tau
            };

            return(ExplorerDecision.Create(chosenAction, explorerState, shouldRecordDecision));
        }
예제 #9
0
        public ExplorerDecision <int[]> MapContext(PRG random, int[] policyAction, int numActions)
        {
            MultiActionHelper.ValidateActionList(policyAction);

            float epsilon = this.explore ? this.defaultEpsilon : 0f;

            int[] chosenAction;
            bool  isExplore;

            if (random.UniformUnitInterval() < epsilon)
            {
                // 1 ... n
                chosenAction = Enumerable.Range(1, policyAction.Length).ToArray();

                // 0 ... n - 2
                for (int i = 0; i < policyAction.Length - 1; i++)
                {
                    int swapIndex = random.UniformInt(i, policyAction.Length - 1);

                    int temp = chosenAction[swapIndex];
                    chosenAction[swapIndex] = chosenAction[i];
                    chosenAction[i]         = temp;
                }

                isExplore = true;
            }
            else
            {
                chosenAction = policyAction;
                isExplore    = false;
            }

            EpsilonGreedySlateState explorerState = new EpsilonGreedySlateState
            {
                Epsilon   = this.defaultEpsilon,
                IsExplore = isExplore,
                Ranking   = policyAction
            };

            return(ExplorerDecision.Create(chosenAction, explorerState, true));
        }
예제 #10
0
        public override ExplorerDecision <int[]> MapContext(PRG random, float[] scores, int numActions)
        {
            if (scores == null || scores.Length < 1)
            {
                throw new ArgumentException("Scores returned by default policy must not be empty.");
            }

            var decision = this.explorer.MapContext(random, scores, numActions);

            int numActionsVariable = scores.Length;

            int[] chosenActions;
            // Note: there might be a way using out generic parameters and explicit interface implementation to avoid the cast
            float actionProbability = ((GenericExplorerState)decision.ExplorerState).Probability;

            if (this.explore)
            {
                chosenActions = MultiActionHelper.SampleWithoutReplacement(scores, numActionsVariable, random, ref actionProbability);
            }
            else
            {
                // avoid linq to optimize perf
                chosenActions = new int[numActionsVariable];
                for (int i = 1; i <= numActionsVariable; i++)
                {
                    chosenActions[i] = i;
                }

                // swap max-score action with the first one
                int firstAction = chosenActions[0];
                chosenActions[0] = chosenActions[decision.Value];
                chosenActions[decision.Value] = firstAction;
            }

            return(ExplorerDecision.Create(chosenActions,
                                           decision.ExplorerState,
                                           decision.ShouldRecord));
        }
예제 #11
0
        public override ExplorerDecision <int[]> MapContext(PRG prg, ActionProbability[] actionProbs, int numActions)
        {
            // Create a discrete_distribution based on the returned actionProbs. This class handles the
            // case where the sum of the actionProbs is < or > 1, by normalizing agains the sum.
            float total = 0f;

            foreach (var ap in actionProbs)
            {
                if (ap.Probability < 0)
                {
                    throw new ArgumentException("Probabilities must be non-negative.");
                }

                total += ap.Probability;
            }

            if (total == 0)
            {
                throw new ArgumentException("At least one probability must be positive.");
            }

            if (Math.Abs(total - 1f) > 1e-4)
            {
                throw new ArgumentException($"Probabilities must sum to one, but {Math.Abs(total - 1f)} was received.");
            }

            float draw = prg.UniformUnitInterval();

            float sum          = 0f;
            var   actionChosen = actionProbs.Last();

            foreach (var ap in actionProbs)
            {
                sum += ap.Probability;
                if (sum > draw)
                {
                    actionChosen = ap;
                    break;
                }
            }

            // top slot explorer
            var action          = actionChosen.Action;
            var probability     = actionChosen.Probability;
            var actionList      = new int[actionProbs.Length];
            var probabilityList = new float[actionProbs.Length];

            for (int i = 0; i < actionList.Length; i++)
            {
                actionList[i]      = actionProbs[i].Action;
                probabilityList[i] = actionProbs[i].Probability;

                if (action == actionList[i])
                {
                    // swap both
                    actionList[i] = actionList[0];
                    actionList[0] = action;

                    probabilityList[i] = probabilityList[0];
                    probabilityList[0] = probability;
                }
            }

            // action id is 1-based
            return(ExplorerDecision.Create(
                       actionList,
                       new GenericTopSlotExplorerState {
                Probabilities = probabilityList
            },
                       true));
        }
예제 #12
0
        public override ExplorerDecision <int> MapContext(PRG random, float[] scores, int numActions)
        {
            int numScores = scores.Length;

            if (numActions != int.MaxValue && numScores != numActions)
            {
                throw new ArgumentException("The number of scores returned by the scorer must equal number of actions");
            }

            int   i        = 0;
            float maxScore = scores.Max();

            float actionProbability = 0f;
            int   actionIndex       = 0;

            if (this.explore)
            {
                // Create a normalized exponential distribution based on the returned scores
                for (i = 0; i < numScores; i++)
                {
                    scores[i] = (float)Math.Exp(this.lambda * (scores[i] - maxScore));
                }

                // Create a discrete_distribution based on the returned weights. This class handles the
                // case where the sum of the weights is < or > 1, by normalizing agains the sum.
                float total = scores.Sum();

                float draw = random.UniformUnitInterval();

                float sum = 0f;
                actionProbability = 0f;
                actionIndex       = numScores - 1;
                for (i = 0; i < numScores; i++)
                {
                    scores[i] = scores[i] / total;
                    sum      += scores[i];
                    if (sum >= draw)
                    {
                        actionIndex       = i;
                        actionProbability = scores[i];
                        break;
                    }
                }
            }
            else
            {
                maxScore = 0f;
                for (i = 0; i < numScores; i++)
                {
                    if (maxScore < scores[i])
                    {
                        maxScore    = scores[i];
                        actionIndex = i;
                    }
                }
                actionProbability = 1f; // Set to 1 since we always pick the highest one.
            }

            actionIndex++;

            // action id is one-based
            return(ExplorerDecision.Create(actionIndex,
                                           new GenericExplorerState {
                Probability = actionProbability
            },
                                           true));
        }