public override ExplorerDecision <int> MapContext(PRG random, float[] weights, int numActions) { int numWeights = weights.Length; if (numActions != int.MaxValue && numWeights != numActions) { throw new ArgumentException("The number of weights returned by the scorer must equal number of actions"); } // Create a discrete_distribution based on the returned weights. This class handles the // case where the sum of the weights is < or > 1, by normalizing agains the sum. float total = 0f; for (int i = 0; i < numWeights; i++) { if (weights[i] < 0) { throw new ArgumentException("Scores must be non-negative."); } total += weights[i]; } if (total == 0) { throw new ArgumentException("At least one score must be positive."); } float draw = random.UniformUnitInterval(); float sum = 0f; float actionProbability = 0f; int actionIndex = numWeights - 1; for (int i = 0; i < numWeights; i++) { weights[i] = weights[i] / total; sum += weights[i]; // This needs to be >=, not >, in case the random draw = 1.0, since sum would never // be > 1.0 and the loop would exit without assigning the right action probability. if (sum >= draw) { actionIndex = i; actionProbability = weights[i]; break; } } actionIndex++; // action id is one-based return(ExplorerDecision.Create( actionIndex, new GenericExplorerState { Probability = actionProbability }, true)); }
internal static int[] SampleWithoutReplacement(float[] probabilities, int size, PRG randomGenerator, ref float topActionProbability) { for (int i = 0; i < size; i++) { if (probabilities[i] == 1f) { throw new ArgumentException("The resulting probability distribution is deterministic and thus cannot generate a list of unique actions."); } } int[] actions = Enumerable.Repeat(0, size).ToArray(); bool[] exists = new bool[actions.Length + 1]; // plus 1 since action index is 1-based // sample without replacement int runningIndex = 0; int runningAction = 0; float draw, sum; while (runningIndex < size) { draw = randomGenerator.UniformUnitInterval(); sum = 0; for (int i = 0; i < size; i++) { sum += probabilities[i]; if (sum > draw) { runningAction = i + 1; // check for duplicate if (exists[runningAction]) { continue; } // store newly sampled action if (runningIndex == 0) { topActionProbability = probabilities[i]; } actions[runningIndex++] = runningAction; exists[runningAction] = true; break; } } } return(actions); }
public override ExplorerDecision <int> MapContext(PRG random, int policyAction, int numActionsVariable) { if (policyAction == 0 || policyAction > numActionsVariable) { throw new ArgumentException("Action chosen by default policy is not within valid range."); } float actionProbability; bool isExplore; float epsilon = explore ? this.defaultEpsilon : 0f; float baseProbability = epsilon / numActionsVariable; // uniform probability if (random.UniformUnitInterval() < 1f - epsilon) { actionProbability = 1f - epsilon + baseProbability; isExplore = false; } else { // Get uniform random 1-based action ID int actionId = random.UniformInt(1, numActionsVariable); if (actionId == policyAction) { // If it matches the one chosen by the default policy // then increase the probability actionProbability = 1f - epsilon + baseProbability; } else { // Otherwise it's just the uniform probability actionProbability = baseProbability; } policyAction = actionId; isExplore = true; } EpsilonGreedyState explorerState = new EpsilonGreedyState { Epsilon = epsilon, IsExplore = isExplore, Probability = actionProbability }; return(ExplorerDecision.Create(policyAction, explorerState, true)); }
public ExplorerDecision <int[]> MapContext(PRG random, int[] policyAction, int numActions) { MultiActionHelper.ValidateActionList(policyAction); float epsilon = this.explore ? this.defaultEpsilon : 0f; int[] chosenAction; bool isExplore; if (random.UniformUnitInterval() < epsilon) { // 1 ... n chosenAction = Enumerable.Range(1, policyAction.Length).ToArray(); // 0 ... n - 2 for (int i = 0; i < policyAction.Length - 1; i++) { int swapIndex = random.UniformInt(i, policyAction.Length - 1); int temp = chosenAction[swapIndex]; chosenAction[swapIndex] = chosenAction[i]; chosenAction[i] = temp; } isExplore = true; } else { chosenAction = policyAction; isExplore = false; } EpsilonGreedySlateState explorerState = new EpsilonGreedySlateState { Epsilon = this.defaultEpsilon, IsExplore = isExplore, Ranking = policyAction }; return(ExplorerDecision.Create(chosenAction, explorerState, true)); }
public override ExplorerDecision <int[]> MapContext(PRG prg, ActionProbability[] actionProbs, int numActions) { // Create a discrete_distribution based on the returned actionProbs. This class handles the // case where the sum of the actionProbs is < or > 1, by normalizing agains the sum. float total = 0f; foreach (var ap in actionProbs) { if (ap.Probability < 0) { throw new ArgumentException("Probabilities must be non-negative."); } total += ap.Probability; } if (total == 0) { throw new ArgumentException("At least one probability must be positive."); } if (Math.Abs(total - 1f) > 1e-4) { throw new ArgumentException($"Probabilities must sum to one, but {Math.Abs(total - 1f)} was received."); } float draw = prg.UniformUnitInterval(); float sum = 0f; var actionChosen = actionProbs.Last(); foreach (var ap in actionProbs) { sum += ap.Probability; if (sum > draw) { actionChosen = ap; break; } } // top slot explorer var action = actionChosen.Action; var probability = actionChosen.Probability; var actionList = new int[actionProbs.Length]; var probabilityList = new float[actionProbs.Length]; for (int i = 0; i < actionList.Length; i++) { actionList[i] = actionProbs[i].Action; probabilityList[i] = actionProbs[i].Probability; if (action == actionList[i]) { // swap both actionList[i] = actionList[0]; actionList[0] = action; probabilityList[i] = probabilityList[0]; probabilityList[0] = probability; } } // action id is 1-based return(ExplorerDecision.Create( actionList, new GenericTopSlotExplorerState { Probabilities = probabilityList }, true)); }
public override ExplorerDecision <int> MapContext(PRG random, float[] scores, int numActions) { int numScores = scores.Length; if (numActions != int.MaxValue && numScores != numActions) { throw new ArgumentException("The number of scores returned by the scorer must equal number of actions"); } int i = 0; float maxScore = scores.Max(); float actionProbability = 0f; int actionIndex = 0; if (this.explore) { // Create a normalized exponential distribution based on the returned scores for (i = 0; i < numScores; i++) { scores[i] = (float)Math.Exp(this.lambda * (scores[i] - maxScore)); } // Create a discrete_distribution based on the returned weights. This class handles the // case where the sum of the weights is < or > 1, by normalizing agains the sum. float total = scores.Sum(); float draw = random.UniformUnitInterval(); float sum = 0f; actionProbability = 0f; actionIndex = numScores - 1; for (i = 0; i < numScores; i++) { scores[i] = scores[i] / total; sum += scores[i]; if (sum >= draw) { actionIndex = i; actionProbability = scores[i]; break; } } } else { maxScore = 0f; for (i = 0; i < numScores; i++) { if (maxScore < scores[i]) { maxScore = scores[i]; actionIndex = i; } } actionProbability = 1f; // Set to 1 since we always pick the highest one. } actionIndex++; // action id is one-based return(ExplorerDecision.Create(actionIndex, new GenericExplorerState { Probability = actionProbability }, true)); }