public override ExplorerDecision <int> MapContext(PRG random, float[] weights, int numActions) { int numWeights = weights.Length; if (numActions != int.MaxValue && numWeights != numActions) { throw new ArgumentException("The number of weights returned by the scorer must equal number of actions"); } // Create a discrete_distribution based on the returned weights. This class handles the // case where the sum of the weights is < or > 1, by normalizing agains the sum. float total = 0f; for (int i = 0; i < numWeights; i++) { if (weights[i] < 0) { throw new ArgumentException("Scores must be non-negative."); } total += weights[i]; } if (total == 0) { throw new ArgumentException("At least one score must be positive."); } float draw = random.UniformUnitInterval(); float sum = 0f; float actionProbability = 0f; int actionIndex = numWeights - 1; for (int i = 0; i < numWeights; i++) { weights[i] = weights[i] / total; sum += weights[i]; // This needs to be >=, not >, in case the random draw = 1.0, since sum would never // be > 1.0 and the loop would exit without assigning the right action probability. if (sum >= draw) { actionIndex = i; actionProbability = weights[i]; break; } } actionIndex++; // action id is one-based return(ExplorerDecision.Create( actionIndex, new GenericExplorerState { Probability = actionProbability }, true)); }
public ExplorerDecision <int> Explore(PRG random, int numActions) { return(ExplorerDecision.Create( random.UniformInt(1, numActions), new GenericExplorerState { Probability = 1f }, shouldRecord: true)); }
public override ExplorerDecision <int[]> MapContext(PRG prg, int[] ranking, int numActions) { if (ranking == null || ranking.Length < 1) { throw new ArgumentException("Actions chosen by default policy must not be empty."); } var decision = this.explorer.MapContext(prg, ranking[0], ranking.Length); MultiActionHelper.PutActionToList(decision.Value, ranking); return(ExplorerDecision.Create(ranking, decision.ExplorerState, decision.ShouldRecord)); }
public ExplorerDecision <TAction> MapContext(PRG random, IReadOnlyCollection <TAction> policyActions, int numActions) { // Invoke the default policy function to get the action TAction chosenDecision = default(TAction); float actionProbability = 0f; if (this.explore) { // Select bag int chosenBag = random.UniformInt(0, policyActions.Count - 1); int[] actionsSelected = Enumerable.Repeat <int>(0, numActions).ToArray(); int currentBag = 0; foreach (var policyAction in policyActions) { var actionFromBag = this.GetTopAction(policyAction); if (actionFromBag == 0 || actionFromBag > numActions) { throw new ArgumentException("Action chosen by default policy is not within valid range."); } //this won't work if actions aren't 0 to Count actionsSelected[actionFromBag - 1]++; // action id is one-based if (currentBag == chosenBag) { chosenDecision = policyAction; } currentBag++; } actionProbability = (float)actionsSelected[this.GetTopAction(chosenDecision) - 1] / policyActions.Count; // action id is one-based } else { chosenDecision = policyActions.First(); actionProbability = 1f; } GenericExplorerState explorerState = new GenericExplorerState { Probability = actionProbability }; return(ExplorerDecision.Create(chosenDecision, explorerState, true)); }
public override ExplorerDecision <int[]> MapContext(PRG random, float[] weights, int numActions) { var decision = this.explorer.MapContext(random, weights, numActions); float actionProbability = 0f; int[] chosenActions = MultiActionHelper.SampleWithoutReplacement(weights, weights.Length, random, ref actionProbability); // action id is one-based return(ExplorerDecision.Create(chosenActions, new GenericExplorerState { Probability = actionProbability }, true)); }
public override ExplorerDecision <int> MapContext(PRG random, int policyAction, int numActionsVariable) { if (policyAction == 0 || policyAction > numActionsVariable) { throw new ArgumentException("Action chosen by default policy is not within valid range."); } float actionProbability; bool isExplore; float epsilon = explore ? this.defaultEpsilon : 0f; float baseProbability = epsilon / numActionsVariable; // uniform probability if (random.UniformUnitInterval() < 1f - epsilon) { actionProbability = 1f - epsilon + baseProbability; isExplore = false; } else { // Get uniform random 1-based action ID int actionId = random.UniformInt(1, numActionsVariable); if (actionId == policyAction) { // If it matches the one chosen by the default policy // then increase the probability actionProbability = 1f - epsilon + baseProbability; } else { // Otherwise it's just the uniform probability actionProbability = baseProbability; } policyAction = actionId; isExplore = true; } EpsilonGreedyState explorerState = new EpsilonGreedyState { Epsilon = epsilon, IsExplore = isExplore, Probability = actionProbability }; return(ExplorerDecision.Create(policyAction, explorerState, true)); }
public ExplorerDecision <int[]> Explore(PRG random, int numActionsVariable) { var ranking = Enumerable.Range(1, numActionsVariable).ToArray(); for (int i = 0; i < ranking.Length - 1 && i < maxPermutations; i++) { int swapIndex = random.UniformInt(i, ranking.Length - 1); int temp = ranking[swapIndex]; ranking[swapIndex] = ranking[i]; ranking[i] = temp; } return(ExplorerDecision.Create(ranking, new GenericExplorerState { Probability = 1f }, true)); }
public override ExplorerDecision <int> MapContext(PRG random, int policyAction, int numActionsVariable) { if (policyAction == 0 || policyAction > numActionsVariable) { throw new ArgumentException("Action chosen by default policy is not within valid range."); } int chosenAction = 0; float actionProbability = 0f; bool shouldRecordDecision = true; bool isExplore = true; int tau = this.tau; lock (this.lockObject) { if (this.tau > 0 && this.explore) { this.tau--; chosenAction = random.UniformInt(1, numActionsVariable); actionProbability = 1f / numActionsVariable; isExplore = true; } else { chosenAction = policyAction; actionProbability = 1f; isExplore = false; } } TauFirstState explorerState = new TauFirstState { IsExplore = isExplore, Probability = actionProbability, Tau = tau }; return(ExplorerDecision.Create(chosenAction, explorerState, shouldRecordDecision)); }
public ExplorerDecision <int[]> MapContext(PRG random, int[] policyAction, int numActions) { MultiActionHelper.ValidateActionList(policyAction); float epsilon = this.explore ? this.defaultEpsilon : 0f; int[] chosenAction; bool isExplore; if (random.UniformUnitInterval() < epsilon) { // 1 ... n chosenAction = Enumerable.Range(1, policyAction.Length).ToArray(); // 0 ... n - 2 for (int i = 0; i < policyAction.Length - 1; i++) { int swapIndex = random.UniformInt(i, policyAction.Length - 1); int temp = chosenAction[swapIndex]; chosenAction[swapIndex] = chosenAction[i]; chosenAction[i] = temp; } isExplore = true; } else { chosenAction = policyAction; isExplore = false; } EpsilonGreedySlateState explorerState = new EpsilonGreedySlateState { Epsilon = this.defaultEpsilon, IsExplore = isExplore, Ranking = policyAction }; return(ExplorerDecision.Create(chosenAction, explorerState, true)); }
public override ExplorerDecision <int[]> MapContext(PRG random, float[] scores, int numActions) { if (scores == null || scores.Length < 1) { throw new ArgumentException("Scores returned by default policy must not be empty."); } var decision = this.explorer.MapContext(random, scores, numActions); int numActionsVariable = scores.Length; int[] chosenActions; // Note: there might be a way using out generic parameters and explicit interface implementation to avoid the cast float actionProbability = ((GenericExplorerState)decision.ExplorerState).Probability; if (this.explore) { chosenActions = MultiActionHelper.SampleWithoutReplacement(scores, numActionsVariable, random, ref actionProbability); } else { // avoid linq to optimize perf chosenActions = new int[numActionsVariable]; for (int i = 1; i <= numActionsVariable; i++) { chosenActions[i] = i; } // swap max-score action with the first one int firstAction = chosenActions[0]; chosenActions[0] = chosenActions[decision.Value]; chosenActions[decision.Value] = firstAction; } return(ExplorerDecision.Create(chosenActions, decision.ExplorerState, decision.ShouldRecord)); }
public override ExplorerDecision <int[]> MapContext(PRG prg, ActionProbability[] actionProbs, int numActions) { // Create a discrete_distribution based on the returned actionProbs. This class handles the // case where the sum of the actionProbs is < or > 1, by normalizing agains the sum. float total = 0f; foreach (var ap in actionProbs) { if (ap.Probability < 0) { throw new ArgumentException("Probabilities must be non-negative."); } total += ap.Probability; } if (total == 0) { throw new ArgumentException("At least one probability must be positive."); } if (Math.Abs(total - 1f) > 1e-4) { throw new ArgumentException($"Probabilities must sum to one, but {Math.Abs(total - 1f)} was received."); } float draw = prg.UniformUnitInterval(); float sum = 0f; var actionChosen = actionProbs.Last(); foreach (var ap in actionProbs) { sum += ap.Probability; if (sum > draw) { actionChosen = ap; break; } } // top slot explorer var action = actionChosen.Action; var probability = actionChosen.Probability; var actionList = new int[actionProbs.Length]; var probabilityList = new float[actionProbs.Length]; for (int i = 0; i < actionList.Length; i++) { actionList[i] = actionProbs[i].Action; probabilityList[i] = actionProbs[i].Probability; if (action == actionList[i]) { // swap both actionList[i] = actionList[0]; actionList[0] = action; probabilityList[i] = probabilityList[0]; probabilityList[0] = probability; } } // action id is 1-based return(ExplorerDecision.Create( actionList, new GenericTopSlotExplorerState { Probabilities = probabilityList }, true)); }
public override ExplorerDecision <int> MapContext(PRG random, float[] scores, int numActions) { int numScores = scores.Length; if (numActions != int.MaxValue && numScores != numActions) { throw new ArgumentException("The number of scores returned by the scorer must equal number of actions"); } int i = 0; float maxScore = scores.Max(); float actionProbability = 0f; int actionIndex = 0; if (this.explore) { // Create a normalized exponential distribution based on the returned scores for (i = 0; i < numScores; i++) { scores[i] = (float)Math.Exp(this.lambda * (scores[i] - maxScore)); } // Create a discrete_distribution based on the returned weights. This class handles the // case where the sum of the weights is < or > 1, by normalizing agains the sum. float total = scores.Sum(); float draw = random.UniformUnitInterval(); float sum = 0f; actionProbability = 0f; actionIndex = numScores - 1; for (i = 0; i < numScores; i++) { scores[i] = scores[i] / total; sum += scores[i]; if (sum >= draw) { actionIndex = i; actionProbability = scores[i]; break; } } } else { maxScore = 0f; for (i = 0; i < numScores; i++) { if (maxScore < scores[i]) { maxScore = scores[i]; actionIndex = i; } } actionProbability = 1f; // Set to 1 since we always pick the highest one. } actionIndex++; // action id is one-based return(ExplorerDecision.Create(actionIndex, new GenericExplorerState { Probability = actionProbability }, true)); }