/// <summary> /// Go over all actions and return the one with the highest Q value. /// </summary> /// <param name="control"></param> /// <param name="state">The state of control to use.</param> /// <param name="isLegal">If true, the returned action has to be considered legal in the control.</param> /// <returns></returns> protected override Actione getMaxAction(GameControlBase control, State state, bool isLegal) { int maxID = 0; double max = 0; if (IsMultidimensionalOutput) { // Feed the state to the neural network and find the maximum NeuralNet.Feed(CreateInputArray(state.Board)); for (int id = 0; id < control.ActionNum; id++) { if (NeuralNet.Activations[NeuralNet.Activations.Count - 1][id] > max) { if (isLegal) // If the action has to be legal { if (control.IsLegalAction(new Actione(id))) { maxID = id; max = NeuralNet.Activations[NeuralNet.Activations.Count - 1][id]; } } else { maxID = id; max = NeuralNet.Activations[NeuralNet.Activations.Count - 1][id]; } } } } else { for (int id = 0; id < control.ActionNum; id++) { // Feed the state and the action to the neural network NeuralNet.Feed(CreateInputArray(state.Board, id)); if (NeuralNet.Activations[NeuralNet.Activations.Count - 1][0] > max) { if (isLegal) // If the action has to be legal { if (control.IsLegalAction(new Actione(id))) { maxID = id; max = NeuralNet.Activations[NeuralNet.Activations.Count - 1][0]; } } else { maxID = id; max = NeuralNet.Activations[NeuralNet.Activations.Count - 1][0]; } } } } return(new Actione(maxID)); // Return the best action }
/// <summary> /// Learning using reinforcement learning Q learning, that updates with gradient descent and keeps replay memory. /// </summary> /// <param name="EpocheNumber"></param> /// <param name="EpsilonLimit"></param> /// <param name="EpsilonDecrease">Decrease of epsilon every iteration</param> /// <param name="LearningRate"></param> /// <param name="DiscountRate"></param> /// <param name="against">optional, if not given opponent is random</param> public override void Learn(int EpocheNumber, double EpsilonLimit, double EpsilonDecrease, double LearningRate, double DiscountRate, Bot against = null) { IsLearning = true; int current_epoche = 0; int last_epcohe = 0; int SampleSize = 100; // Size of gradient descent sample int iterations = 0; bool AlreadyReported = false; List <Transition> miniBatch = null; List <Tuple <double[], double[]> > Q_Targets = new List <Tuple <double[], double[]> >(); List <List <double[]> > Gradients = new List <List <double[]> >(); // Initialize the Gradients matrix for (int layer = 1; layer < Dimensions.Count; layer++) { Gradients.Add(new List <double[]>()); for (int neuron = 0; neuron < Dimensions[layer]; neuron++) { Gradients[Gradients.Count - 1].Add(CreateInitArray(Dimensions[layer - 1], 0)); } } // Initialize variables for tracking the progress games = 0; wins = 0; losses = 0; draws = 0; int testGames = 0; // Observe state and declare variables for learning State state = Control.GetState(); State newState; double reward = 0; Actione action; double loss = 0; OldNeuralNet = (NetworkVectors)NeuralNet.Clone(); double[] Target = new double[NeuralNet.WeightedSums[NeuralNet.Activations.Count - 1].Length]; double[] Result = new double[NeuralNet.WeightedSums[NeuralNet.Activations.Count - 1].Length]; double[] target; if (!IsMultidimensionalOutput) // Is the output the Q value of just one action, or of all actions { target = new double[1]; } else { target = new double[NeuralNet.WeightedSums[NeuralNet.Activations.Count - 1].Length]; } double addLoss = 0; while (current_epoche < EpocheNumber && IsLearning) { if (BotTurn != Control.CurrTurn) // If its not this learningbot's turn in the control { BotMove(against); } state = Control.GetState(); // Observe state action = TakeEpsilonGreedyAction(Epsilon, state, rand); // Take action if (!Control.IsTerminalState()) { BotMove(against); } int temp = games; Track(); // Update the tracking variables according to the current state of the game testGames += games - temp; // Get reward and observe new state reward = Control.GetReward(BotTurn); newState = Control.GetState(); // Store the transition in Replay Memory if (ReplayMem.Count >= ReplayMemSize) { ReplayMem.RemoveAt(0); } ReplayMem.Add(new Transition(state, action, reward, newState)); // If the x-th iteration, switch the old neural network with the new one if (iterations % 100 == 0 && iterations != 0) { NeuralNet.Copy(OldNeuralNet); } iterations++; // Sample random mini-batch of transitions from Replay Memory miniBatch = RandomSample(ReplayMem, SampleSize); // Compute Q-Learning targets Q_Targets.Clear(); Zero(Gradients); addLoss = 0; foreach (Transition transition in miniBatch) { int maxID; if (IsMultidimensionalOutput) { OldNeuralNet.Feed(CreateInputArray(transition.s1.Board)); // Compute the Q value of all the actions in the given state } else { maxID = getMaxAction(Control, transition.s1, false).ID; // Get the best action of the new state OldNeuralNet.Feed(CreateInputArray(transition.s1.Board, maxID)); // Compute its Q value } // Bellman equation: Q += (Q' * DiscountRate + R) * learning rate if (!IsMultidimensionalOutput) // If the output is value of just one action { double t = 0; if (Control.IsTerminalState(transition.s1)) { t = transition.Reward; } else { t = transition.Reward + DiscountRate * OldNeuralNet.WeightedSums[OldNeuralNet.Activations.Count - 1][0]; } target[0] = t; Q_Targets.Add(new Tuple <double[], double[]>(CreateInputArray(transition.s.Board, transition.a.ID), ApplyFunction(target, Activation_Functions.Sigmoid.Function))); OldNeuralNet.Feed(CreateInputArray(transition.s.Board, transition.a.ID)); addLoss += 0.5 * Math.Pow(ApplyFunction(target, Activation_Functions.Sigmoid.Function)[0] - OldNeuralNet.Activations[NeuralNet.Activations.Count - 1][0], 2); } else // If the output is the value of all the actions { double t = 0; if (Control.IsTerminalState(transition.s1)) { t = transition.Reward; } else { t = transition.Reward + DiscountRate * Max(OldNeuralNet.WeightedSums[OldNeuralNet.Activations.Count - 1]); } OldNeuralNet.Feed(CreateInputArray(transition.s.Board)); for (int i = 0; i < OldNeuralNet.WeightedSums[OldNeuralNet.Activations.Count - 1].Length; i++) { target[i] = OldNeuralNet.WeightedSums[OldNeuralNet.Activations.Count - 1][i]; } target[transition.a.ID] = t; Q_Targets.Add(new Tuple <double[], double[]>(CreateInputArray(transition.s.Board), ApplyFunction(target, Activation_Functions.Sigmoid.Function))); // Loss for tracking addLoss += 0.5 * Math.Pow(ApplyFunction(target, Activation_Functions.Sigmoid.Function)[transition.a.ID] - OldNeuralNet.Activations[NeuralNet.Activations.Count - 1][transition.a.ID], 2); } } addLoss /= miniBatch.Count; loss += addLoss; // SGD NeuralNet.SGD(Q_Targets, LearningRate, 1); // Adjust learning variables if (Epsilon > EpsilonLimit) { Epsilon -= EpsilonDecrease; } // Every 20 games report on the win rate against a random bot if (testGames % 20 != 0) { AlreadyReported = false; } if (testGames % 20 == 0 && !AlreadyReported) { Console.WriteLine(Test(300) + " total games: " + testGames); AlreadyReported = true; } // Report the progress if (current_epoche % 200 == 0 && current_epoche != last_epcohe) { last_epcohe = current_epoche; Console.WriteLine("Learning percentage: {0}%, win rate: {1}%, loss rate: {3}%, draw rate: {4}%, avg cost: {2}, games:" + testGames, current_epoche / (double)EpocheNumber * 100, (double)wins * 100 / games, loss / 200, (double)losses * 100 / games, (double)draws * 100 / games); wins = 0; draws = 0; losses = 0; loss = 0; games = 0; } current_epoche++; Control.Clean(); // Make the control ready for another move } IsLearning = false; }