// Q-Learning thread private void QLearningThread() { int iteration = 0; // current coordinates of the agent int agentCurrentX, agentCurrentY; // exploration policy TabuSearchExploration tabuPolicy = (TabuSearchExploration)qLearning.ExplorationPolicy; EpsilonGreedyExploration explorationPolicy = (EpsilonGreedyExploration)tabuPolicy.BasePolicy; // loop while ((!needToStop) && (iteration < learningIterations)) { // set exploration rate for this iteration explorationPolicy.Epsilon = explorationRate - ((double)iteration / learningIterations) * explorationRate; // set learning rate for this iteration qLearning.LearningRate = learningRate - ((double)iteration / learningIterations) * learningRate; // clear tabu list tabuPolicy.ResetTabuList(); // reset agent's coordinates to the starting position agentCurrentX = agentStartX; agentCurrentY = agentStartY; // steps performed by agent to get to the goal int steps = 0; while ((!needToStop) && ((agentCurrentX != agentStopX) || (agentCurrentY != agentStopY))) { steps++; // get agent's current state int currentState = GetStateNumber(agentCurrentX, agentCurrentY); // get the action for this state int action = qLearning.GetAction(currentState); // update agent's current position and get his reward double reward = UpdateAgentPosition(ref agentCurrentX, ref agentCurrentY, action); // get agent's next state int nextState = GetStateNumber(agentCurrentX, agentCurrentY); // do learning of the agent - update his Q-function qLearning.UpdateState(currentState, action, reward, nextState); // set tabu action tabuPolicy.SetTabuAction((action + 2) % 4, 1); } System.Diagnostics.Debug.WriteLine(steps); iteration++; // show current iteration SetText(iterationBox, iteration.ToString()); } // enable settings controls EnableControls(true); }
// Q-Learning thread private void QLearningThread() { _currentIteration = 0; // exploration policy TabuSearchExploration tabuPolicy = (TabuSearchExploration)_qLearning.ExplorationPolicy; EpsilonGreedyExploration explorationPolicy = (EpsilonGreedyExploration)tabuPolicy.BasePolicy; // loop while ((!_needToStop) && (_currentIteration < learningIterations)) { // set exploration rate for this iteration explorationPolicy.Epsilon = explorationRate - ((double)_currentIteration / learningIterations) * explorationRate; // set learning rate for this iteration _qLearning.LearningRate = learningRate - ((double)_currentIteration / learningIterations) * learningRate; // clear tabu list tabuPolicy.ResetTabuList(); // reset agent's coordinates to the starting position _agentCurrX = _agentStartX; _agentCurrY = _agentStartY; // steps performed by agent to get to the goal int steps = 0; while ((!_needToStop) && ((_agentCurrX != _agentStopX) || (_agentCurrY != _agentStopY))) { steps++; // get agent's current state int currentState = GetStateNumber(_agentCurrX, _agentCurrY); // get the action for this state int action = _qLearning.GetAction(currentState); // update agent's current position and get his reward double reward = UpdateAgentPosition(action); // get agent's next state int nextState = GetStateNumber(_agentCurrX, _agentCurrY); // do learning of the agent - update his Q-function _qLearning.UpdateState(currentState, action, reward, nextState); // set tabu action tabuPolicy.SetTabuAction((action + 2) % 4, 1); } _currentIteration++; Debug.Log(string.Format("{0} steps needed for iteration {1}.", steps, _currentIteration)); } _enableControls = true; Debug.Log("QLearning training finished. Try to execute the solution."); }
// Sarsa thread private void SarsaThread() { int iteration = 0; TabuSearchExploration tabuPolicy = (TabuSearchExploration)sarsa.ExplorationPolicy; EpsilonGreedyExploration explorationPolicy = (EpsilonGreedyExploration)tabuPolicy.BasePolicy; while ((!needToStop) && (iteration < learningIterations)) { explorationPolicy.Epsilon = explorationRate - ((double)iteration / learningIterations) * explorationRate; sarsa.LearningRate = learningRate - ((double)iteration / learningIterations) * learningRate; tabuPolicy.ResetTabuList(); var agentCurrentX = agentStartX; var agentCurrentY = agentStartY; int steps = 1; int previousState = GetStateNumber(agentCurrentX, agentCurrentY); int previousAction = sarsa.GetAction(previousState); double reward = UpdateAgentPosition(ref agentCurrentX, ref agentCurrentY, previousAction); while ((!needToStop) && ((agentCurrentX != agentStopX) || (agentCurrentY != agentStopY))) { steps++; tabuPolicy.SetTabuAction((previousAction + 2) % 4, 1); int nextState = GetStateNumber(agentCurrentX, agentCurrentY); int nextAction = sarsa.GetAction(nextState); sarsa.UpdateState(previousState, previousAction, reward, nextState, nextAction); reward = UpdateAgentPosition(ref agentCurrentX, ref agentCurrentY, nextAction); previousState = nextState; previousAction = nextAction; } if (!needToStop) { sarsa.UpdateState(previousState, previousAction, reward); } System.Diagnostics.Debug.WriteLine(steps); iteration++; SetText(iterationBox, iteration.ToString()); } // enable settings controls EnableControls(true); }
// Q-Learning thread private void QLearningThread() { int iteration = 0; TabuSearchExploration tabuPolicy = (TabuSearchExploration)qLearning.ExplorationPolicy; EpsilonGreedyExploration explorationPolicy = (EpsilonGreedyExploration)tabuPolicy.BasePolicy; while ((!needToStop) && (iteration < learningIterations)) { explorationPolicy.Epsilon = explorationRate - ((double)iteration / learningIterations) * explorationRate; qLearning.LearningRate = learningRate - ((double)iteration / learningIterations) * learningRate; tabuPolicy.ResetTabuList(); var agentCurrentX = agentStartX; var agentCurrentY = agentStartY; int steps = 0; while ((!needToStop) && ((agentCurrentX != agentStopX) || (agentCurrentY != agentStopY))) { steps++; int currentState = GetStateNumber(agentCurrentX, agentCurrentY); int action = qLearning.GetAction(currentState); double reward = UpdateAgentPosition(ref agentCurrentX, ref agentCurrentY, action); int nextState = GetStateNumber(agentCurrentX, agentCurrentY); // do learning of the agent - update his Q-function, set Tabu action qLearning.UpdateState(currentState, action, reward, nextState); tabuPolicy.SetTabuAction((action + 2) % 4, 1); } System.Diagnostics.Debug.WriteLine(steps); iteration++; SetText(iterationBox, iteration.ToString()); } EnableControls(true); }
private static void Main(string[] args) { /**Region for setting up SARSA function (and possibly parameters)**/ #region SARSA Setup //Set up SARSA object var explorationPolicy = new EpsilonGreedyExploration(ExplorationRate); var numberOfStates = 15 * 15 * 15 * 15; var numberOfActions = Enum.GetValues(typeof(Type)).Length; var sarsa = new SARSA(numberOfStates, numberOfActions, explorationPolicy); //Prepare the state mapping Func <Pokémon, Pokémon, long> getState = (pokémon1, pokémon2) => { var moveTypes = pokémon1.Moves.Select(t => t.AttackType).Distinct().ToList(); return (15 * 15 * 15 * (long)pokémon1.Types[0] + 15 * 15 * (long)(pokémon1.Types.Count > 1 ? pokémon1.Types[1] : pokémon1.Types[0]) + 15 * (long)pokémon2.Types[0] + 1 * (long)(pokémon2.Types.Count > 1 ? pokémon2.Types[1] : pokémon2.Types[0])); }; #endregion SARSA Setup using (var sw = new StreamWriter("PineappleExpress.txt")) { sw.Write(""); } /**Region for setting up the battle itself**/ #region Battle Execution //For the specified number of battles, perform battles and update the policy for (var battleNumber = 0; battleNumber < NumberOfBattles; battleNumber++) { // set exploration rate for this iteration explorationPolicy.ExplorationRate = ExplorationRate - (double)battleNumber / NumberOfBattles * ExplorationRate; // set learning rate for this iteration sarsa.LearningRate = LearningRate - (double)battleNumber / NumberOfBattles * LearningRate; //Prepare the Pokémon Pokémon pokemon1 = RentalPokémon.RentalPorygon; //A pre-made Porygon Pokémon pokemon2 = RentalPokémon.RentalVenusaur; //A pre-made opponent long previousState = -1; var previousAction = -1; long currentState = -1; var nextAction = -1; var reward = 0.0; var firstTurn = true; double percentFinished = 0; //Battle loop while (!(pokemon1.IsFainted || pokemon2.IsFainted)) { //Shift states currentState = getState(pokemon1, pokemon2); var validTypes = pokemon1.Moves.Select(m => (int)m.AttackType).Distinct().ToList(); nextAction = sarsa.GetAction(currentState, validTypes); //update SARSA if (!firstTurn) { sarsa.UpdateState(previousState, previousAction, reward, currentState, nextAction); } else { firstTurn = false; } //Determine who moves first var firstMover = pokemon1.Stats[Stat.Speed] > pokemon2.Stats[Stat.Speed] ? pokemon1 : pokemon2; //Perform actions if (pokemon1 == firstMover) { reward = pokemon1.UseMoveOfType((Type)nextAction, pokemon2); Console.WriteLine("{0} (Pokémon 1) used a move of type {1}", pokemon1.Species.Name, Enum.GetName(typeof(Type), (Type)nextAction)); Console.WriteLine("Did {0} damage. {1} (Pokémon 2) now has {2} health remaining)", reward, pokemon2.Species.Name, pokemon2.RemainingHealth); Console.WriteLine(((Type)nextAction).MultiplierOn(pokemon2.Types.ToArray())); if (!pokemon2.IsFainted) { pokemon2.Use(new Random().Next(4), pokemon1); } else { reward += 20; } } else { pokemon2.Use(new Random().Next(4), pokemon1); //Console.WriteLine("{0} (Pokémon 2) used {1}", pokemon2.Species.Name, pokemon2.Moves[0].Name); //Console.WriteLine("Did {0} damage. {1} (Pokémon 1) now has {2} health remaining)", // reward, pokemon1.Species.Name, pokemon1.RemainingHealth); if (!pokemon1.IsFainted) { reward = pokemon1.UseMoveOfType((Type)nextAction, pokemon2); Console.WriteLine("{0} (Pokémon 1) used a move of type {1}", pokemon1.Species.Name, Enum.GetName(typeof(Type), (Type)nextAction)); Console.WriteLine("Did {0} damage. {1} (Pokémon 2) now has {2} health remaining)", reward, pokemon2.Species.Name, pokemon2.RemainingHealth); Console.WriteLine(((Type)nextAction).MultiplierOn(pokemon2.Types.ToArray())); } } previousState = currentState; previousAction = nextAction; percentFinished = ((double)pokemon2.Stats[Stat.HP] - pokemon2.RemainingHealth) / pokemon2.Stats[Stat.HP]; Console.WriteLine($"{reward}"); } sarsa.UpdateState(previousState, previousAction, reward, currentState, nextAction); if (pokemon1.IsFainted) { Console.WriteLine("{0} (Pokémon 1) Fainted", pokemon1.Species.Name); } else { Console.WriteLine("{0} (Pokémon 2) Fainted", pokemon2.Species.Name); } //Print score for graphing using (var sw = new StreamWriter($"PineappleExpress({ExplorationRate}_{LearningRate}).txt", true)) { sw.WriteLine("{0}, {1}", battleNumber, percentFinished); } } #endregion Battle Execution }
/// <summary> /// Main constructor, currently only calls AForge QLearning class constructor /// </summary> /// <param name="states">number of states</param> /// <param name="actions">number of actions</param> /// <param name="explorationRate">exploration rate in epsilon greedy exploration</param> public Q(int states, int actions, EpsilonGreedyExploration exploration) { qLearning = new QLearning(states, actions, exploration, false); }
// Show solution thread private void ShowSolutionThread() { // set exploration rate to 0, so agent uses only what he learnt TabuSearchExploration tabuPolicy = null; EpsilonGreedyExploration exploratioPolicy = null; if (qLearning != null) { tabuPolicy = (TabuSearchExploration)qLearning.ExplorationPolicy; } else if (sarsa != null) { tabuPolicy = (TabuSearchExploration)sarsa.ExplorationPolicy; } else { throw new Exception(); } exploratioPolicy = (EpsilonGreedyExploration)tabuPolicy.BasePolicy; exploratioPolicy.Epsilon = 0; tabuPolicy.ResetTabuList(); // curent coordinates of the agent int agentCurrentX = agentStartX, agentCurrentY = agentStartY; // pripate the map to display Array.Copy(map, mapToDisplay, mapWidth * mapHeight); mapToDisplay[agentStartY, agentStartX] = 2; mapToDisplay[agentStopY, agentStopX] = 3; while (!needToStop) { // dispay the map cellWorld.Map = mapToDisplay; // sleep for a while Thread.Sleep(200); // check if we have reached the end point if ((agentCurrentX == agentStopX) && (agentCurrentY == agentStopY)) { // restore the map mapToDisplay[agentStartY, agentStartX] = 2; mapToDisplay[agentStopY, agentStopX] = 3; agentCurrentX = agentStartX; agentCurrentY = agentStartY; cellWorld.Map = mapToDisplay; Thread.Sleep(200); } // remove agent from current position mapToDisplay[agentCurrentY, agentCurrentX] = 0; // get agent's current state int currentState = GetStateNumber(agentCurrentX, agentCurrentY); // get the action for this state int action = (qLearning != null) ? qLearning.GetAction(currentState) : sarsa.GetAction(currentState); // update agent's current position and get his reward double reward = UpdateAgentPosition(ref agentCurrentX, ref agentCurrentY, action); // put agent to the new position mapToDisplay[agentCurrentY, agentCurrentX] = 2; } // enable settings controls EnableControls(true); }
public void learn_test() { #region doc_main // Fix the random number generator Accord.Math.Random.Generator.Seed = 0; // In this example, we will be using the QLearning algorithm // to make a robot learn how to navigate a map. The map is // shown below, where a 1 denotes a wall and 0 denotes areas // where the robot can navigate: // int[,] map = { { 1, 1, 1, 1, 1, 1, 1, 1, 1 }, { 1, 1, 0, 0, 0, 0, 0, 0, 1 }, { 1, 1, 0, 0, 0, 1, 1, 0, 1 }, { 1, 0, 0, 1, 0, 0, 0, 0, 1 }, { 1, 0, 0, 1, 1, 1, 1, 0, 1 }, { 1, 0, 0, 1, 1, 0, 0, 0, 1 }, { 1, 1, 0, 1, 0, 0, 0, 0, 1 }, { 1, 1, 0, 1, 0, 1, 1, 0, 1 }, { 1, 1, 1, 1, 1, 1, 1, 1, 1 }, }; // Now, we define the initial and target points from which the // robot will be spawn and where it should go, respectively: int agentStartX = 1; int agentStartY = 4; int agentStopX = 7; int agentStopY = 4; // The robot is able to sense the environment though 8 sensors // that capture whether the robot is near a wall or not. Based // on the robot's current location, the sensors will return an // integer number representing which sensors have detected walls Func <int, int, int> getState = (int x, int y) => { int c1 = (map[y - 1, x - 1] != 0) ? 1 : 0; int c2 = (map[y - 1, x + 0] != 0) ? 1 : 0; int c3 = (map[y - 1, x + 1] != 0) ? 1 : 0; int c4 = (map[y + 0, x + 1] != 0) ? 1 : 0; int c5 = (map[y + 1, x + 1] != 0) ? 1 : 0; int c6 = (map[y + 1, x + 0] != 0) ? 1 : 0; int c7 = (map[y + 1, x - 1] != 0) ? 1 : 0; int c8 = (map[y + 0, x - 1] != 0) ? 1 : 0; return(c1 | (c2 << 1) | (c3 << 2) | (c4 << 3) | (c5 << 4) | (c6 << 5) | (c7 << 6) | (c8 << 7)); }; // The actions are the possible directions the robot can go: // // - case 0: go to north (up) // - case 1: go to east (right) // - case 2: go to south (down) // - case 3: go to west (left) // int learningIterations = 1000; double explorationRate = 0.5; double learningRate = 0.5; double moveReward = 0; double wallReward = -1; double goalReward = 1; // The function below specifies how the robot should perform an action given its // current position and an action number. This will cause the robot to update its // current X and Y locations given the direction (above) it was instructed to go: Func <int, int, int, Tuple <double, int, int> > doAction = (int currentX, int currentY, int action) => { // default reward is equal to moving reward double reward = moveReward; // moving direction int dx = 0, dy = 0; switch (action) { case 0: // go to north (up) dy = -1; break; case 1: // go to east (right) dx = 1; break; case 2: // go to south (down) dy = 1; break; case 3: // go to west (left) dx = -1; break; } int newX = currentX + dx; int newY = currentY + dy; // check new agent's coordinates if ((map[newY, newX] != 0) || (newX < 0) || (newX >= map.Columns()) || (newY < 0) || (newY >= map.Rows())) { // we found a wall or got outside of the world reward = wallReward; } else { currentX = newX; currentY = newY; // check if we found the goal if ((currentX == agentStopX) && (currentY == agentStopY)) { reward = goalReward; } } return(Tuple.Create(reward, currentX, currentY)); }; // After defining all those functions, we create a new Sarsa algorithm: var explorationPolicy = new EpsilonGreedyExploration(explorationRate); var tabuPolicy = new TabuSearchExploration(4, explorationPolicy); var qLearning = new QLearning(256, 4, tabuPolicy); // curent coordinates of the agent int agentCurrentX = -1; int agentCurrentY = -1; bool needToStop = false; int iteration = 0; // loop while ((!needToStop) && (iteration < learningIterations)) { // set exploration rate for this iteration explorationPolicy.Epsilon = explorationRate - ((double)iteration / learningIterations) * explorationRate; // set learning rate for this iteration qLearning.LearningRate = learningRate - ((double)iteration / learningIterations) * learningRate; // clear tabu list tabuPolicy.ResetTabuList(); // reset agent's coordinates to the starting position agentCurrentX = agentStartX; agentCurrentY = agentStartY; // previous state and action int previousState = getState(agentCurrentX, agentCurrentY); int previousAction = qLearning.GetAction(previousState); // update agent's current position and get his reward var r = doAction(agentCurrentX, agentCurrentY, previousAction); double reward = r.Item1; agentCurrentX = r.Item2; agentCurrentY = r.Item3; // loop while ((!needToStop) && (iteration < learningIterations)) { // set exploration rate for this iteration explorationPolicy.Epsilon = explorationRate - ((double)iteration / learningIterations) * explorationRate; // set learning rate for this iteration qLearning.LearningRate = learningRate - ((double)iteration / learningIterations) * learningRate; // clear tabu list tabuPolicy.ResetTabuList(); // reset agent's coordinates to the starting position agentCurrentX = agentStartX; agentCurrentY = agentStartY; // steps performed by agent to get to the goal int steps = 0; while ((!needToStop) && ((agentCurrentX != agentStopX) || (agentCurrentY != agentStopY))) { steps++; // get agent's current state int currentState = getState(agentCurrentX, agentCurrentY); // get the action for this state int action = qLearning.GetAction(currentState); // update agent's current position and get his reward r = doAction(agentCurrentX, agentCurrentY, action); reward = r.Item1; agentCurrentX = r.Item2; agentCurrentY = r.Item3; // get agent's next state int nextState = getState(agentCurrentX, agentCurrentY); // do learning of the agent - update his Q-function qLearning.UpdateState(currentState, action, reward, nextState); // set tabu action tabuPolicy.SetTabuAction((action + 2) % 4, 1); } System.Diagnostics.Debug.WriteLine(steps); iteration++; } } // The end position for the robot will be (7, 4): int finalPosX = agentCurrentX; // 7 int finalPosY = agentCurrentY; // 4; #endregion Assert.AreEqual(7, finalPosX); Assert.AreEqual(4, finalPosY); }
private void button13_Click(object sender, EventArgs e) { vissim = new VissimConnection(); OpenFileDialog openFileDialog1 = new OpenFileDialog(); if (openFileDialog1.ShowDialog() == System.Windows.Forms.DialogResult.OK) { COM.LoadVissimNetwork(vissim.GetVissimInstance(), openFileDialog1.FileName); } sim = new Simulation(vissim); sim.SetSimulationResolution(1); BLL.Neural.SOM som = new BLL.Neural.SOM(25, 5, 5, 0.5, 1, 18, 100); OpenFileDialog openFileDialog2 = new OpenFileDialog(); if (openFileDialog2.ShowDialog() == System.Windows.Forms.DialogResult.OK) { double[][] importedData = BLL.HelpMethods.LoadTrainingSet(openFileDialog2.FileName, ','); if (importedData != null) { som.TrainSOM(importedData, 1000); } } EpsilonGreedyExploration greedyExploration = new EpsilonGreedyExploration(1); double beta = 0; BLL.Neural.Q q = new BLL.Neural.Q(25, 7, greedyExploration); q.qLearning.DiscountFactor = 0.1; BLL.Thesis.Actions actions = new BLL.Thesis.Actions(25, 22); double simulationPeriod = COM.getSimulationPeriod(sim.currentSimulation); ISignalController SignalController = vissim.GetVissimInstance().Net.SignalControllers.get_ItemByKey(3); StreamWriter writer = new StreamWriter("Results1000.csv"); for (int j = 0; j < 1000; j++) { sim.RunContinuos(299); for (int i = 599; i <= simulationPeriod - 1; i = i + 300) { double[] qLenMax = sim.queueCounterResultsMax; double[] qLenAvg = sim.queueCounterResultsAvg; double[] qLenAll = new double[18]; Array.Copy(qLenAvg, qLenAll, qLenAvg.Length); Array.Copy(qLenMax, 0, qLenAll, qLenAvg.Length, qLenMax.Length); double delayAvgBefore = vissim.GetVissimInstance().Net.VehicleNetworkPerformanceMeasurement.get_AttValue("DelayAvg(Current, Current, All)"); int state1 = som.GetWinningNeuronNumber(qLenAll); int action = q.GetAction(state1); actions.PerformAction(state1, action, SignalController); sim.RunContinuos(i); qLenMax = sim.queueCounterResultsMax; qLenAvg = sim.queueCounterResultsAvg; Array.Copy(qLenAvg, qLenAll, qLenAvg.Length); Array.Copy(qLenMax, 0, qLenAll, qLenAvg.Length, qLenMax.Length); int state2 = som.GetWinningNeuronNumber(qLenAll); double delayAvgAfter = vissim.GetVissimInstance().Net.VehicleNetworkPerformanceMeasurement.get_AttValue("DelayAvg(Current, Current, All)"); double reward = BLL.Neural.Q.CalculateReward(qLenAvg, delayAvgBefore, delayAvgAfter, beta); q.UpdateQTable(state1, action, reward, state2); } double resultTTS = vissim.GetVissimInstance().Net.VehicleNetworkPerformanceMeasurement.get_AttValue("TravTmTot(Current, Total, All)"); // int seed = sim.currentSimulation.get_AttValue("RandSeed"); writer.WriteLine(j + "," + resultTTS); q.qLearning.ExplorationPolicy = new EpsilonGreedyExploration((Math.Pow(0.995, (j + 1)) * 0.99) + 0.01); // beta = (j + 1) * 0.001; writer.Flush(); sim.currentSimulation.RunSingleStep(); } writer.Close(); }
// Sarsa thread private void SarsaThread() { int iteration = 0; // exploration policy TabuSearchExploration tabuPolicy = (TabuSearchExploration)_sarsa.ExplorationPolicy; EpsilonGreedyExploration explorationPolicy = (EpsilonGreedyExploration)tabuPolicy.BasePolicy; // loop while ((!_needToStop) && (iteration < learningIterations)) { // set exploration rate for this iteration explorationPolicy.Epsilon = explorationRate - ((double)iteration / learningIterations) * explorationRate; // set learning rate for this iteration _sarsa.LearningRate = learningRate - ((double)iteration / learningIterations) * learningRate; // clear tabu list tabuPolicy.ResetTabuList(); // reset agent's coordinates to the starting position _agentCurrX = _agentStartX; _agentCurrY = _agentStartY; // steps performed by agent to get to the goal int steps = 1; // previous state and action int previousState = GetStateNumber(_agentCurrX, _agentCurrY); int previousAction = _sarsa.GetAction(previousState); // update agent's current position and get his reward double reward = UpdateAgentPosition(previousAction); while ((!_needToStop) && ((_agentCurrX != _agentStopX) || (_agentCurrY != _agentStopY))) { steps++; // set tabu action tabuPolicy.SetTabuAction((previousAction + 2) % 4, 1); // get agent's next state int nextState = GetStateNumber(_agentCurrX, _agentCurrY); // get agent's next action int nextAction = _sarsa.GetAction(nextState); // do learning of the agent - update his Q-function _sarsa.UpdateState(previousState, previousAction, reward, nextState, nextAction); // update agent's new position and get his reward reward = UpdateAgentPosition(nextAction); previousState = nextState; previousAction = nextAction; } if (!_needToStop) { // update Q-function if terminal state was reached _sarsa.UpdateState(previousState, previousAction, reward); } iteration++; Debug.Log(string.Format("{0} steps needed for iteration {1}.", steps, iteration)); } _enableControls = true; Debug.Log("SARSA training finished. Try to execute the solution."); }
void FixedUpdate() { if (_showSolution) { // move every 0.5 seconds if ((_timeStep + 0.25f) < Time.time) { _timeStep = Time.time; if ((_agentCurrX == _agentStopX) && (_agentCurrY == _agentStopY)) { PlayerObject.localPosition = new Vector3(_agentStartX, 0, _agentStartY); _agentCurrX = _agentStartX; _agentCurrY = _agentStartY; } else { if (_initShowSolution) { _initShowSolution = false; // set exploration rate to 0, so agent uses only what he learnt TabuSearchExploration tabuPolicy = null; EpsilonGreedyExploration exploratioPolicy = null; if (_qLearning != null) { tabuPolicy = (TabuSearchExploration)_qLearning.ExplorationPolicy; } else if (_sarsa != null) { tabuPolicy = (TabuSearchExploration)_sarsa.ExplorationPolicy; } else { tabuPolicy = (TabuSearchExploration)_qLearning_FDGS.ExplorationPolicy; } exploratioPolicy = (EpsilonGreedyExploration)tabuPolicy.BasePolicy; exploratioPolicy.Epsilon = 0; tabuPolicy.ResetTabuList(); PlayerObject.localPosition = new Vector3(_agentStartX, 0, _agentStartY); // current coordinates of the agent _agentCurrX = (int)PlayerObject.localPosition.x; _agentCurrY = (int)PlayerObject.localPosition.z; } if ((_qLearning != null) || (_sarsa != null)) { // get agent's current state int currentState = GetStateNumber(_agentCurrX, _agentCurrY); // get the action for this state int action = (_qLearning != null) ? _qLearning.GetAction(currentState) : _sarsa.GetAction(currentState); // update agent's current position and get his reward UpdateAgentPosition(action); } else { // get agent's current state int currentState = _qLearning_FDGS.GetStateFromCoordinates(_agentCurrX, _agentCurrY); // get the action for this state int action = _qLearning_FDGS.GetLearnedAction(currentState); // update agent's current position UpdateAgentPosition(currentState, action); } // set player object position PlayerObject.localPosition = new Vector3(_agentCurrX, 0, _agentCurrY); } } } else { if (!_needToStop) { // show current iteration References.CurrentIteration.text = _currentIteration.ToString(); } if (_enableControls) { _enableControls = false; // enable settings controls References.EnableControls(true); } } }
private void SarsaThread() { int iteration = 0; // 当前坐标的代理 curent coordinates of the agent int agentCurrentX, agentCurrentY; // 探索策略 exploration policy TabuSearchExploration tabuPolicy = (TabuSearchExploration)sarsa.ExplorationPolicy; EpsilonGreedyExploration explorationPolicy = (EpsilonGreedyExploration)tabuPolicy.BasePolicy; // loop while ((!needToStop) && (iteration < learningIterations)) { // 为这个迭代设置勘探速率 set exploration rate for this iteration explorationPolicy.Epsilon = explorationRate - ((double)iteration / learningIterations) * explorationRate; // 为迭代设置学习率 set learning rate for this iteration sarsa.LearningRate = learningRate - ((double)iteration / learningIterations) * learningRate; // 清除tabu列表 clear tabu list tabuPolicy.ResetTabuList(); // 复位代理的坐标到起始位置 reset agent's coordinates to the starting position agentCurrentX = _agentStartX; agentCurrentY = _agentStartY; // 代理执行的步骤以达到目标 steps performed by agent to get to the goal int steps = 1; // 以前的状态和动作 previous state and action int previousState = GetStateNumber(agentCurrentX, agentCurrentY); int previousAction = sarsa.GetAction(previousState); // 更新代理的当前位置并得到他的奖励 update agent's current position and get his reward double reward = UpdateAgentPosition(ref agentCurrentX, ref agentCurrentY, previousAction); while ((!needToStop) && ((agentCurrentX != _agentStopX) || (agentCurrentY != _agentStopY))) { steps++; // 设置禁忌动作 set tabu action tabuPolicy.SetTabuAction((previousAction + 2) % 4, 1); // 获取代理的下一个状态 get agent's next state int nextState = GetStateNumber(agentCurrentX, agentCurrentY); // 获取代理的下一个动作 get agent's next action int nextAction = sarsa.GetAction(nextState); // 做学习代理 - 更新他的Q函数 do learning of the agent - update his Q-function sarsa.UpdateState(previousState, previousAction, reward, nextState, nextAction); // 更新代理的新位置并得到他的奖励 update agent's new position and get his reward reward = UpdateAgentPosition(ref agentCurrentX, ref agentCurrentY, nextAction); previousState = nextState; previousAction = nextAction; } if (!needToStop) { // 如果达到终端状态,则更新Q函数 update Q-function if terminal state was reached sarsa.UpdateState(previousState, previousAction, reward); } System.Diagnostics.Debug.WriteLine(steps); iteration++; // show current iteration SetText(iterationBox, iteration.ToString()); } // enable settings controls EnableControls(true); }
private void DoubleQLearningThread() { miniSteps = int.MaxValue; MiniOldAction.Clear(); int iteration = 0; // 当前坐标的代理 curent coordinates of the agent int agentCurrentX, agentCurrentY; // 探索策略 exploration policy TabuSearchExploration tabuPolicy = (TabuSearchExploration)doubleQLearning.ExplorationPolicy; EpsilonGreedyExploration explorationPolicy = (EpsilonGreedyExploration)tabuPolicy.BasePolicy; // loop while ((!needToStop) && (iteration < learningIterations)) { OldAction.Clear(); // 为这个迭代设置勘探速率 set exploration rate for this iteration explorationPolicy.Epsilon = explorationRate - ((double)iteration / learningIterations) * explorationRate; // 为迭代设置学习率 set learning rate for this iteration doubleQLearning.LearningRate = learningRate - ((double)iteration / learningIterations) * learningRate; // 清除tabu列表 clear tabu list tabuPolicy.ResetTabuList(); //复位代理的坐标到起始位置 reset agent's coordinates to the starting position agentCurrentX = _agentStartX; agentCurrentY = _agentStartY; // 代理执行的步骤以达到目标 steps performed by agent to get to the goal int steps = 0; while ((!needToStop) && ((agentCurrentX != _agentStopX) || (agentCurrentY != _agentStopY))) { steps++; // 获取代理的当前状态 get agent's current state int currentState = GetStateNumber(agentCurrentX, agentCurrentY); // 获取此状态的操作 get the action for this state int action = doubleQLearning.GetAction(currentState); tabuPolicy.ResetTabuList(); // 更新代理的当前位置并得到他的奖励 update agent's current position and get his reward double reward = UpdateAgentPosition(ref agentCurrentX, ref agentCurrentY, action); // 获取代理的下一个状态 get agent's next state int nextState = GetStateNumber(agentCurrentX, agentCurrentY); // 做学习代理 - 更新他的Q函数 do learning of the agent - update his Q-function doubleQLearning.UpdateState(currentState, action, reward, nextState); var tup = Tuple.Create(currentState, action, reward, nextState); if (OldAction.Contains(tup) == false) { OldAction.Add(tup); } // 设置tabu动作 set tabu action tabuPolicy.SetTabuAction((action + 2) % 4, 1); } for (int i = OldAction.Count - 1; i >= 0; i--) { var a = OldAction[i]; doubleQLearning.UpdateState(a.Item1, a.Item2, a.Item3, a.Item4); } if (steps < miniSteps) { miniSteps = steps; MiniOldAction.Clear(); for (int i = 0; i < OldAction.Count; i++) { MiniOldAction.Add(OldAction[i]); } } else { for (int i = MiniOldAction.Count - 1; i >= 0; i--) { var a = MiniOldAction[i]; doubleQLearning.UpdateState(a.Item1, a.Item2, a.Item3, a.Item4); } } System.Diagnostics.Debug.WriteLine(steps); iteration++; // 显示当前迭代 show current iteration SetText(iterationBox, iteration.ToString()); } // 启用设置控件 enable settings controls EnableControls(true); }
private void ShowSolutionThread() { // 将探索率设置为0,因此代理仅使用他学到的内容 set exploration rate to 0, so agent uses only what he learnt TabuSearchExploration tabuPolicy = null; EpsilonGreedyExploration exploratioPolicy = null; if (qLearning != null) { tabuPolicy = (TabuSearchExploration)qLearning.ExplorationPolicy; } else if (sarsa != null) { tabuPolicy = (TabuSearchExploration)sarsa.ExplorationPolicy; } else { tabuPolicy = (TabuSearchExploration)doubleQLearning.ExplorationPolicy; } exploratioPolicy = (EpsilonGreedyExploration)tabuPolicy.BasePolicy; exploratioPolicy.Epsilon = 0; tabuPolicy.ResetTabuList(); // 代理的当前坐标 curent coordinates of the agent int agentCurrentX = _agentStartX, agentCurrentY = _agentStartY; // pripate地图显示 pripate the map to display Array.Copy(map, mapToDisplay, map.GetLength(0) * map.GetLength(1)); mapToDisplay[_agentStartX, _agentStartY] = 2; mapToDisplay[_agentStopX, _agentStopY] = 3; while (!needToStop) { // 显示地图 dispay the map this.cellWorld1.Map = mapToDisplay; // sleep for a while Thread.Sleep(200); // 检查我们是否已经到达终点 check if we have reached the end point if ((agentCurrentX == _agentStopX) && (agentCurrentY == _agentStopY)) { // 恢复地图 restore the map mapToDisplay[_agentStartX, _agentStartY] = 2; mapToDisplay[_agentStopX, _agentStopY] = 3; agentCurrentX = _agentStartX; agentCurrentY = _agentStartY; this.cellWorld1.Map = mapToDisplay; Thread.Sleep(200); } // 从当前位置删除代理 remove agent from current position mapToDisplay[agentCurrentX, agentCurrentY] = 0; // 获取代理的当前状态 get agent's current state int currentState = GetStateNumber(agentCurrentX, agentCurrentY); // 获取此状态的操作 get the action for this state int action = GetAction(currentState);// (qLearning != null) ? qLearning.GetAction(currentState) : sarsa.GetAction(currentState); // 更新代理的当前位置并得到他的奖励 update agent's current position and get his reward double reward = UpdateAgentPosition(ref agentCurrentX, ref agentCurrentY, action); // 把代理放到新的位置 put agent to the new position mapToDisplay[agentCurrentX, agentCurrentY] = 2; } // 启用设置控件 enable settings controls EnableControls(true); }