// Sarsa thread private void SarsaThread() { int iteration = 0; TabuSearchExploration tabuPolicy = (TabuSearchExploration)sarsa.ExplorationPolicy; EpsilonGreedyExploration explorationPolicy = (EpsilonGreedyExploration)tabuPolicy.BasePolicy; while ((!needToStop) && (iteration < learningIterations)) { explorationPolicy.Epsilon = explorationRate - ((double)iteration / learningIterations) * explorationRate; sarsa.LearningRate = learningRate - ((double)iteration / learningIterations) * learningRate; tabuPolicy.ResetTabuList(); var agentCurrentX = agentStartX; var agentCurrentY = agentStartY; int steps = 1; int previousState = GetStateNumber(agentCurrentX, agentCurrentY); int previousAction = sarsa.GetAction(previousState); double reward = UpdateAgentPosition(ref agentCurrentX, ref agentCurrentY, previousAction); while ((!needToStop) && ((agentCurrentX != agentStopX) || (agentCurrentY != agentStopY))) { steps++; tabuPolicy.SetTabuAction((previousAction + 2) % 4, 1); int nextState = GetStateNumber(agentCurrentX, agentCurrentY); int nextAction = sarsa.GetAction(nextState); sarsa.UpdateState(previousState, previousAction, reward, nextState, nextAction); reward = UpdateAgentPosition(ref agentCurrentX, ref agentCurrentY, nextAction); previousState = nextState; previousAction = nextAction; } if (!needToStop) { sarsa.UpdateState(previousState, previousAction, reward); } System.Diagnostics.Debug.WriteLine(steps); iteration++; SetText(iterationBox, iteration.ToString()); } // enable settings controls EnableControls(true); }
// Sarsa thread private void SarsaThread() { int iteration = 0; // curent coordinates of the agent int agentCurrentX, agentCurrentY; // exploration policy TabuSearchExploration tabuPolicy = (TabuSearchExploration)sarsa.ExplorationPolicy; EpsilonGreedyExploration explorationPolicy = (EpsilonGreedyExploration)tabuPolicy.BasePolicy; // loop while ((!needToStop) && (iteration < learningIterations)) { // set exploration rate for this iteration explorationPolicy.Epsilon = explorationRate - ((double)iteration / learningIterations) * explorationRate; // set learning rate for this iteration sarsa.LearningRate = learningRate - ((double)iteration / learningIterations) * learningRate; // clear tabu list tabuPolicy.ResetTabuList(); // reset agent's coordinates to the starting position agentCurrentX = agentStartX; agentCurrentY = agentStartY; // steps performed by agent to get to the goal int steps = 1; // previous state and action int previousState = GetStateNumber(agentCurrentX, agentCurrentY); int previousAction = sarsa.GetAction(previousState); // update agent's current position and get his reward double reward = UpdateAgentPosition(ref agentCurrentX, ref agentCurrentY, previousAction); while ((!needToStop) && ((agentCurrentX != agentStopX) || (agentCurrentY != agentStopY))) { steps++; // set tabu action tabuPolicy.SetTabuAction((previousAction + 2) % 4, 1); // get agent's next state int nextState = GetStateNumber(agentCurrentX, agentCurrentY); // get agent's next action int nextAction = sarsa.GetAction(nextState); // do learning of the agent - update his Q-function sarsa.UpdateState(previousState, previousAction, reward, nextState, nextAction); // update agent's new position and get his reward reward = UpdateAgentPosition(ref agentCurrentX, ref agentCurrentY, nextAction); previousState = nextState; previousAction = nextAction; } if (!needToStop) { // update Q-function if terminal state was reached sarsa.UpdateState(previousState, previousAction, reward); } System.Diagnostics.Debug.WriteLine(steps); iteration++; // show current iteration SetText(iterationBox, iteration.ToString()); } // enable settings controls EnableControls(true); }
public void learn_test() { #region doc_main // Fix the random number generator Accord.Math.Random.Generator.Seed = 0; // In this example, we will be using the Sarsa algorithm // to make a robot learn how to navigate a map. The map // is shown below, where a 1 denotes a wall and 0 denotes // areas where the robot can navigate: // int[,] map = { { 1, 1, 1, 1, 1, 1, 1, 1, 1 }, { 1, 1, 0, 0, 0, 0, 0, 0, 1 }, { 1, 1, 0, 0, 0, 1, 1, 0, 1 }, { 1, 0, 0, 1, 0, 0, 0, 0, 1 }, { 1, 0, 0, 1, 1, 1, 1, 0, 1 }, { 1, 0, 0, 1, 1, 0, 0, 0, 1 }, { 1, 1, 0, 1, 0, 0, 0, 0, 1 }, { 1, 1, 0, 1, 0, 1, 1, 0, 1 }, { 1, 1, 1, 1, 1, 1, 1, 1, 1 }, }; // Now, we define the initial and target points from which the // robot will be spawn and where it should go, respectively: int agentStartX = 1; int agentStartY = 4; int agentStopX = 7; int agentStopY = 4; // The robot is able to sense the environment though 8 sensors // that capture whether the robot is near a wall or not. Based // on the robot's current location, the sensors will return an // integer number representing which sensors have detected walls Func <int, int, int> getState = (int x, int y) => { int c1 = (map[y - 1, x - 1] != 0) ? 1 : 0; int c2 = (map[y - 1, x + 0] != 0) ? 1 : 0; int c3 = (map[y - 1, x + 1] != 0) ? 1 : 0; int c4 = (map[y + 0, x + 1] != 0) ? 1 : 0; int c5 = (map[y + 1, x + 1] != 0) ? 1 : 0; int c6 = (map[y + 1, x + 0] != 0) ? 1 : 0; int c7 = (map[y + 1, x - 1] != 0) ? 1 : 0; int c8 = (map[y + 0, x - 1] != 0) ? 1 : 0; return(c1 | (c2 << 1) | (c3 << 2) | (c4 << 3) | (c5 << 4) | (c6 << 5) | (c7 << 6) | (c8 << 7)); }; // The actions are the possible directions the robot can go: // // - case 0: go to north (up) // - case 1: go to east (right) // - case 2: go to south (down) // - case 3: go to west (left) // int learningIterations = 1000; double explorationRate = 0.5; double learningRate = 0.5; double moveReward = 0; double wallReward = -1; double goalReward = 1; // The function below specifies how the robot should perform an action given its // current position and an action number. This will cause the robot to update its // current X and Y locations given the direction (above) it was instructed to go: Func <int, int, int, Tuple <double, int, int> > doAction = (int currentX, int currentY, int action) => { // default reward is equal to moving reward double reward = moveReward; // moving direction int dx = 0, dy = 0; switch (action) { case 0: // go to north (up) dy = -1; break; case 1: // go to east (right) dx = 1; break; case 2: // go to south (down) dy = 1; break; case 3: // go to west (left) dx = -1; break; } int newX = currentX + dx; int newY = currentY + dy; // check new agent's coordinates if ((map[newY, newX] != 0) || (newX < 0) || (newX >= map.Columns()) || (newY < 0) || (newY >= map.Rows())) { // we found a wall or got outside of the world reward = wallReward; } else { currentX = newX; currentY = newY; // check if we found the goal if ((currentX == agentStopX) && (currentY == agentStopY)) { reward = goalReward; } } return(Tuple.Create(reward, currentX, currentY)); }; // After defining all those functions, we create a new Sarsa algorithm: var explorationPolicy = new EpsilonGreedyExploration(explorationRate); var tabuPolicy = new TabuSearchExploration(4, explorationPolicy); var sarsa = new Sarsa(256, 4, tabuPolicy); // curent coordinates of the agent int agentCurrentX = -1; int agentCurrentY = -1; bool needToStop = false; int iteration = 0; // loop while ((!needToStop) && (iteration < learningIterations)) { // set exploration rate for this iteration explorationPolicy.Epsilon = explorationRate - ((double)iteration / learningIterations) * explorationRate; // set learning rate for this iteration sarsa.LearningRate = learningRate - ((double)iteration / learningIterations) * learningRate; // clear tabu list tabuPolicy.ResetTabuList(); // reset agent's coordinates to the starting position agentCurrentX = agentStartX; agentCurrentY = agentStartY; // steps performed by agent to get to the goal int steps = 1; // previous state and action int previousState = getState(agentCurrentX, agentCurrentY); int previousAction = sarsa.GetAction(previousState); // update agent's current position and get his reward var r = doAction(agentCurrentX, agentCurrentY, previousAction); double reward = r.Item1; agentCurrentX = r.Item2; agentCurrentY = r.Item3; while ((!needToStop) && ((agentCurrentX != agentStopX) || (agentCurrentY != agentStopY))) { steps++; // set tabu action tabuPolicy.SetTabuAction((previousAction + 2) % 4, 1); // get agent's next state int nextState = getState(agentCurrentX, agentCurrentY); // get agent's next action int nextAction = sarsa.GetAction(nextState); // do learning of the agent - update his Q-function sarsa.UpdateState(previousState, previousAction, reward, nextState, nextAction); // update agent's new position and get his reward r = doAction(agentCurrentX, agentCurrentY, nextAction); reward = r.Item1; agentCurrentX = r.Item2; agentCurrentY = r.Item3; previousState = nextState; previousAction = nextAction; } if (!needToStop) { // update Q-function if terminal state was reached sarsa.UpdateState(previousState, previousAction, reward); } iteration++; } // The end position for the robot will be (7, 4): int finalPosX = agentCurrentX; // 7 int finalPosY = agentCurrentY; // 4; #endregion Assert.AreEqual(7, finalPosX); Assert.AreEqual(4, finalPosY); }
// Sarsa thread private void SarsaThread() { int iteration = 0; // exploration policy TabuSearchExploration tabuPolicy = (TabuSearchExploration)_sarsa.ExplorationPolicy; EpsilonGreedyExploration explorationPolicy = (EpsilonGreedyExploration)tabuPolicy.BasePolicy; // loop while ((!_needToStop) && (iteration < learningIterations)) { // set exploration rate for this iteration explorationPolicy.Epsilon = explorationRate - ((double)iteration / learningIterations) * explorationRate; // set learning rate for this iteration _sarsa.LearningRate = learningRate - ((double)iteration / learningIterations) * learningRate; // clear tabu list tabuPolicy.ResetTabuList(); // reset agent's coordinates to the starting position _agentCurrX = _agentStartX; _agentCurrY = _agentStartY; // steps performed by agent to get to the goal int steps = 1; // previous state and action int previousState = GetStateNumber(_agentCurrX, _agentCurrY); int previousAction = _sarsa.GetAction(previousState); // update agent's current position and get his reward double reward = UpdateAgentPosition(previousAction); while ((!_needToStop) && ((_agentCurrX != _agentStopX) || (_agentCurrY != _agentStopY))) { steps++; // set tabu action tabuPolicy.SetTabuAction((previousAction + 2) % 4, 1); // get agent's next state int nextState = GetStateNumber(_agentCurrX, _agentCurrY); // get agent's next action int nextAction = _sarsa.GetAction(nextState); // do learning of the agent - update his Q-function _sarsa.UpdateState(previousState, previousAction, reward, nextState, nextAction); // update agent's new position and get his reward reward = UpdateAgentPosition(nextAction); previousState = nextState; previousAction = nextAction; } if (!_needToStop) { // update Q-function if terminal state was reached _sarsa.UpdateState(previousState, previousAction, reward); } iteration++; Debug.Log(string.Format("{0} steps needed for iteration {1}.", steps, iteration)); } _enableControls = true; Debug.Log("SARSA training finished. Try to execute the solution."); }
private void SarsaThread() { int iteration = 0; // 当前坐标的代理 curent coordinates of the agent int agentCurrentX, agentCurrentY; // 探索策略 exploration policy TabuSearchExploration tabuPolicy = (TabuSearchExploration)sarsa.ExplorationPolicy; EpsilonGreedyExploration explorationPolicy = (EpsilonGreedyExploration)tabuPolicy.BasePolicy; // loop while ((!needToStop) && (iteration < learningIterations)) { // 为这个迭代设置勘探速率 set exploration rate for this iteration explorationPolicy.Epsilon = explorationRate - ((double)iteration / learningIterations) * explorationRate; // 为迭代设置学习率 set learning rate for this iteration sarsa.LearningRate = learningRate - ((double)iteration / learningIterations) * learningRate; // 清除tabu列表 clear tabu list tabuPolicy.ResetTabuList(); // 复位代理的坐标到起始位置 reset agent's coordinates to the starting position agentCurrentX = _agentStartX; agentCurrentY = _agentStartY; // 代理执行的步骤以达到目标 steps performed by agent to get to the goal int steps = 1; // 以前的状态和动作 previous state and action int previousState = GetStateNumber(agentCurrentX, agentCurrentY); int previousAction = sarsa.GetAction(previousState); // 更新代理的当前位置并得到他的奖励 update agent's current position and get his reward double reward = UpdateAgentPosition(ref agentCurrentX, ref agentCurrentY, previousAction); while ((!needToStop) && ((agentCurrentX != _agentStopX) || (agentCurrentY != _agentStopY))) { steps++; // 设置禁忌动作 set tabu action tabuPolicy.SetTabuAction((previousAction + 2) % 4, 1); // 获取代理的下一个状态 get agent's next state int nextState = GetStateNumber(agentCurrentX, agentCurrentY); // 获取代理的下一个动作 get agent's next action int nextAction = sarsa.GetAction(nextState); // 做学习代理 - 更新他的Q函数 do learning of the agent - update his Q-function sarsa.UpdateState(previousState, previousAction, reward, nextState, nextAction); // 更新代理的新位置并得到他的奖励 update agent's new position and get his reward reward = UpdateAgentPosition(ref agentCurrentX, ref agentCurrentY, nextAction); previousState = nextState; previousAction = nextAction; } if (!needToStop) { // 如果达到终端状态,则更新Q函数 update Q-function if terminal state was reached sarsa.UpdateState(previousState, previousAction, reward); } System.Diagnostics.Debug.WriteLine(steps); iteration++; // show current iteration SetText(iterationBox, iteration.ToString()); } // enable settings controls EnableControls(true); }