// Q-Learning thread private void QLearningThread() { int iteration = 0; // current coordinates of the agent int agentCurrentX, agentCurrentY; // exploration policy TabuSearchExploration tabuPolicy = (TabuSearchExploration)qLearning.ExplorationPolicy; EpsilonGreedyExploration explorationPolicy = (EpsilonGreedyExploration)tabuPolicy.BasePolicy; // loop while ((!needToStop) && (iteration < learningIterations)) { // set exploration rate for this iteration explorationPolicy.Epsilon = explorationRate - ((double)iteration / learningIterations) * explorationRate; // set learning rate for this iteration qLearning.LearningRate = learningRate - ((double)iteration / learningIterations) * learningRate; // clear tabu list tabuPolicy.ResetTabuList(); // reset agent's coordinates to the starting position agentCurrentX = agentStartX; agentCurrentY = agentStartY; // steps performed by agent to get to the goal int steps = 0; while ((!needToStop) && ((agentCurrentX != agentStopX) || (agentCurrentY != agentStopY))) { steps++; // get agent's current state int currentState = GetStateNumber(agentCurrentX, agentCurrentY); // get the action for this state int action = qLearning.GetAction(currentState); // update agent's current position and get his reward double reward = UpdateAgentPosition(ref agentCurrentX, ref agentCurrentY, action); // get agent's next state int nextState = GetStateNumber(agentCurrentX, agentCurrentY); // do learning of the agent - update his Q-function qLearning.UpdateState(currentState, action, reward, nextState); // set tabu action tabuPolicy.SetTabuAction((action + 2) % 4, 1); } System.Diagnostics.Debug.WriteLine(steps); iteration++; // show current iteration SetText(iterationBox, iteration.ToString()); } // enable settings controls EnableControls(true); }
// Q-Learning thread private void QLearningThread() { _currentIteration = 0; // exploration policy TabuSearchExploration tabuPolicy = (TabuSearchExploration)_qLearning.ExplorationPolicy; EpsilonGreedyExploration explorationPolicy = (EpsilonGreedyExploration)tabuPolicy.BasePolicy; // loop while ((!_needToStop) && (_currentIteration < learningIterations)) { // set exploration rate for this iteration explorationPolicy.Epsilon = explorationRate - ((double)_currentIteration / learningIterations) * explorationRate; // set learning rate for this iteration _qLearning.LearningRate = learningRate - ((double)_currentIteration / learningIterations) * learningRate; // clear tabu list tabuPolicy.ResetTabuList(); // reset agent's coordinates to the starting position _agentCurrX = _agentStartX; _agentCurrY = _agentStartY; // steps performed by agent to get to the goal int steps = 0; while ((!_needToStop) && ((_agentCurrX != _agentStopX) || (_agentCurrY != _agentStopY))) { steps++; // get agent's current state int currentState = GetStateNumber(_agentCurrX, _agentCurrY); // get the action for this state int action = _qLearning.GetAction(currentState); // update agent's current position and get his reward double reward = UpdateAgentPosition(action); // get agent's next state int nextState = GetStateNumber(_agentCurrX, _agentCurrY); // do learning of the agent - update his Q-function _qLearning.UpdateState(currentState, action, reward, nextState); // set tabu action tabuPolicy.SetTabuAction((action + 2) % 4, 1); } _currentIteration++; Debug.Log(string.Format("{0} steps needed for iteration {1}.", steps, _currentIteration)); } _enableControls = true; Debug.Log("QLearning training finished. Try to execute the solution."); }
// Q-Learning thread private void QLearningThread() { int iteration = 0; TabuSearchExploration tabuPolicy = (TabuSearchExploration)qLearning.ExplorationPolicy; EpsilonGreedyExploration explorationPolicy = (EpsilonGreedyExploration)tabuPolicy.BasePolicy; while ((!needToStop) && (iteration < learningIterations)) { explorationPolicy.Epsilon = explorationRate - ((double)iteration / learningIterations) * explorationRate; qLearning.LearningRate = learningRate - ((double)iteration / learningIterations) * learningRate; tabuPolicy.ResetTabuList(); var agentCurrentX = agentStartX; var agentCurrentY = agentStartY; int steps = 0; while ((!needToStop) && ((agentCurrentX != agentStopX) || (agentCurrentY != agentStopY))) { steps++; int currentState = GetStateNumber(agentCurrentX, agentCurrentY); int action = qLearning.GetAction(currentState); double reward = UpdateAgentPosition(ref agentCurrentX, ref agentCurrentY, action); int nextState = GetStateNumber(agentCurrentX, agentCurrentY); // do learning of the agent - update his Q-function, set Tabu action qLearning.UpdateState(currentState, action, reward, nextState); tabuPolicy.SetTabuAction((action + 2) % 4, 1); } System.Diagnostics.Debug.WriteLine(steps); iteration++; SetText(iterationBox, iteration.ToString()); } EnableControls(true); }
/// <summary> /// Updates Q table based on the calculated reward /// </summary> /// <param name="previousState">state number of previous state</param> /// <param name="action">number of action that was taken</param> /// <param name="reward">reward given</param> /// <param name="nextState">number of the next state</param> public void UpdateQTable(int previousState, int action, double reward, int nextState) { qLearning.UpdateState(previousState, action, reward, nextState); }
public void learn_test() { #region doc_main // Fix the random number generator Accord.Math.Random.Generator.Seed = 0; // In this example, we will be using the QLearning algorithm // to make a robot learn how to navigate a map. The map is // shown below, where a 1 denotes a wall and 0 denotes areas // where the robot can navigate: // int[,] map = { { 1, 1, 1, 1, 1, 1, 1, 1, 1 }, { 1, 1, 0, 0, 0, 0, 0, 0, 1 }, { 1, 1, 0, 0, 0, 1, 1, 0, 1 }, { 1, 0, 0, 1, 0, 0, 0, 0, 1 }, { 1, 0, 0, 1, 1, 1, 1, 0, 1 }, { 1, 0, 0, 1, 1, 0, 0, 0, 1 }, { 1, 1, 0, 1, 0, 0, 0, 0, 1 }, { 1, 1, 0, 1, 0, 1, 1, 0, 1 }, { 1, 1, 1, 1, 1, 1, 1, 1, 1 }, }; // Now, we define the initial and target points from which the // robot will be spawn and where it should go, respectively: int agentStartX = 1; int agentStartY = 4; int agentStopX = 7; int agentStopY = 4; // The robot is able to sense the environment though 8 sensors // that capture whether the robot is near a wall or not. Based // on the robot's current location, the sensors will return an // integer number representing which sensors have detected walls Func <int, int, int> getState = (int x, int y) => { int c1 = (map[y - 1, x - 1] != 0) ? 1 : 0; int c2 = (map[y - 1, x + 0] != 0) ? 1 : 0; int c3 = (map[y - 1, x + 1] != 0) ? 1 : 0; int c4 = (map[y + 0, x + 1] != 0) ? 1 : 0; int c5 = (map[y + 1, x + 1] != 0) ? 1 : 0; int c6 = (map[y + 1, x + 0] != 0) ? 1 : 0; int c7 = (map[y + 1, x - 1] != 0) ? 1 : 0; int c8 = (map[y + 0, x - 1] != 0) ? 1 : 0; return(c1 | (c2 << 1) | (c3 << 2) | (c4 << 3) | (c5 << 4) | (c6 << 5) | (c7 << 6) | (c8 << 7)); }; // The actions are the possible directions the robot can go: // // - case 0: go to north (up) // - case 1: go to east (right) // - case 2: go to south (down) // - case 3: go to west (left) // int learningIterations = 1000; double explorationRate = 0.5; double learningRate = 0.5; double moveReward = 0; double wallReward = -1; double goalReward = 1; // The function below specifies how the robot should perform an action given its // current position and an action number. This will cause the robot to update its // current X and Y locations given the direction (above) it was instructed to go: Func <int, int, int, Tuple <double, int, int> > doAction = (int currentX, int currentY, int action) => { // default reward is equal to moving reward double reward = moveReward; // moving direction int dx = 0, dy = 0; switch (action) { case 0: // go to north (up) dy = -1; break; case 1: // go to east (right) dx = 1; break; case 2: // go to south (down) dy = 1; break; case 3: // go to west (left) dx = -1; break; } int newX = currentX + dx; int newY = currentY + dy; // check new agent's coordinates if ((map[newY, newX] != 0) || (newX < 0) || (newX >= map.Columns()) || (newY < 0) || (newY >= map.Rows())) { // we found a wall or got outside of the world reward = wallReward; } else { currentX = newX; currentY = newY; // check if we found the goal if ((currentX == agentStopX) && (currentY == agentStopY)) { reward = goalReward; } } return(Tuple.Create(reward, currentX, currentY)); }; // After defining all those functions, we create a new Sarsa algorithm: var explorationPolicy = new EpsilonGreedyExploration(explorationRate); var tabuPolicy = new TabuSearchExploration(4, explorationPolicy); var qLearning = new QLearning(256, 4, tabuPolicy); // curent coordinates of the agent int agentCurrentX = -1; int agentCurrentY = -1; bool needToStop = false; int iteration = 0; // loop while ((!needToStop) && (iteration < learningIterations)) { // set exploration rate for this iteration explorationPolicy.Epsilon = explorationRate - ((double)iteration / learningIterations) * explorationRate; // set learning rate for this iteration qLearning.LearningRate = learningRate - ((double)iteration / learningIterations) * learningRate; // clear tabu list tabuPolicy.ResetTabuList(); // reset agent's coordinates to the starting position agentCurrentX = agentStartX; agentCurrentY = agentStartY; // previous state and action int previousState = getState(agentCurrentX, agentCurrentY); int previousAction = qLearning.GetAction(previousState); // update agent's current position and get his reward var r = doAction(agentCurrentX, agentCurrentY, previousAction); double reward = r.Item1; agentCurrentX = r.Item2; agentCurrentY = r.Item3; // loop while ((!needToStop) && (iteration < learningIterations)) { // set exploration rate for this iteration explorationPolicy.Epsilon = explorationRate - ((double)iteration / learningIterations) * explorationRate; // set learning rate for this iteration qLearning.LearningRate = learningRate - ((double)iteration / learningIterations) * learningRate; // clear tabu list tabuPolicy.ResetTabuList(); // reset agent's coordinates to the starting position agentCurrentX = agentStartX; agentCurrentY = agentStartY; // steps performed by agent to get to the goal int steps = 0; while ((!needToStop) && ((agentCurrentX != agentStopX) || (agentCurrentY != agentStopY))) { steps++; // get agent's current state int currentState = getState(agentCurrentX, agentCurrentY); // get the action for this state int action = qLearning.GetAction(currentState); // update agent's current position and get his reward r = doAction(agentCurrentX, agentCurrentY, action); reward = r.Item1; agentCurrentX = r.Item2; agentCurrentY = r.Item3; // get agent's next state int nextState = getState(agentCurrentX, agentCurrentY); // do learning of the agent - update his Q-function qLearning.UpdateState(currentState, action, reward, nextState); // set tabu action tabuPolicy.SetTabuAction((action + 2) % 4, 1); } System.Diagnostics.Debug.WriteLine(steps); iteration++; } } // The end position for the robot will be (7, 4): int finalPosX = agentCurrentX; // 7 int finalPosY = agentCurrentY; // 4; #endregion Assert.AreEqual(7, finalPosX); Assert.AreEqual(4, finalPosY); }
private void QLearningThread() { int iteration = 0; // 当前坐标的代理 curent coordinates of the agent int agentCurrentX, agentCurrentY; // 探索策略 exploration policy TabuSearchExploration tabuPolicy = (TabuSearchExploration)qLearning.ExplorationPolicy; EpsilonGreedyExploration explorationPolicy = (EpsilonGreedyExploration)tabuPolicy.BasePolicy; // loop while ((!needToStop) && (iteration < learningIterations)) { OldAction.Clear(); // 为这个迭代设置勘探速率 set exploration rate for this iteration explorationPolicy.Epsilon = explorationRate - ((double)iteration / learningIterations) * explorationRate; // 为迭代设置学习率 set learning rate for this iteration qLearning.LearningRate = learningRate - ((double)iteration / learningIterations) * learningRate; // 清除tabu列表 clear tabu list tabuPolicy.ResetTabuList(); //复位代理的坐标到起始位置 reset agent's coordinates to the starting position agentCurrentX = _agentStartX; agentCurrentY = _agentStartY; // 代理执行的步骤以达到目标 steps performed by agent to get to the goal int steps = 0; while ((!needToStop) && ((agentCurrentX != _agentStopX) || (agentCurrentY != _agentStopY))) { steps++; // 获取代理的当前状态 get agent's current state int currentState = GetStateNumber(agentCurrentX, agentCurrentY); // 获取此状态的操作 get the action for this state int action = qLearning.GetAction(currentState); tabuPolicy.ResetTabuList(); // 更新代理的当前位置并得到他的奖励 update agent's current position and get his reward double reward = UpdateAgentPosition(ref agentCurrentX, ref agentCurrentY, action); // 获取代理的下一个状态 get agent's next state int nextState = GetStateNumber(agentCurrentX, agentCurrentY); // 做学习代理 - 更新他的Q函数 do learning of the agent - update his Q-function qLearning.UpdateState(currentState, action, reward, nextState); OldAction.Add(Tuple.Create(currentState, action, reward, nextState)); // 设置tabu动作 set tabu action tabuPolicy.SetTabuAction((action + 2) % 4, 1); } for (int i = OldAction.Count - 1; i >= 0; i--) { var a = OldAction[i]; qLearning.UpdateState(a.Item1, a.Item2, a.Item3, a.Item4); } System.Diagnostics.Debug.WriteLine(steps); iteration++; // 显示当前迭代 show current iteration SetText(iterationBox, iteration.ToString()); } // 启用设置控件 enable settings controls EnableControls(true); }