private Move MoveOfMaxQValue(Position fpos, Position epos) { double max = -double.MaxValue; Move max_move = default; for (int i = 0; i < 2; i++) { for (int j = 0; j < 2; j++) { Move move = new Move((HMove)i, (VMove)j); var tmp_key = new QTKey(fpos, epos, move); var val = QTable[tmp_key]; if (val > max) { max = val; max_move = move; } } } return(max_move); }
public void Start(IProgress <double> pr) { Random rnd = new Random(DateTime.Now.Millisecond); Position Agent; Position Food; Position Enemy; for (int i = -Position.SIZE; i <= Position.SIZE; i++) { for (int j = -Position.SIZE; j <= Position.SIZE; j++) { Position fpos = new Position(i, j); for (int k = -Position.SIZE; k <= Position.SIZE; k++) { for (int l = -Position.SIZE; l <= Position.SIZE; l++) { Position epos = new Position(k, l); QTable[new QTKey(fpos, epos, new Move(HMove.Left, VMove.Top))] = 0; QTable[new QTKey(fpos, epos, new Move(HMove.Left, VMove.Bottom))] = 0; QTable[new QTKey(fpos, epos, new Move(HMove.Right, VMove.Top))] = 0; QTable[new QTKey(fpos, epos, new Move(HMove.Right, VMove.Bottom))] = 0; } } } } for (int i = 0; i < Episodes; i++) { Agent = Position.RandPos(rnd); Food = Position.RandPos(rnd); Enemy = Position.RandPos(rnd); double ep_reward = 0; if (Food == Agent || Food == Enemy || Enemy == Agent) { i--; continue; } for (int j = 0; j < MaxStepPerEpisode; j++) { var relative_posf = Food - Agent; var relative_pose = Enemy - Agent; Move action; if (rnd.NextDouble() < Epsilon) // explore { action = Move.GetRandomMove(rnd); } else // exploit { action = MoveOfMaxQValue(relative_posf, relative_pose); } var key = new QTKey(relative_posf, relative_pose, action); Agent.Move((int)action.H, (int)action.V); double reward; if (Agent == Food) { reward = FoodReward; } else if (Agent == Enemy) { reward = EnemyPenalty; } else { reward = NothingPenalty; } ep_reward += reward; relative_posf = Food - Agent; relative_pose = Enemy - Agent; var max_q_action = MoveOfMaxQValue(relative_posf, relative_pose); var max_future_q = QTable[new QTKey(relative_posf, relative_pose, max_q_action)]; QTable[key] = reward == FoodReward ? FoodReward : (1 - LearningRate) * QTable[key] + LearningRate * (reward + FutureDiscount * max_future_q); if (reward == FoodReward || reward == EnemyPenalty) { break; } } Rewards[i] = ep_reward; Epsilon *= EpsilonDecay; if (i % 100 == 0) { pr.Report(i * 100.0 / Episodes); } } }