Beispiel #1
0
        private Move MoveOfMaxQValue(Position fpos, Position epos)
        {
            double max      = -double.MaxValue;
            Move   max_move = default;

            for (int i = 0; i < 2; i++)
            {
                for (int j = 0; j < 2; j++)
                {
                    Move move    = new Move((HMove)i, (VMove)j);
                    var  tmp_key = new QTKey(fpos, epos, move);
                    var  val     = QTable[tmp_key];
                    if (val > max)
                    {
                        max      = val;
                        max_move = move;
                    }
                }
            }

            return(max_move);
        }
Beispiel #2
0
        public void Start(IProgress <double> pr)
        {
            Random   rnd = new Random(DateTime.Now.Millisecond);
            Position Agent;
            Position Food;
            Position Enemy;

            for (int i = -Position.SIZE; i <= Position.SIZE; i++)
            {
                for (int j = -Position.SIZE; j <= Position.SIZE; j++)
                {
                    Position fpos = new Position(i, j);
                    for (int k = -Position.SIZE; k <= Position.SIZE; k++)
                    {
                        for (int l = -Position.SIZE; l <= Position.SIZE; l++)
                        {
                            Position epos = new Position(k, l);
                            QTable[new QTKey(fpos, epos, new Move(HMove.Left, VMove.Top))]    = 0;
                            QTable[new QTKey(fpos, epos, new Move(HMove.Left, VMove.Bottom))] = 0;

                            QTable[new QTKey(fpos, epos, new Move(HMove.Right, VMove.Top))]    = 0;
                            QTable[new QTKey(fpos, epos, new Move(HMove.Right, VMove.Bottom))] = 0;
                        }
                    }
                }
            }

            for (int i = 0; i < Episodes; i++)
            {
                Agent = Position.RandPos(rnd);
                Food  = Position.RandPos(rnd);
                Enemy = Position.RandPos(rnd);

                double ep_reward = 0;

                if (Food == Agent || Food == Enemy || Enemy == Agent)
                {
                    i--;
                    continue;
                }

                for (int j = 0; j < MaxStepPerEpisode; j++)
                {
                    var  relative_posf = Food - Agent;
                    var  relative_pose = Enemy - Agent;
                    Move action;

                    if (rnd.NextDouble() < Epsilon) // explore
                    {
                        action = Move.GetRandomMove(rnd);
                    }
                    else // exploit
                    {
                        action = MoveOfMaxQValue(relative_posf, relative_pose);
                    }

                    var key = new QTKey(relative_posf, relative_pose, action);
                    Agent.Move((int)action.H, (int)action.V);

                    double reward;
                    if (Agent == Food)
                    {
                        reward = FoodReward;
                    }
                    else if (Agent == Enemy)
                    {
                        reward = EnemyPenalty;
                    }
                    else
                    {
                        reward = NothingPenalty;
                    }

                    ep_reward += reward;

                    relative_posf = Food - Agent;
                    relative_pose = Enemy - Agent;

                    var max_q_action = MoveOfMaxQValue(relative_posf, relative_pose);
                    var max_future_q = QTable[new QTKey(relative_posf, relative_pose, max_q_action)];

                    QTable[key] = reward == FoodReward ? FoodReward : (1 - LearningRate) * QTable[key] + LearningRate * (reward + FutureDiscount * max_future_q);

                    if (reward == FoodReward || reward == EnemyPenalty)
                    {
                        break;
                    }
                }

                Rewards[i] = ep_reward;
                Epsilon   *= EpsilonDecay;
                if (i % 100 == 0)
                {
                    pr.Report(i * 100.0 / Episodes);
                }
            }
        }