Beispiel #1
0
        //Update traces -- qlearning---Peng's Q(λ)
        private bool updateQTraces(Observation obs, Monopoly.RLClasses.Action a, double reward)
        {
            bool found = false;

            //Since the state space is huge we'll use a similarity function to decide whether two states are similar enough
            for (int i = 0; i < traces.Count; i++)
            {
                if (checkStateSimilarity(obs, traces[i].observation) && (!a.action.Equals(traces[i].action.action)))
                {
                    traces[i].value = 0;
                    traces.RemoveAt(i);
                    i--;
                }
                else if (checkStateSimilarity(obs, traces[i].observation) && (a.action.Equals(traces[i].action.action)))
                {
                    found = true;

                    traces[i].value = 1;

                    //Q[t] (s,a)
                    double qT = network.Run(createInput(traces[i].observation, traces[i].action.action))[0];

                    //maxQ[t] (s[t+1],a)
                    int    act   = findMaxValues(calculateQValues(obs));
                    double maxQt = network.Run(createInput(obs, act))[0];

                    //maxQ[t] (s[t],a)
                    act = findMaxValues(calculateQValues(lastState));
                    double maxQ = network.Run(createInput(lastState, act))[0];

                    //Q[t+1] (s,a) = Q[t] (s,a) + alpha * ( trace[i].value ) * ( reward + gamma * maxQ[t] (s[t+1],a) * maxQ[t] (s[t],a))
                    double qVal = qT + alpha * (traces[i].value) * (reward + gamma * maxQt - maxQ);

                    trainNeural(createInput(traces[i].observation, traces[i].action.action), qVal);
                }
                else
                {
                    traces[i].value = gamma * lamda * traces[i].value;

                    //Q[t] (s,a)
                    double qT = network.Run(createInput(traces[i].observation, traces[i].action.action))[0];

                    //maxQ[t] (s[t+1],a)
                    int    act   = findMaxValues(calculateQValues(obs));
                    double maxQt = network.Run(createInput(obs, act))[0];

                    //maxQ[t] (s[t],a)
                    act = findMaxValues(calculateQValues(lastState));
                    double maxQ = network.Run(createInput(lastState, act))[0];

                    //Q[t+1] (s,a) = Q[t] (s,a) + alpha * ( trace[i].value ) * ( reward + gamma * maxQ[t] (s[t+1],a) * maxQ[t] (s[t],a))
                    double qVal = qT + alpha * (traces[i].value) * (reward + gamma * maxQt - maxQ);

                    trainNeural(createInput(traces[i].observation, traces[i].action.action), qVal);
                }
            }

            return(found);
        }
Beispiel #2
0
        //Sarsa algorithm
        private double Sarsa(Observation lastState, Monopoly.RLClasses.Action lastAction, Observation newState, Monopoly.RLClasses.Action newAction, double reward)
        {
            double QValue = network.Run(createInput(lastState, lastAction.action)).First();

            //run network for last state and last action
            double previousQ = QValue;

            //run network for new state and best action
            double newQ = network.Run(createInput(newState, newAction.action)).First();

            QValue += alpha * (reward + gamma * newQ - previousQ);

            return(QValue);
        }
Beispiel #3
0
 //Update traces  -- sarsa
 private bool updateSTraces(Observation obs, Monopoly.RLClasses.Action a)
 {
     return(false);
 }