Exemplo n.º 1
0
        /**
         * Passive reinforcement learning based on adaptive dynamic programming.
         *
         * @param percept
         *            a percept indicating the current state s' and reward signal
         *            r'.
         * @return an action
         */

        public override A execute(IPerceptStateReward <S> percept)
        {
            // if s' is new then U[s'] <- r'; R[s'] <- r'
            S      sDelta = percept.state();
            double rDelta = percept.reward();

            if (!U.ContainsKey(sDelta))
            {
                U.Put(sDelta, rDelta);
                R.Put(sDelta, rDelta);
            }
            // if s is not null then
            if (null != s)
            {
                // increment N<sub>sa</sub>[s,a] and N<sub>s'|sa</sub>[s',s,a]
                Pair <S, A> sa = new Pair <S, A>(s, a);
                Nsa.incrementFor(sa);
                NsDelta_sa.incrementFor(new Pair <S, Pair <S, A> >(sDelta, sa));
                // for each t such that N<sub>s'|sa</sub>[t,s,a] is nonzero do
                foreach (S t in mdp.states())
                {
                    Pair <S, Pair <S, A> > t_sa = new Pair <S, Pair <S, A> >(t, sa);
                    if (0 != NsDelta_sa.getCount(t_sa))
                    {
                        // P(t|s,a) <- N<sub>s'|sa</sub>[t,s,a] /
                        // N<sub>sa</sub>[s,a]
                        P.Put(t_sa, (double)NsDelta_sa.getCount(t_sa)
                              / (double)Nsa.getCount(sa));
                    }
                }
            }
            // U <- POLICY-EVALUATION(&pi;, U, mdp)
            U = policyEvaluation.evaluate(pi, U, mdp);
            // if s'.TERMINAL? then s,a <- null else s,a <- s',&pi;[s']
            if (isTerminal(sDelta))
            {
                s = default(S);
                a = default(A);
            }
            else
            {
                s = sDelta;
                a = pi.Get(sDelta);
            }
            // return a
            return(a);
        }
Exemplo n.º 2
0
        /**
         * An exploratory Q-learning agent. It is an active learner that learns the
         * value Q(s,a) of each action in each situation. It uses the same
         * exploration function f as the exploratory ADP agent, but avoids having to
         * learn the transition model because the Q-value of a state can be related
         * directly to those of its neighbors.
         *
         * @param percept
         *            a percept indicating the current state s' and reward signal
         *            r'.
         * @return an action
         */

        public override A execute(IPerceptStateReward <S> percept)
        {
            S      sPrime = percept.state();
            double rPrime = percept.reward();

            // if TERMAINAL?(s') then Q[s',None] <- r'
            if (isTerminal(sPrime))
            {
                Q.Put(new Pair <S, A>(sPrime, noneAction), rPrime);
            }

            // if s is not null then
            if (null != s)
            {
                // increment N<sub>sa</sub>[s,a]
                Pair <S, A> sa = new Pair <S, A>(s, a);
                Nsa.incrementFor(sa);
                // Q[s,a] <- Q[s,a] + &alpha;(N<sub>sa</sub>[s,a])(r +
                // &gamma;max<sub>a'</sub>Q[s',a'] - Q[s,a])
                double Q_sa = 0D;
                if (Q.ContainsKey(sa))
                {
                    Q_sa = Q.Get(sa);
                }
                Q.Put(sa, Q_sa + alpha(Nsa, s, a) * (r.Value + gamma * maxAPrime(sPrime) - Q_sa));
            }
            // if s'.TERMINAL? then s,a,r <- null else
            // s,a,r <- s',argmax<sub>a'</sub>f(Q[s',a'],N<sub>sa</sub>[s',a']),r'
            if (isTerminal(sPrime))
            {
                s = default(S);
                a = default(A);
                r = null;
            }
            else
            {
                s = sPrime;
                a = argmaxAPrime(sPrime);
                r = rPrime;
            }

            // return a
            return(a);
        }
Exemplo n.º 3
0
        /**
         * Passive reinforcement learning that learns utility estimates using
         * temporal differences
         *
         * @param percept
         *            a percept indicating the current state s' and reward signal
         *            r'.
         * @return an action
         */

        public override A execute(IPerceptStateReward <S> percept)
        {
            // if s' is new then U[s'] <- r'
            S      sDelta = percept.state();
            double rDelta = percept.reward();

            if (!U.ContainsKey(sDelta))
            {
                U.Put(sDelta, rDelta);
            }
            // if s is not null then
            if (null != s)
            {
                // increment N<sub>s</sub>[s]
                Ns.incrementFor(s);
                // U[s] <- U[s] + &alpha;(N<sub>s</sub>[s])(r + &gamma;U[s'] - U[s])
                double U_s = U.Get(s);
                U.Put(s, U_s + alpha(Ns, s) * (r.Value + gamma * U.Get(sDelta) - U_s));
            }
            // if s'.TERMINAL? then s,a,r <- null else s,a,r <- s',&pi;[s'],r'
            if (isTerminal(sDelta))
            {
                s = default(S);
                a = default(A);
                r = null;
            }
            else
            {
                s = sDelta;
                a = pi.Get(sDelta);
                r = rDelta;
            }

            // return a
            return(a);
        }
Exemplo n.º 4
0
 /**
  * Map the given percept to an Agent action.
  *
  * @param percept
  *            a percept indicating the current state s' and reward signal r'
  * @return the action to take.
  */
 public abstract A execute(IPerceptStateReward <S> percept);