Beispiel #1
         * Passive reinforcement learning based on adaptive dynamic programming.
         * @param percept
         *            a percept indicating the current state s' and reward signal
         *            r'.
         * @return an action

        public override A execute(IPerceptStateReward <S> percept)
            // if s' is new then U[s'] <- r'; R[s'] <- r'
            S      sDelta = percept.state();
            double rDelta = percept.reward();

            if (!U.ContainsKey(sDelta))
                U.Put(sDelta, rDelta);
                R.Put(sDelta, rDelta);
            // if s is not null then
            if (null != s)
                // increment N<sub>sa</sub>[s,a] and N<sub>s'|sa</sub>[s',s,a]
                Pair <S, A> sa = new Pair <S, A>(s, a);
                NsDelta_sa.incrementFor(new Pair <S, Pair <S, A> >(sDelta, sa));
                // for each t such that N<sub>s'|sa</sub>[t,s,a] is nonzero do
                foreach (S t in mdp.states())
                    Pair <S, Pair <S, A> > t_sa = new Pair <S, Pair <S, A> >(t, sa);
                    if (0 != NsDelta_sa.getCount(t_sa))
                        // P(t|s,a) <- N<sub>s'|sa</sub>[t,s,a] /
                        // N<sub>sa</sub>[s,a]
                        P.Put(t_sa, (double)NsDelta_sa.getCount(t_sa)
                              / (double)Nsa.getCount(sa));
            // U <- POLICY-EVALUATION(&pi;, U, mdp)
            U = policyEvaluation.evaluate(pi, U, mdp);
            // if s'.TERMINAL? then s,a <- null else s,a <- s',&pi;[s']
            if (isTerminal(sDelta))
                s = default(S);
                a = default(A);
                s = sDelta;
                a = pi.Get(sDelta);
            // return a
Beispiel #2
        // argmax<sub>a'</sub>f(Q[s',a'],N<sub>sa</sub>[s',a'])
        private A argmaxAPrime(S sPrime)
            A      a   = default(A);
            double max = double.NegativeInfinity;

            foreach (A aPrime in actionsFunction.actions(sPrime))
                Pair <S, A> sPrimeAPrime     = new Pair <S, A>(sPrime, aPrime);
                double      explorationValue = f(Q.Get(sPrimeAPrime), Nsa.getCount(sPrimeAPrime));
                if (explorationValue > max)
                    max = explorationValue;
                    a   = aPrime;