/** * Passive reinforcement learning based on adaptive dynamic programming. * * @param percept * a percept indicating the current state s' and reward signal * r'. * @return an action */ public override A execute(IPerceptStateReward <S> percept) { // if s' is new then U[s'] <- r'; R[s'] <- r' S sDelta = percept.state(); double rDelta = percept.reward(); if (!U.ContainsKey(sDelta)) { U.Put(sDelta, rDelta); R.Put(sDelta, rDelta); } // if s is not null then if (null != s) { // increment N<sub>sa</sub>[s,a] and N<sub>s'|sa</sub>[s',s,a] Pair <S, A> sa = new Pair <S, A>(s, a); Nsa.incrementFor(sa); NsDelta_sa.incrementFor(new Pair <S, Pair <S, A> >(sDelta, sa)); // for each t such that N<sub>s'|sa</sub>[t,s,a] is nonzero do foreach (S t in mdp.states()) { Pair <S, Pair <S, A> > t_sa = new Pair <S, Pair <S, A> >(t, sa); if (0 != NsDelta_sa.getCount(t_sa)) { // P(t|s,a) <- N<sub>s'|sa</sub>[t,s,a] / // N<sub>sa</sub>[s,a] P.Put(t_sa, (double)NsDelta_sa.getCount(t_sa) / (double)Nsa.getCount(sa)); } } } // U <- POLICY-EVALUATION(π, U, mdp) U = policyEvaluation.evaluate(pi, U, mdp); // if s'.TERMINAL? then s,a <- null else s,a <- s',π[s'] if (isTerminal(sDelta)) { s = default(S); a = default(A); } else { s = sDelta; a = pi.Get(sDelta); } // return a return(a); }