private void incrementStateActionCount(STATE_TYPE state, ACTION_TYPE action) { Pair <STATE_TYPE, ACTION_TYPE> stateActionPair = new Pair <STATE_TYPE, ACTION_TYPE>( state, action); stateActionCount.incrementFor(stateActionPair); }
public override ACTION_TYPE decideAction(MDPPerception <STATE_TYPE> perception) { if (!(utilityFunction.hasUtilityFor(perception.getState()))) // if // perceptionState // is // new { utilityFunction.setUtility(perception.getState(), perception .getReward()); mdp.setReward(perception.getState(), perception.getReward()); } if (!(previousState == null)) { stateCount.incrementFor(previousState); utilityFunction = updateUtilityFunction(1.0); } if (mdp.isTerminalState(currentState)) { previousState = default(STATE_TYPE); previousAction = default(ACTION_TYPE); previousReward = double.MinValue; } else { previousState = currentState; previousAction = policy.getAction(currentState); previousReward = currentReward; } return(previousAction); }
/** * Passive reinforcement learning based on adaptive dynamic programming. * * @param percept * a percept indicating the current state s' and reward signal * r'. * @return an action */ public override A execute(IPerceptStateReward <S> percept) { // if s' is new then U[s'] <- r'; R[s'] <- r' S sDelta = percept.state(); double rDelta = percept.reward(); if (!U.ContainsKey(sDelta)) { U.Put(sDelta, rDelta); R.Put(sDelta, rDelta); } // if s is not null then if (null != s) { // increment N<sub>sa</sub>[s,a] and N<sub>s'|sa</sub>[s',s,a] Pair <S, A> sa = new Pair <S, A>(s, a); Nsa.incrementFor(sa); NsDelta_sa.incrementFor(new Pair <S, Pair <S, A> >(sDelta, sa)); // for each t such that N<sub>s'|sa</sub>[t,s,a] is nonzero do foreach (S t in mdp.states()) { Pair <S, Pair <S, A> > t_sa = new Pair <S, Pair <S, A> >(t, sa); if (0 != NsDelta_sa.getCount(t_sa)) { // P(t|s,a) <- N<sub>s'|sa</sub>[t,s,a] / // N<sub>sa</sub>[s,a] P.Put(t_sa, (double)NsDelta_sa.getCount(t_sa) / (double)Nsa.getCount(sa)); } } } // U <- POLICY-EVALUATION(π, U, mdp) U = policyEvaluation.evaluate(pi, U, mdp); // if s'.TERMINAL? then s,a <- null else s,a <- s',π[s'] if (isTerminal(sDelta)) { s = default(S); a = default(A); } else { s = sDelta; a = pi.Get(sDelta); } // return a return(a); }
/** * An exploratory Q-learning agent. It is an active learner that learns the * value Q(s,a) of each action in each situation. It uses the same * exploration function f as the exploratory ADP agent, but avoids having to * learn the transition model because the Q-value of a state can be related * directly to those of its neighbors. * * @param percept * a percept indicating the current state s' and reward signal * r'. * @return an action */ public override A execute(IPerceptStateReward <S> percept) { S sPrime = percept.state(); double rPrime = percept.reward(); // if TERMAINAL?(s') then Q[s',None] <- r' if (isTerminal(sPrime)) { Q.Put(new Pair <S, A>(sPrime, noneAction), rPrime); } // if s is not null then if (null != s) { // increment N<sub>sa</sub>[s,a] Pair <S, A> sa = new Pair <S, A>(s, a); Nsa.incrementFor(sa); // Q[s,a] <- Q[s,a] + α(N<sub>sa</sub>[s,a])(r + // γmax<sub>a'</sub>Q[s',a'] - Q[s,a]) double Q_sa = 0D; if (Q.ContainsKey(sa)) { Q_sa = Q.Get(sa); } Q.Put(sa, Q_sa + alpha(Nsa, s, a) * (r.Value + gamma * maxAPrime(sPrime) - Q_sa)); } // if s'.TERMINAL? then s,a,r <- null else // s,a,r <- s',argmax<sub>a'</sub>f(Q[s',a'],N<sub>sa</sub>[s',a']),r' if (isTerminal(sPrime)) { s = default(S); a = default(A); r = null; } else { s = sPrime; a = argmaxAPrime(sPrime); r = rPrime; } // return a return(a); }
/** * Passive reinforcement learning that learns utility estimates using * temporal differences * * @param percept * a percept indicating the current state s' and reward signal * r'. * @return an action */ public override A execute(IPerceptStateReward <S> percept) { // if s' is new then U[s'] <- r' S sDelta = percept.state(); double rDelta = percept.reward(); if (!U.ContainsKey(sDelta)) { U.Put(sDelta, rDelta); } // if s is not null then if (null != s) { // increment N<sub>s</sub>[s] Ns.incrementFor(s); // U[s] <- U[s] + α(N<sub>s</sub>[s])(r + γU[s'] - U[s]) double U_s = U.Get(s); U.Put(s, U_s + alpha(Ns, s) * (r.Value + gamma * U.Get(sDelta) - U_s)); } // if s'.TERMINAL? then s,a,r <- null else s,a,r <- s',π[s'],r' if (isTerminal(sDelta)) { s = default(S); a = default(A); r = null; } else { s = sDelta; a = pi.Get(sDelta); r = rDelta; } // return a return(a); }