public void testValueIterationInCellWorld() { MDPUtilityFunction <CellWorldPosition> uf = fourByThreeMDP .valueIterationTillMAximumUtilityGrowthFallsBelowErrorMargin(1, 0.00001); // AIMA2e check against Fig 17.3 Assert.AreEqual(0.705, uf.getUtility(new CellWorldPosition(1, 1)), 0.001); Assert.AreEqual(0.655, uf.getUtility(new CellWorldPosition(1, 2)), 0.001); Assert.AreEqual(0.611, uf.getUtility(new CellWorldPosition(1, 3)), 0.001); Assert.AreEqual(0.388, uf.getUtility(new CellWorldPosition(1, 4)), 0.001); Assert.AreEqual(0.762, uf.getUtility(new CellWorldPosition(2, 1)), 0.001); Assert.AreEqual(0.660, uf.getUtility(new CellWorldPosition(2, 3)), 0.001); Assert.AreEqual(-1.0, uf.getUtility(new CellWorldPosition(2, 4)), 0.001); Assert.AreEqual(0.812, uf.getUtility(new CellWorldPosition(3, 1)), 0.001); Assert.AreEqual(0.868, uf.getUtility(new CellWorldPosition(3, 2)), 0.001); Assert.AreEqual(0.918, uf.getUtility(new CellWorldPosition(3, 3)), 0.001); Assert.AreEqual(1.0, uf.getUtility(new CellWorldPosition(3, 4)), 0.001); Assert.AreEqual(0.868, uf.getUtility(new CellWorldPosition(3, 2)), 0.001); }
public override TAction DecideAction(MDPPerception <TState> perception) { if (!(utilityFunction.HasUtilityFor(perception.GetState()))) { // if // perceptionState // is // new utilityFunction.SetUtility(perception.GetState(), perception.GetReward()); MDP.SetReward(perception.GetState(), perception.GetReward()); } if (!(PreviousState == null)) { stateCount.IncrementFor(PreviousState); utilityFunction = this.UpdateUtilityFunction(1.0); } if (MDP.IsTerminalState(CurrentState)) { PreviousState = null; PreviousAction = null; //TODO: make sure that 0 is appropriate value for what used to be null in java previousReward = 0; } else { PreviousState = CurrentState; PreviousAction = policy.GetAction(CurrentState); previousReward = CurrentReward; } return(PreviousAction); }
public PassiveTDAgent(MDP <STATE_TYPE, ACTION_TYPE> mdp, MDPPolicy <STATE_TYPE, ACTION_TYPE> policy) : base(mdp.emptyMdp()) { this.policy = policy; this.utilityFunction = new MDPUtilityFunction <STATE_TYPE>(); this.stateCount = new FrequencyCounter <STATE_TYPE>(); }
public PassiveTDAgent(MDP <TState, TAction> mdp, MDPPolicy <TState, TAction> policy) : base(mdp.EmptyMdp()) { this.policy = policy; this.utilityFunction = new MDPUtilityFunction <TState>(); this.stateCount = new FrequencyCounter <TState>(); }
public override ACTION_TYPE decideAction(MDPPerception <STATE_TYPE> perception) { if (!(utilityFunction.hasUtilityFor(perception.getState()))) // if // perceptionState // is // new { utilityFunction.setUtility(perception.getState(), perception .getReward()); mdp.setReward(perception.getState(), perception.getReward()); } if (!(previousState == null)) { stateCount.incrementFor(previousState); utilityFunction = updateUtilityFunction(1.0); } if (mdp.isTerminalState(currentState)) { previousState = default(STATE_TYPE); previousAction = default(ACTION_TYPE); previousReward = double.MinValue; } else { previousState = currentState; previousAction = policy.getAction(currentState); previousReward = currentReward; } return(previousAction); }
public PassiveADPAgent(MDP <STATE_TYPE, ACTION_TYPE> mdp, MDPPolicy <STATE_TYPE, ACTION_TYPE> policy) : base(mdp.emptyMdp()) { this.policy = policy; this.utilityFunction = new MDPUtilityFunction <STATE_TYPE>(); this.nsa = new Dictionary <Pair <STATE_TYPE, ACTION_TYPE>, Double>(); this.nsasdash = new Dictionary <MDPTransition <STATE_TYPE, ACTION_TYPE>, Double>(); }
public PassiveADPAgent(MDP <TState, TAction> mdp, MDPPolicy <TState, TAction> policy) : base(mdp.EmptyMdp()) { this.policy = policy; this.utilityFunction = new MDPUtilityFunction <TState>(); this.nsa = new Dictionary <Pair <TState, TAction>, double>(); this.nsasdash = new Dictionary <MDPTransition <TState, TAction>, double>(); }
public override TAction DecideAction(MDPPerception <TState> perception) { if (!(utilityFunction.HasUtilityFor(perception.GetState()))) { // if // perceptionState // is // new utilityFunction.SetUtility(perception.GetState(), perception .GetReward()); MDP.SetReward(perception.GetState(), perception.GetReward()); } if (!(PreviousState == null)) { if (nsa.ContainsKey(new Pair <TState, TAction>( PreviousState, PreviousAction))) { nsa[new Pair <TState, TAction>(PreviousState, PreviousAction)] += 1; } else { nsa[new Pair <TState, TAction>(PreviousState, PreviousAction)] = 1.0; } if (nsasdash.ContainsKey(new MDPTransition <TState, TAction>(PreviousState, PreviousAction, CurrentState))) { nsasdash[new MDPTransition <TState, TAction>(PreviousState, PreviousAction, CurrentState)] += 1; } else { nsasdash[new MDPTransition <TState, TAction>(PreviousState, PreviousAction, CurrentState)] = 1.0; } foreach (MDPTransition <TState, TAction> transition in nsasdash.Keys) { if (nsasdash[transition] != 0.0) { double newValue = nsasdash[transition] / nsa[new Pair <TState, TAction>( transition.GetInitialState(), transition.GetAction())]; MDP.SetTransitionProbability(transition, newValue); } } IList <MDPTransition <TState, TAction> > validTransitions = MDP .GetTransitionsWith(PreviousState, policy.GetAction(PreviousState)); utilityFunction = this.ValueDetermination(validTransitions, 1); } if (MDP.IsTerminalState(CurrentState)) { PreviousState = null; PreviousAction = null; } else { PreviousState = CurrentState; PreviousAction = policy.GetAction(CurrentState); } return(PreviousAction); }
public void testPolicyEvaluation() { MDPPolicy <CellWorldPosition, String> policy = fourByThreeMDP .randomPolicy(); MDPUtilityFunction <CellWorldPosition> uf1 = fourByThreeMDP .initialUtilityFunction(); MDPUtilityFunction <CellWorldPosition> uf2 = fourByThreeMDP .policyEvaluation(policy, uf1, 1, 3); Assert.IsFalse(uf1.Equals(uf2)); }
// // PRIVATE METHODS // private MDPUtilityFunction <STATE_TYPE> updateUtilityFunction(double gamma) { MDPUtilityFunction <STATE_TYPE> uf = utilityFunction.copy(); double u_s = utilityFunction.getUtility(previousState); double gammaUtilDIff = ((gamma * utilityFunction .getUtility(currentState)) - utilityFunction .getUtility(previousState)); double alphaTerm = stateCount.probabilityOf(previousState) * (previousReward + gammaUtilDIff); uf.setUtility(previousState, u_s + alphaTerm); return(uf); }
private MDPUtilityFunction <TState> UpdateUtilityFunction(double gamma) { MDPUtilityFunction <TState> uf = utilityFunction.Copy(); double u_s = utilityFunction.GetUtility(PreviousState); double gammaUtilDIff = ((gamma * utilityFunction .GetUtility(CurrentState)) - utilityFunction .GetUtility(PreviousState)); double alphaTerm = stateCount.ProbabilityOf(PreviousState) * (previousReward + gammaUtilDIff); uf.SetUtility(PreviousState, u_s + alphaTerm); return(uf); }
public void testPassiveTDAgent() { PassiveTDAgent <CellWorldPosition, String> agent = new PassiveTDAgent <CellWorldPosition, String>( fourByThree, policy); // Randomizer r = new JavaRandomizer(); Randomizer r = new MockRandomizer(new double[] { 0.1, 0.9, 0.2, 0.8, 0.3, 0.7, 0.4, 0.6, 0.5 }); MDPUtilityFunction <CellWorldPosition> uf = null; for (int i = 0; i < 200; i++) { agent.executeTrial(r); uf = agent.getUtilityFunction(); // System.Console.WriteLine(uf); } Assert.AreEqual(0.662, uf.getUtility(new CellWorldPosition(1, 1)), 0.001); Assert.AreEqual(0.610, uf.getUtility(new CellWorldPosition(1, 2)), 0.001); Assert.AreEqual(0.553, uf.getUtility(new CellWorldPosition(1, 3)), 0.001); Assert.AreEqual(0.496, uf.getUtility(new CellWorldPosition(1, 4)), 0.001); Assert.AreEqual(0.735, uf.getUtility(new CellWorldPosition(2, 1)), 0.001); Assert.AreEqual(0.835, uf.getUtility(new CellWorldPosition(2, 3)), 0.001); // AreEqual(-1.0, uf.getUtility(new // CellWorldPosition(2,4)),0.001);//the pseudo random genrator never // gets to this square Assert.AreEqual(0.789, uf.getUtility(new CellWorldPosition(3, 1)), 0.001); Assert.AreEqual(0.889, uf.getUtility(new CellWorldPosition(3, 3)), 0.001); Assert.AreEqual(1.0, uf.getUtility(new CellWorldPosition(3, 4)), 0.001); }
public void testPassiveADPAgent() { PassiveADPAgent <CellWorldPosition, String> agent = new PassiveADPAgent <CellWorldPosition, String>( fourByThree, policy); // Randomizer r = new JavaRandomizer(); Randomizer r = new MockRandomizer(new double[] { 0.1, 0.9, 0.2, 0.8, 0.3, 0.7, 0.4, 0.6, 0.5 }); MDPUtilityFunction <CellWorldPosition> uf = null; for (int i = 0; i < 100; i++) { agent.executeTrial(r); uf = agent.getUtilityFunction(); } Assert.AreEqual(0.676, uf.getUtility(new CellWorldPosition(1, 1)), 0.001); Assert.AreEqual(0.626, uf.getUtility(new CellWorldPosition(1, 2)), 0.001); Assert.AreEqual(0.573, uf.getUtility(new CellWorldPosition(1, 3)), 0.001); Assert.AreEqual(0.519, uf.getUtility(new CellWorldPosition(1, 4)), 0.001); Assert.AreEqual(0.746, uf.getUtility(new CellWorldPosition(2, 1)), 0.001); Assert.AreEqual(0.865, uf.getUtility(new CellWorldPosition(2, 3)), 0.001); // AreEqual(-1.0, uf.getUtility(new // CellWorldPosition(2,4)),0.001);//the pseudo random genrator never // gets to this square Assert.AreEqual(0.796, uf.getUtility(new CellWorldPosition(3, 1)), 0.001); Assert.AreEqual(0.906, uf.getUtility(new CellWorldPosition(3, 3)), 0.001); Assert.AreEqual(1.0, uf.getUtility(new CellWorldPosition(3, 4)), 0.001); }
private MDPUtilityFunction <TState> ValueDetermination( IList <MDPTransition <TState, TAction> > validTransitions, double gamma) { MDPUtilityFunction <TState> uf = utilityFunction.Copy(); double additional = 0.0; if (validTransitions.Count > 0) { TState initState = validTransitions[0].GetInitialState(); double reward = MDP.GetRewardFor(initState); additional = validTransitions.Sum( transition => MDP.GetTransitionProbability(transition) * this.utilityFunction.GetUtility(transition.GetDestinationState())); uf.SetUtility(initState, reward + (gamma * additional)); } return(uf); }
// // PRIVATE METHODS // private MDPUtilityFunction <STATE_TYPE> valueDetermination( List <MDPTransition <STATE_TYPE, ACTION_TYPE> > validTransitions, double gamma) { MDPUtilityFunction <STATE_TYPE> uf = utilityFunction.copy(); double additional = 0.0; if (validTransitions.Count > 0) { STATE_TYPE initState = validTransitions[0].getInitialState(); double reward = mdp.getRewardFor(initState); foreach (MDPTransition <STATE_TYPE, ACTION_TYPE> transition in validTransitions) { additional += mdp.getTransitionProbability(transition) * utilityFunction.getUtility(transition .getDestinationState()); } uf.setUtility(initState, reward + (gamma * additional)); } return(uf); }
public override ACTION_TYPE decideAction(MDPPerception <STATE_TYPE> perception) { if (!(utilityFunction.hasUtilityFor(perception.getState()))) { // if // perceptionState // is // new utilityFunction.setUtility(perception.getState(), perception .getReward()); mdp.setReward(perception.getState(), perception.getReward()); } if (!(previousState == null)) { Pair <STATE_TYPE, ACTION_TYPE> prevState = new Pair <STATE_TYPE, ACTION_TYPE>(previousState, previousAction); if (!nsa.ContainsKey(prevState)) { nsa.Add(prevState, 1.0); } else { nsa[prevState]++; } MDPTransition <STATE_TYPE, ACTION_TYPE> prevTransition = new MDPTransition <STATE_TYPE, ACTION_TYPE>( previousState, previousAction, currentState); if (!nsasdash.ContainsKey(prevTransition)) { nsasdash.Add(prevTransition, 1.0); } else { nsasdash[prevTransition]++; } foreach (MDPTransition <STATE_TYPE, ACTION_TYPE> transition in nsasdash .Keys) { if (nsasdash[transition] != 0.0) { double newValue = nsasdash[transition] / nsa[new Pair <STATE_TYPE, ACTION_TYPE>( transition.getInitialState(), transition .getAction())]; mdp.setTransitionProbability(transition, newValue); } } List <MDPTransition <STATE_TYPE, ACTION_TYPE> > validTransitions = mdp .getTransitionsWith(previousState, policy .getAction(previousState)); utilityFunction = valueDetermination(validTransitions, 1); } if (mdp.isTerminalState(currentState)) { previousState = default(STATE_TYPE); previousAction = default(ACTION_TYPE); } else { previousState = currentState; previousAction = policy.getAction(currentState); } return(previousAction); }
// // PRIVATE METHODS // private void assertPolicyReccomends(CellWorld cw, MDPUtilityFunction<CellWorldPosition> uf, int x, int y, String actionExpected) { Pair<String, Double> p = cw.getTransitionModel() .getTransitionWithMaximumExpectedUtility( new CellWorldPosition(x, y), uf); Assert.assertEquals(actionExpected, p.getFirst()); }
public void testMaximumTransitionDetection() { // aka policy extraction // given a utility function // create the Utility Function depicted in Fig 17.3 MDPUtilityFunction<CellWorldPosition> uf = new MDPUtilityFunction<CellWorldPosition>(); uf.setUtility(new CellWorldPosition(1, 1), 0.705); uf.setUtility(new CellWorldPosition(1, 2), 0.655); uf.setUtility(new CellWorldPosition(1, 3), 0.611); uf.setUtility(new CellWorldPosition(1, 4), 0.388); uf.setUtility(new CellWorldPosition(2, 1), 0.762); uf.setUtility(new CellWorldPosition(2, 3), 0.660); uf.setUtility(new CellWorldPosition(2, 4), -1.0); uf.setUtility(new CellWorldPosition(3, 1), 0.812); uf.setUtility(new CellWorldPosition(3, 2), 0.868); uf.setUtility(new CellWorldPosition(3, 3), 0.918); uf.setUtility(new CellWorldPosition(3, 4), 1.0); assertPolicyReccomends(cw, uf, 1, 1, CellWorld.UP); assertPolicyReccomends(cw, uf, 1, 2, CellWorld.LEFT); assertPolicyReccomends(cw, uf, 1, 3, CellWorld.LEFT); assertPolicyReccomends(cw, uf, 1, 4, CellWorld.LEFT); assertPolicyReccomends(cw, uf, 2, 1, CellWorld.UP); assertPolicyReccomends(cw, uf, 2, 3, CellWorld.UP); assertPolicyReccomends(cw, uf, 2, 4, null); assertPolicyReccomends(cw, uf, 3, 1, CellWorld.RIGHT); assertPolicyReccomends(cw, uf, 3, 2, CellWorld.RIGHT); assertPolicyReccomends(cw, uf, 3, 3, CellWorld.RIGHT); assertPolicyReccomends(cw, uf, 3, 4, null); }