// Start is called before the first frame update void Start() { gridLayout.constraint = GridLayoutGroup.Constraint.FixedColumnCount; gridLayout.constraintCount = width; GridWorld gridWorld = new GridWorld(width, height); float[,] Pi; switch (methods) { case Methods.IPE: Pi = new float[gridWorld.S.Length, gridWorld.A.Length]; for (int i = 0; i < gridWorld.S.Length; ++i) { for (int y = 0; y < gridWorld.A.Length; ++y) { Pi[i, y] = 0.0f; } } for (int i = 0; i < gridWorld.S.Length; ++i) { Pi[i, 1] = 1.0f; } float[] V = Algorithms.iterative_policy_evaluation(gridWorld.S, gridWorld.A, gridWorld.T, gridWorld.P, gridWorld.R, Pi); foreach (float v in V) { GameObject caseGO = GameObject.Instantiate(prefabCase, gridLayout.transform); caseGO.GetComponentInChildren <Text>().text = "V=" + v; } break; case Methods.PI: PairVPi VPi = Algorithms.policy_iteration(gridWorld.S, gridWorld.A, gridWorld.T, gridWorld.P, gridWorld.R); foreach (float v in VPi.V) { GameObject caseGO = GameObject.Instantiate(prefabCase, gridLayout.transform); caseGO.GetComponentInChildren <Text>().text = "V=" + v; } break; case Methods.MCES: float[,] Q = Algorithms.monte_carlo_control_with_exploring_starts( gridWorld.S, gridWorld.A, gridWorld.T, gridWorld.step, gridWorld.step_until_the_end_of_episode_and_return_transitions, out Pi, 0.99f, 5000 ); for (int s = 0; s < gridWorld.S.Length; ++s) { GameObject caseGO = GameObject.Instantiate(prefabCase, gridLayout.transform); caseGO.GetComponentInChildren <Text>().text = ""; for (int a = 0; a < gridWorld.A.Length; ++a) { caseGO.GetComponentInChildren <Text>().text += "S:" + s + " A:" + a + " Q:" + Q[s, a] + "\n"; } } break; case Methods.VI: Pi = new float[gridWorld.S.Length, gridWorld.A.Length]; for (int i = 0; i < gridWorld.S.Length; ++i) { for (int y = 0; y < gridWorld.A.Length; ++y) { Pi[i, y] = 0.0f; } } for (int i = 0; i < gridWorld.S.Length; ++i) { Pi[i, 1] = 1.0f; } V = Algorithms.value_iteration(gridWorld.S, gridWorld.A, gridWorld.T, gridWorld.P, gridWorld.R, Pi); foreach (float v in V) { GameObject caseGO = GameObject.Instantiate(prefabCase, gridLayout.transform); caseGO.GetComponentInChildren <Text>().text = "V=" + v; } break; } }
private float[] V; // value per state // Start is called before the first frame update void Start() { lineworld = new LineWorld(); Pi = Algorithms.create_random_uniform_policy(lineworld.S.Length, lineworld.A.Length); // V = Algorithms.iterative_policy_evaluation(lineworld.S, lineworld.A, lineworld.T, lineworld.P, lineworld.R, Pi); UIDisplayator.AddVF("Value Function de la stratégie \"random uniform\"", ref V); //////////////////////////////////////////////////////////////////////////////////////// for (int i = 0; i < lineworld.S.Length; ++i) { for (int y = 0; y < lineworld.A.Length; ++y) { Pi[i, y] = 0.0f; } } for (int i = 0; i < lineworld.S.Length; ++i) { Pi[i, 1] = 1.0f; } V = Algorithms.iterative_policy_evaluation(lineworld.S, lineworld.A, lineworld.T, lineworld.P, lineworld.R, Pi); UIDisplayator.AddVF("Value Function de la stratégie \"tout a droite\"", ref V); PairVPi VPi = Algorithms.policy_iteration(lineworld.S, lineworld.A, lineworld.T, lineworld.P, lineworld.R); UIDisplayator.AddVF("La value function optimale", ref VPi.V); Debug.Log("La policy optimale : "); for (int s = 0; s < lineworld.S.Length; ++s) { for (int a = 0; a < lineworld.A.Length; ++a) { Debug.Log("S=" + s + " A=" + a + " =" + VPi.Pi[s, a]); } } ///////////////////////////////////////////////////////////////////////////////////// gridworld = new GridWorld(); Pi = Algorithms.create_random_uniform_policy(gridworld.S.Length, gridworld.A.Length); // V = Algorithms.iterative_policy_evaluation(gridworld.S, gridworld.A, gridworld.T, gridworld.P, gridworld.R, Pi); UIDisplayator.AddVF("Value Function de la stratégie \"random uniform\"", ref V); //////////////////////////////////////////////////////////////////////////////////////// for (int i = 0; i < gridworld.S.Length; ++i) { for (int y = 0; y < gridworld.A.Length; ++y) { Pi[i, y] = 0.0f; } } for (int i = 0; i < gridworld.S.Length; ++i) { Pi[i, 1] = 1.0f; } V = Algorithms.iterative_policy_evaluation(gridworld.S, gridworld.A, gridworld.T, gridworld.P, gridworld.R, Pi); UIDisplayator.AddVF("Value Function de la stratégie \"tout a droite\"", ref V); VPi = Algorithms.policy_iteration(gridworld.S, gridworld.A, gridworld.T, gridworld.P, gridworld.R); UIDisplayator.AddVF("La value function optimale", ref VPi.V); Debug.Log("La policy optimale : "); for (int s = 0; s < gridworld.S.Length; ++s) { for (int a = 0; a < gridworld.A.Length; ++a) { Debug.Log("S=" + s + " A=" + a + " =" + VPi.Pi[s, a]); } } ////////////////////////////////////////////////// float[,] Q = Algorithms.monte_carlo_control_with_exploring_starts( lineworld.S, lineworld.A, lineworld.T, lineworld.step, lineworld.step_until_the_end_of_episode_and_return_transitions, out Pi, 0.99f, 5000 ); Debug.Log("L'action value optimale :"); for (int s = 0; s < lineworld.S.Length; ++s) { for (int a = 0; a < lineworld.A.Length; ++a) { Debug.Log("S:" + s + " A:" + a + " Q:" + Q[s, a]); } } Debug.Log("La policy \"optimale\" :"); for (int s = 0; s < lineworld.S.Length; ++s) { for (int a = 0; a < lineworld.A.Length; ++a) { Debug.Log("S:" + s + " A:" + a + " Pi:" + Pi[s, a]); } } }