Example #1
0
    // Start is called before the first frame update
    void Start()
    {
        gridLayout.constraint      = GridLayoutGroup.Constraint.FixedColumnCount;
        gridLayout.constraintCount = width;

        GridWorld gridWorld = new GridWorld(width, height);

        float[,] Pi;
        switch (methods)
        {
        case Methods.IPE:
            Pi = new float[gridWorld.S.Length, gridWorld.A.Length];
            for (int i = 0; i < gridWorld.S.Length; ++i)
            {
                for (int y = 0; y < gridWorld.A.Length; ++y)
                {
                    Pi[i, y] = 0.0f;
                }
            }

            for (int i = 0; i < gridWorld.S.Length; ++i)
            {
                Pi[i, 1] = 1.0f;
            }
            float[] V = Algorithms.iterative_policy_evaluation(gridWorld.S, gridWorld.A, gridWorld.T, gridWorld.P, gridWorld.R, Pi);
            foreach (float v in V)
            {
                GameObject caseGO = GameObject.Instantiate(prefabCase, gridLayout.transform);
                caseGO.GetComponentInChildren <Text>().text = "V=" + v;
            }
            break;

        case Methods.PI:
            PairVPi VPi = Algorithms.policy_iteration(gridWorld.S, gridWorld.A, gridWorld.T, gridWorld.P, gridWorld.R);
            foreach (float v in VPi.V)
            {
                GameObject caseGO = GameObject.Instantiate(prefabCase, gridLayout.transform);
                caseGO.GetComponentInChildren <Text>().text = "V=" + v;
            }
            break;

        case Methods.MCES:
            float[,] Q = Algorithms.monte_carlo_control_with_exploring_starts(
                gridWorld.S, gridWorld.A, gridWorld.T, gridWorld.step, gridWorld.step_until_the_end_of_episode_and_return_transitions, out Pi, 0.99f, 5000
                );

            for (int s = 0; s < gridWorld.S.Length; ++s)
            {
                GameObject caseGO = GameObject.Instantiate(prefabCase, gridLayout.transform);
                caseGO.GetComponentInChildren <Text>().text = "";
                for (int a = 0; a < gridWorld.A.Length; ++a)
                {
                    caseGO.GetComponentInChildren <Text>().text += "S:" + s + " A:" + a + " Q:" + Q[s, a] + "\n";
                }
            }
            break;

        case Methods.VI:
            Pi = new float[gridWorld.S.Length, gridWorld.A.Length];
            for (int i = 0; i < gridWorld.S.Length; ++i)
            {
                for (int y = 0; y < gridWorld.A.Length; ++y)
                {
                    Pi[i, y] = 0.0f;
                }
            }

            for (int i = 0; i < gridWorld.S.Length; ++i)
            {
                Pi[i, 1] = 1.0f;
            }
            V = Algorithms.value_iteration(gridWorld.S, gridWorld.A, gridWorld.T, gridWorld.P, gridWorld.R, Pi);
            foreach (float v in V)
            {
                GameObject caseGO = GameObject.Instantiate(prefabCase, gridLayout.transform);
                caseGO.GetComponentInChildren <Text>().text = "V=" + v;
            }
            break;
        }
    }
    private float[] V; // value per state

    // Start is called before the first frame update
    void Start()
    {
        lineworld = new LineWorld();
        Pi        = Algorithms.create_random_uniform_policy(lineworld.S.Length, lineworld.A.Length); //
        V         = Algorithms.iterative_policy_evaluation(lineworld.S, lineworld.A, lineworld.T, lineworld.P, lineworld.R, Pi);

        UIDisplayator.AddVF("Value Function de la stratégie \"random uniform\"", ref V);

        ////////////////////////////////////////////////////////////////////////////////////////

        for (int i = 0; i < lineworld.S.Length; ++i)
        {
            for (int y = 0; y < lineworld.A.Length; ++y)
            {
                Pi[i, y] = 0.0f;
            }
        }

        for (int i = 0; i < lineworld.S.Length; ++i)
        {
            Pi[i, 1] = 1.0f;
        }
        V = Algorithms.iterative_policy_evaluation(lineworld.S, lineworld.A, lineworld.T, lineworld.P, lineworld.R, Pi);

        UIDisplayator.AddVF("Value Function de la stratégie \"tout a droite\"", ref V);

        PairVPi VPi = Algorithms.policy_iteration(lineworld.S, lineworld.A, lineworld.T, lineworld.P, lineworld.R);

        UIDisplayator.AddVF("La value function optimale", ref VPi.V);
        Debug.Log("La policy optimale : ");
        for (int s = 0; s < lineworld.S.Length; ++s)
        {
            for (int a = 0; a < lineworld.A.Length; ++a)
            {
                Debug.Log("S=" + s + " A=" + a + " =" + VPi.Pi[s, a]);
            }
        }


        /////////////////////////////////////////////////////////////////////////////////////

        gridworld = new GridWorld();
        Pi        = Algorithms.create_random_uniform_policy(gridworld.S.Length, gridworld.A.Length); //
        V         = Algorithms.iterative_policy_evaluation(gridworld.S, gridworld.A, gridworld.T, gridworld.P, gridworld.R, Pi);

        UIDisplayator.AddVF("Value Function de la stratégie \"random uniform\"", ref V);

        ////////////////////////////////////////////////////////////////////////////////////////

        for (int i = 0; i < gridworld.S.Length; ++i)
        {
            for (int y = 0; y < gridworld.A.Length; ++y)
            {
                Pi[i, y] = 0.0f;
            }
        }

        for (int i = 0; i < gridworld.S.Length; ++i)
        {
            Pi[i, 1] = 1.0f;
        }
        V = Algorithms.iterative_policy_evaluation(gridworld.S, gridworld.A, gridworld.T, gridworld.P, gridworld.R, Pi);

        UIDisplayator.AddVF("Value Function de la stratégie \"tout a droite\"", ref V);

        VPi = Algorithms.policy_iteration(gridworld.S, gridworld.A, gridworld.T, gridworld.P, gridworld.R);

        UIDisplayator.AddVF("La value function optimale", ref VPi.V);
        Debug.Log("La policy optimale : ");
        for (int s = 0; s < gridworld.S.Length; ++s)
        {
            for (int a = 0; a < gridworld.A.Length; ++a)
            {
                Debug.Log("S=" + s + " A=" + a + " =" + VPi.Pi[s, a]);
            }
        }

        //////////////////////////////////////////////////

        float[,] Q = Algorithms.monte_carlo_control_with_exploring_starts(
            lineworld.S, lineworld.A, lineworld.T, lineworld.step, lineworld.step_until_the_end_of_episode_and_return_transitions, out Pi, 0.99f, 5000
            );

        Debug.Log("L'action value optimale :");
        for (int s = 0; s < lineworld.S.Length; ++s)
        {
            for (int a = 0; a < lineworld.A.Length; ++a)
            {
                Debug.Log("S:" + s + " A:" + a + " Q:" + Q[s, a]);
            }
        }
        Debug.Log("La policy \"optimale\" :");
        for (int s = 0; s < lineworld.S.Length; ++s)
        {
            for (int a = 0; a < lineworld.A.Length; ++a)
            {
                Debug.Log("S:" + s + " A:" + a + " Pi:" + Pi[s, a]);
            }
        }
    }