コード例 #1
0
    public PairSARSP step_until_the_end_of_episode_and_return_transitions(int s, float[,] Pi)
    {
        List <int>   s_list   = new List <int>();
        List <int>   a_list   = new List <int>();
        List <float> r_list   = new List <float>();
        List <int>   s_p_list = new List <int>();

        while (!isTermined /*Algorithms.arrayContain(T, s)*/ && s_list.Count < S.Length * 10)
        {
            int   a    = 0;
            float rand = Random.value;
            float temp = 0;
            for (int i = 0; i < A.Length; ++i)
            {
                temp += Pi[s, i];
                if (rand <= temp)
                {
                    a = i;
                    break;
                }
            }
            PairSR pairSR = step(s, a);
            s_list.Add(s);
            a_list.Add(a);
            r_list.Add(pairSR.R);
            s_p_list.Add(pairSR.S);
            s = pairSR.S;
        }
        Reset();
        PairSARSP pairSARSP;

        pairSARSP.S  = s_list;
        pairSARSP.A  = a_list;
        pairSARSP.R  = r_list;
        pairSARSP.SP = s_p_list;
        return(pairSARSP);
    }
コード例 #2
0
    public static float[,] monte_carlo_control_with_exploring_starts(
        int[] S,
        int[] A,
        int[] T,
        System.Func <int, int, PairSR> step_func,
        System.Func <int, float[, ], PairSARSP> step_until_the_end_and_return_transitions_func,
        out float[,] Pi,
        float gamma = 0.99f,
        int nb_iter = 1000
        )
    {
        Pi         = create_random_uniform_policy(S.Length, A.Length);
        float[,] Q = new float[S.Length, A.Length];
        for (int s = 0; s < S.Length; ++s)
        {
            for (int a = 0; a < A.Length; ++a)
            {
                Q[s, a] = Random.value;
            }
        }
        foreach (int t in T)
        {
            for (int a = 0; a < A.Length; ++a)
            {
                Q[t, a] = 0.0f;
            }
        }
        float[,] ReturnsSum   = new float[S.Length, A.Length];
        float[,] ReturnsCount = new float[S.Length, A.Length];

        for (int _ = 0; _ < nb_iter; ++_)
        {
            int s0 = S[Random.Range(0, S.Length)];

            if (arrayContain(T, s0))
            {
                continue;
            }

            int a0 = A[Random.Range(0, A.Length)];

            PairSR    pairSR    = step_func.Invoke(s0, a0);
            PairSARSP pairSARSP = step_until_the_end_and_return_transitions_func.Invoke(pairSR.S, Pi);
            float     G         = 0;
            pairSARSP.S.Insert(0, s0);
            pairSARSP.A.Insert(0, a0);
            pairSARSP.R.Insert(0, pairSR.R);
            for (int t = pairSARSP.S.Count - 1; t > 0; --t)
            {
                G = pairSARSP.R[t] + gamma * G;
                int  st   = pairSARSP.S[t];
                int  at   = pairSARSP.A[t];
                bool stIn = false;
                for (int i = 0; i <= t; ++i)
                {
                    if (pairSARSP.S[t] == pairSARSP.S[i])
                    {
                        stIn = true;
                        break;
                    }
                }
                bool atIn = false;
                for (int i = 0; i <= t; ++i)
                {
                    if (at == pairSARSP.A[i])
                    {
                        atIn = true;
                        break;
                    }
                }
                if (stIn && atIn)
                {
                    continue;
                }
                ReturnsSum[st, at]   += G;
                ReturnsCount[st, at] += 1;
                Q[st, at]             = ReturnsSum[st, at] / ReturnsCount[st, at];
                setPiArrayValueAtState(ref Pi, st, 0.0f, A.Length);
                float max    = float.MinValue;
                int   argmax = 0;
                for (int a = 0; a < A.Length; ++a)
                {
                    if (Q[st, a] > max)
                    {
                        max    = Q[st, a];
                        argmax = a;
                    }
                }
                Pi[st, argmax] = 1.0f;
            }
        }
        return(Q);
    }