public PairSARSP step_until_the_end_of_episode_and_return_transitions(int s, float[,] Pi) { List <int> s_list = new List <int>(); List <int> a_list = new List <int>(); List <float> r_list = new List <float>(); List <int> s_p_list = new List <int>(); while (!isTermined /*Algorithms.arrayContain(T, s)*/ && s_list.Count < S.Length * 10) { int a = 0; float rand = Random.value; float temp = 0; for (int i = 0; i < A.Length; ++i) { temp += Pi[s, i]; if (rand <= temp) { a = i; break; } } PairSR pairSR = step(s, a); s_list.Add(s); a_list.Add(a); r_list.Add(pairSR.R); s_p_list.Add(pairSR.S); s = pairSR.S; } Reset(); PairSARSP pairSARSP; pairSARSP.S = s_list; pairSARSP.A = a_list; pairSARSP.R = r_list; pairSARSP.SP = s_p_list; return(pairSARSP); }
public static float[,] monte_carlo_control_with_exploring_starts( int[] S, int[] A, int[] T, System.Func <int, int, PairSR> step_func, System.Func <int, float[, ], PairSARSP> step_until_the_end_and_return_transitions_func, out float[,] Pi, float gamma = 0.99f, int nb_iter = 1000 ) { Pi = create_random_uniform_policy(S.Length, A.Length); float[,] Q = new float[S.Length, A.Length]; for (int s = 0; s < S.Length; ++s) { for (int a = 0; a < A.Length; ++a) { Q[s, a] = Random.value; } } foreach (int t in T) { for (int a = 0; a < A.Length; ++a) { Q[t, a] = 0.0f; } } float[,] ReturnsSum = new float[S.Length, A.Length]; float[,] ReturnsCount = new float[S.Length, A.Length]; for (int _ = 0; _ < nb_iter; ++_) { int s0 = S[Random.Range(0, S.Length)]; if (arrayContain(T, s0)) { continue; } int a0 = A[Random.Range(0, A.Length)]; PairSR pairSR = step_func.Invoke(s0, a0); PairSARSP pairSARSP = step_until_the_end_and_return_transitions_func.Invoke(pairSR.S, Pi); float G = 0; pairSARSP.S.Insert(0, s0); pairSARSP.A.Insert(0, a0); pairSARSP.R.Insert(0, pairSR.R); for (int t = pairSARSP.S.Count - 1; t > 0; --t) { G = pairSARSP.R[t] + gamma * G; int st = pairSARSP.S[t]; int at = pairSARSP.A[t]; bool stIn = false; for (int i = 0; i <= t; ++i) { if (pairSARSP.S[t] == pairSARSP.S[i]) { stIn = true; break; } } bool atIn = false; for (int i = 0; i <= t; ++i) { if (at == pairSARSP.A[i]) { atIn = true; break; } } if (stIn && atIn) { continue; } ReturnsSum[st, at] += G; ReturnsCount[st, at] += 1; Q[st, at] = ReturnsSum[st, at] / ReturnsCount[st, at]; setPiArrayValueAtState(ref Pi, st, 0.0f, A.Length); float max = float.MinValue; int argmax = 0; for (int a = 0; a < A.Length; ++a) { if (Q[st, a] > max) { max = Q[st, a]; argmax = a; } } Pi[st, argmax] = 1.0f; } } return(Q); }