static void _Main(string[] args) { int success_num; double ITERATION = (3 * 10e5); double GAMMA = 0.95; CartPoleEnv env = new CartPoleEnv(WinFormEnvViewer.Factory); //or AvaloniaEnvViewer.Factory env.Seed(0); Space ob_space = env.ObservationSpace; Policy_net Policy = new Policy_net("policy", env); Policy_net Old_Policy = new Policy_net("old_policy", env); PPOTrain PPO = new PPOTrain(Policy, Old_Policy, gamma: GAMMA); Saver saver = tf.train.Saver(); using (var sess = tf.Session()) { writer = tf.summary.FileWriter("./log/test", sess.graph); sess.run(tf.global_variables_initializer()); saver.restore(sess, "model/model.ckpt"); obs = env.reset(); reward = 0; success_num = 0; for (double iteration = 0; iteration < ITERATION; iteration++) { // episode run_policy_steps = 0; env.render(); while (true) { // run policy RUN_POLICY_STEPS which is much less than episode length run_policy_steps += 1; obs = np.stack(new[] { obs }).astype(dtype: np.float32); // prepare to feed placeholder Policy.obs var(act, v_pred) = Policy.act(obs: obs, stochastic: false); act = act.item(); v_pred = v_pred.item(); observations.add(obs); actions.add(act); v_preds.add(v_pred); rewards.add(reward); var(next_obs, reward, done, info) = env.Step(act); if (done) { v_preds_next = v_preds[1 :] + [0]; // next state of terminate state has 0 state value
static void Main(string[] args) { // Configuration parameters for the whole setup var seed = 42; var gamma = 0.99; // Discount factor for past rewards var max_steps_per_episode = 10000; // var env = gym.make("CartPole-v0"); // Create the environment CartPoleEnv env = new CartPoleEnv(WinFormEnvViewer.Factory); // Create the environment env.Seed(seed); // var eps = np.finfo(np.float32).eps.item(); // Smallest number such that 1.0 + eps != 1.0 var eps = 1e-5; // Smallest number such that 1.0 + eps != 1.0 /*/ * //// Implement Actor Critic network * * This network learns two functions: * * 1. Actor: This takes as input the state of our environment and returns a * probability value for each action in its action space. * 2. Critic: This takes as input the state of our environment and returns * an estimate of total rewards in the future. * * In our implementation, they share the initial layer. * /*/ var num_inputs = 4; NDArray num_actions = 2; var num_hidden = 128; LayersApi layers = new LayersApi(); var inputs = layers.Input(shape: (num_inputs)); var common = layers.Dense(num_hidden, activation: "relu").Apply(inputs); var action = layers.Dense(num_actions, activation: "softmax").Apply(common); var critic = layers.Dense(1).Apply(common); Model model = keras.Model(inputs: inputs, outputs: (action, critic)); /*/ * //// Train * /*/ var optimizer = keras.optimizers.Adam(learning_rate: (float)0.01); var huber_loss = keras.losses.Huber(); var action_probs_history = new List <double>(); var critic_value_history = new List <dynamic>(); var rewards_history = new List <double>(); double running_reward = 0; var episode_count = 0; while (true) // Run until solved { Program.state = env.Reset(); double episode_reward = 0; using (var tape = tf.GradientTape()) { for (int timestep = 1; timestep < max_steps_per_episode; timestep++) { //env.Render(); // Adding this line would show the attempts // of the agent in a pop up window. Program.state = tf.convert_to_tensor(Program.state); Program.state = tf.expand_dims(Program.state, 0); // Predict action probabilities and estimated future rewards // from environment state // var (action_probs, critic_value) = model.Apply(Program.state); var pred_result = model.Apply(tf.cast(Program.state, tf.float32)); var action_probs = pred_result[0][0]; var critic_value = pred_result[1][0][0]; critic_value_history.Add(critic_value); Tensor probabilities = np.squeeze(action_probs); Console.WriteLine(probabilities); // Sample action from action probability distribution NDArray chosen_action = np.random.choice(num_actions, probabilities: probabilities.); action_probs_history.Add(tf.math.log(action_probs[0, chosen_action])); // Apply the sampled action in our environment var(state, reward, done, _) = env.Step(chosen_action); rewards_history.Add(reward); episode_reward += reward; if (done) { break; } } // Update running reward to check condition for solving running_reward = 0.05 * episode_reward + (1 - 0.05) * running_reward; // Calculate expected value from rewards // - At each timestep what was the total reward received after that timestep // - Rewards in the past are discounted by multiplying them with gamma // - These are the labels for our critic dynamic returns = new List <double>(); double discounted_sum = 0; var reverse_rewards_history = rewards_history; reverse_rewards_history.Reverse(); foreach (double r in reverse_rewards_history) { discounted_sum = r + gamma * discounted_sum; returns.Insert(0, discounted_sum); } // Normalize returns = np.array(returns.ToArray()); returns = (returns - np.mean(returns)) / (np.std(returns) + eps); returns = returns.ToList(); // Calculating loss values to update our network var history = zip(action_probs_history, critic_value_history, returns); var actor_losses = new List <double>(); var critic_losses = new List <double>(); foreach (double[] item in history) { var log_prob = item[0]; dynamic value = item[1]; dynamic ret = item[2]; // At this point in history, the critic estimated that we would get a // total reward = `value` in the future. We took an action with log probability // of `log_prob` and ended up recieving a total reward = `ret`. // The actor must be updated so that it predicts an action that leads to // high rewards (compared to critic's estimate) with high probability. var diff = ret - value; actor_losses.Add(-log_prob * diff); // actor loss // The critic must be updated so that it predicts a better estimate of // the future rewards. critic_losses.Add( huber_loss.Call(tf.expand_dims(value, 0), tf.expand_dims(ret, 0)) ); } // Backpropagation dynamic loss_value = actor_losses.Sum(x => Convert.ToDouble(x)) + critic_losses.Sum(x => Convert.ToDouble(x)); var grads = tape.gradient(loss_value, model.trainable_variables); optimizer.apply_gradients(zip(grads, model.trainable_variables)); // Clear the loss and reward history action_probs_history.Clear(); critic_value_history.Clear(); rewards_history.Clear(); } // Log details episode_count += 1; if (episode_count % 10 == 0) { var template = String.Format("running reward: {0} at episode {1}", running_reward, episode_count); Console.WriteLine(template); } if (running_reward > 195) // Condition to consider the task solved { Console.WriteLine(String.Format("Solved at episode {0}!", episode_count)); break; } } /*/ * //// Visualizations * In early stages of training: * ![Imgur](https://i.imgur.com/5gCs5kH.gif) * * In later stages of training: * ![Imgur](https://i.imgur.com/5ziiZUD.gif) * /*/ }
static void Main(string[] args) { List <NDArray> observations = new List <NDArray>(); List <NDArray> actions = new List <NDArray>(); List <NDArray> v_preds = new List <NDArray>(); List <double> rewards = new List <double>(); List <NDArray> v_preds_next = new List <NDArray>(); List <NDArray> gaes = new List <NDArray>(); double EPISODES = 1e5; double GAMMA = 0.95; CartPoleEnv env = new CartPoleEnv(WinFormEnvViewer.Factory); //or AvaloniaEnvViewer.Factory // Instancia o ambiente CartPole env.Seed(0); // Space ob_space = env.ObservationSpace; // Descrevem o formato de observações válidas do espaço Policy_net Policy = new Policy_net("policy", env); // Cria a rede de Politica Policy_net Old_Policy = new Policy_net("old_policy", env); // Cria a rede de politica antiga PPOTrain PPO = new PPOTrain(Policy, Old_Policy, gamma: GAMMA); Saver saver = tf.train.Saver(); // using (var sess = tf.Session()) // Bloco da sessão { FileWriter writer = tf.summary.FileWriter("./log/train", sess.graph); // Define diretório de logs sess.run(tf.global_variables_initializer()); // Inicializa as redes NDArray obs = env.Reset(); // Resets o ambiente e obtêm a primeira observação double reward = 0; // Armazena as recompensas int success_num = 0; // Contador de sucessos for (int episode = 0; episode < EPISODES; episode++) // Loop do episodio { int run_policy_steps = 0; // Contador de passos em cada episodio env.Render(); // Renderiza o ambiente while (true) // Execute a política RUN_POLICY_STEPS, que é muito menor que a duração do episódio { run_policy_steps += 1; // Incrementa contador de passos de cada episodio obs = np.stack(new[] { obs }).astype(dtype: np.float32); // prepare to feed placeholder Policy.obs (NDArray _act, NDArray _v_pred) = Policy.act(obs: obs, stochastic: true); // Corre a rede neural e obtêm uma ação e o V previsto int act = np.asscalar <int>(_act.ToArray <int>()); // Transforma um array do numpy NDArray v_pred = np.asscalar <double[]>(_v_pred.ToArray <double>()); // em um objeto scalar do Python //var v_pred = _v_pred.Item(); // em um objeto scalar do Python observations.Add(obs); // Adiciona a observação ao buffer de observações actions.Add(act); // Adiciona a ação ao buffer de ações v_preds.Add(v_pred); // Adiciona a v_pred ao buffer de v_pred rewards.Add(reward); // Adiciona a recompensa ao buffer de recompensa var(next_obs, _reward, done, _) = env.Step(act); // envia a ação ao ambiente e recebe a próxima observação, a recompensa e se o passo terminou reward = _reward; if (done) { // Se o done for (verdadeiro ... v_preds_next = v_preds; // [1:] seleciona do segundo elemento da lista em diante e + [0] adiciona um elemento de valor zero no final da lista v_preds_next.RemoveAt(0); v_preds_next.Add(0); // next state of terminate state has 0 state value // próximo estado do estado final tem 0 valor de estado obs = env.Reset(); // Redefine o ambiente reward = -1; // define a recompensa como -1 (?) break; // Sai do loop while } else { // Senão... obs = next_obs; // Armazena em obs a próxima observação } } // Armazena em log para visualização no tensorboard //writer.add_summary(tf.Summary(value:[tf.Summary.Value(tag:"episode_length", simple_value:run_policy_steps)]), episode); //writer.add_summary(tf.Summary(value:[tf.Summary.Value(tag:"episode_reward", simple_value:rewards.Sum())]), episode); // Condicional para finalizar o teste if (rewards.Sum() >= 195) { // Se a soma das recompensas for (maior ou igual 195 success_num += 1; // Incrementa o contador de sucessos if (success_num >= 100) { // Se ocorrerem 100 sucessos saver.save(sess, "./model/model.ckpt"); // Salva a sessão Console.WriteLine("Clear!! Model saved."); // Escreve na tela break; // Sai do loop } } else { // senão, success_num = 0; // zera o contador de sucessos } Console.WriteLine("EP: ", episode, " Rw: ", rewards.Sum()); // Escreve na tela o numero do episodio e a recompensa gaes = PPO.get_gaes(rewards: rewards, v_preds: v_preds, v_preds_next: v_preds_next); // ? // Converte lista em NPArray para alimentar o tf.placeholder //int[] newShape = ((-1), (list(ob_space.Shape)));// cria um array [-1, 4] var _observations = np.reshape(observations.ToArray(), shape: ((-1), (4)));// antes, cada linha de observations era um array independente. depois do reshape, observations passou ser um array só com varias linhas. var _actions = np.array(actions.ToArray(), dtype: np.int32); var _rewards = np.array(rewards.ToArray(), dtype: np.float32); var _v_preds_next = np.array(v_preds_next.ToArray(), dtype: np.float32); var __gaes = np.array(gaes.ToArray(), dtype: np.float32); var _gaes = (__gaes - __gaes.mean()) / __gaes.std(); // subtrai dos itens de gaes a media de todos os itens de gaes e divide todos pelo desvio padrão de gaes PPO.assign_policy_parameters(); NDArray[] inp = new[] { _observations, _actions, _rewards, _v_preds_next, _gaes }; // Cria um array com 5 colunas: observações, ações, recompensas, // Treina for (int epoch = 0; epoch < 4; epoch++) { NDArray sample_indices = np.random.randint(low: 0, high: observations.ToArray().GetLength(0), size: new Shape(64));// índices estão em [baixo, alto] var sampled_inp = new List <NDArray>(); foreach (NDArray arr in inp) { foreach (int indice in sample_indices.ToArray <int>()) { sampled_inp.Add(arr[0][indice]); } //sampled_inp.Add(np.Take(a: arr, índices: sample_indices, axis: 0)); // amostra de dados de treinamento } PPO.train(obs: sampled_inp[0], actions: sampled_inp[1], rewards: sampled_inp[2], v_preds_next: sampled_inp[3], gaes: sampled_inp[4]); } var summary = PPO.get_summary(obs: inp[0], actions: inp[1], rewards: inp[2], v_preds_next: inp[3], gaes: inp[4])[0]; //writer.add_summary(summary, episode); } //writer.close(); // Final do episódio } }