private static void CartPoleEnv() { CartPoleEnv cp = new CartPoleEnv(); //or AvaloniaEnvViewer.Factory bool done = true; for (int i = 0; i < 100_000; i++) { if (done) { NDArray observation = cp.Reset(); done = false; } else { var(observation, reward, _done, information) = cp.Step((i % 2)); //we switch between moving left and right done = _done; if (done) { } //do something with the reward and observation. } //var view = new Ebby.Gym.Rendering.Viewer(100, 100, "viewer"); var img = cp.Render(); //returns the image that was rendered. Thread.Sleep(10); //this is to prevent it from finishing instantly ! } }
public void SolveCartpole() { for (int episode = 0; episode < maxEpisodes; episode++) { List <double> rewards = new List <double>(); List <List <Matrix> > episodeActorGradients = new List <List <Matrix> >(); List <List <Matrix> > episodeCriticGradients = new List <List <Matrix> >(); var state = _env.Reset().ToArray <double>(); var value = _critic.Forward(state)[0]; for (int steps = 0; steps < maxSteps; steps++) { var policyDistribution = (_actor.Forward(state)); var actionIndex = SelectActionIndex(policyDistribution); var prob = policyDistribution[actionIndex]; var(newStateRaw, reward, done, information) = _env.Step(actionIndex); var stateNew = newStateRaw.ToArray <double>(); rewards.Add(reward); var valueNew = _critic.Forward(stateNew)[0]; var advantage = CalculateAdvantage(valueNew, value, reward); _actor.Backward(policyDistribution, actionIndex, advantage); _critic.Backward(advantage); episodeActorGradients.Add(_actor.GetGradients()); episodeCriticGradients.Add(_critic.GetGradients()); if (done || steps == (maxSteps - 1)) { _allRewards.Add(rewards.Sum()); if (episode % 1 == 0) { Console.WriteLine("Episode: {0}, Rewards: {1}", episode, _allRewards.Last()); } break; } value = valueNew; state = stateNew; _env.Render(); Thread.Sleep(5); } UpdateWeights(episodeActorGradients, episodeCriticGradients); } }
public void Run() { var cp = new CartPoleEnv(); var rnd = new Random(); var done = true; using (new StopwatchMeasurer("time it took to run all steps in ms")) for (int i = 0; i < 100_000; i++) { if (done) { cp.Reset(); done = false; } else { var (observation, reward, _done, information) = cp.Step((i % 2)); done = _done; } cp.Render(); Thread.Sleep(15); //this is to prevent it from finishing instantly ! } }
static void Main(string[] args) { // Configuration parameters for the whole setup var seed = 42; var gamma = 0.99; // Discount factor for past rewards var max_steps_per_episode = 10000; // var env = gym.make("CartPole-v0"); // Create the environment CartPoleEnv env = new CartPoleEnv(WinFormEnvViewer.Factory); // Create the environment env.Seed(seed); // var eps = np.finfo(np.float32).eps.item(); // Smallest number such that 1.0 + eps != 1.0 var eps = 1e-5; // Smallest number such that 1.0 + eps != 1.0 /*/ * //// Implement Actor Critic network * * This network learns two functions: * * 1. Actor: This takes as input the state of our environment and returns a * probability value for each action in its action space. * 2. Critic: This takes as input the state of our environment and returns * an estimate of total rewards in the future. * * In our implementation, they share the initial layer. * /*/ var num_inputs = 4; NDArray num_actions = 2; var num_hidden = 128; LayersApi layers = new LayersApi(); var inputs = layers.Input(shape: (num_inputs)); var common = layers.Dense(num_hidden, activation: "relu").Apply(inputs); var action = layers.Dense(num_actions, activation: "softmax").Apply(common); var critic = layers.Dense(1).Apply(common); Model model = keras.Model(inputs: inputs, outputs: (action, critic)); /*/ * //// Train * /*/ var optimizer = keras.optimizers.Adam(learning_rate: (float)0.01); var huber_loss = keras.losses.Huber(); var action_probs_history = new List <double>(); var critic_value_history = new List <dynamic>(); var rewards_history = new List <double>(); double running_reward = 0; var episode_count = 0; while (true) // Run until solved { Program.state = env.Reset(); double episode_reward = 0; using (var tape = tf.GradientTape()) { for (int timestep = 1; timestep < max_steps_per_episode; timestep++) { //env.Render(); // Adding this line would show the attempts // of the agent in a pop up window. Program.state = tf.convert_to_tensor(Program.state); Program.state = tf.expand_dims(Program.state, 0); // Predict action probabilities and estimated future rewards // from environment state // var (action_probs, critic_value) = model.Apply(Program.state); var pred_result = model.Apply(tf.cast(Program.state, tf.float32)); var action_probs = pred_result[0][0]; var critic_value = pred_result[1][0][0]; critic_value_history.Add(critic_value); Tensor probabilities = np.squeeze(action_probs); Console.WriteLine(probabilities); // Sample action from action probability distribution NDArray chosen_action = np.random.choice(num_actions, probabilities: probabilities.); action_probs_history.Add(tf.math.log(action_probs[0, chosen_action])); // Apply the sampled action in our environment var(state, reward, done, _) = env.Step(chosen_action); rewards_history.Add(reward); episode_reward += reward; if (done) { break; } } // Update running reward to check condition for solving running_reward = 0.05 * episode_reward + (1 - 0.05) * running_reward; // Calculate expected value from rewards // - At each timestep what was the total reward received after that timestep // - Rewards in the past are discounted by multiplying them with gamma // - These are the labels for our critic dynamic returns = new List <double>(); double discounted_sum = 0; var reverse_rewards_history = rewards_history; reverse_rewards_history.Reverse(); foreach (double r in reverse_rewards_history) { discounted_sum = r + gamma * discounted_sum; returns.Insert(0, discounted_sum); } // Normalize returns = np.array(returns.ToArray()); returns = (returns - np.mean(returns)) / (np.std(returns) + eps); returns = returns.ToList(); // Calculating loss values to update our network var history = zip(action_probs_history, critic_value_history, returns); var actor_losses = new List <double>(); var critic_losses = new List <double>(); foreach (double[] item in history) { var log_prob = item[0]; dynamic value = item[1]; dynamic ret = item[2]; // At this point in history, the critic estimated that we would get a // total reward = `value` in the future. We took an action with log probability // of `log_prob` and ended up recieving a total reward = `ret`. // The actor must be updated so that it predicts an action that leads to // high rewards (compared to critic's estimate) with high probability. var diff = ret - value; actor_losses.Add(-log_prob * diff); // actor loss // The critic must be updated so that it predicts a better estimate of // the future rewards. critic_losses.Add( huber_loss.Call(tf.expand_dims(value, 0), tf.expand_dims(ret, 0)) ); } // Backpropagation dynamic loss_value = actor_losses.Sum(x => Convert.ToDouble(x)) + critic_losses.Sum(x => Convert.ToDouble(x)); var grads = tape.gradient(loss_value, model.trainable_variables); optimizer.apply_gradients(zip(grads, model.trainable_variables)); // Clear the loss and reward history action_probs_history.Clear(); critic_value_history.Clear(); rewards_history.Clear(); } // Log details episode_count += 1; if (episode_count % 10 == 0) { var template = String.Format("running reward: {0} at episode {1}", running_reward, episode_count); Console.WriteLine(template); } if (running_reward > 195) // Condition to consider the task solved { Console.WriteLine(String.Format("Solved at episode {0}!", episode_count)); break; } } /*/ * //// Visualizations * In early stages of training: * ![Imgur](https://i.imgur.com/5gCs5kH.gif) * * In later stages of training: * ![Imgur](https://i.imgur.com/5ziiZUD.gif) * /*/ }
static void Main(string[] args) { Tensor optimizer = keras.optimizers.Adam(learning_rate: LR); Tensor train_writer = tf.summary.create_file_writer(STORE_PATH + f "/PPO-CartPole_{dt.datetime.now().strftime(" % d % m % Y % H % M ")}"); int num_steps = 10000000; Double episode_reward_sum = 0; Tensor state = env.Reset(); int episode = 1; Double total_loss = 0; for (int step = 0; step < num_steps; step++) { var rewards = new List <double>(); var actions = new List <double>(); var values = new List <double>(); var states = new List <double>(); var dones = new List <double>(); var probs = new List <double>(); for (int i = 0; i < BATCH_SIZE; i++) { var(_, policy_logits) = model(state.reshape(1, -1)); var(action, value) = model.action_value(state.reshape(1, -1)); var(new_state, reward, done, _) = env.Step(action.numpy()[0]); actions.Add(action); values.Add(value[0]); states.Add(state); dones.Add(done); probs.Add(policy_logits); episode_reward_sum += reward; state = new_state; if (done) { rewards.Add(0.0); state = env.Reset(); if (total_loss != 0) { Console.WriteLine("Episode: {episode}, latest episode reward: {episode_reward_sum}, ", "total loss: {np.mean(total_loss)}, critic loss: {np.mean(c_loss)}, ", "actor loss: {np.mean(act_loss)}, entropy loss {np.mean(ent_loss)}"); } using (train_writer.as_default()) { tf.summary.scalar("rewards", episode_reward_sum, episode); } episode_reward_sum = 0; episode += 1; } else { rewards.Add(reward); } } var(_, next_value) = model.action_value(state.reshape(1, -1)); var(discounted_rewards, advantages) = get_advantages(rewards, dones, values, next_value[0]); actions = tf.squeeze(tf.stack(actions)); probs = tf.nn.softmax(tf.squeeze(tf.stack(probs))); Tensor action_inds = tf.stack((tf.range(0, actions.shape[0]), tf.cast(actions, tf.int32)), axis: 1); total_loss = np.zeros((NUM_TRAIN_EPOCHS)); Tensor act_loss = np.zeros((NUM_TRAIN_EPOCHS)); Tensor c_loss = np.zeros(((NUM_TRAIN_EPOCHS))); Tensor ent_loss = np.zeros((NUM_TRAIN_EPOCHS)); for (int epoch = 0; epoch < NUM_TRAIN_EPOCHS; epoch++) { Tensor loss_tuple = train_model(action_inds, tf.gather(probs, action_inds), states, advantages, discounted_rewards, optimizer, ent_discount_val); total_loss[epoch] = loss_tuple[0]; c_loss[epoch] = loss_tuple[1]; act_loss[epoch] = loss_tuple[2]; ent_loss[epoch] = loss_tuple[3]; } Tensor ent_discount_val *= ENT_DISCOUNT_RATE; using (train_writer.as_default()) { tf.summary.scalar("tot_loss", np.mean(total_loss), step); tf.summary.scalar("critic_loss", np.mean(c_loss), step); tf.summary.scalar("actor_loss", np.mean(act_loss), step); tf.summary.scalar("entropy_loss", np.mean(ent_loss), step); } } }
static void Main(string[] args) { List <NDArray> observations = new List <NDArray>(); List <NDArray> actions = new List <NDArray>(); List <NDArray> v_preds = new List <NDArray>(); List <double> rewards = new List <double>(); List <NDArray> v_preds_next = new List <NDArray>(); List <NDArray> gaes = new List <NDArray>(); double EPISODES = 1e5; double GAMMA = 0.95; CartPoleEnv env = new CartPoleEnv(WinFormEnvViewer.Factory); //or AvaloniaEnvViewer.Factory // Instancia o ambiente CartPole env.Seed(0); // Space ob_space = env.ObservationSpace; // Descrevem o formato de observações válidas do espaço Policy_net Policy = new Policy_net("policy", env); // Cria a rede de Politica Policy_net Old_Policy = new Policy_net("old_policy", env); // Cria a rede de politica antiga PPOTrain PPO = new PPOTrain(Policy, Old_Policy, gamma: GAMMA); Saver saver = tf.train.Saver(); // using (var sess = tf.Session()) // Bloco da sessão { FileWriter writer = tf.summary.FileWriter("./log/train", sess.graph); // Define diretório de logs sess.run(tf.global_variables_initializer()); // Inicializa as redes NDArray obs = env.Reset(); // Resets o ambiente e obtêm a primeira observação double reward = 0; // Armazena as recompensas int success_num = 0; // Contador de sucessos for (int episode = 0; episode < EPISODES; episode++) // Loop do episodio { int run_policy_steps = 0; // Contador de passos em cada episodio env.Render(); // Renderiza o ambiente while (true) // Execute a política RUN_POLICY_STEPS, que é muito menor que a duração do episódio { run_policy_steps += 1; // Incrementa contador de passos de cada episodio obs = np.stack(new[] { obs }).astype(dtype: np.float32); // prepare to feed placeholder Policy.obs (NDArray _act, NDArray _v_pred) = Policy.act(obs: obs, stochastic: true); // Corre a rede neural e obtêm uma ação e o V previsto int act = np.asscalar <int>(_act.ToArray <int>()); // Transforma um array do numpy NDArray v_pred = np.asscalar <double[]>(_v_pred.ToArray <double>()); // em um objeto scalar do Python //var v_pred = _v_pred.Item(); // em um objeto scalar do Python observations.Add(obs); // Adiciona a observação ao buffer de observações actions.Add(act); // Adiciona a ação ao buffer de ações v_preds.Add(v_pred); // Adiciona a v_pred ao buffer de v_pred rewards.Add(reward); // Adiciona a recompensa ao buffer de recompensa var(next_obs, _reward, done, _) = env.Step(act); // envia a ação ao ambiente e recebe a próxima observação, a recompensa e se o passo terminou reward = _reward; if (done) { // Se o done for (verdadeiro ... v_preds_next = v_preds; // [1:] seleciona do segundo elemento da lista em diante e + [0] adiciona um elemento de valor zero no final da lista v_preds_next.RemoveAt(0); v_preds_next.Add(0); // next state of terminate state has 0 state value // próximo estado do estado final tem 0 valor de estado obs = env.Reset(); // Redefine o ambiente reward = -1; // define a recompensa como -1 (?) break; // Sai do loop while } else { // Senão... obs = next_obs; // Armazena em obs a próxima observação } } // Armazena em log para visualização no tensorboard //writer.add_summary(tf.Summary(value:[tf.Summary.Value(tag:"episode_length", simple_value:run_policy_steps)]), episode); //writer.add_summary(tf.Summary(value:[tf.Summary.Value(tag:"episode_reward", simple_value:rewards.Sum())]), episode); // Condicional para finalizar o teste if (rewards.Sum() >= 195) { // Se a soma das recompensas for (maior ou igual 195 success_num += 1; // Incrementa o contador de sucessos if (success_num >= 100) { // Se ocorrerem 100 sucessos saver.save(sess, "./model/model.ckpt"); // Salva a sessão Console.WriteLine("Clear!! Model saved."); // Escreve na tela break; // Sai do loop } } else { // senão, success_num = 0; // zera o contador de sucessos } Console.WriteLine("EP: ", episode, " Rw: ", rewards.Sum()); // Escreve na tela o numero do episodio e a recompensa gaes = PPO.get_gaes(rewards: rewards, v_preds: v_preds, v_preds_next: v_preds_next); // ? // Converte lista em NPArray para alimentar o tf.placeholder //int[] newShape = ((-1), (list(ob_space.Shape)));// cria um array [-1, 4] var _observations = np.reshape(observations.ToArray(), shape: ((-1), (4)));// antes, cada linha de observations era um array independente. depois do reshape, observations passou ser um array só com varias linhas. var _actions = np.array(actions.ToArray(), dtype: np.int32); var _rewards = np.array(rewards.ToArray(), dtype: np.float32); var _v_preds_next = np.array(v_preds_next.ToArray(), dtype: np.float32); var __gaes = np.array(gaes.ToArray(), dtype: np.float32); var _gaes = (__gaes - __gaes.mean()) / __gaes.std(); // subtrai dos itens de gaes a media de todos os itens de gaes e divide todos pelo desvio padrão de gaes PPO.assign_policy_parameters(); NDArray[] inp = new[] { _observations, _actions, _rewards, _v_preds_next, _gaes }; // Cria um array com 5 colunas: observações, ações, recompensas, // Treina for (int epoch = 0; epoch < 4; epoch++) { NDArray sample_indices = np.random.randint(low: 0, high: observations.ToArray().GetLength(0), size: new Shape(64));// índices estão em [baixo, alto] var sampled_inp = new List <NDArray>(); foreach (NDArray arr in inp) { foreach (int indice in sample_indices.ToArray <int>()) { sampled_inp.Add(arr[0][indice]); } //sampled_inp.Add(np.Take(a: arr, índices: sample_indices, axis: 0)); // amostra de dados de treinamento } PPO.train(obs: sampled_inp[0], actions: sampled_inp[1], rewards: sampled_inp[2], v_preds_next: sampled_inp[3], gaes: sampled_inp[4]); } var summary = PPO.get_summary(obs: inp[0], actions: inp[1], rewards: inp[2], v_preds_next: inp[3], gaes: inp[4])[0]; //writer.add_summary(summary, episode); } //writer.close(); // Final do episódio } }