コード例 #1
0
ファイル: CartPole.cs プロジェクト: Aangbaeck/XGBoost
        private static void CartPoleEnv()
        {
            CartPoleEnv cp   = new CartPoleEnv(); //or AvaloniaEnvViewer.Factory
            bool        done = true;

            for (int i = 0; i < 100_000; i++)
            {
                if (done)
                {
                    NDArray observation = cp.Reset();
                    done = false;
                }
                else
                {
                    var(observation, reward, _done, information) = cp.Step((i % 2));  //we switch between moving left and right
                    done = _done;

                    if (done)
                    {
                    }
                    //do something with the reward and observation.
                }

                //var view = new Ebby.Gym.Rendering.Viewer(100, 100, "viewer");
                var img = cp.Render(); //returns the image that was rendered.

                Thread.Sleep(10);      //this is to prevent it from finishing instantly !
            }
        }
コード例 #2
0
        public void SolveCartpole()
        {
            for (int episode = 0; episode < maxEpisodes; episode++)
            {
                List <double>         rewards = new List <double>();
                List <List <Matrix> > episodeActorGradients  = new List <List <Matrix> >();
                List <List <Matrix> > episodeCriticGradients = new List <List <Matrix> >();

                var state = _env.Reset().ToArray <double>();
                var value = _critic.Forward(state)[0];

                for (int steps = 0; steps < maxSteps; steps++)
                {
                    var policyDistribution = (_actor.Forward(state));
                    var actionIndex        = SelectActionIndex(policyDistribution);
                    var prob = policyDistribution[actionIndex];

                    var(newStateRaw, reward, done, information) = _env.Step(actionIndex);
                    var stateNew = newStateRaw.ToArray <double>();

                    rewards.Add(reward);

                    var valueNew  = _critic.Forward(stateNew)[0];
                    var advantage = CalculateAdvantage(valueNew, value, reward);

                    _actor.Backward(policyDistribution, actionIndex, advantage);
                    _critic.Backward(advantage);
                    episodeActorGradients.Add(_actor.GetGradients());
                    episodeCriticGradients.Add(_critic.GetGradients());

                    if (done || steps == (maxSteps - 1))
                    {
                        _allRewards.Add(rewards.Sum());

                        if (episode % 1 == 0)
                        {
                            Console.WriteLine("Episode: {0}, Rewards: {1}", episode, _allRewards.Last());
                        }

                        break;
                    }
                    value = valueNew;
                    state = stateNew;
                    _env.Render();
                    Thread.Sleep(5);
                }

                UpdateWeights(episodeActorGradients, episodeCriticGradients);
            }
        }
コード例 #3
0
        public void Run() {
            var cp = new CartPoleEnv();
            var rnd = new Random();
            var done = true;
            using (new StopwatchMeasurer("time it took to run all steps in ms"))
                for (int i = 0; i < 100_000; i++) {
                    if (done) {
                        cp.Reset();
                        done = false;
                    } else {
                        var (observation, reward, _done, information) = cp.Step((i % 2));
                        done = _done;
                    }

                    cp.Render();
                    Thread.Sleep(15); //this is to prevent it from finishing instantly !
                }
        }
コード例 #4
0
        static void Main(string[] args)
        {
            // Configuration parameters for the whole setup
            var seed  = 42;
            var gamma = 0.99;  // Discount factor for past rewards
            var max_steps_per_episode = 10000;
            // var env = gym.make("CartPole-v0");  // Create the environment
            CartPoleEnv env = new CartPoleEnv(WinFormEnvViewer.Factory);  // Create the environment

            env.Seed(seed);
            // var eps = np.finfo(np.float32).eps.item();  // Smallest number such that 1.0 + eps != 1.0
            var eps = 1e-5;  // Smallest number such that 1.0 + eps != 1.0

            /*/
             * //// Implement Actor Critic network
             *
             * This network learns two functions:
             *
             * 1. Actor: This takes as input the state of our environment and returns a
             * probability value for each action in its action space.
             * 2. Critic: This takes as input the state of our environment and returns
             * an estimate of total rewards in the future.
             *
             * In our implementation, they share the initial layer.
             * /*/

            var     num_inputs  = 4;
            NDArray num_actions = 2;
            var     num_hidden  = 128;

            LayersApi layers = new LayersApi();
            var       inputs = layers.Input(shape: (num_inputs));
            var       common = layers.Dense(num_hidden, activation: "relu").Apply(inputs);
            var       action = layers.Dense(num_actions, activation: "softmax").Apply(common);
            var       critic = layers.Dense(1).Apply(common);

            Model model = keras.Model(inputs: inputs, outputs: (action, critic));

            /*/
             * //// Train
             * /*/

            var    optimizer            = keras.optimizers.Adam(learning_rate: (float)0.01);
            var    huber_loss           = keras.losses.Huber();
            var    action_probs_history = new List <double>();
            var    critic_value_history = new List <dynamic>();
            var    rewards_history      = new List <double>();
            double running_reward       = 0;
            var    episode_count        = 0;

            while (true)  // Run until solved
            {
                Program.state = env.Reset();
                double episode_reward = 0;
                using (var tape = tf.GradientTape())
                {
                    for (int timestep = 1; timestep < max_steps_per_episode; timestep++)
                    {
                        //env.Render(); // Adding this line would show the attempts
                        // of the agent in a pop up window.

                        Program.state = tf.convert_to_tensor(Program.state);
                        Program.state = tf.expand_dims(Program.state, 0);

                        // Predict action probabilities and estimated future rewards
                        // from environment state
                        // var (action_probs, critic_value) = model.Apply(Program.state);
                        var pred_result = model.Apply(tf.cast(Program.state, tf.float32));

                        var action_probs = pred_result[0][0];
                        var critic_value = pred_result[1][0][0];

                        critic_value_history.Add(critic_value);

                        Tensor probabilities = np.squeeze(action_probs);
                        Console.WriteLine(probabilities);
                        // Sample action from action probability distribution
                        NDArray chosen_action = np.random.choice(num_actions, probabilities: probabilities.);
                        action_probs_history.Add(tf.math.log(action_probs[0, chosen_action]));

                        // Apply the sampled action in our environment
                        var(state, reward, done, _) = env.Step(chosen_action);
                        rewards_history.Add(reward);
                        episode_reward += reward;

                        if (done)
                        {
                            break;
                        }
                    }
                    // Update running reward to check condition for solving
                    running_reward = 0.05 * episode_reward + (1 - 0.05) * running_reward;

                    // Calculate expected value from rewards
                    // - At each timestep what was the total reward received after that timestep
                    // - Rewards in the past are discounted by multiplying them with gamma
                    // - These are the labels for our critic
                    dynamic returns        = new List <double>();
                    double  discounted_sum = 0;

                    var reverse_rewards_history = rewards_history;
                    reverse_rewards_history.Reverse();
                    foreach (double r in reverse_rewards_history)
                    {
                        discounted_sum = r + gamma * discounted_sum;
                        returns.Insert(0, discounted_sum);
                    }

                    // Normalize
                    returns = np.array(returns.ToArray());
                    returns = (returns - np.mean(returns)) / (np.std(returns) + eps);
                    returns = returns.ToList();

                    // Calculating loss values to update our network
                    var history       = zip(action_probs_history, critic_value_history, returns);
                    var actor_losses  = new List <double>();
                    var critic_losses = new List <double>();
                    foreach (double[] item in history)
                    {
                        var     log_prob = item[0];
                        dynamic value    = item[1];
                        dynamic ret      = item[2];
                        // At this point in history, the critic estimated that we would get a
                        // total reward = `value` in the future. We took an action with log probability
                        // of `log_prob` and ended up recieving a total reward = `ret`.
                        // The actor must be updated so that it predicts an action that leads to
                        // high rewards (compared to critic's estimate) with high probability.
                        var diff = ret - value;
                        actor_losses.Add(-log_prob * diff);  // actor loss

                        // The critic must be updated so that it predicts a better estimate of
                        // the future rewards.
                        critic_losses.Add(
                            huber_loss.Call(tf.expand_dims(value, 0), tf.expand_dims(ret, 0))
                            );
                    }

                    // Backpropagation
                    dynamic loss_value = actor_losses.Sum(x => Convert.ToDouble(x)) + critic_losses.Sum(x => Convert.ToDouble(x));
                    var     grads      = tape.gradient(loss_value, model.trainable_variables);
                    optimizer.apply_gradients(zip(grads, model.trainable_variables));

                    // Clear the loss and reward history
                    action_probs_history.Clear();
                    critic_value_history.Clear();
                    rewards_history.Clear();
                }
                // Log details
                episode_count += 1;
                if (episode_count % 10 == 0)
                {
                    var template = String.Format("running reward: {0} at episode {1}", running_reward, episode_count);
                    Console.WriteLine(template);
                }

                if (running_reward > 195)  // Condition to consider the task solved
                {
                    Console.WriteLine(String.Format("Solved at episode {0}!", episode_count));
                    break;
                }
            }

            /*/
             * //// Visualizations
             * In early stages of training:
             * ![Imgur](https://i.imgur.com/5gCs5kH.gif)
             *
             * In later stages of training:
             * ![Imgur](https://i.imgur.com/5ziiZUD.gif)
             * /*/
        }
コード例 #5
0
        static void Main(string[] args)
        {
            Tensor optimizer = keras.optimizers.Adam(learning_rate: LR);

            Tensor train_writer = tf.summary.create_file_writer(STORE_PATH + f "/PPO-CartPole_{dt.datetime.now().strftime(" % d % m % Y % H % M ")}");

            int    num_steps          = 10000000;
            Double episode_reward_sum = 0;
            Tensor state      = env.Reset();
            int    episode    = 1;
            Double total_loss = 0;

            for (int step = 0; step < num_steps; step++)
            {
                var rewards = new List <double>();
                var actions = new List <double>();
                var values  = new List <double>();
                var states  = new List <double>();
                var dones   = new List <double>();
                var probs   = new List <double>();
                for (int i = 0; i < BATCH_SIZE; i++)
                {
                    var(_, policy_logits) = model(state.reshape(1, -1));

                    var(action, value) = model.action_value(state.reshape(1, -1));
                    var(new_state, reward, done, _) = env.Step(action.numpy()[0]);

                    actions.Add(action);
                    values.Add(value[0]);
                    states.Add(state);
                    dones.Add(done);
                    probs.Add(policy_logits);
                    episode_reward_sum += reward;

                    state = new_state;

                    if (done)
                    {
                        rewards.Add(0.0);
                        state = env.Reset();
                        if (total_loss != 0)
                        {
                            Console.WriteLine("Episode: {episode}, latest episode reward: {episode_reward_sum}, ",
                                              "total loss: {np.mean(total_loss)}, critic loss: {np.mean(c_loss)}, ",
                                              "actor loss: {np.mean(act_loss)}, entropy loss {np.mean(ent_loss)}");
                        }
                        using (train_writer.as_default())
                        {
                            tf.summary.scalar("rewards", episode_reward_sum, episode);
                        }
                        episode_reward_sum = 0;

                        episode += 1;
                    }
                    else
                    {
                        rewards.Add(reward);
                    }
                }
                var(_, next_value) = model.action_value(state.reshape(1, -1));
                var(discounted_rewards, advantages) = get_advantages(rewards, dones, values, next_value[0]);

                actions = tf.squeeze(tf.stack(actions));
                probs   = tf.nn.softmax(tf.squeeze(tf.stack(probs)));
                Tensor action_inds = tf.stack((tf.range(0, actions.shape[0]), tf.cast(actions, tf.int32)), axis: 1);

                total_loss = np.zeros((NUM_TRAIN_EPOCHS));
                Tensor act_loss = np.zeros((NUM_TRAIN_EPOCHS));
                Tensor c_loss   = np.zeros(((NUM_TRAIN_EPOCHS)));
                Tensor ent_loss = np.zeros((NUM_TRAIN_EPOCHS));
                for (int epoch = 0; epoch < NUM_TRAIN_EPOCHS; epoch++)
                {
                    Tensor loss_tuple = train_model(action_inds, tf.gather(probs, action_inds),
                                                    states, advantages, discounted_rewards, optimizer,
                                                    ent_discount_val);
                    total_loss[epoch] = loss_tuple[0];
                    c_loss[epoch]     = loss_tuple[1];
                    act_loss[epoch]   = loss_tuple[2];
                    ent_loss[epoch]   = loss_tuple[3];
                }
                Tensor ent_discount_val *= ENT_DISCOUNT_RATE;

                using (train_writer.as_default())
                {
                    tf.summary.scalar("tot_loss", np.mean(total_loss), step);
                    tf.summary.scalar("critic_loss", np.mean(c_loss), step);
                    tf.summary.scalar("actor_loss", np.mean(act_loss), step);
                    tf.summary.scalar("entropy_loss", np.mean(ent_loss), step);
                }
            }
        }
コード例 #6
0
        static void Main(string[] args)
        {
            List <NDArray> observations = new List <NDArray>();
            List <NDArray> actions      = new List <NDArray>();
            List <NDArray> v_preds      = new List <NDArray>();
            List <double>  rewards      = new List <double>();
            List <NDArray> v_preds_next = new List <NDArray>();
            List <NDArray> gaes         = new List <NDArray>();

            double EPISODES = 1e5;
            double GAMMA    = 0.95;

            CartPoleEnv env = new CartPoleEnv(WinFormEnvViewer.Factory);                          //or AvaloniaEnvViewer.Factory  // Instancia o ambiente CartPole

            env.Seed(0);                                                                          //
            Space      ob_space   = env.ObservationSpace;                                         // Descrevem o formato de observações válidas do espaço
            Policy_net Policy     = new Policy_net("policy", env);                                // Cria a rede de Politica
            Policy_net Old_Policy = new Policy_net("old_policy", env);                            // Cria a rede de politica antiga
            PPOTrain   PPO        = new PPOTrain(Policy, Old_Policy, gamma: GAMMA);
            Saver      saver      = tf.train.Saver();                                             //

            using (var sess = tf.Session())                                                       // Bloco da sessão
            {
                FileWriter writer = tf.summary.FileWriter("./log/train", sess.graph);             // Define diretório de logs
                sess.run(tf.global_variables_initializer());                                      // Inicializa as redes

                NDArray obs         = env.Reset();                                                // Resets o ambiente e obtêm a primeira observação
                double  reward      = 0;                                                          // Armazena as recompensas
                int     success_num = 0;                                                          // Contador de sucessos

                for (int episode = 0; episode < EPISODES; episode++)                              // Loop do episodio
                {
                    int run_policy_steps = 0;                                                     // Contador de passos em cada episodio
                    env.Render();                                                                 // Renderiza o ambiente

                    while (true)                                                                  // Execute a política RUN_POLICY_STEPS, que é muito menor que a duração do episódio
                    {
                        run_policy_steps += 1;                                                    // Incrementa contador de passos de cada episodio
                        obs = np.stack(new[] { obs }).astype(dtype: np.float32);                  // prepare to feed placeholder Policy.obs
                        (NDArray _act, NDArray _v_pred) = Policy.act(obs: obs, stochastic: true); // Corre a rede neural e obtêm uma ação e o V previsto
                        int     act    = np.asscalar <int>(_act.ToArray <int>());                 // Transforma um array do numpy
                        NDArray v_pred = np.asscalar <double[]>(_v_pred.ToArray <double>());      // em um objeto scalar do Python
                        //var v_pred = _v_pred.Item();  // em um objeto scalar do Python

                        observations.Add(obs);                           // Adiciona a observação ao buffer de observações
                        actions.Add(act);                                // Adiciona a ação ao buffer de ações
                        v_preds.Add(v_pred);                             // Adiciona a v_pred ao buffer de v_pred
                        rewards.Add(reward);                             // Adiciona a recompensa ao buffer de recompensa

                        var(next_obs, _reward, done, _) = env.Step(act); // envia a ação ao ambiente e recebe a próxima observação, a recompensa e se o passo terminou
                        reward = _reward;

                        if (done)
                        {                           // Se o done for (verdadeiro ...
                            v_preds_next = v_preds; // [1:] seleciona do segundo elemento da lista em diante e + [0] adiciona um elemento de valor zero no final da lista
                            v_preds_next.RemoveAt(0);
                            v_preds_next.Add(0);
                            // next state of terminate state has 0 state value
                            // próximo estado do estado final tem 0 valor de estado
                            obs    = env.Reset(); //   Redefine o ambiente
                            reward = -1;          //   define a recompensa como -1 (?)
                            break;                //   Sai do loop while
                        }
                        else
                        {                   // Senão...
                            obs = next_obs; //   Armazena em obs a próxima observação
                        }
                    }
                    // Armazena em log para visualização no tensorboard
                    //writer.add_summary(tf.Summary(value:[tf.Summary.Value(tag:"episode_length", simple_value:run_policy_steps)]), episode);
                    //writer.add_summary(tf.Summary(value:[tf.Summary.Value(tag:"episode_reward", simple_value:rewards.Sum())]),     episode);

                    // Condicional para finalizar o teste
                    if (rewards.Sum() >= 195)
                    {                                                  // Se a soma das recompensas for (maior ou igual 195
                        success_num += 1;                              //   Incrementa o contador de sucessos
                        if (success_num >= 100)
                        {                                              //   Se ocorrerem 100 sucessos
                            saver.save(sess, "./model/model.ckpt");    //       Salva a sessão
                            Console.WriteLine("Clear!! Model saved."); //       Escreve na tela
                            break;                                     //       Sai do loop
                        }
                    }
                    else
                    {                                                                                    // senão,
                        success_num = 0;                                                                 //   zera o contador de sucessos
                    }
                    Console.WriteLine("EP: ", episode, " Rw: ", rewards.Sum());                          // Escreve na tela o numero do episodio e a recompensa

                    gaes = PPO.get_gaes(rewards: rewards, v_preds: v_preds, v_preds_next: v_preds_next); // ?

                    // Converte lista em NPArray para alimentar o tf.placeholder
                    //int[] newShape = ((-1), (list(ob_space.Shape)));// cria um array [-1, 4]
                    var _observations = np.reshape(observations.ToArray(), shape: ((-1), (4)));// antes, cada linha de observations era um array independente. depois do reshape, observations passou ser um array só com varias linhas.
                    var _actions      = np.array(actions.ToArray(), dtype: np.int32);
                    var _rewards      = np.array(rewards.ToArray(), dtype: np.float32);
                    var _v_preds_next = np.array(v_preds_next.ToArray(), dtype: np.float32);
                    var __gaes        = np.array(gaes.ToArray(), dtype: np.float32);
                    var _gaes         = (__gaes - __gaes.mean()) / __gaes.std(); // subtrai dos itens de gaes a media de todos os itens de gaes e divide todos pelo desvio padrão de gaes

                    PPO.assign_policy_parameters();

                    NDArray[] inp = new[] { _observations, _actions, _rewards, _v_preds_next, _gaes };  // Cria um array com 5 colunas: observações, ações, recompensas,

                    // Treina
                    for (int epoch = 0; epoch < 4; epoch++)
                    {
                        NDArray sample_indices = np.random.randint(low: 0, high: observations.ToArray().GetLength(0), size: new Shape(64));// índices estão em [baixo, alto]
                        var     sampled_inp    = new List <NDArray>();
                        foreach (NDArray arr in inp)
                        {
                            foreach (int indice in sample_indices.ToArray <int>())
                            {
                                sampled_inp.Add(arr[0][indice]);
                            }
                            //sampled_inp.Add(np.Take(a: arr, índices: sample_indices, axis: 0));   // amostra de dados de treinamento
                        }
                        PPO.train(obs: sampled_inp[0], actions: sampled_inp[1], rewards: sampled_inp[2], v_preds_next: sampled_inp[3], gaes: sampled_inp[4]);
                    }
                    var summary = PPO.get_summary(obs: inp[0], actions: inp[1], rewards: inp[2], v_preds_next: inp[3], gaes: inp[4])[0];

                    //writer.add_summary(summary, episode);
                }
                //writer.close(); // Final do episódio
            }
        }