コード例 #1
0
ファイル: CartPole.cs プロジェクト: Aangbaeck/XGBoost
        private static void CartPoleEnv()
        {
            CartPoleEnv cp   = new CartPoleEnv(); //or AvaloniaEnvViewer.Factory
            bool        done = true;

            for (int i = 0; i < 100_000; i++)
            {
                if (done)
                {
                    NDArray observation = cp.Reset();
                    done = false;
                }
                else
                {
                    var(observation, reward, _done, information) = cp.Step((i % 2));  //we switch between moving left and right
                    done = _done;

                    if (done)
                    {
                    }
                    //do something with the reward and observation.
                }

                //var view = new Ebby.Gym.Rendering.Viewer(100, 100, "viewer");
                var img = cp.Render(); //returns the image that was rendered.

                Thread.Sleep(10);      //this is to prevent it from finishing instantly !
            }
        }
コード例 #2
0
ファイル: CartPole.cs プロジェクト: uzbekdev1/DeepRL
        static void Main(string[] args)
        {
            Env env = new CartPoleEnv();
            //https://github.com/rlcode/reinforcement-learning/blob/master/2-cartpole/1-dqn/cartpole_dqn.py

            var valueFunc = new DQN(env.ObservationSpace.Shape, env.ActionSpace.NumberOfValues(), new[] { 24, 24 }, 0.001f, 0.99f, 32, new ExperienceReplay(2000))
            {
                TargetModelUpdateInterval = 1
            };

            Agent agent = new AgentQL("dqn_cartpole", env, valueFunc)
            {
                Verbose          = true,
                RewardOnDone     = -100,
                EpsilonDecayMode = EEpsilonDecayMode.EveryStep,
                EpsilonDecay     = 0.999f,
                WarmupSteps      = 1000
            };

            agent.Train(300, 500);
            Console.WriteLine($"Average reward {agent.Test(50, 300, 0)}");

            //while (!env.Step((int)env.ActionSpace.Sample()[0], out var nextState, out var reward))
            //{
            //    env.Render();
            //    Thread.Sleep(50);
            //}

            return;
        }
コード例 #3
0
ファイル: test_policy.cs プロジェクト: samuelcaldas/PPO.NETv2
        static void _Main(string[] args)
        {
            int success_num;

            double      ITERATION = (3 * 10e5);
            double      GAMMA     = 0.95;
            CartPoleEnv env       = new CartPoleEnv(WinFormEnvViewer.Factory); //or AvaloniaEnvViewer.Factory

            env.Seed(0);
            Space      ob_space   = env.ObservationSpace;
            Policy_net Policy     = new Policy_net("policy", env);
            Policy_net Old_Policy = new Policy_net("old_policy", env);
            PPOTrain   PPO        = new PPOTrain(Policy, Old_Policy, gamma: GAMMA);
            Saver      saver      = tf.train.Saver();

            using (var sess = tf.Session())
            {
                writer = tf.summary.FileWriter("./log/test", sess.graph);
                sess.run(tf.global_variables_initializer());
                saver.restore(sess, "model/model.ckpt");
                obs         = env.reset();
                reward      = 0;
                success_num = 0;

                for (double iteration = 0; iteration < ITERATION; iteration++)
                {  // episode
                    run_policy_steps = 0;
                    env.render();
                    while (true)
                    {                                                            // run policy RUN_POLICY_STEPS which is much less than episode length
                        run_policy_steps += 1;
                        obs = np.stack(new[] { obs }).astype(dtype: np.float32); // prepare to feed placeholder Policy.obs
                        var(act, v_pred) = Policy.act(obs: obs, stochastic: false);

                        act    = act.item();
                        v_pred = v_pred.item();

                        observations.add(obs);
                        actions.add(act);
                        v_preds.add(v_pred);
                        rewards.add(reward);

                        var(next_obs, reward, done, info) = env.Step(act);

                        if (done)
                        {
                            v_preds_next = v_preds[1 :] + [0];  // next state of terminate state has 0 state value
コード例 #4
0
        public void Run() {
            var cp = new CartPoleEnv();
            var rnd = new Random();
            var done = true;
            using (new StopwatchMeasurer("time it took to run all steps in ms"))
                for (int i = 0; i < 100_000; i++) {
                    if (done) {
                        cp.Reset();
                        done = false;
                    } else {
                        var (observation, reward, _done, information) = cp.Step((i % 2));
                        done = _done;
                    }

                    cp.Render();
                    Thread.Sleep(15); //this is to prevent it from finishing instantly !
                }
        }
コード例 #5
0
        public A2C()
        {
            _env   = new CartPoleEnv();
            _actor = new Model(new List <Layers.Layer> {
                new Layers.Input(_env.ObservationSpace.Shape[0]),
                new Layers.Dense(256, new ActivationFunctions.Relu()),
                new Layers.Dense(_env.ActionSpace.Shape[0], new ActivationFunctions.Softmax())
            });
            _critic = new Model(new List <Layers.Layer> {
                new Layers.Input(_env.ObservationSpace.Shape[0]),
                new Layers.Dense(256, new ActivationFunctions.Relu()),
                new Layers.Dense(1, null)
            });

            _actor.Compile(new Optimizers.Adam(lr), new LossFunctions.ActorLoss());
            _critic.Compile(new Optimizers.Adam(lr), new LossFunctions.CriticLoss());

            _allLengths     = new List <int>();
            _averageLengths = new List <double>();
            _allRewards     = new List <double>();
        }
コード例 #6
0
        public Policy_net(string name, CartPoleEnv env, double temp = 0.1)
        {
            /*
             * :param name: string
             * :param env: gym env
             * :param temp: temperature of boltzmann distribution
             */

            var ob_space  = env.ObservationSpace;
            var act_space = env.ActionSpace;

            using (tf.variable_scope(name))
            {
                this.obs = tf.placeholder(dtype: tf.float32, shape: new TensorShape((Unknown), (4)), name: "obs");

                using (tf.variable_scope("policy_net"))
                {
                    var layer_1 = keras.layers.dense(inputs: this.obs, units: 20, activation: tf.nn.tanh());
                    var layer_2 = keras.layers.dense(inputs: layer_1, units: 20, activation: tf.nn.tanh());
                    var layer_3 = keras.layers.dense(inputs: layer_2, units: 4, activation: tf.nn.tanh());
                    this.act_probs = keras.layers.dense(inputs: tf.divide(layer_3, new Tensor(temp)), units: 4, activation: tf.nn.softmax());
                }
                using (tf.variable_scope("value_net"))
                {
                    var layer_1 = keras.layers.dense(inputs:   this.obs, units: 20, activation: tf.nn.tanh());
                    var layer_2 = keras.layers.dense(inputs:   layer_1, units: 20, activation: tf.nn.tanh());
                    this.v_preds = keras.layers.dense(inputs:  layer_2, units: 1, activation: null);
                }
                this.act_stochastic = tf.random.categorical(tf.log(this.act_probs), num_samples: 1);
                this.act_stochastic = tf.reshape(this.act_stochastic, shape: (-1));

                this.act_deterministic = tf.argmax(this.act_probs, axis: 1);

                this.scope = tf.get_variable_scope().name;
            }
        }
コード例 #7
0
        public Policy_net(string name, CartPoleEnv env, double temp = 0.1)
        {
            /*/
             * :param name: string
             * :param env: gym env
             * :param temp: temperature of boltzmann distribution
             * /*/

            var ob_space  = env.ObservationSpace;
            var act_space = env.ActionSpace;

            using (tf.variable_scope(name))
            {
                var obs = tf.placeholder(dtype: tf.float32, shape: (null) + list(ob_space.shape), name: "obs");

                using (tf.variable_scope("policy_net"))
                {
                    var layer_1 = tf.layers.dense(inputs: obs, units: 20, activation: tf.tanh);
                    var layer_2 = tf.layers.dense(inputs: layer_1, units: 20, activation: tf.tanh);
                    var layer_3 = tf.layers.dense(inputs: layer_2, units: act_space.n, activation: tf.tanh);
                    act_probs = tf.layers.dense(inputs: tf.divide(layer_3, temp), units: act_space.n, activation: tf.nn.softmax);
                }
                using (tf.variable_scope("value_net"))
                {
                    var layer_1 = tf.layers.dense(inputs: obs, units: 20, activation: tf.tanh);
                    var layer_2 = tf.layers.dense(inputs: layer_1, units: 20, activation: tf.tanh);
                    v_preds = tf.layers.dense(inputs: layer_2, units: 1, activation: null);
                }
                act_stochastic = tf.multinomial(tf.log(act_probs), num_samples: 1);
                act_stochastic = tf.reshape(act_stochastic, shape:[-1]);

                act_deterministic = tf.argmax(act_probs, axis: 1);

                scope = tf.get_variable_scope().name;
            }
        }
コード例 #8
0
        static void Main(string[] args)
        {
            // Configuration parameters for the whole setup
            var seed  = 42;
            var gamma = 0.99;  // Discount factor for past rewards
            var max_steps_per_episode = 10000;
            // var env = gym.make("CartPole-v0");  // Create the environment
            CartPoleEnv env = new CartPoleEnv(WinFormEnvViewer.Factory);  // Create the environment

            env.Seed(seed);
            // var eps = np.finfo(np.float32).eps.item();  // Smallest number such that 1.0 + eps != 1.0
            var eps = 1e-5;  // Smallest number such that 1.0 + eps != 1.0

            /*/
             * //// Implement Actor Critic network
             *
             * This network learns two functions:
             *
             * 1. Actor: This takes as input the state of our environment and returns a
             * probability value for each action in its action space.
             * 2. Critic: This takes as input the state of our environment and returns
             * an estimate of total rewards in the future.
             *
             * In our implementation, they share the initial layer.
             * /*/

            var     num_inputs  = 4;
            NDArray num_actions = 2;
            var     num_hidden  = 128;

            LayersApi layers = new LayersApi();
            var       inputs = layers.Input(shape: (num_inputs));
            var       common = layers.Dense(num_hidden, activation: "relu").Apply(inputs);
            var       action = layers.Dense(num_actions, activation: "softmax").Apply(common);
            var       critic = layers.Dense(1).Apply(common);

            Model model = keras.Model(inputs: inputs, outputs: (action, critic));

            /*/
             * //// Train
             * /*/

            var    optimizer            = keras.optimizers.Adam(learning_rate: (float)0.01);
            var    huber_loss           = keras.losses.Huber();
            var    action_probs_history = new List <double>();
            var    critic_value_history = new List <dynamic>();
            var    rewards_history      = new List <double>();
            double running_reward       = 0;
            var    episode_count        = 0;

            while (true)  // Run until solved
            {
                Program.state = env.Reset();
                double episode_reward = 0;
                using (var tape = tf.GradientTape())
                {
                    for (int timestep = 1; timestep < max_steps_per_episode; timestep++)
                    {
                        //env.Render(); // Adding this line would show the attempts
                        // of the agent in a pop up window.

                        Program.state = tf.convert_to_tensor(Program.state);
                        Program.state = tf.expand_dims(Program.state, 0);

                        // Predict action probabilities and estimated future rewards
                        // from environment state
                        // var (action_probs, critic_value) = model.Apply(Program.state);
                        var pred_result = model.Apply(tf.cast(Program.state, tf.float32));

                        var action_probs = pred_result[0][0];
                        var critic_value = pred_result[1][0][0];

                        critic_value_history.Add(critic_value);

                        Tensor probabilities = np.squeeze(action_probs);
                        Console.WriteLine(probabilities);
                        // Sample action from action probability distribution
                        NDArray chosen_action = np.random.choice(num_actions, probabilities: probabilities.);
                        action_probs_history.Add(tf.math.log(action_probs[0, chosen_action]));

                        // Apply the sampled action in our environment
                        var(state, reward, done, _) = env.Step(chosen_action);
                        rewards_history.Add(reward);
                        episode_reward += reward;

                        if (done)
                        {
                            break;
                        }
                    }
                    // Update running reward to check condition for solving
                    running_reward = 0.05 * episode_reward + (1 - 0.05) * running_reward;

                    // Calculate expected value from rewards
                    // - At each timestep what was the total reward received after that timestep
                    // - Rewards in the past are discounted by multiplying them with gamma
                    // - These are the labels for our critic
                    dynamic returns        = new List <double>();
                    double  discounted_sum = 0;

                    var reverse_rewards_history = rewards_history;
                    reverse_rewards_history.Reverse();
                    foreach (double r in reverse_rewards_history)
                    {
                        discounted_sum = r + gamma * discounted_sum;
                        returns.Insert(0, discounted_sum);
                    }

                    // Normalize
                    returns = np.array(returns.ToArray());
                    returns = (returns - np.mean(returns)) / (np.std(returns) + eps);
                    returns = returns.ToList();

                    // Calculating loss values to update our network
                    var history       = zip(action_probs_history, critic_value_history, returns);
                    var actor_losses  = new List <double>();
                    var critic_losses = new List <double>();
                    foreach (double[] item in history)
                    {
                        var     log_prob = item[0];
                        dynamic value    = item[1];
                        dynamic ret      = item[2];
                        // At this point in history, the critic estimated that we would get a
                        // total reward = `value` in the future. We took an action with log probability
                        // of `log_prob` and ended up recieving a total reward = `ret`.
                        // The actor must be updated so that it predicts an action that leads to
                        // high rewards (compared to critic's estimate) with high probability.
                        var diff = ret - value;
                        actor_losses.Add(-log_prob * diff);  // actor loss

                        // The critic must be updated so that it predicts a better estimate of
                        // the future rewards.
                        critic_losses.Add(
                            huber_loss.Call(tf.expand_dims(value, 0), tf.expand_dims(ret, 0))
                            );
                    }

                    // Backpropagation
                    dynamic loss_value = actor_losses.Sum(x => Convert.ToDouble(x)) + critic_losses.Sum(x => Convert.ToDouble(x));
                    var     grads      = tape.gradient(loss_value, model.trainable_variables);
                    optimizer.apply_gradients(zip(grads, model.trainable_variables));

                    // Clear the loss and reward history
                    action_probs_history.Clear();
                    critic_value_history.Clear();
                    rewards_history.Clear();
                }
                // Log details
                episode_count += 1;
                if (episode_count % 10 == 0)
                {
                    var template = String.Format("running reward: {0} at episode {1}", running_reward, episode_count);
                    Console.WriteLine(template);
                }

                if (running_reward > 195)  // Condition to consider the task solved
                {
                    Console.WriteLine(String.Format("Solved at episode {0}!", episode_count));
                    break;
                }
            }

            /*/
             * //// Visualizations
             * In early stages of training:
             * ![Imgur](https://i.imgur.com/5gCs5kH.gif)
             *
             * In later stages of training:
             * ![Imgur](https://i.imgur.com/5ziiZUD.gif)
             * /*/
        }
コード例 #9
0
        static void Main(string[] args)
        {
            List <NDArray> observations = new List <NDArray>();
            List <NDArray> actions      = new List <NDArray>();
            List <NDArray> v_preds      = new List <NDArray>();
            List <double>  rewards      = new List <double>();
            List <NDArray> v_preds_next = new List <NDArray>();
            List <NDArray> gaes         = new List <NDArray>();

            double EPISODES = 1e5;
            double GAMMA    = 0.95;

            CartPoleEnv env = new CartPoleEnv(WinFormEnvViewer.Factory);                          //or AvaloniaEnvViewer.Factory  // Instancia o ambiente CartPole

            env.Seed(0);                                                                          //
            Space      ob_space   = env.ObservationSpace;                                         // Descrevem o formato de observações válidas do espaço
            Policy_net Policy     = new Policy_net("policy", env);                                // Cria a rede de Politica
            Policy_net Old_Policy = new Policy_net("old_policy", env);                            // Cria a rede de politica antiga
            PPOTrain   PPO        = new PPOTrain(Policy, Old_Policy, gamma: GAMMA);
            Saver      saver      = tf.train.Saver();                                             //

            using (var sess = tf.Session())                                                       // Bloco da sessão
            {
                FileWriter writer = tf.summary.FileWriter("./log/train", sess.graph);             // Define diretório de logs
                sess.run(tf.global_variables_initializer());                                      // Inicializa as redes

                NDArray obs         = env.Reset();                                                // Resets o ambiente e obtêm a primeira observação
                double  reward      = 0;                                                          // Armazena as recompensas
                int     success_num = 0;                                                          // Contador de sucessos

                for (int episode = 0; episode < EPISODES; episode++)                              // Loop do episodio
                {
                    int run_policy_steps = 0;                                                     // Contador de passos em cada episodio
                    env.Render();                                                                 // Renderiza o ambiente

                    while (true)                                                                  // Execute a política RUN_POLICY_STEPS, que é muito menor que a duração do episódio
                    {
                        run_policy_steps += 1;                                                    // Incrementa contador de passos de cada episodio
                        obs = np.stack(new[] { obs }).astype(dtype: np.float32);                  // prepare to feed placeholder Policy.obs
                        (NDArray _act, NDArray _v_pred) = Policy.act(obs: obs, stochastic: true); // Corre a rede neural e obtêm uma ação e o V previsto
                        int     act    = np.asscalar <int>(_act.ToArray <int>());                 // Transforma um array do numpy
                        NDArray v_pred = np.asscalar <double[]>(_v_pred.ToArray <double>());      // em um objeto scalar do Python
                        //var v_pred = _v_pred.Item();  // em um objeto scalar do Python

                        observations.Add(obs);                           // Adiciona a observação ao buffer de observações
                        actions.Add(act);                                // Adiciona a ação ao buffer de ações
                        v_preds.Add(v_pred);                             // Adiciona a v_pred ao buffer de v_pred
                        rewards.Add(reward);                             // Adiciona a recompensa ao buffer de recompensa

                        var(next_obs, _reward, done, _) = env.Step(act); // envia a ação ao ambiente e recebe a próxima observação, a recompensa e se o passo terminou
                        reward = _reward;

                        if (done)
                        {                           // Se o done for (verdadeiro ...
                            v_preds_next = v_preds; // [1:] seleciona do segundo elemento da lista em diante e + [0] adiciona um elemento de valor zero no final da lista
                            v_preds_next.RemoveAt(0);
                            v_preds_next.Add(0);
                            // next state of terminate state has 0 state value
                            // próximo estado do estado final tem 0 valor de estado
                            obs    = env.Reset(); //   Redefine o ambiente
                            reward = -1;          //   define a recompensa como -1 (?)
                            break;                //   Sai do loop while
                        }
                        else
                        {                   // Senão...
                            obs = next_obs; //   Armazena em obs a próxima observação
                        }
                    }
                    // Armazena em log para visualização no tensorboard
                    //writer.add_summary(tf.Summary(value:[tf.Summary.Value(tag:"episode_length", simple_value:run_policy_steps)]), episode);
                    //writer.add_summary(tf.Summary(value:[tf.Summary.Value(tag:"episode_reward", simple_value:rewards.Sum())]),     episode);

                    // Condicional para finalizar o teste
                    if (rewards.Sum() >= 195)
                    {                                                  // Se a soma das recompensas for (maior ou igual 195
                        success_num += 1;                              //   Incrementa o contador de sucessos
                        if (success_num >= 100)
                        {                                              //   Se ocorrerem 100 sucessos
                            saver.save(sess, "./model/model.ckpt");    //       Salva a sessão
                            Console.WriteLine("Clear!! Model saved."); //       Escreve na tela
                            break;                                     //       Sai do loop
                        }
                    }
                    else
                    {                                                                                    // senão,
                        success_num = 0;                                                                 //   zera o contador de sucessos
                    }
                    Console.WriteLine("EP: ", episode, " Rw: ", rewards.Sum());                          // Escreve na tela o numero do episodio e a recompensa

                    gaes = PPO.get_gaes(rewards: rewards, v_preds: v_preds, v_preds_next: v_preds_next); // ?

                    // Converte lista em NPArray para alimentar o tf.placeholder
                    //int[] newShape = ((-1), (list(ob_space.Shape)));// cria um array [-1, 4]
                    var _observations = np.reshape(observations.ToArray(), shape: ((-1), (4)));// antes, cada linha de observations era um array independente. depois do reshape, observations passou ser um array só com varias linhas.
                    var _actions      = np.array(actions.ToArray(), dtype: np.int32);
                    var _rewards      = np.array(rewards.ToArray(), dtype: np.float32);
                    var _v_preds_next = np.array(v_preds_next.ToArray(), dtype: np.float32);
                    var __gaes        = np.array(gaes.ToArray(), dtype: np.float32);
                    var _gaes         = (__gaes - __gaes.mean()) / __gaes.std(); // subtrai dos itens de gaes a media de todos os itens de gaes e divide todos pelo desvio padrão de gaes

                    PPO.assign_policy_parameters();

                    NDArray[] inp = new[] { _observations, _actions, _rewards, _v_preds_next, _gaes };  // Cria um array com 5 colunas: observações, ações, recompensas,

                    // Treina
                    for (int epoch = 0; epoch < 4; epoch++)
                    {
                        NDArray sample_indices = np.random.randint(low: 0, high: observations.ToArray().GetLength(0), size: new Shape(64));// índices estão em [baixo, alto]
                        var     sampled_inp    = new List <NDArray>();
                        foreach (NDArray arr in inp)
                        {
                            foreach (int indice in sample_indices.ToArray <int>())
                            {
                                sampled_inp.Add(arr[0][indice]);
                            }
                            //sampled_inp.Add(np.Take(a: arr, índices: sample_indices, axis: 0));   // amostra de dados de treinamento
                        }
                        PPO.train(obs: sampled_inp[0], actions: sampled_inp[1], rewards: sampled_inp[2], v_preds_next: sampled_inp[3], gaes: sampled_inp[4]);
                    }
                    var summary = PPO.get_summary(obs: inp[0], actions: inp[1], rewards: inp[2], v_preds_next: inp[3], gaes: inp[4])[0];

                    //writer.add_summary(summary, episode);
                }
                //writer.close(); // Final do episódio
            }
        }