C# (CSharp) CartPoleEnv.Seed示例

编程语言: C# (CSharp)

类/类型: CartPoleEnv

方法/功能: Seed

hotexamples.com的示例: 3

C# (CSharp) CartPoleEnv.Seed - 已找到3个示例。这些是从开源项目中提取的最受好评的CartPoleEnv.Seed现实C# (CSharp)示例。您可以评价示例，以帮助我们提高示例质量。

常用方法

显示隐藏

Step(7)

Reset(6)

Render(4)

Seed(3)

render(1)

示例#1

显示文件

文件： test_policy.cs 项目： samuelcaldas/PPO.NETv2

        static void _Main(string[] args)
        {
            int success_num;

            double      ITERATION = (3 * 10e5);
            double      GAMMA     = 0.95;
            CartPoleEnv env       = new CartPoleEnv(WinFormEnvViewer.Factory); //or AvaloniaEnvViewer.Factory

            env.Seed(0);
            Space      ob_space   = env.ObservationSpace;
            Policy_net Policy     = new Policy_net("policy", env);
            Policy_net Old_Policy = new Policy_net("old_policy", env);
            PPOTrain   PPO        = new PPOTrain(Policy, Old_Policy, gamma: GAMMA);
            Saver      saver      = tf.train.Saver();

            using (var sess = tf.Session())
            {
                writer = tf.summary.FileWriter("./log/test", sess.graph);
                sess.run(tf.global_variables_initializer());
                saver.restore(sess, "model/model.ckpt");
                obs         = env.reset();
                reward      = 0;
                success_num = 0;

                for (double iteration = 0; iteration < ITERATION; iteration++)
                {  // episode
                    run_policy_steps = 0;
                    env.render();
                    while (true)
                    {                                                            // run policy RUN_POLICY_STEPS which is much less than episode length
                        run_policy_steps += 1;
                        obs = np.stack(new[] { obs }).astype(dtype: np.float32); // prepare to feed placeholder Policy.obs
                        var(act, v_pred) = Policy.act(obs: obs, stochastic: false);

                        act    = act.item();
                        v_pred = v_pred.item();

                        observations.add(obs);
                        actions.add(act);
                        v_preds.add(v_pred);
                        rewards.add(reward);

                        var(next_obs, reward, done, info) = env.Step(act);

                        if (done)
                        {
                            v_preds_next = v_preds[1 :] + [0];  // next state of terminate state has 0 state value

示例#2

显示文件

文件： Program.cs 项目： samuelcaldas/Actor-Critic-Cartpole-Keras

        static void Main(string[] args)
        {
            // Configuration parameters for the whole setup
            var seed  = 42;
            var gamma = 0.99;  // Discount factor for past rewards
            var max_steps_per_episode = 10000;
            // var env = gym.make("CartPole-v0");  // Create the environment
            CartPoleEnv env = new CartPoleEnv(WinFormEnvViewer.Factory);  // Create the environment

            env.Seed(seed);
            // var eps = np.finfo(np.float32).eps.item();  // Smallest number such that 1.0 + eps != 1.0
            var eps = 1e-5;  // Smallest number such that 1.0 + eps != 1.0

            /*/
             * //// Implement Actor Critic network
             *
             * This network learns two functions:
             *
             * 1. Actor: This takes as input the state of our environment and returns a
             * probability value for each action in its action space.
             * 2. Critic: This takes as input the state of our environment and returns
             * an estimate of total rewards in the future.
             *
             * In our implementation, they share the initial layer.
             * /*/

            var     num_inputs  = 4;
            NDArray num_actions = 2;
            var     num_hidden  = 128;

            LayersApi layers = new LayersApi();
            var       inputs = layers.Input(shape: (num_inputs));
            var       common = layers.Dense(num_hidden, activation: "relu").Apply(inputs);
            var       action = layers.Dense(num_actions, activation: "softmax").Apply(common);
            var       critic = layers.Dense(1).Apply(common);

            Model model = keras.Model(inputs: inputs, outputs: (action, critic));

            /*/
             * //// Train
             * /*/

            var    optimizer            = keras.optimizers.Adam(learning_rate: (float)0.01);
            var    huber_loss           = keras.losses.Huber();
            var    action_probs_history = new List <double>();
            var    critic_value_history = new List <dynamic>();
            var    rewards_history      = new List <double>();
            double running_reward       = 0;
            var    episode_count        = 0;

            while (true)  // Run until solved
            {
                Program.state = env.Reset();
                double episode_reward = 0;
                using (var tape = tf.GradientTape())
                {
                    for (int timestep = 1; timestep < max_steps_per_episode; timestep++)
                    {
                        //env.Render(); // Adding this line would show the attempts
                        // of the agent in a pop up window.

                        Program.state = tf.convert_to_tensor(Program.state);
                        Program.state = tf.expand_dims(Program.state, 0);

                        // Predict action probabilities and estimated future rewards
                        // from environment state
                        // var (action_probs, critic_value) = model.Apply(Program.state);
                        var pred_result = model.Apply(tf.cast(Program.state, tf.float32));

                        var action_probs = pred_result[0][0];
                        var critic_value = pred_result[1][0][0];

                        critic_value_history.Add(critic_value);

                        Tensor probabilities = np.squeeze(action_probs);
                        Console.WriteLine(probabilities);
                        // Sample action from action probability distribution
                        NDArray chosen_action = np.random.choice(num_actions, probabilities: probabilities.);
                        action_probs_history.Add(tf.math.log(action_probs[0, chosen_action]));

                        // Apply the sampled action in our environment
                        var(state, reward, done, _) = env.Step(chosen_action);
                        rewards_history.Add(reward);
                        episode_reward += reward;

                        if (done)
                        {
                            break;
                        }
                    }
                    // Update running reward to check condition for solving
                    running_reward = 0.05 * episode_reward + (1 - 0.05) * running_reward;

                    // Calculate expected value from rewards
                    // - At each timestep what was the total reward received after that timestep
                    // - Rewards in the past are discounted by multiplying them with gamma
                    // - These are the labels for our critic
                    dynamic returns        = new List <double>();
                    double  discounted_sum = 0;

                    var reverse_rewards_history = rewards_history;
                    reverse_rewards_history.Reverse();
                    foreach (double r in reverse_rewards_history)
                    {
                        discounted_sum = r + gamma * discounted_sum;
                        returns.Insert(0, discounted_sum);
                    }

                    // Normalize
                    returns = np.array(returns.ToArray());
                    returns = (returns - np.mean(returns)) / (np.std(returns) + eps);
                    returns = returns.ToList();

                    // Calculating loss values to update our network
                    var history       = zip(action_probs_history, critic_value_history, returns);
                    var actor_losses  = new List <double>();
                    var critic_losses = new List <double>();
                    foreach (double[] item in history)
                    {
                        var     log_prob = item[0];
                        dynamic value    = item[1];
                        dynamic ret      = item[2];
                        // At this point in history, the critic estimated that we would get a
                        // total reward = `value` in the future. We took an action with log probability
                        // of `log_prob` and ended up recieving a total reward = `ret`.
                        // The actor must be updated so that it predicts an action that leads to
                        // high rewards (compared to critic's estimate) with high probability.
                        var diff = ret - value;
                        actor_losses.Add(-log_prob * diff);  // actor loss

                        // The critic must be updated so that it predicts a better estimate of
                        // the future rewards.
                        critic_losses.Add(
                            huber_loss.Call(tf.expand_dims(value, 0), tf.expand_dims(ret, 0))
                            );
                    }

                    // Backpropagation
                    dynamic loss_value = actor_losses.Sum(x => Convert.ToDouble(x)) + critic_losses.Sum(x => Convert.ToDouble(x));
                    var     grads      = tape.gradient(loss_value, model.trainable_variables);
                    optimizer.apply_gradients(zip(grads, model.trainable_variables));

                    // Clear the loss and reward history
                    action_probs_history.Clear();
                    critic_value_history.Clear();
                    rewards_history.Clear();
                }
                // Log details
                episode_count += 1;
                if (episode_count % 10 == 0)
                {
                    var template = String.Format("running reward: {0} at episode {1}", running_reward, episode_count);
                    Console.WriteLine(template);
                }

                if (running_reward > 195)  // Condition to consider the task solved
                {
                    Console.WriteLine(String.Format("Solved at episode {0}!", episode_count));
                    break;
                }
            }

            /*/
             * //// Visualizations
             * In early stages of training:
             * ![Imgur](https://i.imgur.com/5gCs5kH.gif)
             *
             * In later stages of training:
             * ![Imgur](https://i.imgur.com/5ziiZUD.gif)
             * /*/
        }

示例#3

显示文件

        static void Main(string[] args)
        {
            List <NDArray> observations = new List <NDArray>();
            List <NDArray> actions      = new List <NDArray>();
            List <NDArray> v_preds      = new List <NDArray>();
            List <double>  rewards      = new List <double>();
            List <NDArray> v_preds_next = new List <NDArray>();
            List <NDArray> gaes         = new List <NDArray>();

            double EPISODES = 1e5;
            double GAMMA    = 0.95;

            CartPoleEnv env = new CartPoleEnv(WinFormEnvViewer.Factory);                          //or AvaloniaEnvViewer.Factory  // Instancia o ambiente CartPole

            env.Seed(0);                                                                          //
            Space      ob_space   = env.ObservationSpace;                                         // Descrevem o formato de observações válidas do espaço
            Policy_net Policy     = new Policy_net("policy", env);                                // Cria a rede de Politica
            Policy_net Old_Policy = new Policy_net("old_policy", env);                            // Cria a rede de politica antiga
            PPOTrain   PPO        = new PPOTrain(Policy, Old_Policy, gamma: GAMMA);
            Saver      saver      = tf.train.Saver();                                             //

            using (var sess = tf.Session())                                                       // Bloco da sessão
            {
                FileWriter writer = tf.summary.FileWriter("./log/train", sess.graph);             // Define diretório de logs
                sess.run(tf.global_variables_initializer());                                      // Inicializa as redes

                NDArray obs         = env.Reset();                                                // Resets o ambiente e obtêm a primeira observação
                double  reward      = 0;                                                          // Armazena as recompensas
                int     success_num = 0;                                                          // Contador de sucessos

                for (int episode = 0; episode < EPISODES; episode++)                              // Loop do episodio
                {
                    int run_policy_steps = 0;                                                     // Contador de passos em cada episodio
                    env.Render();                                                                 // Renderiza o ambiente

                    while (true)                                                                  // Execute a política RUN_POLICY_STEPS, que é muito menor que a duração do episódio
                    {
                        run_policy_steps += 1;                                                    // Incrementa contador de passos de cada episodio
                        obs = np.stack(new[] { obs }).astype(dtype: np.float32);                  // prepare to feed placeholder Policy.obs
                        (NDArray _act, NDArray _v_pred) = Policy.act(obs: obs, stochastic: true); // Corre a rede neural e obtêm uma ação e o V previsto
                        int     act    = np.asscalar <int>(_act.ToArray <int>());                 // Transforma um array do numpy
                        NDArray v_pred = np.asscalar <double[]>(_v_pred.ToArray <double>());      // em um objeto scalar do Python
                        //var v_pred = _v_pred.Item();  // em um objeto scalar do Python

                        observations.Add(obs);                           // Adiciona a observação ao buffer de observações
                        actions.Add(act);                                // Adiciona a ação ao buffer de ações
                        v_preds.Add(v_pred);                             // Adiciona a v_pred ao buffer de v_pred
                        rewards.Add(reward);                             // Adiciona a recompensa ao buffer de recompensa

                        var(next_obs, _reward, done, _) = env.Step(act); // envia a ação ao ambiente e recebe a próxima observação, a recompensa e se o passo terminou
                        reward = _reward;

                        if (done)
                        {                           // Se o done for (verdadeiro ...
                            v_preds_next = v_preds; // [1:] seleciona do segundo elemento da lista em diante e + [0] adiciona um elemento de valor zero no final da lista
                            v_preds_next.RemoveAt(0);
                            v_preds_next.Add(0);
                            // next state of terminate state has 0 state value
                            // próximo estado do estado final tem 0 valor de estado
                            obs    = env.Reset(); //   Redefine o ambiente
                            reward = -1;          //   define a recompensa como -1 (?)
                            break;                //   Sai do loop while
                        }
                        else
                        {                   // Senão...
                            obs = next_obs; //   Armazena em obs a próxima observação
                        }
                    }
                    // Armazena em log para visualização no tensorboard
                    //writer.add_summary(tf.Summary(value:[tf.Summary.Value(tag:"episode_length", simple_value:run_policy_steps)]), episode);
                    //writer.add_summary(tf.Summary(value:[tf.Summary.Value(tag:"episode_reward", simple_value:rewards.Sum())]),     episode);

                    // Condicional para finalizar o teste
                    if (rewards.Sum() >= 195)
                    {                                                  // Se a soma das recompensas for (maior ou igual 195
                        success_num += 1;                              //   Incrementa o contador de sucessos
                        if (success_num >= 100)
                        {                                              //   Se ocorrerem 100 sucessos
                            saver.save(sess, "./model/model.ckpt");    //       Salva a sessão
                            Console.WriteLine("Clear!! Model saved."); //       Escreve na tela
                            break;                                     //       Sai do loop
                        }
                    }
                    else
                    {                                                                                    // senão,
                        success_num = 0;                                                                 //   zera o contador de sucessos
                    }
                    Console.WriteLine("EP: ", episode, " Rw: ", rewards.Sum());                          // Escreve na tela o numero do episodio e a recompensa

                    gaes = PPO.get_gaes(rewards: rewards, v_preds: v_preds, v_preds_next: v_preds_next); // ?

                    // Converte lista em NPArray para alimentar o tf.placeholder
                    //int[] newShape = ((-1), (list(ob_space.Shape)));// cria um array [-1, 4]
                    var _observations = np.reshape(observations.ToArray(), shape: ((-1), (4)));// antes, cada linha de observations era um array independente. depois do reshape, observations passou ser um array só com varias linhas.
                    var _actions      = np.array(actions.ToArray(), dtype: np.int32);
                    var _rewards      = np.array(rewards.ToArray(), dtype: np.float32);
                    var _v_preds_next = np.array(v_preds_next.ToArray(), dtype: np.float32);
                    var __gaes        = np.array(gaes.ToArray(), dtype: np.float32);
                    var _gaes         = (__gaes - __gaes.mean()) / __gaes.std(); // subtrai dos itens de gaes a media de todos os itens de gaes e divide todos pelo desvio padrão de gaes

                    PPO.assign_policy_parameters();

                    NDArray[] inp = new[] { _observations, _actions, _rewards, _v_preds_next, _gaes };  // Cria um array com 5 colunas: observações, ações, recompensas,

                    // Treina
                    for (int epoch = 0; epoch < 4; epoch++)
                    {
                        NDArray sample_indices = np.random.randint(low: 0, high: observations.ToArray().GetLength(0), size: new Shape(64));// índices estão em [baixo, alto]
                        var     sampled_inp    = new List <NDArray>();
                        foreach (NDArray arr in inp)
                        {
                            foreach (int indice in sample_indices.ToArray <int>())
                            {
                                sampled_inp.Add(arr[0][indice]);
                            }
                            //sampled_inp.Add(np.Take(a: arr, índices: sample_indices, axis: 0));   // amostra de dados de treinamento
                        }
                        PPO.train(obs: sampled_inp[0], actions: sampled_inp[1], rewards: sampled_inp[2], v_preds_next: sampled_inp[3], gaes: sampled_inp[4]);
                    }
                    var summary = PPO.get_summary(obs: inp[0], actions: inp[1], rewards: inp[2], v_preds_next: inp[3], gaes: inp[4])[0];

                    //writer.add_summary(summary, episode);
                }
                //writer.close(); // Final do episódio
            }
        }