コード例 #1
0
        static void Main(string[] args)
        {
            // Configuration parameters for the whole setup
            var seed  = 42;
            var gamma = 0.99;  // Discount factor for past rewards
            var max_steps_per_episode = 10000;
            // var env = gym.make("CartPole-v0");  // Create the environment
            CartPoleEnv env = new CartPoleEnv(WinFormEnvViewer.Factory);  // Create the environment

            env.Seed(seed);
            // var eps = np.finfo(np.float32).eps.item();  // Smallest number such that 1.0 + eps != 1.0
            var eps = 1e-5;  // Smallest number such that 1.0 + eps != 1.0

            /*/
             * //// Implement Actor Critic network
             *
             * This network learns two functions:
             *
             * 1. Actor: This takes as input the state of our environment and returns a
             * probability value for each action in its action space.
             * 2. Critic: This takes as input the state of our environment and returns
             * an estimate of total rewards in the future.
             *
             * In our implementation, they share the initial layer.
             * /*/

            var     num_inputs  = 4;
            NDArray num_actions = 2;
            var     num_hidden  = 128;

            LayersApi layers = new LayersApi();
            var       inputs = layers.Input(shape: (num_inputs));
            var       common = layers.Dense(num_hidden, activation: "relu").Apply(inputs);
            var       action = layers.Dense(num_actions, activation: "softmax").Apply(common);
            var       critic = layers.Dense(1).Apply(common);

            Model model = keras.Model(inputs: inputs, outputs: (action, critic));

            /*/
             * //// Train
             * /*/

            var    optimizer            = keras.optimizers.Adam(learning_rate: (float)0.01);
            var    huber_loss           = keras.losses.Huber();
            var    action_probs_history = new List <double>();
            var    critic_value_history = new List <dynamic>();
            var    rewards_history      = new List <double>();
            double running_reward       = 0;
            var    episode_count        = 0;

            while (true)  // Run until solved
            {
                Program.state = env.Reset();
                double episode_reward = 0;
                using (var tape = tf.GradientTape())
                {
                    for (int timestep = 1; timestep < max_steps_per_episode; timestep++)
                    {
                        //env.Render(); // Adding this line would show the attempts
                        // of the agent in a pop up window.

                        Program.state = tf.convert_to_tensor(Program.state);
                        Program.state = tf.expand_dims(Program.state, 0);

                        // Predict action probabilities and estimated future rewards
                        // from environment state
                        // var (action_probs, critic_value) = model.Apply(Program.state);
                        var pred_result = model.Apply(tf.cast(Program.state, tf.float32));

                        var action_probs = pred_result[0][0];
                        var critic_value = pred_result[1][0][0];

                        critic_value_history.Add(critic_value);

                        Tensor probabilities = np.squeeze(action_probs);
                        Console.WriteLine(probabilities);
                        // Sample action from action probability distribution
                        NDArray chosen_action = np.random.choice(num_actions, probabilities: probabilities.);
                        action_probs_history.Add(tf.math.log(action_probs[0, chosen_action]));

                        // Apply the sampled action in our environment
                        var(state, reward, done, _) = env.Step(chosen_action);
                        rewards_history.Add(reward);
                        episode_reward += reward;

                        if (done)
                        {
                            break;
                        }
                    }
                    // Update running reward to check condition for solving
                    running_reward = 0.05 * episode_reward + (1 - 0.05) * running_reward;

                    // Calculate expected value from rewards
                    // - At each timestep what was the total reward received after that timestep
                    // - Rewards in the past are discounted by multiplying them with gamma
                    // - These are the labels for our critic
                    dynamic returns        = new List <double>();
                    double  discounted_sum = 0;

                    var reverse_rewards_history = rewards_history;
                    reverse_rewards_history.Reverse();
                    foreach (double r in reverse_rewards_history)
                    {
                        discounted_sum = r + gamma * discounted_sum;
                        returns.Insert(0, discounted_sum);
                    }

                    // Normalize
                    returns = np.array(returns.ToArray());
                    returns = (returns - np.mean(returns)) / (np.std(returns) + eps);
                    returns = returns.ToList();

                    // Calculating loss values to update our network
                    var history       = zip(action_probs_history, critic_value_history, returns);
                    var actor_losses  = new List <double>();
                    var critic_losses = new List <double>();
                    foreach (double[] item in history)
                    {
                        var     log_prob = item[0];
                        dynamic value    = item[1];
                        dynamic ret      = item[2];
                        // At this point in history, the critic estimated that we would get a
                        // total reward = `value` in the future. We took an action with log probability
                        // of `log_prob` and ended up recieving a total reward = `ret`.
                        // The actor must be updated so that it predicts an action that leads to
                        // high rewards (compared to critic's estimate) with high probability.
                        var diff = ret - value;
                        actor_losses.Add(-log_prob * diff);  // actor loss

                        // The critic must be updated so that it predicts a better estimate of
                        // the future rewards.
                        critic_losses.Add(
                            huber_loss.Call(tf.expand_dims(value, 0), tf.expand_dims(ret, 0))
                            );
                    }

                    // Backpropagation
                    dynamic loss_value = actor_losses.Sum(x => Convert.ToDouble(x)) + critic_losses.Sum(x => Convert.ToDouble(x));
                    var     grads      = tape.gradient(loss_value, model.trainable_variables);
                    optimizer.apply_gradients(zip(grads, model.trainable_variables));

                    // Clear the loss and reward history
                    action_probs_history.Clear();
                    critic_value_history.Clear();
                    rewards_history.Clear();
                }
                // Log details
                episode_count += 1;
                if (episode_count % 10 == 0)
                {
                    var template = String.Format("running reward: {0} at episode {1}", running_reward, episode_count);
                    Console.WriteLine(template);
                }

                if (running_reward > 195)  // Condition to consider the task solved
                {
                    Console.WriteLine(String.Format("Solved at episode {0}!", episode_count));
                    break;
                }
            }

            /*/
             * //// Visualizations
             * In early stages of training:
             * ![Imgur](https://i.imgur.com/5gCs5kH.gif)
             *
             * In later stages of training:
             * ![Imgur](https://i.imgur.com/5ziiZUD.gif)
             * /*/
        }