static void Main(string[] args) { // Configuration parameters for the whole setup var seed = 42; var gamma = 0.99; // Discount factor for past rewards var max_steps_per_episode = 10000; // var env = gym.make("CartPole-v0"); // Create the environment CartPoleEnv env = new CartPoleEnv(WinFormEnvViewer.Factory); // Create the environment env.Seed(seed); // var eps = np.finfo(np.float32).eps.item(); // Smallest number such that 1.0 + eps != 1.0 var eps = 1e-5; // Smallest number such that 1.0 + eps != 1.0 /*/ * //// Implement Actor Critic network * * This network learns two functions: * * 1. Actor: This takes as input the state of our environment and returns a * probability value for each action in its action space. * 2. Critic: This takes as input the state of our environment and returns * an estimate of total rewards in the future. * * In our implementation, they share the initial layer. * /*/ var num_inputs = 4; NDArray num_actions = 2; var num_hidden = 128; LayersApi layers = new LayersApi(); var inputs = layers.Input(shape: (num_inputs)); var common = layers.Dense(num_hidden, activation: "relu").Apply(inputs); var action = layers.Dense(num_actions, activation: "softmax").Apply(common); var critic = layers.Dense(1).Apply(common); Model model = keras.Model(inputs: inputs, outputs: (action, critic)); /*/ * //// Train * /*/ var optimizer = keras.optimizers.Adam(learning_rate: (float)0.01); var huber_loss = keras.losses.Huber(); var action_probs_history = new List <double>(); var critic_value_history = new List <dynamic>(); var rewards_history = new List <double>(); double running_reward = 0; var episode_count = 0; while (true) // Run until solved { Program.state = env.Reset(); double episode_reward = 0; using (var tape = tf.GradientTape()) { for (int timestep = 1; timestep < max_steps_per_episode; timestep++) { //env.Render(); // Adding this line would show the attempts // of the agent in a pop up window. Program.state = tf.convert_to_tensor(Program.state); Program.state = tf.expand_dims(Program.state, 0); // Predict action probabilities and estimated future rewards // from environment state // var (action_probs, critic_value) = model.Apply(Program.state); var pred_result = model.Apply(tf.cast(Program.state, tf.float32)); var action_probs = pred_result[0][0]; var critic_value = pred_result[1][0][0]; critic_value_history.Add(critic_value); Tensor probabilities = np.squeeze(action_probs); Console.WriteLine(probabilities); // Sample action from action probability distribution NDArray chosen_action = np.random.choice(num_actions, probabilities: probabilities.); action_probs_history.Add(tf.math.log(action_probs[0, chosen_action])); // Apply the sampled action in our environment var(state, reward, done, _) = env.Step(chosen_action); rewards_history.Add(reward); episode_reward += reward; if (done) { break; } } // Update running reward to check condition for solving running_reward = 0.05 * episode_reward + (1 - 0.05) * running_reward; // Calculate expected value from rewards // - At each timestep what was the total reward received after that timestep // - Rewards in the past are discounted by multiplying them with gamma // - These are the labels for our critic dynamic returns = new List <double>(); double discounted_sum = 0; var reverse_rewards_history = rewards_history; reverse_rewards_history.Reverse(); foreach (double r in reverse_rewards_history) { discounted_sum = r + gamma * discounted_sum; returns.Insert(0, discounted_sum); } // Normalize returns = np.array(returns.ToArray()); returns = (returns - np.mean(returns)) / (np.std(returns) + eps); returns = returns.ToList(); // Calculating loss values to update our network var history = zip(action_probs_history, critic_value_history, returns); var actor_losses = new List <double>(); var critic_losses = new List <double>(); foreach (double[] item in history) { var log_prob = item[0]; dynamic value = item[1]; dynamic ret = item[2]; // At this point in history, the critic estimated that we would get a // total reward = `value` in the future. We took an action with log probability // of `log_prob` and ended up recieving a total reward = `ret`. // The actor must be updated so that it predicts an action that leads to // high rewards (compared to critic's estimate) with high probability. var diff = ret - value; actor_losses.Add(-log_prob * diff); // actor loss // The critic must be updated so that it predicts a better estimate of // the future rewards. critic_losses.Add( huber_loss.Call(tf.expand_dims(value, 0), tf.expand_dims(ret, 0)) ); } // Backpropagation dynamic loss_value = actor_losses.Sum(x => Convert.ToDouble(x)) + critic_losses.Sum(x => Convert.ToDouble(x)); var grads = tape.gradient(loss_value, model.trainable_variables); optimizer.apply_gradients(zip(grads, model.trainable_variables)); // Clear the loss and reward history action_probs_history.Clear(); critic_value_history.Clear(); rewards_history.Clear(); } // Log details episode_count += 1; if (episode_count % 10 == 0) { var template = String.Format("running reward: {0} at episode {1}", running_reward, episode_count); Console.WriteLine(template); } if (running_reward > 195) // Condition to consider the task solved { Console.WriteLine(String.Format("Solved at episode {0}!", episode_count)); break; } } /*/ * //// Visualizations * In early stages of training: * ![Imgur](https://i.imgur.com/5gCs5kH.gif) * * In later stages of training: * ![Imgur](https://i.imgur.com/5ziiZUD.gif) * /*/ }