Exemple #1
0
        static void UpdateLearningRate(IOptimizer optimizer, Variable step, LearningRateSchedule learningRateSchedule)
        {
            Tensor learningRate          = learningRateSchedule.Get(step: step);
            var    optimizerLearningRate = optimizer.DynamicGet <Variable>("lr");

            optimizerLearningRate.assign(learningRate);
        }
 public Tensor create_learning_rate(LearningRateSchedule lr_schedule, float lr, RefVariable global_step,
                                    float max_step)
 {
     /*if (lr_schedule == LearningRateSchedule.CONSTANT)
      *  learning_rate = tf.Variable(lr);
      * else */if (lr_schedule == LearningRateSchedule.LINEAR)
     {
         return(tf.train.polynomial_decay(
                    lr, global_step, max_step, 1e-10f, power: 1.0f));
     }
     throw new NotImplementedException("create_learning_rate");
 }
        /// <summary>
        /// Takes a Unity environment and model-specific hyper-parameters and returns the
        /// appropriate PPO agent model for the environment.
        /// </summary>
        /// <param name="brain">BrainInfo used to generate specific network graph.</param>
        /// <param name="lr">Learning rate.</param>
        /// <param name="lr_schedule">Learning rate decay schedule.</param>
        /// <param name="h_size">Size of hidden layers</param>
        /// <param name="epsilon">Value for policy-divergence threshold.</param>
        /// <param name="beta">Strength of entropy regularization.</param>
        /// <param name="max_step">Total number of training steps.</param>
        /// <param name="normalize">Whether to normalize vector observation input.</param>
        /// <param name="use_recurrent">Whether to use an LSTM layer in the network.</param>
        /// <param name="num_layers">Number of hidden layers between encoded input and policy & value layers</param>
        /// <param name="m_size">Size of brain memory.</param>
        /// <param name="seed">Seed to use for initialization of model.</param>
        /// <param name="stream_names">List of names of value streams. Usually, a list of the Reward Signals being used.</param>
        /// <param name="vis_encode_type"></param>
        public PPOModel(BrainParameters brain,
                        float lr = 0.0001f,
                        LearningRateSchedule lr_schedule = LearningRateSchedule.LINEAR,
                        int h_size                  = 128,
                        float epsilon               = 0.2f,
                        float beta                  = 0.001f,
                        float max_step              = 5e6f,
                        bool normalize              = false,
                        bool use_recurrent          = false,
                        int num_layers              = 2,
                        int?m_size                  = null,
                        int seed                    = 0,
                        List <string> stream_names  = null,
                        EncoderType vis_encode_type = EncoderType.SIMPLE) : base(m_size: m_size,
                                                                                 normalize: normalize,
                                                                                 use_recurrent: use_recurrent,
                                                                                 brain: brain,
                                                                                 seed: seed,
                                                                                 stream_names: stream_names)
        {
            // optimizer: Optional[tf.train.AdamOptimizer] = null;
            // update_batch: Optional[tf.Operation] = null;

            if (num_layers < 1)
            {
                num_layers = 1;
            }
            if (brain.vector_action_space_type == "continuous")
            {
                throw new NotImplementedException("brain.vector_action_space_type");
                // create_cc_actor_critic(h_size, num_layers, vis_encode_type);
                // entropy = tf.ones_like(tf.reshape(value, [-1])) * entropy;
            }
            else
            {
                create_dc_actor_critic(h_size, num_layers, vis_encode_type);
            }
            learning_rate = create_learning_rate(lr_schedule, lr, global_step, max_step);
            create_losses(
                log_probs,
                old_log_probs,
                value_heads,
                entropy,
                beta,
                epsilon,
                lr,
                max_step);
        }