Beispiel #1
0
        public PPOTrain(Policy_net policy, Policy_net old_policy, double gamma = 0.95, double clip_value = 0.2, double c_1 = 1, double c_2 = 0.01)
        {
            /*
             * :param Policy:
             * :param Old_Policy:
             * :param gamma:
             * :param clip_value:
             * :param c_1: parameter for value difference
             * :param c_2: parameter for entropy bonus
             */
            Policy     = policy;
            Old_Policy = old_policy;
            this.gamma = gamma;
            List <IVariableV1> pi_trainable     = Policy.get_trainable_variables() as List <IVariableV1>;
            List <IVariableV1> old_pi_trainable = Old_Policy.get_trainable_variables();

            // assign_operations for (policy parameter values to old policy parameters
            // atribuir operações para valores de parâmetros de política a parâmetros de política antigos
            using (tf.variable_scope("assign_op"))
            {
                foreach ((RefVariable v_old, Tensor v) in zip(old_pi_trainable, pi_trainable))
                {
                    this.assign_ops.Add(tf.assign(v_old, v));
                }
            }
            // inputs for (train_op
            // inputs para train_op
            using (tf.variable_scope("train_inp"))
            {
                this.actions      = tf.placeholder(dtype: tf.int32, shape: (Unknown), name: "actions");
                this.rewards      = tf.placeholder(dtype: tf.float32, shape: (Unknown), name: "rewards");
                this.v_preds_next = tf.placeholder(dtype: tf.float32, shape: (Unknown), name: "v_preds_next");
                this.gaes         = tf.placeholder(dtype: tf.float32, shape: (Unknown), name: "gaes");
            }
            Tensor act_probs     = Policy.act_probs;
            Tensor act_probs_old = Old_Policy.act_probs;

            // probabilidades de ações que o agente executou com a política
            act_probs = act_probs * tf.one_hot(indices: this.actions, depth: act_probs.ToArray <double>().GetLength(1));
            act_probs = tf.reduce_sum(act_probs, axis: 1);

            // probabilidades de ações que o agente executou com a política antiga
            act_probs_old = act_probs_old * tf.one_hot(indices: this.actions, depth: act_probs_old.ToArray <double>().GetLength(1));
            act_probs_old = tf.reduce_sum(act_probs_old, axis: 1);

            using (tf.variable_scope("loss/clip"))
            {
                // ratios = tf.divide(act_probs, act_probs_old)
                Tensor ratios         = tf.exp(tf.log(act_probs) - tf.log(act_probs_old));
                Tensor clipped_ratios = tf.clip_by_value(ratios, clip_value_min: new Tensor(1 - clip_value), clip_value_max: new Tensor(1 + clip_value));
                loss_clip = tf.minimum(tf.multiply(this.gaes, ratios), tf.multiply(this.gaes, clipped_ratios));
                loss_clip = tf.reduce_mean(loss_clip);
                tf.summary.scalar("loss_clip", loss_clip);
            }
            // construct computation graph for (loss of value function
            // constrói gráfico de cálculo para perda da função de valor
            using (tf.variable_scope("loss/vf"))
            {
                Tensor v_preds = Policy.v_preds;
                loss_vf = tf.squared_difference(this.rewards + (gamma * this.v_preds_next), v_preds);
                loss_vf = tf.reduce_mean(loss_vf);
                tf.summary.scalar("loss_vf", loss_vf);
            }
            // construct computation graph for (loss of entropy bonus
            // construir gráfico de computação para perda de bônus de entropia
            using (tf.variable_scope("loss/entropy"))
            {
                entropy = -tf.reduce_sum(Policy.act_probs * tf.log(tf.clip_by_value(Policy.act_probs, new Tensor(1e-10), new Tensor(1.0))), axis: 1);
                entropy = tf.reduce_mean(entropy, axis: new[] { 0 });   // média de entropia de pi (obs)
                tf.summary.scalar("entropy", entropy);
            }
            using (tf.variable_scope("loss"))
            {
                loss = loss_clip - c_1 * loss_vf + c_2 * entropy;
                loss = -loss;  // minimize -loss == maximize loss
                tf.summary.scalar("loss", loss);
            }
            this.merged = tf.summary.merge_all();
            Optimizer optimizer = tf.train.AdamOptimizer(learning_rate: (float)1e-4, epsilon: (float)1e-5);

            this.train_op = optimizer.minimize(loss, var_list: pi_trainable);
        }