public PPOTrain(Policy_net policy, Policy_net old_policy, double gamma = 0.95, double clip_value = 0.2, double c_1 = 1, double c_2 = 0.01) { /* * :param Policy: * :param Old_Policy: * :param gamma: * :param clip_value: * :param c_1: parameter for value difference * :param c_2: parameter for entropy bonus */ Policy = policy; Old_Policy = old_policy; this.gamma = gamma; List <IVariableV1> pi_trainable = Policy.get_trainable_variables() as List <IVariableV1>; List <IVariableV1> old_pi_trainable = Old_Policy.get_trainable_variables(); // assign_operations for (policy parameter values to old policy parameters // atribuir operações para valores de parâmetros de política a parâmetros de política antigos using (tf.variable_scope("assign_op")) { foreach ((RefVariable v_old, Tensor v) in zip(old_pi_trainable, pi_trainable)) { this.assign_ops.Add(tf.assign(v_old, v)); } } // inputs for (train_op // inputs para train_op using (tf.variable_scope("train_inp")) { this.actions = tf.placeholder(dtype: tf.int32, shape: (Unknown), name: "actions"); this.rewards = tf.placeholder(dtype: tf.float32, shape: (Unknown), name: "rewards"); this.v_preds_next = tf.placeholder(dtype: tf.float32, shape: (Unknown), name: "v_preds_next"); this.gaes = tf.placeholder(dtype: tf.float32, shape: (Unknown), name: "gaes"); } Tensor act_probs = Policy.act_probs; Tensor act_probs_old = Old_Policy.act_probs; // probabilidades de ações que o agente executou com a política act_probs = act_probs * tf.one_hot(indices: this.actions, depth: act_probs.ToArray <double>().GetLength(1)); act_probs = tf.reduce_sum(act_probs, axis: 1); // probabilidades de ações que o agente executou com a política antiga act_probs_old = act_probs_old * tf.one_hot(indices: this.actions, depth: act_probs_old.ToArray <double>().GetLength(1)); act_probs_old = tf.reduce_sum(act_probs_old, axis: 1); using (tf.variable_scope("loss/clip")) { // ratios = tf.divide(act_probs, act_probs_old) Tensor ratios = tf.exp(tf.log(act_probs) - tf.log(act_probs_old)); Tensor clipped_ratios = tf.clip_by_value(ratios, clip_value_min: new Tensor(1 - clip_value), clip_value_max: new Tensor(1 + clip_value)); loss_clip = tf.minimum(tf.multiply(this.gaes, ratios), tf.multiply(this.gaes, clipped_ratios)); loss_clip = tf.reduce_mean(loss_clip); tf.summary.scalar("loss_clip", loss_clip); } // construct computation graph for (loss of value function // constrói gráfico de cálculo para perda da função de valor using (tf.variable_scope("loss/vf")) { Tensor v_preds = Policy.v_preds; loss_vf = tf.squared_difference(this.rewards + (gamma * this.v_preds_next), v_preds); loss_vf = tf.reduce_mean(loss_vf); tf.summary.scalar("loss_vf", loss_vf); } // construct computation graph for (loss of entropy bonus // construir gráfico de computação para perda de bônus de entropia using (tf.variable_scope("loss/entropy")) { entropy = -tf.reduce_sum(Policy.act_probs * tf.log(tf.clip_by_value(Policy.act_probs, new Tensor(1e-10), new Tensor(1.0))), axis: 1); entropy = tf.reduce_mean(entropy, axis: new[] { 0 }); // média de entropia de pi (obs) tf.summary.scalar("entropy", entropy); } using (tf.variable_scope("loss")) { loss = loss_clip - c_1 * loss_vf + c_2 * entropy; loss = -loss; // minimize -loss == maximize loss tf.summary.scalar("loss", loss); } this.merged = tf.summary.merge_all(); Optimizer optimizer = tf.train.AdamOptimizer(learning_rate: (float)1e-4, epsilon: (float)1e-5); this.train_op = optimizer.minimize(loss, var_list: pi_trainable); }