public void test_ppo_model_dc_vector() { tf.reset_default_graph(); tf_with(tf.variable_scope("FakeGraphScope"), delegate { model = new PPOModel(make_brain_parameters(discrete_action: true, visual_inputs: 0)); init = tf.global_variables_initializer(); }); using (var sess = tf.Session()) { sess.run(init); var results = sess.run ( // run list (model.output, model.all_log_probs, model.value, model.entropy, model.learning_rate), // feed dict (model.batch_size, 2), (model.sequence_length, 1), (model.vector_in, np.array(new int[, ] { { 1, 2, 3, 1, 2, 3 }, { 3, 4, 5, 3, 4, 5 } })), (model.action_masks, np.ones((2, 2))) ); print(results); } }
// Use this for initialization void Start() { PPONetworkContinuousSimple network; if (environment.is3D) { network = new PPONetworkContinuousSimple(8, 2, 2, 32, DeviceDescriptor.CPUDevice, 0.01f); model = new PPOModel(network); trainer = new TrainerPPOSimple(model, LearnerDefs.AdamLearner(learningRate), 1, 10000, 200); trainer.ClipEpsilon = 0.1f; } else { network = new PPONetworkContinuousSimple(5, 2, 2, 32, DeviceDescriptor.CPUDevice, 0.01f); model = new PPOModel(network); trainer = new TrainerPPOSimple(model, LearnerDefs.AdamLearner(learningRate), 1, 10000, 200); } //test //trainer.RewardDiscountFactor = 0.5f; loss = new AutoAverage(iterationForEachTrain); episodePointAve = new AutoAverage(episodeToRunForEachTrain); }
// Use this for initialization void Start() { var network = new PPONetworkDiscreteSimple(6, 3, 4, 64, DeviceDescriptor.CPUDevice, 0.01f); model = new PPOModel(network); trainer = new TrainerPPOSimple(model, LearnerDefs.AdamLearner(learningRate), 1, 50000, 2000); //test //trainer.RewardDiscountFactor = 0.5f; loss = new AutoAverage(iterationForEachTrain); episodePointAve = new AutoAverage(episodeToRunForEachTrain); }