public void test_ppo_model_dc_vector()
        {
            tf.reset_default_graph();

            tf_with(tf.variable_scope("FakeGraphScope"), delegate
            {
                model = new PPOModel(make_brain_parameters(discrete_action: true, visual_inputs: 0));
                init  = tf.global_variables_initializer();
            });

            using (var sess = tf.Session())
            {
                sess.run(init);
                var results = sess.run
                              (
                    // run list
                    (model.output,
                     model.all_log_probs,
                     model.value,
                     model.entropy,
                     model.learning_rate),
                    // feed dict
                    (model.batch_size, 2),
                    (model.sequence_length, 1),
                    (model.vector_in, np.array(new int[, ] {
                    { 1, 2, 3, 1, 2, 3 }, { 3, 4, 5, 3, 4, 5 }
                })),
                    (model.action_masks, np.ones((2, 2)))
                              );

                print(results);
            }
        }
예제 #2
0
    // Use this for initialization
    void Start()
    {
        PPONetworkContinuousSimple network;

        if (environment.is3D)
        {
            network             = new PPONetworkContinuousSimple(8, 2, 2, 32, DeviceDescriptor.CPUDevice, 0.01f);
            model               = new PPOModel(network);
            trainer             = new TrainerPPOSimple(model, LearnerDefs.AdamLearner(learningRate), 1, 10000, 200);
            trainer.ClipEpsilon = 0.1f;
        }
        else
        {
            network = new PPONetworkContinuousSimple(5, 2, 2, 32, DeviceDescriptor.CPUDevice, 0.01f);
            model   = new PPOModel(network);
            trainer = new TrainerPPOSimple(model, LearnerDefs.AdamLearner(learningRate), 1, 10000, 200);
        }



        //test
        //trainer.RewardDiscountFactor = 0.5f;

        loss            = new AutoAverage(iterationForEachTrain);
        episodePointAve = new AutoAverage(episodeToRunForEachTrain);
    }
예제 #3
0
    // Use this for initialization
    void Start()
    {
        var network = new PPONetworkDiscreteSimple(6, 3, 4, 64, DeviceDescriptor.CPUDevice, 0.01f);

        model   = new PPOModel(network);
        trainer = new TrainerPPOSimple(model, LearnerDefs.AdamLearner(learningRate), 1, 50000, 2000);

        //test
        //trainer.RewardDiscountFactor = 0.5f;

        loss            = new AutoAverage(iterationForEachTrain);
        episodePointAve = new AutoAverage(episodeToRunForEachTrain);
    }