static void AddPPOCMAGO()
    {
        var obj1 = new GameObject("LearningModel_PPOCMA");

        obj1.AddComponent <RLModelPPOCMA>();
        var obj2 = new GameObject("Trainer_PPOCMA");

        obj2.AddComponent <TrainerPPOCMA>();

        var obj3 = new GameObject("PPOCMA_Learning");

        obj1.transform.parent = obj3.transform;
        obj2.transform.parent = obj3.transform;

        //try to create parameter assets
        RLNetworkACSeperateVar network      = null;
        TrainerParamsPPO       trainerParam = null;

        CreateAssets <TrainerParamsPPO, RLNetworkACSeperateVar>("TrainerParamPPOCMA_" + obj1.scene.name + ".asset",
                                                                "NetworkPPOCMA_" + obj1.scene.name + ".asset",
                                                                out trainerParam, out network);
        network.actorHiddenLayers = new List <UnityNetwork.SimpleDenseLayerDef>();
        network.actorHiddenLayers.Add(new UnityNetwork.SimpleDenseLayerDef());
        network.criticHiddenLayers = new List <UnityNetwork.SimpleDenseLayerDef>();
        network.criticHiddenLayers.Add(new UnityNetwork.SimpleDenseLayerDef());

        var trainer = obj2.GetComponent <TrainerPPOCMA>();

        trainer.modelRef           = obj1.GetComponent <RLModelPPOCMA>();
        trainer.parameters         = trainerParam;
        trainer.checkpointPath     = checkpointPath;
        trainer.checkpointFileName = "Checkpoint_" + obj1.scene.name + ".bytes";

        ((RLModelPPOCMA)trainer.modelRef).network = network;
    }
Example #2
0
    protected void CreatePPOOptimizer(TrainerParamsPPO trainingParams, Tensor entropy, Tensor actionLogProb, Tensor outputValueFromNetwork, List <Tensor> extraInputTensors, List <Tensor> weightsToUpdate)
    {
        ClipEpsilon       = trainingParams.clipEpsilon;
        ValueLossWeight   = trainingParams.valueLossWeight;
        EntropyLossWeight = trainingParams.entropyLossWeight;
        ClipValueLoss     = trainingParams.clipValueLoss;


        var inputOldLogProb  = UnityTFUtils.Input(new int?[] { ActionSpace == SpaceType.continuous ? ActionSizes[0] : ActionSizes.Length }, name: "InputOldLogProb")[0];
        var inputAdvantage   = UnityTFUtils.Input(new int?[] { 1 }, name: "InputAdvantage")[0];
        var inputTargetValue = UnityTFUtils.Input(new int?[] { 1 }, name: "InputTargetValue")[0];
        var inputOldValue    = UnityTFUtils.Input(new int?[] { 1 }, name: "InputOldValue")[0];

        var inputClipEpsilon       = UnityTFUtils.Input(batch_shape: new int?[] { }, name: "ClipEpsilon", dtype: DataType.Float)[0];
        var inputClipValueLoss     = UnityTFUtils.Input(batch_shape: new int?[] { }, name: "ClipValueLoss", dtype: DataType.Float)[0];
        var inputValuelossWeight   = UnityTFUtils.Input(batch_shape: new int?[] { }, name: "ValueLossWeight", dtype: DataType.Float)[0];
        var inputEntropyLossWeight = UnityTFUtils.Input(batch_shape: new int?[] { }, name: "EntropyLossWeight", dtype: DataType.Float)[0];


        // value loss
        Tensor outputValueLoss = null;

        using (K.name_scope("ValueLoss"))
        {
            var clippedValueEstimate = inputOldValue + K.clip(outputValueFromNetwork - inputOldValue, 0.0f - inputClipValueLoss, inputClipValueLoss);
            var valueLoss1           = new MeanSquareError().Call(outputValueFromNetwork, inputTargetValue);
            var valueLoss2           = new MeanSquareError().Call(clippedValueEstimate, inputTargetValue);
            outputValueLoss = K.mean(K.maximum(valueLoss1, valueLoss2));
        }
        //var outputValueLoss = K.mean(valueLoss1);

        // Clipped Surrogate loss
        Tensor outputPolicyLoss;

        using (K.name_scope("ClippedCurreogateLoss"))
        {
            //Debug.LogWarning("testnew");
            //var probStopGradient = K.stop_gradient(actionProb);
            var probRatio = K.exp(actionLogProb - inputOldLogProb);
            var p_opt_a   = probRatio * inputAdvantage;
            var p_opt_b   = K.clip(probRatio, 1.0f - inputClipEpsilon, 1.0f + inputClipEpsilon) * inputAdvantage;

            outputPolicyLoss = (-1f) * K.mean(K.mean(K.minimun(p_opt_a, p_opt_b)), name: "ClippedCurreogateLoss");
        }
        //final weighted loss
        var outputLoss = outputPolicyLoss + inputValuelossWeight * outputValueLoss;

        outputLoss = outputLoss - inputEntropyLossWeight * entropy;
        outputLoss = K.identity(outputLoss, "OutputLoss");

        //add inputs, outputs and parameters to the list
        List <Tensor> allInputs = new List <Tensor>();

        allInputs.Add(inputOldLogProb);
        allInputs.Add(inputTargetValue);
        allInputs.Add(inputOldValue);
        allInputs.Add(inputAdvantage);
        allInputs.Add(inputClipEpsilon);
        allInputs.Add(inputClipValueLoss);
        allInputs.Add(inputValuelossWeight);
        allInputs.Add(inputEntropyLossWeight);

        allInputs.AddRange(extraInputTensors);

        //create optimizer and create necessary functions
        var updates = AddOptimizer(weightsToUpdate, outputLoss, optimizer);

        UpdatePPOFunction = K.function(allInputs, new List <Tensor> {
            outputLoss, outputValueLoss, outputPolicyLoss, entropy
        }, updates, "UpdateFunction");
    }
Example #3
0
    protected void InitializePPOStructureContinuousAction(Tensor vectorObs, Tensor normalizedVectorObs, List <Tensor> visualObs, TrainerParams trainerParams)
    {
        //all inputs list
        List <Tensor> allObservationInputs = new List <Tensor>();

        if (HasVectorObservation)
        {
            allObservationInputs.Add(vectorObs);
        }
        if (HasVisualObservation)
        {
            allObservationInputs.AddRange(visualObs);
        }

        //build the network
        Tensor outputValue = null; Tensor outputActionMean = null; Tensor outputLogVariance = null;

        network.BuildNetworkForContinuousActionSapce(normalizedVectorObs, visualObs, null, null, ActionSizes[0], out outputActionMean, out outputValue, out outputLogVariance);

        //value function
        ValueFunction = K.function(allObservationInputs, new List <Tensor> {
            outputValue
        }, null, "ValueFunction");

        Tensor outputActualAction = null, actionLogProb = null, outputVariance = null;

        //build action sampling
        outputVariance = K.exp(outputLogVariance);
        using (K.name_scope("SampleAction"))
        {
            outputActualAction = K.standard_normal(K.shape(outputActionMean), DataType.Float) * K.sqrt(outputVariance) + outputActionMean;
        }
        using (K.name_scope("ActionProbs"))
        {
            actionLogProb = K.log_normal_probability(K.stop_gradient(outputActualAction), outputActionMean, outputVariance, outputLogVariance);
        }
        //action function
        //ActionFunction = K.function(allObservationInputs, new List<Tensor> { outputActualAction, actionLogProb, outputActionMean }, null, "ActionFunction");
        ActionFunction = K.function(allObservationInputs, new List <Tensor> {
            outputActualAction, actionLogProb
        }, null, "ActionFunction");

        var probInputs = new List <Tensor>(); probInputs.AddRange(allObservationInputs); probInputs.Add(outputActualAction);

        //probability function
        ActionProbabilityFunction = K.function(probInputs, new List <Tensor> {
            actionLogProb
        }, null, "ActionProbabilityFunction");

        //training related
        TrainerParamsPPO trainingParams = trainerParams as TrainerParamsPPO;

        if (trainingParams != null)
        {
            Tensor outputEntropy;
            using (K.name_scope("Entropy"))
            {
                var temp = 0.5f * (Mathf.Log(2 * Mathf.PI * 2.7182818285f, 2.7182818285f) + outputLogVariance);
                if (outputLogVariance.shape.Length == 2)
                {
                    outputEntropy = K.mean(K.mean(temp, 0, false), name: "OutputEntropy");
                }
                else
                {
                    outputEntropy = K.mean(temp, 0, false, name: "OutputEntropy");
                }
            }

            List <Tensor> extraInputs = new List <Tensor>();
            extraInputs.AddRange(allObservationInputs);
            extraInputs.Add(outputActualAction);
            CreatePPOOptimizer(trainingParams, outputEntropy, actionLogProb, outputValue, extraInputs, network.GetWeights());
        }
    }
Example #4
0
    protected void InitializePPOStructureDiscreteAction(Tensor vectorObs, Tensor normalizedVectorObs, List <Tensor> visualObs, TrainerParams trainerParams)
    {
        //all inputs list
        List <Tensor> allObservationInputs = new List <Tensor>();

        if (HasVectorObservation)
        {
            allObservationInputs.Add(vectorObs);
        }
        if (HasVisualObservation)
        {
            allObservationInputs.AddRange(visualObs);
        }

        Tensor[] outputActionsLogits = null; Tensor outputValue = null;
        network.BuildNetworkForDiscreteActionSpace(normalizedVectorObs, visualObs, null, null, ActionSizes, out outputActionsLogits, out outputValue);

        ValueFunction = K.function(allObservationInputs, new List <Tensor> {
            outputValue
        }, null, "ValueFunction");

        //the action masks input placeholders
        List <Tensor> actionMasksInputs = new List <Tensor>();

        for (int i = 0; i < ActionSizes.Length; ++i)
        {
            actionMasksInputs.Add(UnityTFUtils.Input(new int?[] { ActionSizes[i] }, name: "AcionMask" + i)[0]);
        }

        Tensor[] outputActions, outputNormalizedLogits;
        CreateDiscreteActionMaskingLayer(outputActionsLogits, actionMasksInputs.ToArray(), out outputActions, out outputNormalizedLogits);

        //output tensors for discrete actions. Includes all action selected actions and the normalized logits of all actions
        var outputDiscreteActions = new List <Tensor>();

        outputDiscreteActions.Add(K.identity(K.cast(ActionSizes.Length == 1? outputActions[0]: K.concat(outputActions.ToList(), 1), DataType.Float), "OutputAction"));
        outputDiscreteActions.AddRange(outputNormalizedLogits);
        var actionFunctionInputs = new List <Tensor>();

        actionFunctionInputs.AddRange(allObservationInputs); actionFunctionInputs.AddRange(actionMasksInputs);
        ActionFunction = K.function(actionFunctionInputs, outputDiscreteActions, null, "ActionFunction");


        TrainerParamsPPO trainingParams = trainerParams as TrainerParamsPPO;

        if (trainingParams != null)
        {
            // action probability from input action
            Tensor        outputEntropy;
            List <Tensor> inputActionsDiscreteSeperated = null, onehotInputActions = null;    //for discrete action space

            Tensor inputAction = UnityTFUtils.Input(new int?[] { ActionSizes.Length }, name: "InputActions", dtype: DataType.Int32)[0];

            //split the input for each discrete branch
            var splits = new int[ActionSizes.Length];
            for (int i = 0; i < splits.Length; ++i)
            {
                splits[i] = 1;
            }
            inputActionsDiscreteSeperated = K.split(inputAction, K.constant(splits, dtype: DataType.Int32), K.constant(1, dtype: DataType.Int32), ActionSizes.Length);

            Tensor actionLogProb = null;
            using (K.name_scope("ActionProbAndEntropy"))
            {
                onehotInputActions = inputActionsDiscreteSeperated.Select((x, i) => K.reshape(K.one_hot(x, K.constant <int>(ActionSizes[i], dtype: DataType.Int32), K.constant(1.0f), K.constant(0.0f)), new int[] { -1, ActionSizes[i] })).ToList();

                //entropy
                var entropies = outputActionsLogits.Select((t) => { return(K.mean((-1.0f) * K.sum(K.softmax(t) * K.log(K.softmax(t) + 0.00000001f), axis: 1), 0)); });
                outputEntropy = entropies.Aggregate((x, y) => { return(x + y); });

                //probabilities
                var actionProbsArray = ActionSizes.Select((x, i) => { return(K.sum(outputNormalizedLogits[i] * onehotInputActions[i], 1, true)); }).ToList();
                //actionLogProb = K.reshape(K.sum(K.log(outputActionFromNetwork) * onehotInputAction, 1), new int[] { -1, 1 });
                actionLogProb = ActionSizes.Length == 1 ? actionProbsArray[0]:K.concat(actionProbsArray, 1);
            }

            List <Tensor> extraInputs = new List <Tensor>();
            extraInputs.AddRange(actionFunctionInputs);
            extraInputs.Add(inputAction);

            CreatePPOOptimizer(trainingParams, outputEntropy, actionLogProb, outputValue, extraInputs, network.GetWeights());
        }
    }
    public override void Initialize()
    {
        iModelPPO = modelRef as IRLModelPPO;
        Debug.Assert(iModelPPO != null, "Please assign a model that implement interface IRLModelPPO to modelRef");
        parametersPPO = parameters as TrainerParamsPPO;
        Debug.Assert(parametersPPO != null, "Please Specify PPO Trainer Parameters");
        Debug.Assert(BrainToTrain != null, "brain can not be null");


        //initialize all data buffers
        statesEpisodeHistory      = new Dictionary <Agent, List <float> >();
        rewardsEpisodeHistory     = new Dictionary <Agent, List <float> >();
        actionsEpisodeHistory     = new Dictionary <Agent, List <float> >();
        actionprobsEpisodeHistory = new Dictionary <Agent, List <float> >();
        valuesEpisodeHistory      = new Dictionary <Agent, List <float> >();
        visualEpisodeHistory      = new Dictionary <Agent, List <List <float[, , ]> > >();
        actionMasksEpisodeHistory = new Dictionary <Agent, List <List <float> > >();

        accumulatedRewards = new Dictionary <Agent, float>();
        episodeSteps       = new Dictionary <Agent, int>();


        var brainParameters = BrainToTrain.brainParameters;

        Debug.Assert(brainParameters.vectorActionSize.Length > 0, "Action size can not be zero. Please set it in the brain");
        List <DataBuffer.DataInfo> allBufferData = new List <DataBuffer.DataInfo>()
        {
            new DataBuffer.DataInfo("Action", typeof(float), new int[] { brainParameters.vectorActionSpaceType == SpaceType.continuous ? brainParameters.vectorActionSize[0] : brainParameters.vectorActionSize.Length }),
            new DataBuffer.DataInfo("ActionProb", typeof(float), new int[] { brainParameters.vectorActionSpaceType == SpaceType.continuous ? brainParameters.vectorActionSize[0] : brainParameters.vectorActionSize.Length }),
            new DataBuffer.DataInfo("TargetValue", typeof(float), new int[] { 1 }),
            new DataBuffer.DataInfo("OldValue", typeof(float), new int[] { 1 }),
            new DataBuffer.DataInfo("Advantage", typeof(float), new int[] { 1 })
        };

        if (brainParameters.vectorObservationSize > 0)
        {
            allBufferData.Add(new DataBuffer.DataInfo("VectorObservation", typeof(float), new int[] { brainParameters.vectorObservationSize *brainParameters.numStackedVectorObservations }));
        }

        for (int i = 0; i < brainParameters.cameraResolutions.Length; ++i)
        {
            int width  = brainParameters.cameraResolutions[i].width;
            int height = brainParameters.cameraResolutions[i].height;
            int channels;
            if (brainParameters.cameraResolutions[i].blackAndWhite)
            {
                channels = 1;
            }
            else
            {
                channels = 3;
            }

            allBufferData.Add(new DataBuffer.DataInfo("VisualObservation" + i, typeof(float), new int[] { height, width, channels }));
        }

        if (brainParameters.vectorActionSpaceType == SpaceType.discrete)
        {
            for (int i = 0; i < brainParameters.vectorActionSize.Length; ++i)
            {
                allBufferData.Add(new DataBuffer.DataInfo("ActionMask" + i, typeof(float), new int[] { brainParameters.vectorActionSize[i] }));
            }
        }

        dataBuffer = new DataBuffer(allBufferData.ToArray());

        //initialize loggers and neuralnetowrk model
        stats = new StatsLogger();

        modelRef.Initialize(BrainToTrain.brainParameters, isTraining, parameters);
        if (continueFromCheckpoint)
        {
            LoadModel();
        }
    }
    /// <summary>
    /// Initialize the model for PPO
    /// </summary>
    /// <param name="trainerParams"></param>
    /// <param name="stateTensor"></param>
    /// <param name="inputVisualTensors"></param>
    /// <param name="outputValueFromNetwork"></param>
    /// <param name="outputActionFromNetwork"></param>
    /// <param name="outputVarianceFromNetwork"></param>
    /// <param name="weightsToUpdate"></param>
    protected void InitializePPOStructures(TrainerParams trainerParams, Tensor stateTensor, List <Tensor> inputVisualTensors, Tensor outputValueFromNetwork, Tensor outputActionFromNetwork, Tensor outputVarianceFromNetwork, List <Tensor> weightsToUpdate)
    {
        List <Tensor> allobservationInputs = new List <Tensor>();

        if (HasVectorObservation)
        {
            allobservationInputs.Add(stateTensor);
        }
        if (HasVisualObservation)
        {
            allobservationInputs.AddRange(inputVisualTensors);
        }

        ValueFunction = K.function(allobservationInputs, new List <Tensor> {
            outputValueFromNetwork
        }, null, "ValueFunction");

        Tensor outputActualAction = null; Tensor actionProb = null;

        if (ActionSpace == SpaceType.continuous)
        {
            using (K.name_scope("SampleAction"))
            {
                outputActualAction = K.standard_normal(K.shape(outputActionFromNetwork), DataType.Float) * K.sqrt(outputVarianceFromNetwork) + outputActionFromNetwork;
            }
            using (K.name_scope("ActionProbs"))
            {
                actionProb = K.normal_probability(K.stop_gradient(outputActualAction), outputActionFromNetwork, outputVarianceFromNetwork);
            }
            ActionFunction = K.function(allobservationInputs, new List <Tensor> {
                outputActualAction, actionProb, outputActionFromNetwork, outputVarianceFromNetwork
            }, null, "ActionFunction");

            var probInputs = new List <Tensor>(); probInputs.AddRange(allobservationInputs); probInputs.Add(outputActualAction);
            ActionProbabilityFunction = K.function(probInputs, new List <Tensor> {
                actionProb
            }, null, "ActionProbabilityFunction");
        }
        else
        {
            ActionFunction = K.function(allobservationInputs, new List <Tensor> {
                outputActionFromNetwork
            }, null, "ActionFunction");
        }

        TrainerParamsPPO trainingParams = trainerParams as TrainerParamsPPO;

        if (trainingParams != null)
        {
            //training needed inputs

            var inputOldProb     = UnityTFUtils.Input(new int?[] { ActionSpace == SpaceType.continuous ? ActionSize : 1 }, name: "InputOldProb")[0];
            var inputAdvantage   = UnityTFUtils.Input(new int?[] { 1 }, name: "InputAdvantage")[0];
            var inputTargetValue = UnityTFUtils.Input(new int?[] { 1 }, name: "InputTargetValue")[0];
            var inputOldValue    = UnityTFUtils.Input(new int?[] { 1 }, name: "InputOldValue")[0];

            ClipEpsilon       = trainingParams.clipEpsilon;
            ValueLossWeight   = trainingParams.valueLossWeight;
            EntropyLossWeight = trainingParams.entropyLossWeight;

            var inputClipEpsilon       = UnityTFUtils.Input(batch_shape: new int?[] { }, name: "ClipEpsilon", dtype: DataType.Float)[0];
            var inputValuelossWeight   = UnityTFUtils.Input(batch_shape: new int?[] { }, name: "ValueLossWeight", dtype: DataType.Float)[0];
            var inputEntropyLossWeight = UnityTFUtils.Input(batch_shape: new int?[] { }, name: "EntropyLossWeight", dtype: DataType.Float)[0];

            // action probability from input action
            Tensor outputEntropy;
            Tensor inputActionDiscrete = null, onehotInputAction = null;    //for discrete action space

            if (ActionSpace == SpaceType.continuous)
            {
                using (K.name_scope("Entropy"))
                {
                    var temp = K.mul(outputVarianceFromNetwork, 2 * Mathf.PI * 2.7182818285);
                    temp = K.mul(K.log(temp), 0.5);
                    if (outputVarianceFromNetwork.shape.Length == 2)
                    {
                        outputEntropy = K.mean(K.mean(temp, 0, false), name: "OutputEntropy");
                    }
                    else
                    {
                        outputEntropy = K.mean(temp, 0, false, name: "OutputEntropy");
                    }
                }
            }
            else
            {
                using (K.name_scope("ActionProbAndEntropy"))
                {
                    inputActionDiscrete = UnityTFUtils.Input(new int?[] { 1 }, name: "InputAction", dtype: DataType.Int32)[0];
                    onehotInputAction   = K.one_hot(inputActionDiscrete, K.constant <int>(ActionSize, dtype: DataType.Int32), K.constant(1.0f), K.constant(0.0f));
                    onehotInputAction   = K.reshape(onehotInputAction, new int[] { -1, ActionSize });
                    outputEntropy       = K.mean((-1.0f) * K.sum(outputActionFromNetwork * K.log(outputActionFromNetwork + 0.00000001f), axis: 1), 0);
                    actionProb          = K.reshape(K.sum(outputActionFromNetwork * onehotInputAction, 1), new int[] { -1, 1 });
                }
            }

            // value loss
            Tensor outputValueLoss = null;
            using (K.name_scope("ValueLoss"))
            {
                var clippedValueEstimate = inputOldValue + K.clip(outputValueFromNetwork - inputOldValue, 0.0f - inputClipEpsilon, inputClipEpsilon);
                var valueLoss1           = new MeanSquareError().Call(outputValueFromNetwork, inputTargetValue);
                var valueLoss2           = new MeanSquareError().Call(clippedValueEstimate, inputTargetValue);
                outputValueLoss = K.mean(K.maximum(valueLoss1, valueLoss2));
            }
            //var outputValueLoss = K.mean(valueLoss1);

            // Clipped Surrogate loss
            Tensor outputPolicyLoss;
            using (K.name_scope("ClippedCurreogateLoss"))
            {
                //Debug.LogWarning("testnew");
                //var probStopGradient = K.stop_gradient(actionProb);
                var probRatio = actionProb / (inputOldProb + 0.0000000001f);
                var p_opt_a   = probRatio * inputAdvantage;
                var p_opt_b   = K.clip(probRatio, 1.0f - inputClipEpsilon, 1.0f + inputClipEpsilon) * inputAdvantage;

                outputPolicyLoss = (-1f) * K.mean(K.mean(K.minimun(p_opt_a, p_opt_b)), name: "ClippedCurreogateLoss");
            }
            //final weighted loss
            var outputLoss = outputPolicyLoss + inputValuelossWeight * outputValueLoss;
            outputLoss = outputLoss - inputEntropyLossWeight * outputEntropy;
            outputLoss = K.identity(outputLoss, "OutputLoss");

            //add inputs, outputs and parameters to the list
            List <Tensor> allInputs = new List <Tensor>();
            if (HasVectorObservation)
            {
                allInputs.Add(stateTensor);
            }
            if (HasVisualObservation)
            {
                allInputs.AddRange(inputVisualTensors);
            }
            if (ActionSpace == SpaceType.continuous)
            {
                allInputs.Add(outputActualAction);
            }
            else
            {
                allInputs.Add(inputActionDiscrete);
            }

            allInputs.Add(inputOldProb);
            allInputs.Add(inputTargetValue);
            allInputs.Add(inputOldValue);
            allInputs.Add(inputAdvantage);
            allInputs.Add(inputClipEpsilon);
            allInputs.Add(inputValuelossWeight);
            allInputs.Add(inputEntropyLossWeight);

            //create optimizer and create necessary functions
            var updates = AddOptimizer(weightsToUpdate, outputLoss, optimizer);
            UpdatePPOFunction = K.function(allInputs, new List <Tensor> {
                outputLoss, outputValueLoss, outputPolicyLoss, outputEntropy, actionProb
            }, updates, "UpdateFunction");
        }
    }
Example #7
0
    /// <summary>
    /// Initialize the model for PPO
    /// </summary>
    /// <param name="trainerParams"></param>
    /// <param name="stateTensor"></param>
    /// <param name="inputVisualTensors"></param>
    /// <param name="outputValueFromNetwork"></param>
    /// <param name="outputActionFromNetwork"></param>
    /// <param name="outputVarianceFromNetwork"></param>
    protected void InitializePPOCMAStructures(TrainerParams trainerParams, Tensor stateTensor, List <Tensor> inputVisualTensors, Tensor outputValueFromNetwork, Tensor outputActionMeanFromNetwork, Tensor outActionLogVarianceFromNetwork, List <Tensor> valueWeights, List <Tensor> meanWeights, List <Tensor> varweights)
    {
        List <Tensor> allobservationInputs = new List <Tensor>();

        if (HasVectorObservation)
        {
            allobservationInputs.Add(stateTensor);
        }
        if (HasVisualObservation)
        {
            allobservationInputs.AddRange(inputVisualTensors);
        }

        ValueFunction = K.function(allobservationInputs, new List <Tensor> {
            outputValueFromNetwork
        }, null, "ValueFunction");

        Tensor outputActualAction = null;
        Tensor outputVariance     = K.exp(outActionLogVarianceFromNetwork);

        using (K.name_scope("SampleAction"))
        {
            outputActualAction = K.standard_normal(K.shape(outputActionMeanFromNetwork), DataType.Float) * K.sqrt(outputVariance) + outputActionMeanFromNetwork;
        }

        ActionFunction = K.function(allobservationInputs, new List <Tensor> {
            outputActualAction, outputActionMeanFromNetwork, outputVariance
        }, null, "ActionFunction");

        TrainerParamsPPO trainingParams = trainerParams as TrainerParamsPPO;

        if (trainingParams != null)
        {
            //training needed inputs
            var inputOldAction   = UnityTFUtils.Input(new int?[] { ActionSizes[0] }, name: "InputOldAction")[0];
            var inputAdvantage   = UnityTFUtils.Input(new int?[] { 1 }, name: "InputAdvantage")[0];
            var inputTargetValue = UnityTFUtils.Input(new int?[] { 1 }, name: "InputTargetValue")[0];
            var inputOldValue    = UnityTFUtils.Input(new int?[] { 1 }, name: "InputOldValue")[0];

            //var inputClipEpsilon = UnityTFUtils.Input(batch_shape: new int?[] { }, name: "ClipEpsilon", dtype: DataType.Float)[0];

            var inputClipEpsilonValue = UnityTFUtils.Input(batch_shape: new int?[] { }, name: "ClipEpsilonValue", dtype: DataType.Float)[0];
            // value loss
            Tensor outputValueLoss = null;
            using (K.name_scope("ValueLoss"))
            {
                var clippedValueEstimate = inputOldValue + K.clip(outputValueFromNetwork - inputOldValue, 0.0f - inputClipEpsilonValue, inputClipEpsilonValue);
                var valueLoss1           = new MeanSquareError().Call(outputValueFromNetwork, inputTargetValue);
                var valueLoss2           = new MeanSquareError().Call(clippedValueEstimate, inputTargetValue);
                outputValueLoss = K.mean(K.maximum(valueLoss1, valueLoss2));
                outputValueLoss = K.mean(valueLoss1);
            }

            var           valueUpdates = AddOptimizer(valueWeights, outputValueLoss, optimizerValue);
            List <Tensor> valueInputs  = new List <Tensor>();
            if (HasVectorObservation)
            {
                valueInputs.Add(stateTensor);
            }
            if (HasVisualObservation)
            {
                valueInputs.AddRange(inputVisualTensors);
            }
            valueInputs.Add(inputOldValue);
            valueInputs.Add(inputTargetValue);
            valueInputs.Add(inputClipEpsilonValue);
            TrainValueFunction = K.function(valueInputs, new List <Tensor> {
                outputValueLoss
            }, valueUpdates, "TrainValueFunction");

            // actor losses
            Tensor meanLoss, varLoss;
            using (K.name_scope("ActorLosses"))
            {
                Tensor posAdvantage;
                if (usePositiveAdvOnly)
                {
                    posAdvantage = K.identity(K.relu(K.mean(inputAdvantage)), "ClipedPositiveAdv");
                }
                else
                {
                    posAdvantage = K.identity(K.mean(inputAdvantage), "Adv");
                }
                var meanNoGrad   = K.stop_gradient(outputActionMeanFromNetwork, "MeanNoGrad");
                var varNoGrad    = K.stop_gradient(outputVariance, "VarNoGrad");
                var logVar       = outActionLogVarianceFromNetwork;
                var logVarNoGrad = K.stop_gradient(logVar, "LogVarNoGrad");
                using (K.name_scope("VarLoss"))
                {
                    var logpNoMeanGrad = -1.0f * K.sum(0.5f * K.square(inputOldAction - meanNoGrad) / outputVariance + 0.5f * logVar, 1);
                    varLoss = K.identity(-1.0f * K.mean(posAdvantage * logpNoMeanGrad), "VarLoss");
                }
                using (K.name_scope("MeanLoss"))
                {
                    var logpNoVarGrad = -1.0f * K.sum(0.5f * K.square(inputOldAction - outputActionMeanFromNetwork) / varNoGrad + 0.5f * logVarNoGrad, 1);
                    meanLoss = K.identity(-1.0f * K.mean(posAdvantage * logpNoVarGrad), "MeanLoss");
                }
            }

            //add inputs, outputs and parameters to the list
            List <Tensor> allInputs = new List <Tensor>();
            if (HasVectorObservation)
            {
                allInputs.Add(stateTensor);
            }
            if (HasVisualObservation)
            {
                allInputs.AddRange(inputVisualTensors);
            }
            allInputs.Add(inputOldAction);
            allInputs.Add(inputAdvantage);


            //create optimizer and create necessary functions
            var updatesMean = AddOptimizer(meanWeights, meanLoss, optimizerMean);
            var updatesVar  = AddOptimizer(varweights, varLoss, optimizerVariance);

            TrainMeanFunction = K.function(allInputs, new List <Tensor> {
                meanLoss
            }, updatesMean, "UpdateMeanFunction");
            TrainVarianceFunction = K.function(allInputs, new List <Tensor> {
                varLoss
            }, updatesVar, "UpdateMeanFunction");

            //pretraining for output mean and var
            var inputInitialStd  = UnityTFUtils.Input(new int?[] { ActionSizes[0] }, name: "InputInitialStd")[0];
            var inputInitialMean = UnityTFUtils.Input(new int?[] { ActionSizes[0] }, name: "InputInitialMean")[0];
            var policyInitLoss   = K.mean(K.mean(K.square(inputInitialMean - outputActionMeanFromNetwork)));
            policyInitLoss += K.mean(K.mean(K.square(inputInitialStd - K.sqrt(outputVariance))));

            var updatesPretrain = AddOptimizer(network.GetActorWeights(), policyInitLoss, optimizerPretrain);
            var pretrainInputs  = new List <Tensor>();
            pretrainInputs.Add(stateTensor);
            pretrainInputs.Add(inputInitialMean);
            pretrainInputs.Add(inputInitialStd);
            PretrainFunction = K.function(pretrainInputs, new List <Tensor> {
                policyInitLoss
            }, updatesPretrain, "PretrainFunction");
        }
    }