static void AddPPOCMAGO() { var obj1 = new GameObject("LearningModel_PPOCMA"); obj1.AddComponent <RLModelPPOCMA>(); var obj2 = new GameObject("Trainer_PPOCMA"); obj2.AddComponent <TrainerPPOCMA>(); var obj3 = new GameObject("PPOCMA_Learning"); obj1.transform.parent = obj3.transform; obj2.transform.parent = obj3.transform; //try to create parameter assets RLNetworkACSeperateVar network = null; TrainerParamsPPO trainerParam = null; CreateAssets <TrainerParamsPPO, RLNetworkACSeperateVar>("TrainerParamPPOCMA_" + obj1.scene.name + ".asset", "NetworkPPOCMA_" + obj1.scene.name + ".asset", out trainerParam, out network); network.actorHiddenLayers = new List <UnityNetwork.SimpleDenseLayerDef>(); network.actorHiddenLayers.Add(new UnityNetwork.SimpleDenseLayerDef()); network.criticHiddenLayers = new List <UnityNetwork.SimpleDenseLayerDef>(); network.criticHiddenLayers.Add(new UnityNetwork.SimpleDenseLayerDef()); var trainer = obj2.GetComponent <TrainerPPOCMA>(); trainer.modelRef = obj1.GetComponent <RLModelPPOCMA>(); trainer.parameters = trainerParam; trainer.checkpointPath = checkpointPath; trainer.checkpointFileName = "Checkpoint_" + obj1.scene.name + ".bytes"; ((RLModelPPOCMA)trainer.modelRef).network = network; }
protected void CreatePPOOptimizer(TrainerParamsPPO trainingParams, Tensor entropy, Tensor actionLogProb, Tensor outputValueFromNetwork, List <Tensor> extraInputTensors, List <Tensor> weightsToUpdate) { ClipEpsilon = trainingParams.clipEpsilon; ValueLossWeight = trainingParams.valueLossWeight; EntropyLossWeight = trainingParams.entropyLossWeight; ClipValueLoss = trainingParams.clipValueLoss; var inputOldLogProb = UnityTFUtils.Input(new int?[] { ActionSpace == SpaceType.continuous ? ActionSizes[0] : ActionSizes.Length }, name: "InputOldLogProb")[0]; var inputAdvantage = UnityTFUtils.Input(new int?[] { 1 }, name: "InputAdvantage")[0]; var inputTargetValue = UnityTFUtils.Input(new int?[] { 1 }, name: "InputTargetValue")[0]; var inputOldValue = UnityTFUtils.Input(new int?[] { 1 }, name: "InputOldValue")[0]; var inputClipEpsilon = UnityTFUtils.Input(batch_shape: new int?[] { }, name: "ClipEpsilon", dtype: DataType.Float)[0]; var inputClipValueLoss = UnityTFUtils.Input(batch_shape: new int?[] { }, name: "ClipValueLoss", dtype: DataType.Float)[0]; var inputValuelossWeight = UnityTFUtils.Input(batch_shape: new int?[] { }, name: "ValueLossWeight", dtype: DataType.Float)[0]; var inputEntropyLossWeight = UnityTFUtils.Input(batch_shape: new int?[] { }, name: "EntropyLossWeight", dtype: DataType.Float)[0]; // value loss Tensor outputValueLoss = null; using (K.name_scope("ValueLoss")) { var clippedValueEstimate = inputOldValue + K.clip(outputValueFromNetwork - inputOldValue, 0.0f - inputClipValueLoss, inputClipValueLoss); var valueLoss1 = new MeanSquareError().Call(outputValueFromNetwork, inputTargetValue); var valueLoss2 = new MeanSquareError().Call(clippedValueEstimate, inputTargetValue); outputValueLoss = K.mean(K.maximum(valueLoss1, valueLoss2)); } //var outputValueLoss = K.mean(valueLoss1); // Clipped Surrogate loss Tensor outputPolicyLoss; using (K.name_scope("ClippedCurreogateLoss")) { //Debug.LogWarning("testnew"); //var probStopGradient = K.stop_gradient(actionProb); var probRatio = K.exp(actionLogProb - inputOldLogProb); var p_opt_a = probRatio * inputAdvantage; var p_opt_b = K.clip(probRatio, 1.0f - inputClipEpsilon, 1.0f + inputClipEpsilon) * inputAdvantage; outputPolicyLoss = (-1f) * K.mean(K.mean(K.minimun(p_opt_a, p_opt_b)), name: "ClippedCurreogateLoss"); } //final weighted loss var outputLoss = outputPolicyLoss + inputValuelossWeight * outputValueLoss; outputLoss = outputLoss - inputEntropyLossWeight * entropy; outputLoss = K.identity(outputLoss, "OutputLoss"); //add inputs, outputs and parameters to the list List <Tensor> allInputs = new List <Tensor>(); allInputs.Add(inputOldLogProb); allInputs.Add(inputTargetValue); allInputs.Add(inputOldValue); allInputs.Add(inputAdvantage); allInputs.Add(inputClipEpsilon); allInputs.Add(inputClipValueLoss); allInputs.Add(inputValuelossWeight); allInputs.Add(inputEntropyLossWeight); allInputs.AddRange(extraInputTensors); //create optimizer and create necessary functions var updates = AddOptimizer(weightsToUpdate, outputLoss, optimizer); UpdatePPOFunction = K.function(allInputs, new List <Tensor> { outputLoss, outputValueLoss, outputPolicyLoss, entropy }, updates, "UpdateFunction"); }
protected void InitializePPOStructureContinuousAction(Tensor vectorObs, Tensor normalizedVectorObs, List <Tensor> visualObs, TrainerParams trainerParams) { //all inputs list List <Tensor> allObservationInputs = new List <Tensor>(); if (HasVectorObservation) { allObservationInputs.Add(vectorObs); } if (HasVisualObservation) { allObservationInputs.AddRange(visualObs); } //build the network Tensor outputValue = null; Tensor outputActionMean = null; Tensor outputLogVariance = null; network.BuildNetworkForContinuousActionSapce(normalizedVectorObs, visualObs, null, null, ActionSizes[0], out outputActionMean, out outputValue, out outputLogVariance); //value function ValueFunction = K.function(allObservationInputs, new List <Tensor> { outputValue }, null, "ValueFunction"); Tensor outputActualAction = null, actionLogProb = null, outputVariance = null; //build action sampling outputVariance = K.exp(outputLogVariance); using (K.name_scope("SampleAction")) { outputActualAction = K.standard_normal(K.shape(outputActionMean), DataType.Float) * K.sqrt(outputVariance) + outputActionMean; } using (K.name_scope("ActionProbs")) { actionLogProb = K.log_normal_probability(K.stop_gradient(outputActualAction), outputActionMean, outputVariance, outputLogVariance); } //action function //ActionFunction = K.function(allObservationInputs, new List<Tensor> { outputActualAction, actionLogProb, outputActionMean }, null, "ActionFunction"); ActionFunction = K.function(allObservationInputs, new List <Tensor> { outputActualAction, actionLogProb }, null, "ActionFunction"); var probInputs = new List <Tensor>(); probInputs.AddRange(allObservationInputs); probInputs.Add(outputActualAction); //probability function ActionProbabilityFunction = K.function(probInputs, new List <Tensor> { actionLogProb }, null, "ActionProbabilityFunction"); //training related TrainerParamsPPO trainingParams = trainerParams as TrainerParamsPPO; if (trainingParams != null) { Tensor outputEntropy; using (K.name_scope("Entropy")) { var temp = 0.5f * (Mathf.Log(2 * Mathf.PI * 2.7182818285f, 2.7182818285f) + outputLogVariance); if (outputLogVariance.shape.Length == 2) { outputEntropy = K.mean(K.mean(temp, 0, false), name: "OutputEntropy"); } else { outputEntropy = K.mean(temp, 0, false, name: "OutputEntropy"); } } List <Tensor> extraInputs = new List <Tensor>(); extraInputs.AddRange(allObservationInputs); extraInputs.Add(outputActualAction); CreatePPOOptimizer(trainingParams, outputEntropy, actionLogProb, outputValue, extraInputs, network.GetWeights()); } }
protected void InitializePPOStructureDiscreteAction(Tensor vectorObs, Tensor normalizedVectorObs, List <Tensor> visualObs, TrainerParams trainerParams) { //all inputs list List <Tensor> allObservationInputs = new List <Tensor>(); if (HasVectorObservation) { allObservationInputs.Add(vectorObs); } if (HasVisualObservation) { allObservationInputs.AddRange(visualObs); } Tensor[] outputActionsLogits = null; Tensor outputValue = null; network.BuildNetworkForDiscreteActionSpace(normalizedVectorObs, visualObs, null, null, ActionSizes, out outputActionsLogits, out outputValue); ValueFunction = K.function(allObservationInputs, new List <Tensor> { outputValue }, null, "ValueFunction"); //the action masks input placeholders List <Tensor> actionMasksInputs = new List <Tensor>(); for (int i = 0; i < ActionSizes.Length; ++i) { actionMasksInputs.Add(UnityTFUtils.Input(new int?[] { ActionSizes[i] }, name: "AcionMask" + i)[0]); } Tensor[] outputActions, outputNormalizedLogits; CreateDiscreteActionMaskingLayer(outputActionsLogits, actionMasksInputs.ToArray(), out outputActions, out outputNormalizedLogits); //output tensors for discrete actions. Includes all action selected actions and the normalized logits of all actions var outputDiscreteActions = new List <Tensor>(); outputDiscreteActions.Add(K.identity(K.cast(ActionSizes.Length == 1? outputActions[0]: K.concat(outputActions.ToList(), 1), DataType.Float), "OutputAction")); outputDiscreteActions.AddRange(outputNormalizedLogits); var actionFunctionInputs = new List <Tensor>(); actionFunctionInputs.AddRange(allObservationInputs); actionFunctionInputs.AddRange(actionMasksInputs); ActionFunction = K.function(actionFunctionInputs, outputDiscreteActions, null, "ActionFunction"); TrainerParamsPPO trainingParams = trainerParams as TrainerParamsPPO; if (trainingParams != null) { // action probability from input action Tensor outputEntropy; List <Tensor> inputActionsDiscreteSeperated = null, onehotInputActions = null; //for discrete action space Tensor inputAction = UnityTFUtils.Input(new int?[] { ActionSizes.Length }, name: "InputActions", dtype: DataType.Int32)[0]; //split the input for each discrete branch var splits = new int[ActionSizes.Length]; for (int i = 0; i < splits.Length; ++i) { splits[i] = 1; } inputActionsDiscreteSeperated = K.split(inputAction, K.constant(splits, dtype: DataType.Int32), K.constant(1, dtype: DataType.Int32), ActionSizes.Length); Tensor actionLogProb = null; using (K.name_scope("ActionProbAndEntropy")) { onehotInputActions = inputActionsDiscreteSeperated.Select((x, i) => K.reshape(K.one_hot(x, K.constant <int>(ActionSizes[i], dtype: DataType.Int32), K.constant(1.0f), K.constant(0.0f)), new int[] { -1, ActionSizes[i] })).ToList(); //entropy var entropies = outputActionsLogits.Select((t) => { return(K.mean((-1.0f) * K.sum(K.softmax(t) * K.log(K.softmax(t) + 0.00000001f), axis: 1), 0)); }); outputEntropy = entropies.Aggregate((x, y) => { return(x + y); }); //probabilities var actionProbsArray = ActionSizes.Select((x, i) => { return(K.sum(outputNormalizedLogits[i] * onehotInputActions[i], 1, true)); }).ToList(); //actionLogProb = K.reshape(K.sum(K.log(outputActionFromNetwork) * onehotInputAction, 1), new int[] { -1, 1 }); actionLogProb = ActionSizes.Length == 1 ? actionProbsArray[0]:K.concat(actionProbsArray, 1); } List <Tensor> extraInputs = new List <Tensor>(); extraInputs.AddRange(actionFunctionInputs); extraInputs.Add(inputAction); CreatePPOOptimizer(trainingParams, outputEntropy, actionLogProb, outputValue, extraInputs, network.GetWeights()); } }
public override void Initialize() { iModelPPO = modelRef as IRLModelPPO; Debug.Assert(iModelPPO != null, "Please assign a model that implement interface IRLModelPPO to modelRef"); parametersPPO = parameters as TrainerParamsPPO; Debug.Assert(parametersPPO != null, "Please Specify PPO Trainer Parameters"); Debug.Assert(BrainToTrain != null, "brain can not be null"); //initialize all data buffers statesEpisodeHistory = new Dictionary <Agent, List <float> >(); rewardsEpisodeHistory = new Dictionary <Agent, List <float> >(); actionsEpisodeHistory = new Dictionary <Agent, List <float> >(); actionprobsEpisodeHistory = new Dictionary <Agent, List <float> >(); valuesEpisodeHistory = new Dictionary <Agent, List <float> >(); visualEpisodeHistory = new Dictionary <Agent, List <List <float[, , ]> > >(); actionMasksEpisodeHistory = new Dictionary <Agent, List <List <float> > >(); accumulatedRewards = new Dictionary <Agent, float>(); episodeSteps = new Dictionary <Agent, int>(); var brainParameters = BrainToTrain.brainParameters; Debug.Assert(brainParameters.vectorActionSize.Length > 0, "Action size can not be zero. Please set it in the brain"); List <DataBuffer.DataInfo> allBufferData = new List <DataBuffer.DataInfo>() { new DataBuffer.DataInfo("Action", typeof(float), new int[] { brainParameters.vectorActionSpaceType == SpaceType.continuous ? brainParameters.vectorActionSize[0] : brainParameters.vectorActionSize.Length }), new DataBuffer.DataInfo("ActionProb", typeof(float), new int[] { brainParameters.vectorActionSpaceType == SpaceType.continuous ? brainParameters.vectorActionSize[0] : brainParameters.vectorActionSize.Length }), new DataBuffer.DataInfo("TargetValue", typeof(float), new int[] { 1 }), new DataBuffer.DataInfo("OldValue", typeof(float), new int[] { 1 }), new DataBuffer.DataInfo("Advantage", typeof(float), new int[] { 1 }) }; if (brainParameters.vectorObservationSize > 0) { allBufferData.Add(new DataBuffer.DataInfo("VectorObservation", typeof(float), new int[] { brainParameters.vectorObservationSize *brainParameters.numStackedVectorObservations })); } for (int i = 0; i < brainParameters.cameraResolutions.Length; ++i) { int width = brainParameters.cameraResolutions[i].width; int height = brainParameters.cameraResolutions[i].height; int channels; if (brainParameters.cameraResolutions[i].blackAndWhite) { channels = 1; } else { channels = 3; } allBufferData.Add(new DataBuffer.DataInfo("VisualObservation" + i, typeof(float), new int[] { height, width, channels })); } if (brainParameters.vectorActionSpaceType == SpaceType.discrete) { for (int i = 0; i < brainParameters.vectorActionSize.Length; ++i) { allBufferData.Add(new DataBuffer.DataInfo("ActionMask" + i, typeof(float), new int[] { brainParameters.vectorActionSize[i] })); } } dataBuffer = new DataBuffer(allBufferData.ToArray()); //initialize loggers and neuralnetowrk model stats = new StatsLogger(); modelRef.Initialize(BrainToTrain.brainParameters, isTraining, parameters); if (continueFromCheckpoint) { LoadModel(); } }
/// <summary> /// Initialize the model for PPO /// </summary> /// <param name="trainerParams"></param> /// <param name="stateTensor"></param> /// <param name="inputVisualTensors"></param> /// <param name="outputValueFromNetwork"></param> /// <param name="outputActionFromNetwork"></param> /// <param name="outputVarianceFromNetwork"></param> /// <param name="weightsToUpdate"></param> protected void InitializePPOStructures(TrainerParams trainerParams, Tensor stateTensor, List <Tensor> inputVisualTensors, Tensor outputValueFromNetwork, Tensor outputActionFromNetwork, Tensor outputVarianceFromNetwork, List <Tensor> weightsToUpdate) { List <Tensor> allobservationInputs = new List <Tensor>(); if (HasVectorObservation) { allobservationInputs.Add(stateTensor); } if (HasVisualObservation) { allobservationInputs.AddRange(inputVisualTensors); } ValueFunction = K.function(allobservationInputs, new List <Tensor> { outputValueFromNetwork }, null, "ValueFunction"); Tensor outputActualAction = null; Tensor actionProb = null; if (ActionSpace == SpaceType.continuous) { using (K.name_scope("SampleAction")) { outputActualAction = K.standard_normal(K.shape(outputActionFromNetwork), DataType.Float) * K.sqrt(outputVarianceFromNetwork) + outputActionFromNetwork; } using (K.name_scope("ActionProbs")) { actionProb = K.normal_probability(K.stop_gradient(outputActualAction), outputActionFromNetwork, outputVarianceFromNetwork); } ActionFunction = K.function(allobservationInputs, new List <Tensor> { outputActualAction, actionProb, outputActionFromNetwork, outputVarianceFromNetwork }, null, "ActionFunction"); var probInputs = new List <Tensor>(); probInputs.AddRange(allobservationInputs); probInputs.Add(outputActualAction); ActionProbabilityFunction = K.function(probInputs, new List <Tensor> { actionProb }, null, "ActionProbabilityFunction"); } else { ActionFunction = K.function(allobservationInputs, new List <Tensor> { outputActionFromNetwork }, null, "ActionFunction"); } TrainerParamsPPO trainingParams = trainerParams as TrainerParamsPPO; if (trainingParams != null) { //training needed inputs var inputOldProb = UnityTFUtils.Input(new int?[] { ActionSpace == SpaceType.continuous ? ActionSize : 1 }, name: "InputOldProb")[0]; var inputAdvantage = UnityTFUtils.Input(new int?[] { 1 }, name: "InputAdvantage")[0]; var inputTargetValue = UnityTFUtils.Input(new int?[] { 1 }, name: "InputTargetValue")[0]; var inputOldValue = UnityTFUtils.Input(new int?[] { 1 }, name: "InputOldValue")[0]; ClipEpsilon = trainingParams.clipEpsilon; ValueLossWeight = trainingParams.valueLossWeight; EntropyLossWeight = trainingParams.entropyLossWeight; var inputClipEpsilon = UnityTFUtils.Input(batch_shape: new int?[] { }, name: "ClipEpsilon", dtype: DataType.Float)[0]; var inputValuelossWeight = UnityTFUtils.Input(batch_shape: new int?[] { }, name: "ValueLossWeight", dtype: DataType.Float)[0]; var inputEntropyLossWeight = UnityTFUtils.Input(batch_shape: new int?[] { }, name: "EntropyLossWeight", dtype: DataType.Float)[0]; // action probability from input action Tensor outputEntropy; Tensor inputActionDiscrete = null, onehotInputAction = null; //for discrete action space if (ActionSpace == SpaceType.continuous) { using (K.name_scope("Entropy")) { var temp = K.mul(outputVarianceFromNetwork, 2 * Mathf.PI * 2.7182818285); temp = K.mul(K.log(temp), 0.5); if (outputVarianceFromNetwork.shape.Length == 2) { outputEntropy = K.mean(K.mean(temp, 0, false), name: "OutputEntropy"); } else { outputEntropy = K.mean(temp, 0, false, name: "OutputEntropy"); } } } else { using (K.name_scope("ActionProbAndEntropy")) { inputActionDiscrete = UnityTFUtils.Input(new int?[] { 1 }, name: "InputAction", dtype: DataType.Int32)[0]; onehotInputAction = K.one_hot(inputActionDiscrete, K.constant <int>(ActionSize, dtype: DataType.Int32), K.constant(1.0f), K.constant(0.0f)); onehotInputAction = K.reshape(onehotInputAction, new int[] { -1, ActionSize }); outputEntropy = K.mean((-1.0f) * K.sum(outputActionFromNetwork * K.log(outputActionFromNetwork + 0.00000001f), axis: 1), 0); actionProb = K.reshape(K.sum(outputActionFromNetwork * onehotInputAction, 1), new int[] { -1, 1 }); } } // value loss Tensor outputValueLoss = null; using (K.name_scope("ValueLoss")) { var clippedValueEstimate = inputOldValue + K.clip(outputValueFromNetwork - inputOldValue, 0.0f - inputClipEpsilon, inputClipEpsilon); var valueLoss1 = new MeanSquareError().Call(outputValueFromNetwork, inputTargetValue); var valueLoss2 = new MeanSquareError().Call(clippedValueEstimate, inputTargetValue); outputValueLoss = K.mean(K.maximum(valueLoss1, valueLoss2)); } //var outputValueLoss = K.mean(valueLoss1); // Clipped Surrogate loss Tensor outputPolicyLoss; using (K.name_scope("ClippedCurreogateLoss")) { //Debug.LogWarning("testnew"); //var probStopGradient = K.stop_gradient(actionProb); var probRatio = actionProb / (inputOldProb + 0.0000000001f); var p_opt_a = probRatio * inputAdvantage; var p_opt_b = K.clip(probRatio, 1.0f - inputClipEpsilon, 1.0f + inputClipEpsilon) * inputAdvantage; outputPolicyLoss = (-1f) * K.mean(K.mean(K.minimun(p_opt_a, p_opt_b)), name: "ClippedCurreogateLoss"); } //final weighted loss var outputLoss = outputPolicyLoss + inputValuelossWeight * outputValueLoss; outputLoss = outputLoss - inputEntropyLossWeight * outputEntropy; outputLoss = K.identity(outputLoss, "OutputLoss"); //add inputs, outputs and parameters to the list List <Tensor> allInputs = new List <Tensor>(); if (HasVectorObservation) { allInputs.Add(stateTensor); } if (HasVisualObservation) { allInputs.AddRange(inputVisualTensors); } if (ActionSpace == SpaceType.continuous) { allInputs.Add(outputActualAction); } else { allInputs.Add(inputActionDiscrete); } allInputs.Add(inputOldProb); allInputs.Add(inputTargetValue); allInputs.Add(inputOldValue); allInputs.Add(inputAdvantage); allInputs.Add(inputClipEpsilon); allInputs.Add(inputValuelossWeight); allInputs.Add(inputEntropyLossWeight); //create optimizer and create necessary functions var updates = AddOptimizer(weightsToUpdate, outputLoss, optimizer); UpdatePPOFunction = K.function(allInputs, new List <Tensor> { outputLoss, outputValueLoss, outputPolicyLoss, outputEntropy, actionProb }, updates, "UpdateFunction"); } }
/// <summary> /// Initialize the model for PPO /// </summary> /// <param name="trainerParams"></param> /// <param name="stateTensor"></param> /// <param name="inputVisualTensors"></param> /// <param name="outputValueFromNetwork"></param> /// <param name="outputActionFromNetwork"></param> /// <param name="outputVarianceFromNetwork"></param> protected void InitializePPOCMAStructures(TrainerParams trainerParams, Tensor stateTensor, List <Tensor> inputVisualTensors, Tensor outputValueFromNetwork, Tensor outputActionMeanFromNetwork, Tensor outActionLogVarianceFromNetwork, List <Tensor> valueWeights, List <Tensor> meanWeights, List <Tensor> varweights) { List <Tensor> allobservationInputs = new List <Tensor>(); if (HasVectorObservation) { allobservationInputs.Add(stateTensor); } if (HasVisualObservation) { allobservationInputs.AddRange(inputVisualTensors); } ValueFunction = K.function(allobservationInputs, new List <Tensor> { outputValueFromNetwork }, null, "ValueFunction"); Tensor outputActualAction = null; Tensor outputVariance = K.exp(outActionLogVarianceFromNetwork); using (K.name_scope("SampleAction")) { outputActualAction = K.standard_normal(K.shape(outputActionMeanFromNetwork), DataType.Float) * K.sqrt(outputVariance) + outputActionMeanFromNetwork; } ActionFunction = K.function(allobservationInputs, new List <Tensor> { outputActualAction, outputActionMeanFromNetwork, outputVariance }, null, "ActionFunction"); TrainerParamsPPO trainingParams = trainerParams as TrainerParamsPPO; if (trainingParams != null) { //training needed inputs var inputOldAction = UnityTFUtils.Input(new int?[] { ActionSizes[0] }, name: "InputOldAction")[0]; var inputAdvantage = UnityTFUtils.Input(new int?[] { 1 }, name: "InputAdvantage")[0]; var inputTargetValue = UnityTFUtils.Input(new int?[] { 1 }, name: "InputTargetValue")[0]; var inputOldValue = UnityTFUtils.Input(new int?[] { 1 }, name: "InputOldValue")[0]; //var inputClipEpsilon = UnityTFUtils.Input(batch_shape: new int?[] { }, name: "ClipEpsilon", dtype: DataType.Float)[0]; var inputClipEpsilonValue = UnityTFUtils.Input(batch_shape: new int?[] { }, name: "ClipEpsilonValue", dtype: DataType.Float)[0]; // value loss Tensor outputValueLoss = null; using (K.name_scope("ValueLoss")) { var clippedValueEstimate = inputOldValue + K.clip(outputValueFromNetwork - inputOldValue, 0.0f - inputClipEpsilonValue, inputClipEpsilonValue); var valueLoss1 = new MeanSquareError().Call(outputValueFromNetwork, inputTargetValue); var valueLoss2 = new MeanSquareError().Call(clippedValueEstimate, inputTargetValue); outputValueLoss = K.mean(K.maximum(valueLoss1, valueLoss2)); outputValueLoss = K.mean(valueLoss1); } var valueUpdates = AddOptimizer(valueWeights, outputValueLoss, optimizerValue); List <Tensor> valueInputs = new List <Tensor>(); if (HasVectorObservation) { valueInputs.Add(stateTensor); } if (HasVisualObservation) { valueInputs.AddRange(inputVisualTensors); } valueInputs.Add(inputOldValue); valueInputs.Add(inputTargetValue); valueInputs.Add(inputClipEpsilonValue); TrainValueFunction = K.function(valueInputs, new List <Tensor> { outputValueLoss }, valueUpdates, "TrainValueFunction"); // actor losses Tensor meanLoss, varLoss; using (K.name_scope("ActorLosses")) { Tensor posAdvantage; if (usePositiveAdvOnly) { posAdvantage = K.identity(K.relu(K.mean(inputAdvantage)), "ClipedPositiveAdv"); } else { posAdvantage = K.identity(K.mean(inputAdvantage), "Adv"); } var meanNoGrad = K.stop_gradient(outputActionMeanFromNetwork, "MeanNoGrad"); var varNoGrad = K.stop_gradient(outputVariance, "VarNoGrad"); var logVar = outActionLogVarianceFromNetwork; var logVarNoGrad = K.stop_gradient(logVar, "LogVarNoGrad"); using (K.name_scope("VarLoss")) { var logpNoMeanGrad = -1.0f * K.sum(0.5f * K.square(inputOldAction - meanNoGrad) / outputVariance + 0.5f * logVar, 1); varLoss = K.identity(-1.0f * K.mean(posAdvantage * logpNoMeanGrad), "VarLoss"); } using (K.name_scope("MeanLoss")) { var logpNoVarGrad = -1.0f * K.sum(0.5f * K.square(inputOldAction - outputActionMeanFromNetwork) / varNoGrad + 0.5f * logVarNoGrad, 1); meanLoss = K.identity(-1.0f * K.mean(posAdvantage * logpNoVarGrad), "MeanLoss"); } } //add inputs, outputs and parameters to the list List <Tensor> allInputs = new List <Tensor>(); if (HasVectorObservation) { allInputs.Add(stateTensor); } if (HasVisualObservation) { allInputs.AddRange(inputVisualTensors); } allInputs.Add(inputOldAction); allInputs.Add(inputAdvantage); //create optimizer and create necessary functions var updatesMean = AddOptimizer(meanWeights, meanLoss, optimizerMean); var updatesVar = AddOptimizer(varweights, varLoss, optimizerVariance); TrainMeanFunction = K.function(allInputs, new List <Tensor> { meanLoss }, updatesMean, "UpdateMeanFunction"); TrainVarianceFunction = K.function(allInputs, new List <Tensor> { varLoss }, updatesVar, "UpdateMeanFunction"); //pretraining for output mean and var var inputInitialStd = UnityTFUtils.Input(new int?[] { ActionSizes[0] }, name: "InputInitialStd")[0]; var inputInitialMean = UnityTFUtils.Input(new int?[] { ActionSizes[0] }, name: "InputInitialMean")[0]; var policyInitLoss = K.mean(K.mean(K.square(inputInitialMean - outputActionMeanFromNetwork))); policyInitLoss += K.mean(K.mean(K.square(inputInitialStd - K.sqrt(outputVariance)))); var updatesPretrain = AddOptimizer(network.GetActorWeights(), policyInitLoss, optimizerPretrain); var pretrainInputs = new List <Tensor>(); pretrainInputs.Add(stateTensor); pretrainInputs.Add(inputInitialMean); pretrainInputs.Add(inputInitialStd); PretrainFunction = K.function(pretrainInputs, new List <Tensor> { policyInitLoss }, updatesPretrain, "PretrainFunction"); } }