/// <summary> /// Initialize the model for supervised learning /// </summary> /// <param name="trainerParams"></param> /// <param name="vectorObsTensor"></param> /// <param name="inputVisualTensors"></param> /// <param name="outputActionFromNetwork"></param> /// <param name="outputLogVarianceFromNetwork"></param> /// <param name="weightsToUpdate"></param> protected void InitializeSLStructures(TrainerParams trainerParams, Tensor vectorObsTensor, Tensor normalizedVectorObs, List <Tensor> inputVisualTensors) { Tensor outActionMean, outActionLogVariance, outValue; network.BuildNetworkForContinuousActionSapce(normalizedVectorObs, inputVisualTensors, null, null, ActionSizes[0], out outActionMean, out outValue, out outActionLogVariance); List <Tensor> allobservationInputs = new List <Tensor>(); if (HasVectorObservation) { allobservationInputs.Add(vectorObsTensor); } if (HasVisualObservation) { allobservationInputs.AddRange(inputVisualTensors); } Tensor outputVariance = null; //if (ActionSpace == SpaceType.continuous) //{ outputVariance = K.exp(outActionLogVariance); ActionFunction = K.function(allobservationInputs, new List <Tensor> { outActionMean, outputVariance }, null, "ActionFunction"); /*} * else * { * * ActionFunction = K.function(allobservationInputs, new List<Tensor> { outputActionFromNetwork }, null, "ActionFunction"); * }*/ ///created losses for supervised learning part Tensor supervisedLearingLoss = null; var inputActionLabel = UnityTFUtils.Input(new int?[] { ActionSpace == SpaceType.continuous ? ActionSizes[0] : 1 }, name: "InputAction", dtype: ActionSpace == SpaceType.continuous ? DataType.Float : DataType.Int32)[0]; /*if (ActionSpace == SpaceType.discrete) * { * var onehotInputAction = K.one_hot(inputActionLabel, K.constant<int>(ActionSizes[0], dtype: DataType.Int32), K.constant(1.0f), K.constant(0.0f)); * onehotInputAction = K.reshape(onehotInputAction, new int[] { -1, ActionSizes[0] }); * supervisedLearingLoss = K.mean(K.categorical_crossentropy(onehotInputAction, outputActionFromNetwork, false)); * } * else * {*/ supervisedLearingLoss = K.mean(K.mean(0.5 * K.square(inputActionLabel - outActionMean) / outputVariance + 0.5 * outActionLogVariance)); //} var updates = AddOptimizer(network.GetActorWeights(), supervisedLearingLoss, optimizer); var slInputs = new List <Tensor>(); slInputs.AddRange(allobservationInputs); slInputs.Add(inputActionLabel); UpdateSLFunction = K.function(slInputs, new List <Tensor>() { supervisedLearingLoss }, updates, "UpdateSLFunction"); }
/// <summary> /// Initialize the model for supervised learning /// </summary> /// <param name="trainerParams"></param> /// <param name="stateTensor"></param> /// <param name="inputVisualTensors"></param> /// <param name="outputActionFromNetwork"></param> /// <param name="outputVarianceFromNetwork"></param> /// <param name="weightsToUpdate"></param> protected void InitializeSLStructures(TrainerParams trainerParams, Tensor stateTensor, List <Tensor> inputVisualTensors, Tensor outputActionFromNetwork, Tensor outputVarianceFromNetwork, List <Tensor> weightsToUpdate) { List <Tensor> allobservationInputs = new List <Tensor>(); if (HasVectorObservation) { allobservationInputs.Add(stateTensor); } if (HasVisualObservation) { allobservationInputs.AddRange(inputVisualTensors); } if (ActionSpace == SpaceType.continuous) { ActionFunction = K.function(allobservationInputs, new List <Tensor> { outputActionFromNetwork, outputVarianceFromNetwork }, null, "ActionFunction"); } else { ActionFunction = K.function(allobservationInputs, new List <Tensor> { outputActionFromNetwork }, null, "ActionFunction"); } ///created losses for supervised learning part Tensor supervisedLearingLoss = null; var inputActionLabel = UnityTFUtils.Input(new int?[] { ActionSpace == SpaceType.continuous ? ActionSize : 1 }, name: "InputAction", dtype: ActionSpace == SpaceType.continuous ? DataType.Float : DataType.Int32)[0]; if (ActionSpace == SpaceType.discrete) { var onehotInputAction = K.one_hot(inputActionLabel, K.constant <int>(ActionSize, dtype: DataType.Int32), K.constant(1.0f), K.constant(0.0f)); onehotInputAction = K.reshape(onehotInputAction, new int[] { -1, ActionSize }); supervisedLearingLoss = K.mean(K.categorical_crossentropy(onehotInputAction, outputActionFromNetwork, false)); } else { supervisedLearingLoss = K.mean(K.mean(0.5 * K.square(inputActionLabel - outputActionFromNetwork) / outputVarianceFromNetwork + 0.5 * K.log(outputVarianceFromNetwork))); } var updates = AddOptimizer(weightsToUpdate, supervisedLearingLoss, optimizer); var slInputs = new List <Tensor>(); slInputs.AddRange(allobservationInputs); slInputs.Add(inputActionLabel); UpdateSLFunction = K.function(slInputs, new List <Tensor>() { supervisedLearingLoss }, updates, "UpdateSLFunction"); }
protected void InitializePPOStructureContinuousAction(Tensor vectorObs, Tensor normalizedVectorObs, List <Tensor> visualObs, TrainerParams trainerParams) { //all inputs list List <Tensor> allObservationInputs = new List <Tensor>(); if (HasVectorObservation) { allObservationInputs.Add(vectorObs); } if (HasVisualObservation) { allObservationInputs.AddRange(visualObs); } //build the network Tensor outputValue = null; Tensor outputActionMean = null; Tensor outputLogVariance = null; network.BuildNetworkForContinuousActionSapce(normalizedVectorObs, visualObs, null, null, ActionSizes[0], out outputActionMean, out outputValue, out outputLogVariance); //value function ValueFunction = K.function(allObservationInputs, new List <Tensor> { outputValue }, null, "ValueFunction"); Tensor outputActualAction = null, actionLogProb = null, outputVariance = null; //build action sampling outputVariance = K.exp(outputLogVariance); using (K.name_scope("SampleAction")) { outputActualAction = K.standard_normal(K.shape(outputActionMean), DataType.Float) * K.sqrt(outputVariance) + outputActionMean; } using (K.name_scope("ActionProbs")) { actionLogProb = K.log_normal_probability(K.stop_gradient(outputActualAction), outputActionMean, outputVariance, outputLogVariance); } //action function //ActionFunction = K.function(allObservationInputs, new List<Tensor> { outputActualAction, actionLogProb, outputActionMean }, null, "ActionFunction"); ActionFunction = K.function(allObservationInputs, new List <Tensor> { outputActualAction, actionLogProb }, null, "ActionFunction"); var probInputs = new List <Tensor>(); probInputs.AddRange(allObservationInputs); probInputs.Add(outputActualAction); //probability function ActionProbabilityFunction = K.function(probInputs, new List <Tensor> { actionLogProb }, null, "ActionProbabilityFunction"); //training related TrainerParamsPPO trainingParams = trainerParams as TrainerParamsPPO; if (trainingParams != null) { Tensor outputEntropy; using (K.name_scope("Entropy")) { var temp = 0.5f * (Mathf.Log(2 * Mathf.PI * 2.7182818285f, 2.7182818285f) + outputLogVariance); if (outputLogVariance.shape.Length == 2) { outputEntropy = K.mean(K.mean(temp, 0, false), name: "OutputEntropy"); } else { outputEntropy = K.mean(temp, 0, false, name: "OutputEntropy"); } } List <Tensor> extraInputs = new List <Tensor>(); extraInputs.AddRange(allObservationInputs); extraInputs.Add(outputActualAction); CreatePPOOptimizer(trainingParams, outputEntropy, actionLogProb, outputValue, extraInputs, network.GetWeights()); } }
protected void InitializePPOStructureDiscreteAction(Tensor vectorObs, Tensor normalizedVectorObs, List <Tensor> visualObs, TrainerParams trainerParams) { //all inputs list List <Tensor> allObservationInputs = new List <Tensor>(); if (HasVectorObservation) { allObservationInputs.Add(vectorObs); } if (HasVisualObservation) { allObservationInputs.AddRange(visualObs); } Tensor[] outputActionsLogits = null; Tensor outputValue = null; network.BuildNetworkForDiscreteActionSpace(normalizedVectorObs, visualObs, null, null, ActionSizes, out outputActionsLogits, out outputValue); ValueFunction = K.function(allObservationInputs, new List <Tensor> { outputValue }, null, "ValueFunction"); //the action masks input placeholders List <Tensor> actionMasksInputs = new List <Tensor>(); for (int i = 0; i < ActionSizes.Length; ++i) { actionMasksInputs.Add(UnityTFUtils.Input(new int?[] { ActionSizes[i] }, name: "AcionMask" + i)[0]); } Tensor[] outputActions, outputNormalizedLogits; CreateDiscreteActionMaskingLayer(outputActionsLogits, actionMasksInputs.ToArray(), out outputActions, out outputNormalizedLogits); //output tensors for discrete actions. Includes all action selected actions and the normalized logits of all actions var outputDiscreteActions = new List <Tensor>(); outputDiscreteActions.Add(K.identity(K.cast(ActionSizes.Length == 1? outputActions[0]: K.concat(outputActions.ToList(), 1), DataType.Float), "OutputAction")); outputDiscreteActions.AddRange(outputNormalizedLogits); var actionFunctionInputs = new List <Tensor>(); actionFunctionInputs.AddRange(allObservationInputs); actionFunctionInputs.AddRange(actionMasksInputs); ActionFunction = K.function(actionFunctionInputs, outputDiscreteActions, null, "ActionFunction"); TrainerParamsPPO trainingParams = trainerParams as TrainerParamsPPO; if (trainingParams != null) { // action probability from input action Tensor outputEntropy; List <Tensor> inputActionsDiscreteSeperated = null, onehotInputActions = null; //for discrete action space Tensor inputAction = UnityTFUtils.Input(new int?[] { ActionSizes.Length }, name: "InputActions", dtype: DataType.Int32)[0]; //split the input for each discrete branch var splits = new int[ActionSizes.Length]; for (int i = 0; i < splits.Length; ++i) { splits[i] = 1; } inputActionsDiscreteSeperated = K.split(inputAction, K.constant(splits, dtype: DataType.Int32), K.constant(1, dtype: DataType.Int32), ActionSizes.Length); Tensor actionLogProb = null; using (K.name_scope("ActionProbAndEntropy")) { onehotInputActions = inputActionsDiscreteSeperated.Select((x, i) => K.reshape(K.one_hot(x, K.constant <int>(ActionSizes[i], dtype: DataType.Int32), K.constant(1.0f), K.constant(0.0f)), new int[] { -1, ActionSizes[i] })).ToList(); //entropy var entropies = outputActionsLogits.Select((t) => { return(K.mean((-1.0f) * K.sum(K.softmax(t) * K.log(K.softmax(t) + 0.00000001f), axis: 1), 0)); }); outputEntropy = entropies.Aggregate((x, y) => { return(x + y); }); //probabilities var actionProbsArray = ActionSizes.Select((x, i) => { return(K.sum(outputNormalizedLogits[i] * onehotInputActions[i], 1, true)); }).ToList(); //actionLogProb = K.reshape(K.sum(K.log(outputActionFromNetwork) * onehotInputAction, 1), new int[] { -1, 1 }); actionLogProb = ActionSizes.Length == 1 ? actionProbsArray[0]:K.concat(actionProbsArray, 1); } List <Tensor> extraInputs = new List <Tensor>(); extraInputs.AddRange(actionFunctionInputs); extraInputs.Add(inputAction); CreatePPOOptimizer(trainingParams, outputEntropy, actionLogProb, outputValue, extraInputs, network.GetWeights()); } }
/// <summary> /// Initialize the model without training parts /// </summary> /// <param name="brainParameters"></param> public override void InitializeInner(BrainParameters brainParameters, Tensor vecotrObsTensor, List <Tensor> visualTensors, TrainerParams trainerParams) { //vector observation normalization Tensor normalizedVectorObs = vecotrObsTensor; if (useInputNormalization && HasVectorObservation) { normalizedVectorObs = CreateRunninngNormalizer(normalizedVectorObs, StateSize); } else if (useInputNormalization) { Debug.LogWarning("useInputNormalization is turned off because it is not supported in this case"); useInputNormalization = false; } //build all stuff if (trainerParams is TrainerParamsPPO || mode == Mode.PPO) { mode = Mode.PPO; if (ActionSpace == SpaceType.continuous) { InitializePPOStructureContinuousAction(vecotrObsTensor, normalizedVectorObs, visualTensors, trainerParams); } else if (ActionSpace == SpaceType.discrete) { InitializePPOStructureDiscreteAction(vecotrObsTensor, normalizedVectorObs, visualTensors, trainerParams); } } else if (mode == Mode.SupervisedLearning || trainerParams is TrainerParamsMimic) { mode = Mode.SupervisedLearning; if (ActionSpace == SpaceType.continuous) { InitializeSLStructureContinuousAction(vecotrObsTensor, normalizedVectorObs, visualTensors, trainerParams); } else { InitializeSLStructureDiscreteAction(vecotrObsTensor, normalizedVectorObs, visualTensors, trainerParams); } } }
public override void InitializeInner(BrainParameters brainParameters, Tensor stateTensor, List <Tensor> visualTensors, TrainerParams trainerParams) { if (brainParameters.cameraResolutions != null && brainParameters.cameraResolutions.Length != 0) { Debug.LogError("GAN for ML agent does not support visual input yet"); } Debug.Assert(brainParameters.vectorActionSpaceType == SpaceType.continuous, "GAN for ML agent does not support discrete action space."); TrainerParams = trainerParams as TrainerParamsGAN; if (trainerParams != null && TrainerParams == null) { Debug.LogError("Trainer params for GAN needs to be a TrainerParamsGAN type"); } outputShape = new int[] { ActionSize }; inputConditionShape = new int[] { StateSize }; Initialize(trainerParams != null); }
/// <summary> /// Initialize the model without training parts /// </summary> /// <param name="brainParameters"></param> public override void InitializeInner(BrainParameters brainParameters, Tensor stateTensor, List <Tensor> visualTensors, TrainerParams trainerParams) { Debug.Assert(ActionSpace == SpaceType.continuous, "RLModelPPOCMA only support continuous action space."); Tensor inputStateTensorToNetwork = stateTensor; if (useInputNormalization && HasVectorObservation) { inputStateTensorToNetwork = CreateRunninngNormalizer(inputStateTensorToNetwork, StateSize); } //build the network Tensor outputValue = null; Tensor outputAction = null; Tensor outActionLogVariance = null; network.BuildNetworkForContinuousActionSapce(inputStateTensorToNetwork, visualTensors, null, null, ActionSizes[0], out outputAction, out outputValue, out outActionLogVariance); InitializePPOCMAStructures(trainerParams, stateTensor, visualTensors, outputValue, outputAction, outActionLogVariance, network.GetCriticWeights(), network.GetActorMeanWeights(), network.GetActorVarianceWeights()); }
public override List <double[]> Train(TrainerParams trainParams) { AdamParams passedParams = (AdamParams)trainParams; //int valSplitSize = 0; List <double[]> learningCurve = new List <double[]>(); List <int> trainingSetIndices = Enumerable.Range(0, passedParams.trainingSet.Labels.RowCount).ToList(); List <int> testSetIndices = null; DataSet test = new DataSet(null, null); if (passedParams.validationSet != null) { testSetIndices = Enumerable.Range(0, passedParams.validationSet.Labels.RowCount).ToList(); /* if (shuffle) * { * testSetIndices.Shuffle(); * } */ test.Inputs = CreateMatrix.Dense(testSetIndices.Count, passedParams.validationSet.Inputs.ColumnCount, 0.0); test.Labels = CreateMatrix.Dense(testSetIndices.Count, passedParams.validationSet.Labels.ColumnCount, 0.0); for (int i = 0; i < testSetIndices.Count; i++) { test.Inputs.SetRow(i, passedParams.validationSet.Inputs.Row(testSetIndices[i])); //, 1, 0, Dataset.Inputs.ColumnCount)); test.Labels.SetRow(i, passedParams.validationSet.Labels.Row(testSetIndices[i])); //.SubMatrix(trainingSetIndices[batchIndex], 1, 0, Dataset.Labels.ColumnCount)); } } if (passedParams.shuffle) { trainingSetIndices.Shuffle(); } Matrix <double> batchesIndices = null; //a 2d matrix of shape(nmberOfBatches,2), rows are batches, row[0] =barchstart, row[1] = batchEnd Dictionary <int, Matrix <double> > previousWeightsUpdate = null; //for the momentum updates int adamUpdateStep = 1; for (int epoch = 1; epoch <= passedParams.numberOfEpochs; epoch++) { if (passedParams.batchSize != null)//will build a matrix "batchesIndices" describing the batches that in each row, contains the start and the end of a batch { var numberOfBatches = (int)Math.Ceiling(((passedParams.trainingSet.Labels.RowCount / (double)(passedParams.batchSize)))); batchesIndices = CreateMatrix.Dense(numberOfBatches, 2, 0.0); for (int j = 0; j < numberOfBatches; j++) { batchesIndices.SetRow(j, new double[] { j *(double)passedParams.batchSize, Math.Min(passedParams.trainingSet.Inputs.RowCount - 1, (j + 1) * (double)passedParams.batchSize - 1) }); } } else//put all of the dataset in one batch { batchesIndices = CreateMatrix.Dense(1, 2, 0.0); batchesIndices.SetRow(0, new double[] { 0, passedParams.trainingSet.Inputs.RowCount - 1 }); } double iterationLoss = 0;//will hold the average of the batches average losses, each batch contributes to this with its loss average = batchloss/batchsize Dictionary <int, Matrix <double> > firstMoment = new Dictionary <int, Matrix <double> >(); Dictionary <int, Matrix <double> > secondMoment = new Dictionary <int, Matrix <double> >(); Dictionary <int, Matrix <double> > mhat = new Dictionary <int, Matrix <double> >(); Dictionary <int, Matrix <double> > vhat = new Dictionary <int, Matrix <double> >(); Dictionary <int, Matrix <double> > prevFirstMoment = new Dictionary <int, Matrix <double> >(); Dictionary <int, Matrix <double> > prevSecondMoment = new Dictionary <int, Matrix <double> >(); for (int batchIndex = 0; batchIndex < batchesIndices.RowCount; batchIndex++) //for each batch { previousWeightsUpdate = passedParams.parallelize ? Parallel_PerformBatchComputations(passedParams, batchesIndices, adamUpdateStep, ref iterationLoss, firstMoment, secondMoment, mhat, vhat, prevFirstMoment, prevSecondMoment, batchIndex) : PerformBatchComputations(passedParams, batchesIndices, adamUpdateStep, ref iterationLoss, firstMoment, secondMoment, mhat, vhat, prevFirstMoment, prevSecondMoment, batchIndex); //for each batch } adamUpdateStep++; iterationLoss /= batchesIndices.RowCount; // computing the test loss: double validationError = 0; validationError = passedParams.parallelize ? Parallel_ComputeValidationLoss(passedParams, testSetIndices, test) : ComputeValidationLoss(passedParams, testSetIndices, test); double trainingAccuracy = 0, validationSetAccuracy = 0; if (passedParams.trueThreshold != null) { trainingAccuracy = Utilities.Tools.ComputeAccuracy(passedParams.network, passedParams.trainingSet, passedParams.trueThreshold); validationSetAccuracy = Utilities.Tools.ComputeAccuracy(passedParams.network, passedParams.validationSet, passedParams.trueThreshold); } learningCurve.Add(new double[] { iterationLoss, passedParams.validationSet != null ? validationError : 0, passedParams.trueThreshold != null ? trainingAccuracy : 0, passedParams.trueThreshold != null ? validationSetAccuracy : 0 }); if (passedParams.PrintLoss) { Console.ForegroundColor = ConsoleColor.Green; Console.WriteLine("Epoch:{0} train loss:{1} - validation loss:{2}", epoch, iterationLoss, validationError); } Console.ResetColor(); } return(learningCurve); }
/// <summary> /// Trainers will call this method to initialize the model. This method will call the InitializeInner() /// </summary> /// <param name="brainParameters">brain parameter of the MLagent brain</param> /// <param name="enableTraining">whether enable training</param> /// <param name="trainerParams">trainer parameters passed by the trainer. Training will not be enabled </param> public virtual void Initialize(BrainParameters brainParameters, bool enableTraining, TrainerParams trainerParams = null) { Debug.Assert(Initialized == false, "Model already Initalized"); NameScope ns = null; if (!string.IsNullOrEmpty(modelName)) { ns = Current.K.name_scope(modelName); } ActionSizes = brainParameters.vectorActionSize; StateSize = brainParameters.vectorObservationSize * brainParameters.numStackedVectorObservations; ActionSpace = brainParameters.vectorActionSpaceType; Debug.Assert(ActionSizes[0] > 0, "Action size can not be zero"); //create basic inputs var inputStateTensor = StateSize > 0 ? UnityTFUtils.Input(new int?[] { StateSize }, name: "InputStates")[0] : null; HasVectorObservation = inputStateTensor != null; var inputVisualTensors = CreateVisualInputs(brainParameters); HasVisualObservation = inputVisualTensors != null; //create inner intialization InitializeInner(brainParameters, inputStateTensor, inputVisualTensors, enableTraining ? trainerParams : null); //test //Debug.LogWarning("Tensorflow Graph is saved for test purpose at: SavedGraph/" + name + ".pb"); //((UnityTFBackend)Current.K).ExportGraphDef("SavedGraph/" + name + ".pb"); Current.K.try_initialize_variables(true); if (ns != null) { ns.Dispose(); } if (checkpointToLoad != null) { RestoreCheckpoint(checkpointToLoad.bytes, true); } Initialized = true; TrainingEnabled = enableTraining; }
/// <summary> /// Initialize the model for PPO /// </summary> /// <param name="trainerParams"></param> /// <param name="stateTensor"></param> /// <param name="inputVisualTensors"></param> /// <param name="outputValueFromNetwork"></param> /// <param name="outputActionFromNetwork"></param> /// <param name="outputVarianceFromNetwork"></param> /// <param name="weightsToUpdate"></param> protected void InitializePPOStructures(TrainerParams trainerParams, Tensor stateTensor, List <Tensor> inputVisualTensors, Tensor outputValueFromNetwork, Tensor outputActionFromNetwork, Tensor outputVarianceFromNetwork, List <Tensor> weightsToUpdate) { List <Tensor> allobservationInputs = new List <Tensor>(); if (HasVectorObservation) { allobservationInputs.Add(stateTensor); } if (HasVisualObservation) { allobservationInputs.AddRange(inputVisualTensors); } ValueFunction = K.function(allobservationInputs, new List <Tensor> { outputValueFromNetwork }, null, "ValueFunction"); Tensor outputActualAction = null; Tensor actionProb = null; if (ActionSpace == SpaceType.continuous) { using (K.name_scope("SampleAction")) { outputActualAction = K.standard_normal(K.shape(outputActionFromNetwork), DataType.Float) * K.sqrt(outputVarianceFromNetwork) + outputActionFromNetwork; } using (K.name_scope("ActionProbs")) { actionProb = K.normal_probability(K.stop_gradient(outputActualAction), outputActionFromNetwork, outputVarianceFromNetwork); } ActionFunction = K.function(allobservationInputs, new List <Tensor> { outputActualAction, actionProb, outputActionFromNetwork, outputVarianceFromNetwork }, null, "ActionFunction"); var probInputs = new List <Tensor>(); probInputs.AddRange(allobservationInputs); probInputs.Add(outputActualAction); ActionProbabilityFunction = K.function(probInputs, new List <Tensor> { actionProb }, null, "ActionProbabilityFunction"); } else { ActionFunction = K.function(allobservationInputs, new List <Tensor> { outputActionFromNetwork }, null, "ActionFunction"); } TrainerParamsPPO trainingParams = trainerParams as TrainerParamsPPO; if (trainingParams != null) { //training needed inputs var inputOldProb = UnityTFUtils.Input(new int?[] { ActionSpace == SpaceType.continuous ? ActionSize : 1 }, name: "InputOldProb")[0]; var inputAdvantage = UnityTFUtils.Input(new int?[] { 1 }, name: "InputAdvantage")[0]; var inputTargetValue = UnityTFUtils.Input(new int?[] { 1 }, name: "InputTargetValue")[0]; var inputOldValue = UnityTFUtils.Input(new int?[] { 1 }, name: "InputOldValue")[0]; ClipEpsilon = trainingParams.clipEpsilon; ValueLossWeight = trainingParams.valueLossWeight; EntropyLossWeight = trainingParams.entropyLossWeight; var inputClipEpsilon = UnityTFUtils.Input(batch_shape: new int?[] { }, name: "ClipEpsilon", dtype: DataType.Float)[0]; var inputValuelossWeight = UnityTFUtils.Input(batch_shape: new int?[] { }, name: "ValueLossWeight", dtype: DataType.Float)[0]; var inputEntropyLossWeight = UnityTFUtils.Input(batch_shape: new int?[] { }, name: "EntropyLossWeight", dtype: DataType.Float)[0]; // action probability from input action Tensor outputEntropy; Tensor inputActionDiscrete = null, onehotInputAction = null; //for discrete action space if (ActionSpace == SpaceType.continuous) { using (K.name_scope("Entropy")) { var temp = K.mul(outputVarianceFromNetwork, 2 * Mathf.PI * 2.7182818285); temp = K.mul(K.log(temp), 0.5); if (outputVarianceFromNetwork.shape.Length == 2) { outputEntropy = K.mean(K.mean(temp, 0, false), name: "OutputEntropy"); } else { outputEntropy = K.mean(temp, 0, false, name: "OutputEntropy"); } } } else { using (K.name_scope("ActionProbAndEntropy")) { inputActionDiscrete = UnityTFUtils.Input(new int?[] { 1 }, name: "InputAction", dtype: DataType.Int32)[0]; onehotInputAction = K.one_hot(inputActionDiscrete, K.constant <int>(ActionSize, dtype: DataType.Int32), K.constant(1.0f), K.constant(0.0f)); onehotInputAction = K.reshape(onehotInputAction, new int[] { -1, ActionSize }); outputEntropy = K.mean((-1.0f) * K.sum(outputActionFromNetwork * K.log(outputActionFromNetwork + 0.00000001f), axis: 1), 0); actionProb = K.reshape(K.sum(outputActionFromNetwork * onehotInputAction, 1), new int[] { -1, 1 }); } } // value loss Tensor outputValueLoss = null; using (K.name_scope("ValueLoss")) { var clippedValueEstimate = inputOldValue + K.clip(outputValueFromNetwork - inputOldValue, 0.0f - inputClipEpsilon, inputClipEpsilon); var valueLoss1 = new MeanSquareError().Call(outputValueFromNetwork, inputTargetValue); var valueLoss2 = new MeanSquareError().Call(clippedValueEstimate, inputTargetValue); outputValueLoss = K.mean(K.maximum(valueLoss1, valueLoss2)); } //var outputValueLoss = K.mean(valueLoss1); // Clipped Surrogate loss Tensor outputPolicyLoss; using (K.name_scope("ClippedCurreogateLoss")) { //Debug.LogWarning("testnew"); //var probStopGradient = K.stop_gradient(actionProb); var probRatio = actionProb / (inputOldProb + 0.0000000001f); var p_opt_a = probRatio * inputAdvantage; var p_opt_b = K.clip(probRatio, 1.0f - inputClipEpsilon, 1.0f + inputClipEpsilon) * inputAdvantage; outputPolicyLoss = (-1f) * K.mean(K.mean(K.minimun(p_opt_a, p_opt_b)), name: "ClippedCurreogateLoss"); } //final weighted loss var outputLoss = outputPolicyLoss + inputValuelossWeight * outputValueLoss; outputLoss = outputLoss - inputEntropyLossWeight * outputEntropy; outputLoss = K.identity(outputLoss, "OutputLoss"); //add inputs, outputs and parameters to the list List <Tensor> allInputs = new List <Tensor>(); if (HasVectorObservation) { allInputs.Add(stateTensor); } if (HasVisualObservation) { allInputs.AddRange(inputVisualTensors); } if (ActionSpace == SpaceType.continuous) { allInputs.Add(outputActualAction); } else { allInputs.Add(inputActionDiscrete); } allInputs.Add(inputOldProb); allInputs.Add(inputTargetValue); allInputs.Add(inputOldValue); allInputs.Add(inputAdvantage); allInputs.Add(inputClipEpsilon); allInputs.Add(inputValuelossWeight); allInputs.Add(inputEntropyLossWeight); //create optimizer and create necessary functions var updates = AddOptimizer(weightsToUpdate, outputLoss, optimizer); UpdatePPOFunction = K.function(allInputs, new List <Tensor> { outputLoss, outputValueLoss, outputPolicyLoss, outputEntropy, actionProb }, updates, "UpdateFunction"); } }
/// <summary> /// Initialize the model without training parts /// </summary> /// <param name="brainParameters"></param> public override void InitializeInner(BrainParameters brainParameters, Tensor stateTensor, List <Tensor> visualTensors, TrainerParams trainerParams) { Tensor inputStateTensorToNetwork = stateTensor; if (useInputNormalization && HasVectorObservation) { inputStateTensorToNetwork = CreateRunninngNormalizer(inputStateTensorToNetwork, StateSize); } //build the network Tensor outputValue = null; Tensor outputAction = null; Tensor outputVariance = null; network.BuildNetwork(inputStateTensorToNetwork, visualTensors, null, null, ActionSize, ActionSpace, out outputAction, out outputValue, out outputVariance); if (trainerParams is TrainerParamsPPO || mode == Mode.PPO) { mode = Mode.PPO; InitializePPOStructures(trainerParams, stateTensor, visualTensors, outputValue, outputAction, outputVariance, network.GetWeights()); } else if (mode == Mode.SupervisedLearning || trainerParams is TrainerParamsMimic) { mode = Mode.SupervisedLearning; InitializeSLStructures(trainerParams, stateTensor, visualTensors, outputAction, outputVariance, network.GetActorWeights()); } }
/// <summary> /// Initialize the model without training parts /// </summary> /// <param name="brainParameters"></param> public override void InitializeInner(BrainParameters brainParameters, Tensor stateTensor, List <Tensor> visualTensors, TrainerParams trainerParams) { Debug.Assert(visualTensors == null, "RLModelPPOHierarchy does not support visual input yet"); if (highLevelObservationSize > 0) { var splited = K.split(stateTensor, K.constant(new int[] { lowLevelObservationSize, highLevelObservationSize }, dtype: DataType.Int32), K.constant(1, dtype: DataType.Int32), 2); inputLowLevelTensor = splited[0]; inputHighLevelTensor = splited[1]; } else { inputLowLevelTensor = stateTensor; } List <Tensor> inputVisualTensors = visualTensors; if (useInputNormalization && HasVectorObservation) { inputLowLevelTensor = CreateRunninngNormalizer(inputLowLevelTensor, StateSize); } Tensor outputValue = null; Tensor outputAction = null; Tensor outputVariance = null; //build the network networkHierarchy.BuildNetwork(inputLowLevelTensor, inputHighLevelTensor, ActionSize, ActionSpace, out outputAction, out outputValue, out outputVariance); InitializePPOStructures(trainerParams, stateTensor, inputVisualTensors, outputValue, outputAction, outputVariance, networkHierarchy.GetHighLevelWeights()); }
/// <summary> /// Initialize the model without training parts /// </summary> /// <param name="brainParameters"></param> public override void InitializeInner(BrainParameters brainParameters, Tensor vecotrObsTensor, List <Tensor> visualTensors, TrainerParams trainerParams) { //vector observation normalization Tensor normalizedVectorObs = vecotrObsTensor; if (useInputNormalization && HasVectorObservation) { normalizedVectorObs = CreateRunninngNormalizer(normalizedVectorObs, StateSize); } //build all stuff if (trainerParams is TrainerParamsPPO || mode == Mode.PPO) { mode = Mode.PPO; if (ActionSpace == SpaceType.continuous) { InitializePPOStructureContinuousAction(vecotrObsTensor, normalizedVectorObs, visualTensors, trainerParams); } else if (ActionSpace == SpaceType.discrete) { InitializePPOStructureDiscreteAction(vecotrObsTensor, normalizedVectorObs, visualTensors, trainerParams); } } else if (mode == Mode.SupervisedLearning || trainerParams is TrainerParamsMimic) { Debug.Assert(ActionSizes.Length <= 1, "Supervised learning not support branching yet"); mode = Mode.SupervisedLearning; if (ActionSpace == SpaceType.continuous) { InitializeSLStructures(trainerParams, vecotrObsTensor, normalizedVectorObs, visualTensors); } else { Debug.LogError("Discrete action is not support for supervised learning yet"); } } }
/// <summary> /// Given a network architecture, and the desired Dataset parameters and dataset for training and testing /// this function runs a forward propagation iteration,calculates the errors and backpropagates it to update the network weights /// for the accuracy computations, we assume a single output classification problem, otherwise we will need to implement cross entropy instead of MSE (perhaps later :) ) /// </summary> /// <param name="network">a network architecture</param> /// <param name="Dataset"> the training dataset</param> /// <param name="learningRate"> the initial learning rate</param> /// <param name="numberOfEpochs">number of Dataset epochs</param> /// <param name="shuffle">set to true, will shuffle the trainingdataWithBias</param> /// <param name="batchSize">the Dataset trainingdataWithBias batch size</param> /// <param name="debug">set to true, will print verbose messages to the screen</param> /// <param name="regularizationRate">the L2 regularization rate used</param> /// <param name="regularization">Regularization method used, only L2 is implemented for now</param> /// <param name="momentum">Momentum rate</param> /// <param name="resilient">set to true, will enable the resilient property where the learning rate is multiplied by resilientUpdateAccelerationRate in case previous update was same sign as current update and resilientUpdateSlowDownRate otherwise</param> /// <param name="resilientUpdateAccelerationRate"> if resilient is set to true, the learning rate will be multiplied by this value in case the sign of the previous weights updates was the same as the current new one</param> /// <param name="resilientUpdateSlowDownRate">if resilient is set to true, the learning rate will be multiplied by this value in case the sign of the previous weights updates was NOT the same as the current new one</param> /// <param name="validationSet"> the validation dataset</param> /// <param name="trueThreshold"> between 0 to 1, if present accuracy of the Dataset and validation trainingdataWithBias will be computed at each epoch and reported in the returned learning curve list of doubles </param> /// <param name="MEE">set to true, will report the Mean Euclidean Error instead of Mean Square Error</param> /// <param name="reduceLearningRate">set to true, will enable reducing the learning rate during training</param> /// <param name="learningRateReduction">will be multiplied by the learning rate during training</param> /// <param name="learningRateReductionAfterEpochs">After how many epoch shall the learningRateReduction be multiplied by the learning rate</param> /// <param name="numberOfReductions">number of time a reduction shall happen</param> /// <returns> a list of double arrays each element is a 4 elements double array "a" a[0] = iteration Dataset loss(MSE), a[1] = validation error(MSE), a[2] =Dataset set accuracy, a[3] = validation set accuracy </returns> //Network network, DataSet trainingSet, double learningRate, int numberOfEpochs, bool shuffle = false, int? batchSize = null, bool debug = false, double regularizationRate = 0, Regularizations regularization = Regularizations.None, double momentum = 0, bool resilient = false, double resilientUpdateAccelerationRate = 1, double resilientUpdateSlowDownRate = 1, DataSet validationSet = null, double? trueThreshold = 0.5, bool MEE = false, bool reduceLearningRate = false, double learningRateReduction = 0.5, int learningRateReductionAfterEpochs = 1000, int numberOfReductions = 2, bool nestrov = false public abstract List <double[]> Train(TrainerParams trainParams);
protected void InitializeSLStructureDiscreteAction(Tensor vectorObs, Tensor normalizedVectorObs, List <Tensor> visualObs, TrainerParams trainerParams) { //all inputs list List <Tensor> allObservationInputs = new List <Tensor>(); if (HasVectorObservation) { allObservationInputs.Add(vectorObs); } if (HasVisualObservation) { allObservationInputs.AddRange(visualObs); } //build basic network Tensor[] outputActionsLogits = null; Tensor outputValue = null; network.BuildNetworkForDiscreteActionSpace(normalizedVectorObs, visualObs, null, null, ActionSizes, out outputActionsLogits, out outputValue); //the action masks input placeholders List <Tensor> actionMasksInputs = new List <Tensor>(); for (int i = 0; i < ActionSizes.Length; ++i) { actionMasksInputs.Add(UnityTFUtils.Input(new int?[] { ActionSizes[i] }, name: "AcionMask" + i)[0]); } //masking and normalized and get the final action tensor Tensor[] outputActions, outputNormalizedLogits; CreateDiscreteActionMaskingLayer(outputActionsLogits, actionMasksInputs.ToArray(), out outputActions, out outputNormalizedLogits); //output tensors for discrete actions. Includes all action selected actions var outputDiscreteActions = new List <Tensor>(); outputDiscreteActions.Add(K.identity(K.cast(ActionSizes.Length == 1 ? outputActions[0] : K.concat(outputActions.ToList(), 1), DataType.Float), "OutputAction")); var actionFunctionInputs = new List <Tensor>(); actionFunctionInputs.AddRange(allObservationInputs); actionFunctionInputs.AddRange(actionMasksInputs); ActionFunction = K.function(actionFunctionInputs, outputDiscreteActions, null, "ActionFunction"); //build the parts for training TrainerParamsMimic trainingParams = trainerParams as TrainerParamsMimic; if (trainerParams != null && trainingParams == null) { Debug.LogError("Trainer params for Supervised learning mode needs to be a TrainerParamsMimic type"); } if (trainingParams != null) { //training inputs var inputActionLabels = UnityTFUtils.Input(new int?[] { ActionSizes.Length }, name: "InputAction", dtype: DataType.Int32)[0]; //split the input for each discrete branch List <Tensor> inputActionsDiscreteSeperated = null, onehotInputActions = null; //for discrete action space var splits = new int[ActionSizes.Length]; for (int i = 0; i < splits.Length; ++i) { splits[i] = 1; } inputActionsDiscreteSeperated = K.split(inputActionLabels, K.constant(splits, dtype: DataType.Int32), K.constant(1, dtype: DataType.Int32), ActionSizes.Length); //creat the loss onehotInputActions = inputActionsDiscreteSeperated.Select((x, i) => K.reshape(K.one_hot(x, K.constant <int>(ActionSizes[i], dtype: DataType.Int32), K.constant(1.0f), K.constant(0.0f)), new int[] { -1, ActionSizes[i] })).ToList(); var losses = onehotInputActions.Select((x, i) => K.mean(K.categorical_crossentropy(x, outputNormalizedLogits[i], true))).ToList(); Tensor loss = losses.Aggregate((x, s) => x + s); //add inputs, outputs and parameters to the list List <Tensor> updateParameters = network.GetActorWeights(); List <Tensor> allInputs = new List <Tensor>(); allInputs.AddRange(actionFunctionInputs); allInputs.Add(inputActionLabels); //create optimizer and create necessary functions var updates = AddOptimizer(updateParameters, loss, optimizer); UpdateSLFunction = K.function(allInputs, new List <Tensor> { loss }, updates, "UpdateFunction"); } }
//Network network, DataSet trainingSet, double learningRate, int numberOfEpochs, bool shuffle = false, int? batchSize = null, bool debug = false, double regularizationRate = 0, Regularizations regularization = Regularizations.None, double momentum = 0, bool resilient = false, double resilientUpdateAccelerationRate = 1, double resilientUpdateSlowDownRate = 1, DataSet validationSet = null, double? trueThreshold = 0.5, bool MEE = false, bool reduceLearningRate = false, double learningRateReduction = 0.5, int learningRateReductionAfterEpochs = 1000, int numberOfReductions = 2, bool nestrov = false public override List <double[]> Train(TrainerParams trainParams) { GradientDescentParams passedParams = (GradientDescentParams)trainParams; /* if (passedParams.resilient) * { * // passedParams.learningRate = 1; * * }*/ //int valSplitSize = 0; List <double[]> learningCurve = new List <double[]>(); List <int> trainingSetIndices = Enumerable.Range(0, passedParams.trainingSet.Labels.RowCount).ToList(); List <int> testSetIndices = null; DataSet test = new DataSet(null, null); if (passedParams.validationSet != null) { testSetIndices = Enumerable.Range(0, passedParams.validationSet.Labels.RowCount).ToList(); /* if (shuffle) * { * testSetIndices.Shuffle(); * } */ test.Inputs = CreateMatrix.Dense(testSetIndices.Count, passedParams.validationSet.Inputs.ColumnCount, 0.0); test.Labels = CreateMatrix.Dense(testSetIndices.Count, passedParams.validationSet.Labels.ColumnCount, 0.0); for (int i = 0; i < testSetIndices.Count; i++) { test.Inputs.SetRow(i, passedParams.validationSet.Inputs.Row(testSetIndices[i])); //, 1, 0, Dataset.Inputs.ColumnCount)); test.Labels.SetRow(i, passedParams.validationSet.Labels.Row(testSetIndices[i])); //.SubMatrix(trainingSetIndices[batchIndex], 1, 0, Dataset.Labels.ColumnCount)); } } if (passedParams.shuffle) { trainingSetIndices.Shuffle(); } Matrix <double> batchesIndices = null; //a 2d matrix of shape(nmberOfBatches,2), rows are batches, row[0] =barchstart, row[1] = batchEnd Dictionary <int, Matrix <double> > previousWeightsUpdate = null; //for the momentum updates Dictionary <int, Matrix <double> > PreviousUpdateSigns = new Dictionary <int, Matrix <double> >(); //for the resilient backpropagation,if the sign changes we slow down with the slow down ratio, if it stays the same we accelerate with the acceleration ratio for (int epoch = 0; epoch < passedParams.numberOfEpochs; epoch++) { if (passedParams.batchSize != null)//will build a matrix "batchesIndices" describing the batches that in each row, contains the start and the end of a batch { var numberOfBatches = (int)Math.Ceiling(((passedParams.trainingSet.Labels.RowCount / (double)(passedParams.batchSize)))); batchesIndices = CreateMatrix.Dense(numberOfBatches, 2, 0.0); for (int j = 0; j < numberOfBatches; j++) { batchesIndices.SetRow(j, new double[] { j *(double)passedParams.batchSize, Math.Min(passedParams.trainingSet.Inputs.RowCount - 1, (j + 1) * (double)passedParams.batchSize - 1) }); } } else//put all of the dataset in one batch { batchesIndices = CreateMatrix.Dense(1, 2, 0.0); batchesIndices.SetRow(0, new double[] { 0, passedParams.trainingSet.Inputs.RowCount - 1 }); } double epochLoss = 0;//will hold the average of the batches average losses, each batch contributes to this with its loss average = batchloss/batchsize for (int batchIdx = 0; batchIdx < batchesIndices.RowCount; batchIdx++)//for each batch { PerformBatchComputations(passedParams, batchesIndices, ref previousWeightsUpdate, PreviousUpdateSigns, epoch, ref epochLoss, batchIdx); } epochLoss /= batchesIndices.RowCount; double validationError = passedParams.parallelize ? Parallel_ComputeValidationLoss(passedParams, testSetIndices, test) : ComputeValidationLoss(passedParams, testSetIndices, test); double trainingAccuracy = 0, validationSetAccuracy = 0; if (passedParams.trueThreshold != null) { trainingAccuracy = Utilities.Tools.ComputeAccuracy(passedParams.network, passedParams.trainingSet, passedParams.trueThreshold); validationSetAccuracy = Utilities.Tools.ComputeAccuracy(passedParams.network, passedParams.validationSet, passedParams.trueThreshold); } learningCurve.Add(new double[] { epochLoss, passedParams.validationSet != null ? validationError : 0, passedParams.trueThreshold != null ? trainingAccuracy : 0, passedParams.trueThreshold != null ? validationSetAccuracy : 0 }); if (passedParams.PrintLoss) { Console.ForegroundColor = ConsoleColor.Green; Console.WriteLine("Epoch:{0} train loss:{1} - validation loss:{2}", epoch, epochLoss, validationError); } if (passedParams.reduceLearningRate && epoch > 0 && passedParams.numberOfReductions > 0 && epoch % passedParams.learningRateReductionAfterEpochs == 0) { passedParams.learningRate *= passedParams.learningRateReduction; passedParams.numberOfReductions--; Console.ForegroundColor = ConsoleColor.Red; Console.WriteLine("Learning Rate Reduced, now: {0}", passedParams.learningRate); } Console.ResetColor(); } return(learningCurve); }
protected void InitializeSLStructureContinuousAction(Tensor vectorObs, Tensor normalizedVectorObs, List <Tensor> visualObs, TrainerParams trainerParams) { //build the network Tensor outputValue = null; Tensor outputActionMean = null; Tensor outputLogVariance = null; network.BuildNetworkForContinuousActionSapce(normalizedVectorObs, visualObs, null, null, ActionSizes[0], out outputActionMean, out outputValue, out outputLogVariance); Tensor outputAction = outputActionMean; Tensor outputVar = K.exp(outputLogVariance); SLHasVar = outputLogVariance != null; List <Tensor> observationInputs = new List <Tensor>(); if (HasVectorObservation) { observationInputs.Add(vectorObs); } if (HasVisualObservation) { observationInputs.AddRange(visualObs); } if (SLHasVar) { ActionFunction = K.function(observationInputs, new List <Tensor> { outputAction, outputVar }, null, "ActionFunction"); } else { ActionFunction = K.function(observationInputs, new List <Tensor> { outputAction }, null, "ActionFunction"); } //build the parts for training TrainerParamsMimic trainingParams = trainerParams as TrainerParamsMimic; if (trainerParams != null && trainingParams == null) { Debug.LogError("Trainer params for Supervised learning mode needs to be a TrainerParamsMimic type"); } if (trainingParams != null) { //training inputs var inputActionLabel = UnityTFUtils.Input(new int?[] { ActionSizes[0] }, name: "InputAction", dtype: DataType.Float)[0]; //creat the loss Tensor loss = null; if (SLHasVar) { loss = K.mean(K.mean(0.5 * K.square(inputActionLabel - outputAction) / outputVar + 0.5 * outputLogVariance)); } else { loss = K.mean(new MeanSquareError().Call(inputActionLabel, outputAction)); } //add inputs, outputs and parameters to the list List <Tensor> updateParameters = network.GetActorWeights(); List <Tensor> allInputs = new List <Tensor>(); allInputs.AddRange(observationInputs); allInputs.Add(inputActionLabel); //create optimizer and create necessary functions var updates = AddOptimizer(updateParameters, loss, optimizer); UpdateSLFunction = K.function(allInputs, new List <Tensor> { loss }, updates, "UpdateFunction"); } }
public override void InitializeInner(BrainParameters brainParameters, Tensor inputStateTensor, List <Tensor> inputVisualTensors, TrainerParams trainerParams) { //build the network if (ActionSpace == SpaceType.continuous) { InitializeSLStructureContinuousAction(inputStateTensor, inputVisualTensors, trainerParams); } else if (ActionSpace == SpaceType.discrete) { InitializeSLStructureDiscreteAction(inputStateTensor, inputVisualTensors, trainerParams); } }
/// <summary> /// implement this method for your learning model for use of ML agent. It is called by a Trainer. You should create everything inccluding the neural network and optimizer(if trainer params if not null), /// using the inputs tensors /// </summary> /// <param name="brainParameters">brain parameter of the MLagent brain</param> /// <param name="stateTensor">the input tensor of the vector observation</param> /// <param name="visualTensors">input tensors of visual observations</param> /// <param name="trainerParams">trainer parameters passed by the trainer. You if it is null, training is no enbled and you dont have to implement the optimzing parts. </param> public abstract void InitializeInner(BrainParameters brainParameters, Tensor stateTensor, List <Tensor> visualTensors, TrainerParams trainerParams);
private void Awake() { trainer = GetComponent <Trainer>(); parameters = trainer.parameters; }
public override void InitializeInner(BrainParameters brainParameters, Tensor inputStateTensor, List <Tensor> inputVisualTensors, TrainerParams trainerParams) { //build the network var networkOutputs = network.BuildNetwork(inputStateTensor, inputVisualTensors, null, ActionSize, ActionSpace); Tensor outputAction = networkOutputs.Item1; Tensor outputVar = networkOutputs.Item2; hasVariance = outputVar != null && brainParameters.vectorActionSpaceType == SpaceType.continuous; List <Tensor> observationInputs = new List <Tensor>(); if (HasVectorObservation) { observationInputs.Add(inputStateTensor); } if (HasVisualObservation) { observationInputs.AddRange(inputVisualTensors); } if (hasVariance) { ActionFunction = K.function(observationInputs, new List <Tensor> { outputAction, outputVar }, null, "ActionFunction"); } else { ActionFunction = K.function(observationInputs, new List <Tensor> { outputAction }, null, "ActionFunction"); } //build the parts for training TrainerParamsMimic trainingParams = trainerParams as TrainerParamsMimic; if (trainerParams != null && trainingParams == null) { Debug.LogError("Trainer params for Supervised learning mode needs to be a TrainerParamsMimic type"); } if (trainingParams != null) { //training inputs var inputActionLabel = UnityTFUtils.Input(new int?[] { ActionSpace == SpaceType.continuous ? ActionSize : 1 }, name: "InputAction", dtype: ActionSpace == SpaceType.continuous ? DataType.Float : DataType.Int32)[0]; //creat the loss Tensor loss = null; if (ActionSpace == SpaceType.discrete) { Tensor actionOnehot = K.one_hot(inputActionLabel, K.constant(ActionSize, dtype: DataType.Int32), K.constant(1.0f), K.constant(0.0f)); Tensor reshapedOnehot = K.reshape(actionOnehot, new int[] { -1, ActionSize }); loss = K.mean(K.categorical_crossentropy(reshapedOnehot, outputAction, false)); } else { if (hasVariance) { loss = K.mean(K.mean(0.5 * K.square(inputActionLabel - outputAction) / outputVar + 0.5 * K.log(outputVar))); } else { loss = K.mean(new MeanSquareError().Call(inputActionLabel, outputAction)); } } //add inputs, outputs and parameters to the list List <Tensor> updateParameters = network.GetWeights(); List <Tensor> allInputs = new List <Tensor>(); if (HasVectorObservation) { allInputs.Add(inputStateTensor); observationInputs.Add(inputStateTensor); } if (HasVisualObservation) { allInputs.AddRange(inputVisualTensors); observationInputs.AddRange(inputVisualTensors); } allInputs.Add(inputActionLabel); //create optimizer and create necessary functions var updates = AddOptimizer(updateParameters, loss, optimizer); UpdateFunction = K.function(allInputs, new List <Tensor> { loss }, updates, "UpdateFunction"); } }
/// <summary> /// Initialize the model for PPO /// </summary> /// <param name="trainerParams"></param> /// <param name="stateTensor"></param> /// <param name="inputVisualTensors"></param> /// <param name="outputValueFromNetwork"></param> /// <param name="outputActionFromNetwork"></param> /// <param name="outputVarianceFromNetwork"></param> protected void InitializePPOCMAStructures(TrainerParams trainerParams, Tensor stateTensor, List <Tensor> inputVisualTensors, Tensor outputValueFromNetwork, Tensor outputActionMeanFromNetwork, Tensor outActionLogVarianceFromNetwork, List <Tensor> valueWeights, List <Tensor> meanWeights, List <Tensor> varweights) { List <Tensor> allobservationInputs = new List <Tensor>(); if (HasVectorObservation) { allobservationInputs.Add(stateTensor); } if (HasVisualObservation) { allobservationInputs.AddRange(inputVisualTensors); } ValueFunction = K.function(allobservationInputs, new List <Tensor> { outputValueFromNetwork }, null, "ValueFunction"); Tensor outputActualAction = null; Tensor outputVariance = K.exp(outActionLogVarianceFromNetwork); using (K.name_scope("SampleAction")) { outputActualAction = K.standard_normal(K.shape(outputActionMeanFromNetwork), DataType.Float) * K.sqrt(outputVariance) + outputActionMeanFromNetwork; } ActionFunction = K.function(allobservationInputs, new List <Tensor> { outputActualAction, outputActionMeanFromNetwork, outputVariance }, null, "ActionFunction"); TrainerParamsPPO trainingParams = trainerParams as TrainerParamsPPO; if (trainingParams != null) { //training needed inputs var inputOldAction = UnityTFUtils.Input(new int?[] { ActionSizes[0] }, name: "InputOldAction")[0]; var inputAdvantage = UnityTFUtils.Input(new int?[] { 1 }, name: "InputAdvantage")[0]; var inputTargetValue = UnityTFUtils.Input(new int?[] { 1 }, name: "InputTargetValue")[0]; var inputOldValue = UnityTFUtils.Input(new int?[] { 1 }, name: "InputOldValue")[0]; //var inputClipEpsilon = UnityTFUtils.Input(batch_shape: new int?[] { }, name: "ClipEpsilon", dtype: DataType.Float)[0]; var inputClipEpsilonValue = UnityTFUtils.Input(batch_shape: new int?[] { }, name: "ClipEpsilonValue", dtype: DataType.Float)[0]; // value loss Tensor outputValueLoss = null; using (K.name_scope("ValueLoss")) { var clippedValueEstimate = inputOldValue + K.clip(outputValueFromNetwork - inputOldValue, 0.0f - inputClipEpsilonValue, inputClipEpsilonValue); var valueLoss1 = new MeanSquareError().Call(outputValueFromNetwork, inputTargetValue); var valueLoss2 = new MeanSquareError().Call(clippedValueEstimate, inputTargetValue); outputValueLoss = K.mean(K.maximum(valueLoss1, valueLoss2)); outputValueLoss = K.mean(valueLoss1); } var valueUpdates = AddOptimizer(valueWeights, outputValueLoss, optimizerValue); List <Tensor> valueInputs = new List <Tensor>(); if (HasVectorObservation) { valueInputs.Add(stateTensor); } if (HasVisualObservation) { valueInputs.AddRange(inputVisualTensors); } valueInputs.Add(inputOldValue); valueInputs.Add(inputTargetValue); valueInputs.Add(inputClipEpsilonValue); TrainValueFunction = K.function(valueInputs, new List <Tensor> { outputValueLoss }, valueUpdates, "TrainValueFunction"); // actor losses Tensor meanLoss, varLoss; using (K.name_scope("ActorLosses")) { Tensor posAdvantage; if (usePositiveAdvOnly) { posAdvantage = K.identity(K.relu(K.mean(inputAdvantage)), "ClipedPositiveAdv"); } else { posAdvantage = K.identity(K.mean(inputAdvantage), "Adv"); } var meanNoGrad = K.stop_gradient(outputActionMeanFromNetwork, "MeanNoGrad"); var varNoGrad = K.stop_gradient(outputVariance, "VarNoGrad"); var logVar = outActionLogVarianceFromNetwork; var logVarNoGrad = K.stop_gradient(logVar, "LogVarNoGrad"); using (K.name_scope("VarLoss")) { var logpNoMeanGrad = -1.0f * K.sum(0.5f * K.square(inputOldAction - meanNoGrad) / outputVariance + 0.5f * logVar, 1); varLoss = K.identity(-1.0f * K.mean(posAdvantage * logpNoMeanGrad), "VarLoss"); } using (K.name_scope("MeanLoss")) { var logpNoVarGrad = -1.0f * K.sum(0.5f * K.square(inputOldAction - outputActionMeanFromNetwork) / varNoGrad + 0.5f * logVarNoGrad, 1); meanLoss = K.identity(-1.0f * K.mean(posAdvantage * logpNoVarGrad), "MeanLoss"); } } //add inputs, outputs and parameters to the list List <Tensor> allInputs = new List <Tensor>(); if (HasVectorObservation) { allInputs.Add(stateTensor); } if (HasVisualObservation) { allInputs.AddRange(inputVisualTensors); } allInputs.Add(inputOldAction); allInputs.Add(inputAdvantage); //create optimizer and create necessary functions var updatesMean = AddOptimizer(meanWeights, meanLoss, optimizerMean); var updatesVar = AddOptimizer(varweights, varLoss, optimizerVariance); TrainMeanFunction = K.function(allInputs, new List <Tensor> { meanLoss }, updatesMean, "UpdateMeanFunction"); TrainVarianceFunction = K.function(allInputs, new List <Tensor> { varLoss }, updatesVar, "UpdateMeanFunction"); //pretraining for output mean and var var inputInitialStd = UnityTFUtils.Input(new int?[] { ActionSizes[0] }, name: "InputInitialStd")[0]; var inputInitialMean = UnityTFUtils.Input(new int?[] { ActionSizes[0] }, name: "InputInitialMean")[0]; var policyInitLoss = K.mean(K.mean(K.square(inputInitialMean - outputActionMeanFromNetwork))); policyInitLoss += K.mean(K.mean(K.square(inputInitialStd - K.sqrt(outputVariance)))); var updatesPretrain = AddOptimizer(network.GetActorWeights(), policyInitLoss, optimizerPretrain); var pretrainInputs = new List <Tensor>(); pretrainInputs.Add(stateTensor); pretrainInputs.Add(inputInitialMean); pretrainInputs.Add(inputInitialStd); PretrainFunction = K.function(pretrainInputs, new List <Tensor> { policyInitLoss }, updatesPretrain, "PretrainFunction"); } }