コード例 #1
0
    /// <summary>
    /// Initialize the model for supervised learning
    /// </summary>
    /// <param name="trainerParams"></param>
    /// <param name="vectorObsTensor"></param>
    /// <param name="inputVisualTensors"></param>
    /// <param name="outputActionFromNetwork"></param>
    /// <param name="outputLogVarianceFromNetwork"></param>
    /// <param name="weightsToUpdate"></param>
    protected void InitializeSLStructures(TrainerParams trainerParams, Tensor vectorObsTensor, Tensor normalizedVectorObs, List <Tensor> inputVisualTensors)
    {
        Tensor outActionMean, outActionLogVariance, outValue;

        network.BuildNetworkForContinuousActionSapce(normalizedVectorObs, inputVisualTensors, null, null, ActionSizes[0], out outActionMean, out outValue, out outActionLogVariance);

        List <Tensor> allobservationInputs = new List <Tensor>();

        if (HasVectorObservation)
        {
            allobservationInputs.Add(vectorObsTensor);
        }
        if (HasVisualObservation)
        {
            allobservationInputs.AddRange(inputVisualTensors);
        }

        Tensor outputVariance = null;

        //if (ActionSpace == SpaceType.continuous)
        //{
        outputVariance = K.exp(outActionLogVariance);
        ActionFunction = K.function(allobservationInputs, new List <Tensor> {
            outActionMean, outputVariance
        }, null, "ActionFunction");

        /*}
         * else
         * {
         *
         *  ActionFunction = K.function(allobservationInputs, new List<Tensor> { outputActionFromNetwork }, null, "ActionFunction");
         * }*/



        ///created losses for supervised learning part
        Tensor supervisedLearingLoss = null;
        var    inputActionLabel      = UnityTFUtils.Input(new int?[] { ActionSpace == SpaceType.continuous ? ActionSizes[0] : 1 }, name: "InputAction", dtype: ActionSpace == SpaceType.continuous ? DataType.Float : DataType.Int32)[0];

        /*if (ActionSpace == SpaceType.discrete)
         * {
         *  var onehotInputAction = K.one_hot(inputActionLabel, K.constant<int>(ActionSizes[0], dtype: DataType.Int32), K.constant(1.0f), K.constant(0.0f));
         *  onehotInputAction = K.reshape(onehotInputAction, new int[] { -1, ActionSizes[0] });
         *  supervisedLearingLoss = K.mean(K.categorical_crossentropy(onehotInputAction, outputActionFromNetwork, false));
         * }
         * else
         * {*/
        supervisedLearingLoss = K.mean(K.mean(0.5 * K.square(inputActionLabel - outActionMean) / outputVariance + 0.5 * outActionLogVariance));
        //}

        var updates  = AddOptimizer(network.GetActorWeights(), supervisedLearingLoss, optimizer);
        var slInputs = new List <Tensor>();

        slInputs.AddRange(allobservationInputs); slInputs.Add(inputActionLabel);
        UpdateSLFunction = K.function(slInputs, new List <Tensor>()
        {
            supervisedLearingLoss
        }, updates, "UpdateSLFunction");
    }
コード例 #2
0
    /// <summary>
    /// Initialize the model for supervised learning
    /// </summary>
    /// <param name="trainerParams"></param>
    /// <param name="stateTensor"></param>
    /// <param name="inputVisualTensors"></param>
    /// <param name="outputActionFromNetwork"></param>
    /// <param name="outputVarianceFromNetwork"></param>
    /// <param name="weightsToUpdate"></param>
    protected void InitializeSLStructures(TrainerParams trainerParams, Tensor stateTensor, List <Tensor> inputVisualTensors, Tensor outputActionFromNetwork, Tensor outputVarianceFromNetwork, List <Tensor> weightsToUpdate)
    {
        List <Tensor> allobservationInputs = new List <Tensor>();

        if (HasVectorObservation)
        {
            allobservationInputs.Add(stateTensor);
        }
        if (HasVisualObservation)
        {
            allobservationInputs.AddRange(inputVisualTensors);
        }

        if (ActionSpace == SpaceType.continuous)
        {
            ActionFunction = K.function(allobservationInputs, new List <Tensor> {
                outputActionFromNetwork, outputVarianceFromNetwork
            }, null, "ActionFunction");
        }
        else
        {
            ActionFunction = K.function(allobservationInputs, new List <Tensor> {
                outputActionFromNetwork
            }, null, "ActionFunction");
        }



        ///created losses for supervised learning part
        Tensor supervisedLearingLoss = null;
        var    inputActionLabel      = UnityTFUtils.Input(new int?[] { ActionSpace == SpaceType.continuous ? ActionSize : 1 }, name: "InputAction", dtype: ActionSpace == SpaceType.continuous ? DataType.Float : DataType.Int32)[0];

        if (ActionSpace == SpaceType.discrete)
        {
            var onehotInputAction = K.one_hot(inputActionLabel, K.constant <int>(ActionSize, dtype: DataType.Int32), K.constant(1.0f), K.constant(0.0f));
            onehotInputAction     = K.reshape(onehotInputAction, new int[] { -1, ActionSize });
            supervisedLearingLoss = K.mean(K.categorical_crossentropy(onehotInputAction, outputActionFromNetwork, false));
        }
        else
        {
            supervisedLearingLoss = K.mean(K.mean(0.5 * K.square(inputActionLabel - outputActionFromNetwork) / outputVarianceFromNetwork + 0.5 * K.log(outputVarianceFromNetwork)));
        }

        var updates  = AddOptimizer(weightsToUpdate, supervisedLearingLoss, optimizer);
        var slInputs = new List <Tensor>();

        slInputs.AddRange(allobservationInputs); slInputs.Add(inputActionLabel);
        UpdateSLFunction = K.function(slInputs, new List <Tensor>()
        {
            supervisedLearingLoss
        }, updates, "UpdateSLFunction");
    }
コード例 #3
0
    protected void InitializePPOStructureContinuousAction(Tensor vectorObs, Tensor normalizedVectorObs, List <Tensor> visualObs, TrainerParams trainerParams)
    {
        //all inputs list
        List <Tensor> allObservationInputs = new List <Tensor>();

        if (HasVectorObservation)
        {
            allObservationInputs.Add(vectorObs);
        }
        if (HasVisualObservation)
        {
            allObservationInputs.AddRange(visualObs);
        }

        //build the network
        Tensor outputValue = null; Tensor outputActionMean = null; Tensor outputLogVariance = null;

        network.BuildNetworkForContinuousActionSapce(normalizedVectorObs, visualObs, null, null, ActionSizes[0], out outputActionMean, out outputValue, out outputLogVariance);

        //value function
        ValueFunction = K.function(allObservationInputs, new List <Tensor> {
            outputValue
        }, null, "ValueFunction");

        Tensor outputActualAction = null, actionLogProb = null, outputVariance = null;

        //build action sampling
        outputVariance = K.exp(outputLogVariance);
        using (K.name_scope("SampleAction"))
        {
            outputActualAction = K.standard_normal(K.shape(outputActionMean), DataType.Float) * K.sqrt(outputVariance) + outputActionMean;
        }
        using (K.name_scope("ActionProbs"))
        {
            actionLogProb = K.log_normal_probability(K.stop_gradient(outputActualAction), outputActionMean, outputVariance, outputLogVariance);
        }
        //action function
        //ActionFunction = K.function(allObservationInputs, new List<Tensor> { outputActualAction, actionLogProb, outputActionMean }, null, "ActionFunction");
        ActionFunction = K.function(allObservationInputs, new List <Tensor> {
            outputActualAction, actionLogProb
        }, null, "ActionFunction");

        var probInputs = new List <Tensor>(); probInputs.AddRange(allObservationInputs); probInputs.Add(outputActualAction);

        //probability function
        ActionProbabilityFunction = K.function(probInputs, new List <Tensor> {
            actionLogProb
        }, null, "ActionProbabilityFunction");

        //training related
        TrainerParamsPPO trainingParams = trainerParams as TrainerParamsPPO;

        if (trainingParams != null)
        {
            Tensor outputEntropy;
            using (K.name_scope("Entropy"))
            {
                var temp = 0.5f * (Mathf.Log(2 * Mathf.PI * 2.7182818285f, 2.7182818285f) + outputLogVariance);
                if (outputLogVariance.shape.Length == 2)
                {
                    outputEntropy = K.mean(K.mean(temp, 0, false), name: "OutputEntropy");
                }
                else
                {
                    outputEntropy = K.mean(temp, 0, false, name: "OutputEntropy");
                }
            }

            List <Tensor> extraInputs = new List <Tensor>();
            extraInputs.AddRange(allObservationInputs);
            extraInputs.Add(outputActualAction);
            CreatePPOOptimizer(trainingParams, outputEntropy, actionLogProb, outputValue, extraInputs, network.GetWeights());
        }
    }
コード例 #4
0
    protected void InitializePPOStructureDiscreteAction(Tensor vectorObs, Tensor normalizedVectorObs, List <Tensor> visualObs, TrainerParams trainerParams)
    {
        //all inputs list
        List <Tensor> allObservationInputs = new List <Tensor>();

        if (HasVectorObservation)
        {
            allObservationInputs.Add(vectorObs);
        }
        if (HasVisualObservation)
        {
            allObservationInputs.AddRange(visualObs);
        }

        Tensor[] outputActionsLogits = null; Tensor outputValue = null;
        network.BuildNetworkForDiscreteActionSpace(normalizedVectorObs, visualObs, null, null, ActionSizes, out outputActionsLogits, out outputValue);

        ValueFunction = K.function(allObservationInputs, new List <Tensor> {
            outputValue
        }, null, "ValueFunction");

        //the action masks input placeholders
        List <Tensor> actionMasksInputs = new List <Tensor>();

        for (int i = 0; i < ActionSizes.Length; ++i)
        {
            actionMasksInputs.Add(UnityTFUtils.Input(new int?[] { ActionSizes[i] }, name: "AcionMask" + i)[0]);
        }

        Tensor[] outputActions, outputNormalizedLogits;
        CreateDiscreteActionMaskingLayer(outputActionsLogits, actionMasksInputs.ToArray(), out outputActions, out outputNormalizedLogits);

        //output tensors for discrete actions. Includes all action selected actions and the normalized logits of all actions
        var outputDiscreteActions = new List <Tensor>();

        outputDiscreteActions.Add(K.identity(K.cast(ActionSizes.Length == 1? outputActions[0]: K.concat(outputActions.ToList(), 1), DataType.Float), "OutputAction"));
        outputDiscreteActions.AddRange(outputNormalizedLogits);
        var actionFunctionInputs = new List <Tensor>();

        actionFunctionInputs.AddRange(allObservationInputs); actionFunctionInputs.AddRange(actionMasksInputs);
        ActionFunction = K.function(actionFunctionInputs, outputDiscreteActions, null, "ActionFunction");


        TrainerParamsPPO trainingParams = trainerParams as TrainerParamsPPO;

        if (trainingParams != null)
        {
            // action probability from input action
            Tensor        outputEntropy;
            List <Tensor> inputActionsDiscreteSeperated = null, onehotInputActions = null;    //for discrete action space

            Tensor inputAction = UnityTFUtils.Input(new int?[] { ActionSizes.Length }, name: "InputActions", dtype: DataType.Int32)[0];

            //split the input for each discrete branch
            var splits = new int[ActionSizes.Length];
            for (int i = 0; i < splits.Length; ++i)
            {
                splits[i] = 1;
            }
            inputActionsDiscreteSeperated = K.split(inputAction, K.constant(splits, dtype: DataType.Int32), K.constant(1, dtype: DataType.Int32), ActionSizes.Length);

            Tensor actionLogProb = null;
            using (K.name_scope("ActionProbAndEntropy"))
            {
                onehotInputActions = inputActionsDiscreteSeperated.Select((x, i) => K.reshape(K.one_hot(x, K.constant <int>(ActionSizes[i], dtype: DataType.Int32), K.constant(1.0f), K.constant(0.0f)), new int[] { -1, ActionSizes[i] })).ToList();

                //entropy
                var entropies = outputActionsLogits.Select((t) => { return(K.mean((-1.0f) * K.sum(K.softmax(t) * K.log(K.softmax(t) + 0.00000001f), axis: 1), 0)); });
                outputEntropy = entropies.Aggregate((x, y) => { return(x + y); });

                //probabilities
                var actionProbsArray = ActionSizes.Select((x, i) => { return(K.sum(outputNormalizedLogits[i] * onehotInputActions[i], 1, true)); }).ToList();
                //actionLogProb = K.reshape(K.sum(K.log(outputActionFromNetwork) * onehotInputAction, 1), new int[] { -1, 1 });
                actionLogProb = ActionSizes.Length == 1 ? actionProbsArray[0]:K.concat(actionProbsArray, 1);
            }

            List <Tensor> extraInputs = new List <Tensor>();
            extraInputs.AddRange(actionFunctionInputs);
            extraInputs.Add(inputAction);

            CreatePPOOptimizer(trainingParams, outputEntropy, actionLogProb, outputValue, extraInputs, network.GetWeights());
        }
    }
コード例 #5
0
    /// <summary>
    /// Initialize the model without training parts
    /// </summary>
    /// <param name="brainParameters"></param>
    public override void InitializeInner(BrainParameters brainParameters, Tensor vecotrObsTensor, List <Tensor> visualTensors, TrainerParams trainerParams)
    {
        //vector observation normalization
        Tensor normalizedVectorObs = vecotrObsTensor;

        if (useInputNormalization && HasVectorObservation)
        {
            normalizedVectorObs = CreateRunninngNormalizer(normalizedVectorObs, StateSize);
        }
        else if (useInputNormalization)
        {
            Debug.LogWarning("useInputNormalization is turned off because it is not supported in this case");
            useInputNormalization = false;
        }



        //build all stuff
        if (trainerParams is TrainerParamsPPO || mode == Mode.PPO)
        {
            mode = Mode.PPO; if (ActionSpace == SpaceType.continuous)
            {
                InitializePPOStructureContinuousAction(vecotrObsTensor, normalizedVectorObs, visualTensors, trainerParams);
            }
            else if (ActionSpace == SpaceType.discrete)
            {
                InitializePPOStructureDiscreteAction(vecotrObsTensor, normalizedVectorObs, visualTensors, trainerParams);
            }
        }
        else if (mode == Mode.SupervisedLearning || trainerParams is TrainerParamsMimic)
        {
            mode = Mode.SupervisedLearning;
            if (ActionSpace == SpaceType.continuous)
            {
                InitializeSLStructureContinuousAction(vecotrObsTensor, normalizedVectorObs, visualTensors, trainerParams);
            }
            else
            {
                InitializeSLStructureDiscreteAction(vecotrObsTensor, normalizedVectorObs, visualTensors, trainerParams);
            }
        }
    }
コード例 #6
0
    public override void InitializeInner(BrainParameters brainParameters, Tensor stateTensor, List <Tensor> visualTensors, TrainerParams trainerParams)
    {
        if (brainParameters.cameraResolutions != null && brainParameters.cameraResolutions.Length != 0)
        {
            Debug.LogError("GAN for ML agent does not support visual input yet");
        }
        Debug.Assert(brainParameters.vectorActionSpaceType == SpaceType.continuous, "GAN for ML agent does not support discrete action space.");

        TrainerParams = trainerParams as TrainerParamsGAN;
        if (trainerParams != null && TrainerParams == null)
        {
            Debug.LogError("Trainer params for GAN needs to be a TrainerParamsGAN type");
        }

        outputShape         = new int[] { ActionSize };
        inputConditionShape = new int[] { StateSize };

        Initialize(trainerParams != null);
    }
コード例 #7
0
    /// <summary>
    /// Initialize the model without training parts
    /// </summary>
    /// <param name="brainParameters"></param>
    public override void InitializeInner(BrainParameters brainParameters, Tensor stateTensor, List <Tensor> visualTensors, TrainerParams trainerParams)
    {
        Debug.Assert(ActionSpace == SpaceType.continuous, "RLModelPPOCMA only support continuous action space.");
        Tensor inputStateTensorToNetwork = stateTensor;

        if (useInputNormalization && HasVectorObservation)
        {
            inputStateTensorToNetwork = CreateRunninngNormalizer(inputStateTensorToNetwork, StateSize);
        }


        //build the network
        Tensor outputValue = null; Tensor outputAction = null; Tensor outActionLogVariance = null;

        network.BuildNetworkForContinuousActionSapce(inputStateTensorToNetwork, visualTensors, null, null, ActionSizes[0], out outputAction, out outputValue, out outActionLogVariance);


        InitializePPOCMAStructures(trainerParams, stateTensor, visualTensors, outputValue, outputAction, outActionLogVariance, network.GetCriticWeights(), network.GetActorMeanWeights(), network.GetActorVarianceWeights());
    }
コード例 #8
0
        public override List <double[]> Train(TrainerParams trainParams)
        {
            AdamParams passedParams = (AdamParams)trainParams;
            //int valSplitSize = 0;
            List <double[]> learningCurve      = new List <double[]>();
            List <int>      trainingSetIndices = Enumerable.Range(0, passedParams.trainingSet.Labels.RowCount).ToList();
            List <int>      testSetIndices     = null;
            DataSet         test = new DataSet(null, null);

            if (passedParams.validationSet != null)
            {
                testSetIndices = Enumerable.Range(0, passedParams.validationSet.Labels.RowCount).ToList();

                /*  if (shuffle)
                 * {
                 *    testSetIndices.Shuffle();
                 * }
                 */
                test.Inputs = CreateMatrix.Dense(testSetIndices.Count, passedParams.validationSet.Inputs.ColumnCount, 0.0);
                test.Labels = CreateMatrix.Dense(testSetIndices.Count, passedParams.validationSet.Labels.ColumnCount, 0.0);
                for (int i = 0; i < testSetIndices.Count; i++)
                {
                    test.Inputs.SetRow(i, passedParams.validationSet.Inputs.Row(testSetIndices[i])); //, 1, 0, Dataset.Inputs.ColumnCount));
                    test.Labels.SetRow(i, passedParams.validationSet.Labels.Row(testSetIndices[i])); //.SubMatrix(trainingSetIndices[batchIndex], 1, 0, Dataset.Labels.ColumnCount));
                }
            }
            if (passedParams.shuffle)
            {
                trainingSetIndices.Shuffle();
            }



            Matrix <double> batchesIndices = null;                           //a 2d matrix of shape(nmberOfBatches,2), rows are batches, row[0] =barchstart, row[1] = batchEnd
            Dictionary <int, Matrix <double> > previousWeightsUpdate = null; //for the momentum updates

            int adamUpdateStep = 1;

            for (int epoch = 1; epoch <= passedParams.numberOfEpochs; epoch++)
            {
                if (passedParams.batchSize != null)//will build a matrix "batchesIndices" describing the batches that in each row, contains the start and the end of a batch
                {
                    var numberOfBatches = (int)Math.Ceiling(((passedParams.trainingSet.Labels.RowCount / (double)(passedParams.batchSize))));
                    batchesIndices = CreateMatrix.Dense(numberOfBatches, 2, 0.0);
                    for (int j = 0; j < numberOfBatches; j++)
                    {
                        batchesIndices.SetRow(j, new double[] { j *(double)passedParams.batchSize, Math.Min(passedParams.trainingSet.Inputs.RowCount - 1, (j + 1) * (double)passedParams.batchSize - 1) });
                    }
                }
                else//put all of the dataset in one batch
                {
                    batchesIndices = CreateMatrix.Dense(1, 2, 0.0);
                    batchesIndices.SetRow(0, new double[] { 0, passedParams.trainingSet.Inputs.RowCount - 1 });
                }

                double iterationLoss = 0;//will hold the average of the batches average losses, each batch contributes to this with its loss average =  batchloss/batchsize
                Dictionary <int, Matrix <double> > firstMoment  = new Dictionary <int, Matrix <double> >();
                Dictionary <int, Matrix <double> > secondMoment = new Dictionary <int, Matrix <double> >();
                Dictionary <int, Matrix <double> > mhat         = new Dictionary <int, Matrix <double> >();
                Dictionary <int, Matrix <double> > vhat         = new Dictionary <int, Matrix <double> >();

                Dictionary <int, Matrix <double> > prevFirstMoment  = new Dictionary <int, Matrix <double> >();
                Dictionary <int, Matrix <double> > prevSecondMoment = new Dictionary <int, Matrix <double> >();
                for (int batchIndex = 0; batchIndex < batchesIndices.RowCount; batchIndex++)                                                                                                                                                                                                                                                                                                                                                       //for each batch
                {
                    previousWeightsUpdate = passedParams.parallelize ? Parallel_PerformBatchComputations(passedParams, batchesIndices, adamUpdateStep, ref iterationLoss, firstMoment, secondMoment, mhat, vhat, prevFirstMoment, prevSecondMoment, batchIndex) : PerformBatchComputations(passedParams, batchesIndices, adamUpdateStep, ref iterationLoss, firstMoment, secondMoment, mhat, vhat, prevFirstMoment, prevSecondMoment, batchIndex); //for each batch
                }
                adamUpdateStep++;


                iterationLoss /= batchesIndices.RowCount;



                // computing the test loss:
                double validationError = 0;
                validationError = passedParams.parallelize ? Parallel_ComputeValidationLoss(passedParams, testSetIndices, test) : ComputeValidationLoss(passedParams, testSetIndices, test);
                double trainingAccuracy = 0, validationSetAccuracy = 0;

                if (passedParams.trueThreshold != null)
                {
                    trainingAccuracy      = Utilities.Tools.ComputeAccuracy(passedParams.network, passedParams.trainingSet, passedParams.trueThreshold);
                    validationSetAccuracy = Utilities.Tools.ComputeAccuracy(passedParams.network, passedParams.validationSet, passedParams.trueThreshold);
                }


                learningCurve.Add(new double[] { iterationLoss, passedParams.validationSet != null ? validationError : 0, passedParams.trueThreshold != null ? trainingAccuracy : 0, passedParams.trueThreshold != null ? validationSetAccuracy : 0 });
                if (passedParams.PrintLoss)
                {
                    Console.ForegroundColor = ConsoleColor.Green;
                    Console.WriteLine("Epoch:{0} train loss:{1} - validation loss:{2}", epoch, iterationLoss, validationError);
                }


                Console.ResetColor();
            }
            return(learningCurve);
        }
コード例 #9
0
    /// <summary>
    /// Trainers will call this method to initialize the model. This method will call the InitializeInner()
    /// </summary>
    /// <param name="brainParameters">brain parameter of the MLagent brain</param>
    /// <param name="enableTraining">whether enable training</param>
    /// <param name="trainerParams">trainer parameters passed by the trainer. Training will not be enabled </param>
    public virtual void Initialize(BrainParameters brainParameters, bool enableTraining, TrainerParams trainerParams = null)
    {
        Debug.Assert(Initialized == false, "Model already Initalized");

        NameScope ns = null;

        if (!string.IsNullOrEmpty(modelName))
        {
            ns = Current.K.name_scope(modelName);
        }

        ActionSizes = brainParameters.vectorActionSize;
        StateSize   = brainParameters.vectorObservationSize * brainParameters.numStackedVectorObservations;
        ActionSpace = brainParameters.vectorActionSpaceType;

        Debug.Assert(ActionSizes[0] > 0, "Action size can not be zero");

        //create basic inputs
        var inputStateTensor = StateSize > 0 ? UnityTFUtils.Input(new int?[] { StateSize }, name: "InputStates")[0] : null;

        HasVectorObservation = inputStateTensor != null;
        var inputVisualTensors = CreateVisualInputs(brainParameters);

        HasVisualObservation = inputVisualTensors != null;

        //create inner intialization
        InitializeInner(brainParameters, inputStateTensor, inputVisualTensors, enableTraining ? trainerParams : null);

        //test
        //Debug.LogWarning("Tensorflow Graph is saved for test purpose at: SavedGraph/" + name + ".pb");
        //((UnityTFBackend)Current.K).ExportGraphDef("SavedGraph/" + name + ".pb");

        Current.K.try_initialize_variables(true);

        if (ns != null)
        {
            ns.Dispose();
        }

        if (checkpointToLoad != null)
        {
            RestoreCheckpoint(checkpointToLoad.bytes, true);
        }
        Initialized     = true;
        TrainingEnabled = enableTraining;
    }
コード例 #10
0
    /// <summary>
    /// Initialize the model for PPO
    /// </summary>
    /// <param name="trainerParams"></param>
    /// <param name="stateTensor"></param>
    /// <param name="inputVisualTensors"></param>
    /// <param name="outputValueFromNetwork"></param>
    /// <param name="outputActionFromNetwork"></param>
    /// <param name="outputVarianceFromNetwork"></param>
    /// <param name="weightsToUpdate"></param>
    protected void InitializePPOStructures(TrainerParams trainerParams, Tensor stateTensor, List <Tensor> inputVisualTensors, Tensor outputValueFromNetwork, Tensor outputActionFromNetwork, Tensor outputVarianceFromNetwork, List <Tensor> weightsToUpdate)
    {
        List <Tensor> allobservationInputs = new List <Tensor>();

        if (HasVectorObservation)
        {
            allobservationInputs.Add(stateTensor);
        }
        if (HasVisualObservation)
        {
            allobservationInputs.AddRange(inputVisualTensors);
        }

        ValueFunction = K.function(allobservationInputs, new List <Tensor> {
            outputValueFromNetwork
        }, null, "ValueFunction");

        Tensor outputActualAction = null; Tensor actionProb = null;

        if (ActionSpace == SpaceType.continuous)
        {
            using (K.name_scope("SampleAction"))
            {
                outputActualAction = K.standard_normal(K.shape(outputActionFromNetwork), DataType.Float) * K.sqrt(outputVarianceFromNetwork) + outputActionFromNetwork;
            }
            using (K.name_scope("ActionProbs"))
            {
                actionProb = K.normal_probability(K.stop_gradient(outputActualAction), outputActionFromNetwork, outputVarianceFromNetwork);
            }
            ActionFunction = K.function(allobservationInputs, new List <Tensor> {
                outputActualAction, actionProb, outputActionFromNetwork, outputVarianceFromNetwork
            }, null, "ActionFunction");

            var probInputs = new List <Tensor>(); probInputs.AddRange(allobservationInputs); probInputs.Add(outputActualAction);
            ActionProbabilityFunction = K.function(probInputs, new List <Tensor> {
                actionProb
            }, null, "ActionProbabilityFunction");
        }
        else
        {
            ActionFunction = K.function(allobservationInputs, new List <Tensor> {
                outputActionFromNetwork
            }, null, "ActionFunction");
        }

        TrainerParamsPPO trainingParams = trainerParams as TrainerParamsPPO;

        if (trainingParams != null)
        {
            //training needed inputs

            var inputOldProb     = UnityTFUtils.Input(new int?[] { ActionSpace == SpaceType.continuous ? ActionSize : 1 }, name: "InputOldProb")[0];
            var inputAdvantage   = UnityTFUtils.Input(new int?[] { 1 }, name: "InputAdvantage")[0];
            var inputTargetValue = UnityTFUtils.Input(new int?[] { 1 }, name: "InputTargetValue")[0];
            var inputOldValue    = UnityTFUtils.Input(new int?[] { 1 }, name: "InputOldValue")[0];

            ClipEpsilon       = trainingParams.clipEpsilon;
            ValueLossWeight   = trainingParams.valueLossWeight;
            EntropyLossWeight = trainingParams.entropyLossWeight;

            var inputClipEpsilon       = UnityTFUtils.Input(batch_shape: new int?[] { }, name: "ClipEpsilon", dtype: DataType.Float)[0];
            var inputValuelossWeight   = UnityTFUtils.Input(batch_shape: new int?[] { }, name: "ValueLossWeight", dtype: DataType.Float)[0];
            var inputEntropyLossWeight = UnityTFUtils.Input(batch_shape: new int?[] { }, name: "EntropyLossWeight", dtype: DataType.Float)[0];

            // action probability from input action
            Tensor outputEntropy;
            Tensor inputActionDiscrete = null, onehotInputAction = null;    //for discrete action space

            if (ActionSpace == SpaceType.continuous)
            {
                using (K.name_scope("Entropy"))
                {
                    var temp = K.mul(outputVarianceFromNetwork, 2 * Mathf.PI * 2.7182818285);
                    temp = K.mul(K.log(temp), 0.5);
                    if (outputVarianceFromNetwork.shape.Length == 2)
                    {
                        outputEntropy = K.mean(K.mean(temp, 0, false), name: "OutputEntropy");
                    }
                    else
                    {
                        outputEntropy = K.mean(temp, 0, false, name: "OutputEntropy");
                    }
                }
            }
            else
            {
                using (K.name_scope("ActionProbAndEntropy"))
                {
                    inputActionDiscrete = UnityTFUtils.Input(new int?[] { 1 }, name: "InputAction", dtype: DataType.Int32)[0];
                    onehotInputAction   = K.one_hot(inputActionDiscrete, K.constant <int>(ActionSize, dtype: DataType.Int32), K.constant(1.0f), K.constant(0.0f));
                    onehotInputAction   = K.reshape(onehotInputAction, new int[] { -1, ActionSize });
                    outputEntropy       = K.mean((-1.0f) * K.sum(outputActionFromNetwork * K.log(outputActionFromNetwork + 0.00000001f), axis: 1), 0);
                    actionProb          = K.reshape(K.sum(outputActionFromNetwork * onehotInputAction, 1), new int[] { -1, 1 });
                }
            }

            // value loss
            Tensor outputValueLoss = null;
            using (K.name_scope("ValueLoss"))
            {
                var clippedValueEstimate = inputOldValue + K.clip(outputValueFromNetwork - inputOldValue, 0.0f - inputClipEpsilon, inputClipEpsilon);
                var valueLoss1           = new MeanSquareError().Call(outputValueFromNetwork, inputTargetValue);
                var valueLoss2           = new MeanSquareError().Call(clippedValueEstimate, inputTargetValue);
                outputValueLoss = K.mean(K.maximum(valueLoss1, valueLoss2));
            }
            //var outputValueLoss = K.mean(valueLoss1);

            // Clipped Surrogate loss
            Tensor outputPolicyLoss;
            using (K.name_scope("ClippedCurreogateLoss"))
            {
                //Debug.LogWarning("testnew");
                //var probStopGradient = K.stop_gradient(actionProb);
                var probRatio = actionProb / (inputOldProb + 0.0000000001f);
                var p_opt_a   = probRatio * inputAdvantage;
                var p_opt_b   = K.clip(probRatio, 1.0f - inputClipEpsilon, 1.0f + inputClipEpsilon) * inputAdvantage;

                outputPolicyLoss = (-1f) * K.mean(K.mean(K.minimun(p_opt_a, p_opt_b)), name: "ClippedCurreogateLoss");
            }
            //final weighted loss
            var outputLoss = outputPolicyLoss + inputValuelossWeight * outputValueLoss;
            outputLoss = outputLoss - inputEntropyLossWeight * outputEntropy;
            outputLoss = K.identity(outputLoss, "OutputLoss");

            //add inputs, outputs and parameters to the list
            List <Tensor> allInputs = new List <Tensor>();
            if (HasVectorObservation)
            {
                allInputs.Add(stateTensor);
            }
            if (HasVisualObservation)
            {
                allInputs.AddRange(inputVisualTensors);
            }
            if (ActionSpace == SpaceType.continuous)
            {
                allInputs.Add(outputActualAction);
            }
            else
            {
                allInputs.Add(inputActionDiscrete);
            }

            allInputs.Add(inputOldProb);
            allInputs.Add(inputTargetValue);
            allInputs.Add(inputOldValue);
            allInputs.Add(inputAdvantage);
            allInputs.Add(inputClipEpsilon);
            allInputs.Add(inputValuelossWeight);
            allInputs.Add(inputEntropyLossWeight);

            //create optimizer and create necessary functions
            var updates = AddOptimizer(weightsToUpdate, outputLoss, optimizer);
            UpdatePPOFunction = K.function(allInputs, new List <Tensor> {
                outputLoss, outputValueLoss, outputPolicyLoss, outputEntropy, actionProb
            }, updates, "UpdateFunction");
        }
    }
コード例 #11
0
    /// <summary>
    /// Initialize the model without training parts
    /// </summary>
    /// <param name="brainParameters"></param>
    public override void InitializeInner(BrainParameters brainParameters, Tensor stateTensor, List <Tensor> visualTensors, TrainerParams trainerParams)
    {
        Tensor inputStateTensorToNetwork = stateTensor;

        if (useInputNormalization && HasVectorObservation)
        {
            inputStateTensorToNetwork = CreateRunninngNormalizer(inputStateTensorToNetwork, StateSize);
        }


        //build the network
        Tensor outputValue = null; Tensor outputAction = null; Tensor outputVariance = null;

        network.BuildNetwork(inputStateTensorToNetwork, visualTensors, null, null, ActionSize, ActionSpace, out outputAction, out outputValue, out outputVariance);

        if (trainerParams is TrainerParamsPPO || mode == Mode.PPO)
        {
            mode = Mode.PPO;
            InitializePPOStructures(trainerParams, stateTensor, visualTensors, outputValue, outputAction, outputVariance, network.GetWeights());
        }
        else if (mode == Mode.SupervisedLearning || trainerParams is TrainerParamsMimic)
        {
            mode = Mode.SupervisedLearning;
            InitializeSLStructures(trainerParams, stateTensor, visualTensors, outputAction, outputVariance, network.GetActorWeights());
        }
    }
コード例 #12
0
    /// <summary>
    /// Initialize the model without training parts
    /// </summary>
    /// <param name="brainParameters"></param>
    public override void InitializeInner(BrainParameters brainParameters, Tensor stateTensor, List <Tensor> visualTensors, TrainerParams trainerParams)
    {
        Debug.Assert(visualTensors == null, "RLModelPPOHierarchy does not support visual input yet");


        if (highLevelObservationSize > 0)
        {
            var splited = K.split(stateTensor, K.constant(new int[] { lowLevelObservationSize, highLevelObservationSize }, dtype: DataType.Int32), K.constant(1, dtype: DataType.Int32), 2);
            inputLowLevelTensor  = splited[0];
            inputHighLevelTensor = splited[1];
        }
        else
        {
            inputLowLevelTensor = stateTensor;
        }

        List <Tensor> inputVisualTensors = visualTensors;

        if (useInputNormalization && HasVectorObservation)
        {
            inputLowLevelTensor = CreateRunninngNormalizer(inputLowLevelTensor, StateSize);
        }



        Tensor outputValue = null; Tensor outputAction = null; Tensor outputVariance = null;

        //build the network
        networkHierarchy.BuildNetwork(inputLowLevelTensor, inputHighLevelTensor, ActionSize, ActionSpace, out outputAction, out outputValue, out outputVariance);

        InitializePPOStructures(trainerParams, stateTensor, inputVisualTensors, outputValue, outputAction, outputVariance, networkHierarchy.GetHighLevelWeights());
    }
コード例 #13
0
    /// <summary>
    /// Initialize the model without training parts
    /// </summary>
    /// <param name="brainParameters"></param>
    public override void InitializeInner(BrainParameters brainParameters, Tensor vecotrObsTensor, List <Tensor> visualTensors, TrainerParams trainerParams)
    {
        //vector observation normalization
        Tensor normalizedVectorObs = vecotrObsTensor;

        if (useInputNormalization && HasVectorObservation)
        {
            normalizedVectorObs = CreateRunninngNormalizer(normalizedVectorObs, StateSize);
        }



        //build all stuff
        if (trainerParams is TrainerParamsPPO || mode == Mode.PPO)
        {
            mode = Mode.PPO; if (ActionSpace == SpaceType.continuous)
            {
                InitializePPOStructureContinuousAction(vecotrObsTensor, normalizedVectorObs, visualTensors, trainerParams);
            }
            else if (ActionSpace == SpaceType.discrete)
            {
                InitializePPOStructureDiscreteAction(vecotrObsTensor, normalizedVectorObs, visualTensors, trainerParams);
            }
        }
        else if (mode == Mode.SupervisedLearning || trainerParams is TrainerParamsMimic)
        {
            Debug.Assert(ActionSizes.Length <= 1, "Supervised learning not support branching yet");
            mode = Mode.SupervisedLearning;
            if (ActionSpace == SpaceType.continuous)
            {
                InitializeSLStructures(trainerParams, vecotrObsTensor, normalizedVectorObs, visualTensors);
            }
            else
            {
                Debug.LogError("Discrete action is not support for supervised learning yet");
            }
        }
    }
コード例 #14
0
        /// <summary>
        /// Given a network architecture, and the desired Dataset parameters and dataset for training and testing
        /// this function runs a forward propagation iteration,calculates the errors and backpropagates it to update the network weights
        /// for the accuracy computations, we assume a single output classification problem, otherwise we will need to implement cross entropy instead of MSE (perhaps later :) )
        /// </summary>
        /// <param name="network">a network architecture</param>
        /// <param name="Dataset"> the training dataset</param>
        /// <param name="learningRate"> the initial learning rate</param>
        /// <param name="numberOfEpochs">number of Dataset epochs</param>
        /// <param name="shuffle">set to true, will shuffle the trainingdataWithBias</param>
        /// <param name="batchSize">the Dataset trainingdataWithBias batch size</param>
        /// <param name="debug">set to true, will print verbose messages to the screen</param>
        /// <param name="regularizationRate">the L2 regularization rate used</param>
        /// <param name="regularization">Regularization method used, only L2 is implemented for now</param>
        /// <param name="momentum">Momentum rate</param>
        /// <param name="resilient">set to true, will enable the resilient property where the learning rate is multiplied by resilientUpdateAccelerationRate in case previous update was same sign as current update and resilientUpdateSlowDownRate otherwise</param>
        /// <param name="resilientUpdateAccelerationRate"> if resilient is set to true, the learning rate will be multiplied by this value in case the sign of the previous weights updates was the same as the current new one</param>
        /// <param name="resilientUpdateSlowDownRate">if resilient is set to true, the learning rate will be multiplied by this value in case the sign of the previous weights updates was NOT the same as the current new one</param>
        /// <param name="validationSet"> the validation dataset</param>
        /// <param name="trueThreshold"> between 0 to 1, if present accuracy of the Dataset and validation trainingdataWithBias will be computed at each epoch and reported in the returned learning curve list of doubles </param>
        /// <param name="MEE">set to true, will report the Mean Euclidean Error instead of Mean Square Error</param>
        /// <param name="reduceLearningRate">set to true, will enable reducing the learning rate during training</param>
        /// <param name="learningRateReduction">will be multiplied by the learning rate during training</param>
        /// <param name="learningRateReductionAfterEpochs">After how many epoch shall the learningRateReduction be multiplied by the learning rate</param>
        /// <param name="numberOfReductions">number of time a reduction shall happen</param>
        /// <returns> a list of double arrays each element is a 4 elements double array "a"  a[0] = iteration Dataset loss(MSE), a[1] = validation error(MSE), a[2] =Dataset set accuracy, a[3] = validation set accuracy  </returns>

        //Network network, DataSet trainingSet, double learningRate, int numberOfEpochs, bool shuffle = false, int? batchSize = null, bool debug = false, double regularizationRate = 0, Regularizations regularization = Regularizations.None, double momentum = 0, bool resilient = false, double resilientUpdateAccelerationRate = 1, double resilientUpdateSlowDownRate = 1, DataSet validationSet = null, double? trueThreshold = 0.5, bool MEE = false, bool reduceLearningRate = false, double learningRateReduction = 0.5, int learningRateReductionAfterEpochs = 1000, int numberOfReductions = 2, bool nestrov = false
        public abstract List <double[]> Train(TrainerParams trainParams);
コード例 #15
0
    protected void InitializeSLStructureDiscreteAction(Tensor vectorObs, Tensor normalizedVectorObs, List <Tensor> visualObs, TrainerParams trainerParams)
    {
        //all inputs list
        List <Tensor> allObservationInputs = new List <Tensor>();

        if (HasVectorObservation)
        {
            allObservationInputs.Add(vectorObs);
        }
        if (HasVisualObservation)
        {
            allObservationInputs.AddRange(visualObs);
        }

        //build basic network
        Tensor[] outputActionsLogits = null;
        Tensor   outputValue         = null;

        network.BuildNetworkForDiscreteActionSpace(normalizedVectorObs, visualObs, null, null, ActionSizes, out outputActionsLogits, out outputValue);

        //the action masks input placeholders
        List <Tensor> actionMasksInputs = new List <Tensor>();

        for (int i = 0; i < ActionSizes.Length; ++i)
        {
            actionMasksInputs.Add(UnityTFUtils.Input(new int?[] { ActionSizes[i] }, name: "AcionMask" + i)[0]);
        }
        //masking and normalized and get the final action tensor
        Tensor[] outputActions, outputNormalizedLogits;
        CreateDiscreteActionMaskingLayer(outputActionsLogits, actionMasksInputs.ToArray(), out outputActions, out outputNormalizedLogits);

        //output tensors for discrete actions. Includes all action selected actions
        var outputDiscreteActions = new List <Tensor>();

        outputDiscreteActions.Add(K.identity(K.cast(ActionSizes.Length == 1 ? outputActions[0] : K.concat(outputActions.ToList(), 1), DataType.Float), "OutputAction"));
        var actionFunctionInputs = new List <Tensor>();

        actionFunctionInputs.AddRange(allObservationInputs);
        actionFunctionInputs.AddRange(actionMasksInputs);
        ActionFunction = K.function(actionFunctionInputs, outputDiscreteActions, null, "ActionFunction");


        //build the parts for training
        TrainerParamsMimic trainingParams = trainerParams as TrainerParamsMimic;

        if (trainerParams != null && trainingParams == null)
        {
            Debug.LogError("Trainer params for Supervised learning mode needs to be a TrainerParamsMimic type");
        }
        if (trainingParams != null)
        {
            //training inputs
            var inputActionLabels = UnityTFUtils.Input(new int?[] { ActionSizes.Length }, name: "InputAction", dtype: DataType.Int32)[0];
            //split the input for each discrete branch
            List <Tensor> inputActionsDiscreteSeperated = null, onehotInputActions = null;    //for discrete action space
            var           splits = new int[ActionSizes.Length];
            for (int i = 0; i < splits.Length; ++i)
            {
                splits[i] = 1;
            }
            inputActionsDiscreteSeperated = K.split(inputActionLabels, K.constant(splits, dtype: DataType.Int32), K.constant(1, dtype: DataType.Int32), ActionSizes.Length);

            //creat the loss
            onehotInputActions = inputActionsDiscreteSeperated.Select((x, i) => K.reshape(K.one_hot(x, K.constant <int>(ActionSizes[i], dtype: DataType.Int32), K.constant(1.0f), K.constant(0.0f)), new int[] { -1, ActionSizes[i] })).ToList();

            var    losses = onehotInputActions.Select((x, i) => K.mean(K.categorical_crossentropy(x, outputNormalizedLogits[i], true))).ToList();
            Tensor loss = losses.Aggregate((x, s) => x + s);

            //add inputs, outputs and parameters to the list
            List <Tensor> updateParameters = network.GetActorWeights();
            List <Tensor> allInputs = new List <Tensor>();
            allInputs.AddRange(actionFunctionInputs);
            allInputs.Add(inputActionLabels);

            //create optimizer and create necessary functions
            var updates = AddOptimizer(updateParameters, loss, optimizer);
            UpdateSLFunction = K.function(allInputs, new List <Tensor> {
                loss
            }, updates, "UpdateFunction");
        }
    }
コード例 #16
0
ファイル: Gradientdescent.cs プロジェクト: lilanpei/Monks
        //Network network, DataSet trainingSet, double learningRate, int numberOfEpochs, bool shuffle = false, int? batchSize = null, bool debug = false, double regularizationRate = 0, Regularizations regularization = Regularizations.None, double momentum = 0, bool resilient = false, double resilientUpdateAccelerationRate = 1, double resilientUpdateSlowDownRate = 1, DataSet validationSet = null, double? trueThreshold = 0.5, bool MEE = false, bool reduceLearningRate = false, double learningRateReduction = 0.5, int learningRateReductionAfterEpochs = 1000, int numberOfReductions = 2, bool nestrov = false
        public override List <double[]> Train(TrainerParams trainParams)
        {
            GradientDescentParams passedParams = (GradientDescentParams)trainParams;

            /* if (passedParams.resilient)
             * {
             *   // passedParams.learningRate = 1;
             *
             * }*/
            //int valSplitSize = 0;
            List <double[]> learningCurve      = new List <double[]>();
            List <int>      trainingSetIndices = Enumerable.Range(0, passedParams.trainingSet.Labels.RowCount).ToList();
            List <int>      testSetIndices     = null;
            DataSet         test = new DataSet(null, null);

            if (passedParams.validationSet != null)
            {
                testSetIndices = Enumerable.Range(0, passedParams.validationSet.Labels.RowCount).ToList();

                /*  if (shuffle)
                 * {
                 *    testSetIndices.Shuffle();
                 * }
                 */
                test.Inputs = CreateMatrix.Dense(testSetIndices.Count, passedParams.validationSet.Inputs.ColumnCount, 0.0);
                test.Labels = CreateMatrix.Dense(testSetIndices.Count, passedParams.validationSet.Labels.ColumnCount, 0.0);
                for (int i = 0; i < testSetIndices.Count; i++)
                {
                    test.Inputs.SetRow(i, passedParams.validationSet.Inputs.Row(testSetIndices[i])); //, 1, 0, Dataset.Inputs.ColumnCount));
                    test.Labels.SetRow(i, passedParams.validationSet.Labels.Row(testSetIndices[i])); //.SubMatrix(trainingSetIndices[batchIndex], 1, 0, Dataset.Labels.ColumnCount));
                }
            }
            if (passedParams.shuffle)
            {
                trainingSetIndices.Shuffle();
            }



            Matrix <double> batchesIndices = null;                                                               //a 2d matrix of shape(nmberOfBatches,2), rows are batches, row[0] =barchstart, row[1] = batchEnd
            Dictionary <int, Matrix <double> > previousWeightsUpdate = null;                                     //for the momentum updates
            Dictionary <int, Matrix <double> > PreviousUpdateSigns   = new Dictionary <int, Matrix <double> >(); //for the resilient backpropagation,if the sign changes we slow down with the slow down ratio, if it stays the same we accelerate with the acceleration ratio


            for (int epoch = 0; epoch < passedParams.numberOfEpochs; epoch++)
            {
                if (passedParams.batchSize != null)//will build a matrix "batchesIndices" describing the batches that in each row, contains the start and the end of a batch
                {
                    var numberOfBatches = (int)Math.Ceiling(((passedParams.trainingSet.Labels.RowCount / (double)(passedParams.batchSize))));
                    batchesIndices = CreateMatrix.Dense(numberOfBatches, 2, 0.0);
                    for (int j = 0; j < numberOfBatches; j++)
                    {
                        batchesIndices.SetRow(j, new double[] { j *(double)passedParams.batchSize, Math.Min(passedParams.trainingSet.Inputs.RowCount - 1, (j + 1) * (double)passedParams.batchSize - 1) });
                    }
                }
                else//put all of the dataset in one batch
                {
                    batchesIndices = CreateMatrix.Dense(1, 2, 0.0);
                    batchesIndices.SetRow(0, new double[] { 0, passedParams.trainingSet.Inputs.RowCount - 1 });
                }

                double epochLoss = 0;//will hold the average of the batches average losses, each batch contributes to this with its loss average =  batchloss/batchsize


                for (int batchIdx = 0; batchIdx < batchesIndices.RowCount; batchIdx++)//for each batch
                {
                    PerformBatchComputations(passedParams, batchesIndices, ref previousWeightsUpdate, PreviousUpdateSigns, epoch, ref epochLoss, batchIdx);
                }
                epochLoss /= batchesIndices.RowCount;

                double validationError = passedParams.parallelize ? Parallel_ComputeValidationLoss(passedParams, testSetIndices, test) : ComputeValidationLoss(passedParams, testSetIndices, test);
                double trainingAccuracy = 0, validationSetAccuracy = 0;

                if (passedParams.trueThreshold != null)
                {
                    trainingAccuracy      = Utilities.Tools.ComputeAccuracy(passedParams.network, passedParams.trainingSet, passedParams.trueThreshold);
                    validationSetAccuracy = Utilities.Tools.ComputeAccuracy(passedParams.network, passedParams.validationSet, passedParams.trueThreshold);
                }


                learningCurve.Add(new double[] { epochLoss, passedParams.validationSet != null ? validationError : 0, passedParams.trueThreshold != null ? trainingAccuracy : 0, passedParams.trueThreshold != null ? validationSetAccuracy : 0 });
                if (passedParams.PrintLoss)
                {
                    Console.ForegroundColor = ConsoleColor.Green;
                    Console.WriteLine("Epoch:{0} train loss:{1} - validation loss:{2}", epoch, epochLoss, validationError);
                }
                if (passedParams.reduceLearningRate && epoch > 0 && passedParams.numberOfReductions > 0 && epoch % passedParams.learningRateReductionAfterEpochs == 0)
                {
                    passedParams.learningRate *= passedParams.learningRateReduction;
                    passedParams.numberOfReductions--;
                    Console.ForegroundColor = ConsoleColor.Red;
                    Console.WriteLine("Learning Rate Reduced, now: {0}", passedParams.learningRate);
                }

                Console.ResetColor();
            }
            return(learningCurve);
        }
コード例 #17
0
    protected void InitializeSLStructureContinuousAction(Tensor vectorObs, Tensor normalizedVectorObs, List <Tensor> visualObs, TrainerParams trainerParams)
    {
        //build the network
        Tensor outputValue = null; Tensor outputActionMean = null; Tensor outputLogVariance = null;

        network.BuildNetworkForContinuousActionSapce(normalizedVectorObs, visualObs, null, null, ActionSizes[0], out outputActionMean, out outputValue, out outputLogVariance);
        Tensor outputAction = outputActionMean;
        Tensor outputVar    = K.exp(outputLogVariance);

        SLHasVar = outputLogVariance != null;

        List <Tensor> observationInputs = new List <Tensor>();

        if (HasVectorObservation)
        {
            observationInputs.Add(vectorObs);
        }
        if (HasVisualObservation)
        {
            observationInputs.AddRange(visualObs);
        }
        if (SLHasVar)
        {
            ActionFunction = K.function(observationInputs, new List <Tensor> {
                outputAction, outputVar
            }, null, "ActionFunction");
        }
        else
        {
            ActionFunction = K.function(observationInputs, new List <Tensor> {
                outputAction
            }, null, "ActionFunction");
        }

        //build the parts for training
        TrainerParamsMimic trainingParams = trainerParams as TrainerParamsMimic;

        if (trainerParams != null && trainingParams == null)
        {
            Debug.LogError("Trainer params for Supervised learning mode needs to be a TrainerParamsMimic type");
        }
        if (trainingParams != null)
        {
            //training inputs
            var inputActionLabel = UnityTFUtils.Input(new int?[] { ActionSizes[0] }, name: "InputAction", dtype: DataType.Float)[0];
            //creat the loss
            Tensor loss = null;
            if (SLHasVar)
            {
                loss = K.mean(K.mean(0.5 * K.square(inputActionLabel - outputAction) / outputVar + 0.5 * outputLogVariance));
            }
            else
            {
                loss = K.mean(new MeanSquareError().Call(inputActionLabel, outputAction));
            }

            //add inputs, outputs and parameters to the list
            List <Tensor> updateParameters = network.GetActorWeights();
            List <Tensor> allInputs        = new List <Tensor>();
            allInputs.AddRange(observationInputs);
            allInputs.Add(inputActionLabel);

            //create optimizer and create necessary functions
            var updates = AddOptimizer(updateParameters, loss, optimizer);
            UpdateSLFunction = K.function(allInputs, new List <Tensor> {
                loss
            }, updates, "UpdateFunction");
        }
    }
コード例 #18
0
 public override void InitializeInner(BrainParameters brainParameters, Tensor inputStateTensor, List <Tensor> inputVisualTensors, TrainerParams trainerParams)
 {
     //build the network
     if (ActionSpace == SpaceType.continuous)
     {
         InitializeSLStructureContinuousAction(inputStateTensor, inputVisualTensors, trainerParams);
     }
     else if (ActionSpace == SpaceType.discrete)
     {
         InitializeSLStructureDiscreteAction(inputStateTensor, inputVisualTensors, trainerParams);
     }
 }
コード例 #19
0
 /// <summary>
 /// implement this method for your learning model for use of ML agent. It is called by a Trainer. You should create everything inccluding the neural network and optimizer(if trainer params if not null),
 /// using the inputs tensors
 /// </summary>
 /// <param name="brainParameters">brain parameter of the MLagent brain</param>
 /// <param name="stateTensor">the input tensor of the vector observation</param>
 /// <param name="visualTensors">input tensors of visual observations</param>
 /// <param name="trainerParams">trainer parameters passed by the trainer. You if it is null, training is no enbled and you dont have to implement the optimzing parts. </param>
 public abstract void InitializeInner(BrainParameters brainParameters, Tensor stateTensor, List <Tensor> visualTensors, TrainerParams trainerParams);
コード例 #20
0
 private void Awake()
 {
     trainer    = GetComponent <Trainer>();
     parameters = trainer.parameters;
 }
コード例 #21
0
    public override void InitializeInner(BrainParameters brainParameters, Tensor inputStateTensor, List <Tensor> inputVisualTensors, TrainerParams trainerParams)
    {
        //build the network
        var    networkOutputs = network.BuildNetwork(inputStateTensor, inputVisualTensors, null, ActionSize, ActionSpace);
        Tensor outputAction   = networkOutputs.Item1;
        Tensor outputVar      = networkOutputs.Item2;

        hasVariance = outputVar != null && brainParameters.vectorActionSpaceType == SpaceType.continuous;

        List <Tensor> observationInputs = new List <Tensor>();

        if (HasVectorObservation)
        {
            observationInputs.Add(inputStateTensor);
        }
        if (HasVisualObservation)
        {
            observationInputs.AddRange(inputVisualTensors);
        }
        if (hasVariance)
        {
            ActionFunction = K.function(observationInputs, new List <Tensor> {
                outputAction, outputVar
            }, null, "ActionFunction");
        }
        else
        {
            ActionFunction = K.function(observationInputs, new List <Tensor> {
                outputAction
            }, null, "ActionFunction");
        }

        //build the parts for training
        TrainerParamsMimic trainingParams = trainerParams as TrainerParamsMimic;

        if (trainerParams != null && trainingParams == null)
        {
            Debug.LogError("Trainer params for Supervised learning mode needs to be a TrainerParamsMimic type");
        }
        if (trainingParams != null)
        {
            //training inputs
            var inputActionLabel = UnityTFUtils.Input(new int?[] { ActionSpace == SpaceType.continuous ? ActionSize : 1 }, name: "InputAction", dtype: ActionSpace == SpaceType.continuous ? DataType.Float : DataType.Int32)[0];
            //creat the loss
            Tensor loss = null;
            if (ActionSpace == SpaceType.discrete)
            {
                Tensor actionOnehot   = K.one_hot(inputActionLabel, K.constant(ActionSize, dtype: DataType.Int32), K.constant(1.0f), K.constant(0.0f));
                Tensor reshapedOnehot = K.reshape(actionOnehot, new int[] { -1, ActionSize });
                loss = K.mean(K.categorical_crossentropy(reshapedOnehot, outputAction, false));
            }
            else
            {
                if (hasVariance)
                {
                    loss = K.mean(K.mean(0.5 * K.square(inputActionLabel - outputAction) / outputVar + 0.5 * K.log(outputVar)));
                }
                else
                {
                    loss = K.mean(new MeanSquareError().Call(inputActionLabel, outputAction));
                }
            }
            //add inputs, outputs and parameters to the list
            List <Tensor> updateParameters = network.GetWeights();
            List <Tensor> allInputs        = new List <Tensor>();


            if (HasVectorObservation)
            {
                allInputs.Add(inputStateTensor);
                observationInputs.Add(inputStateTensor);
            }
            if (HasVisualObservation)
            {
                allInputs.AddRange(inputVisualTensors);
                observationInputs.AddRange(inputVisualTensors);
            }
            allInputs.Add(inputActionLabel);

            //create optimizer and create necessary functions
            var updates = AddOptimizer(updateParameters, loss, optimizer);
            UpdateFunction = K.function(allInputs, new List <Tensor> {
                loss
            }, updates, "UpdateFunction");
        }
    }
コード例 #22
0
    /// <summary>
    /// Initialize the model for PPO
    /// </summary>
    /// <param name="trainerParams"></param>
    /// <param name="stateTensor"></param>
    /// <param name="inputVisualTensors"></param>
    /// <param name="outputValueFromNetwork"></param>
    /// <param name="outputActionFromNetwork"></param>
    /// <param name="outputVarianceFromNetwork"></param>
    protected void InitializePPOCMAStructures(TrainerParams trainerParams, Tensor stateTensor, List <Tensor> inputVisualTensors, Tensor outputValueFromNetwork, Tensor outputActionMeanFromNetwork, Tensor outActionLogVarianceFromNetwork, List <Tensor> valueWeights, List <Tensor> meanWeights, List <Tensor> varweights)
    {
        List <Tensor> allobservationInputs = new List <Tensor>();

        if (HasVectorObservation)
        {
            allobservationInputs.Add(stateTensor);
        }
        if (HasVisualObservation)
        {
            allobservationInputs.AddRange(inputVisualTensors);
        }

        ValueFunction = K.function(allobservationInputs, new List <Tensor> {
            outputValueFromNetwork
        }, null, "ValueFunction");

        Tensor outputActualAction = null;
        Tensor outputVariance     = K.exp(outActionLogVarianceFromNetwork);

        using (K.name_scope("SampleAction"))
        {
            outputActualAction = K.standard_normal(K.shape(outputActionMeanFromNetwork), DataType.Float) * K.sqrt(outputVariance) + outputActionMeanFromNetwork;
        }

        ActionFunction = K.function(allobservationInputs, new List <Tensor> {
            outputActualAction, outputActionMeanFromNetwork, outputVariance
        }, null, "ActionFunction");

        TrainerParamsPPO trainingParams = trainerParams as TrainerParamsPPO;

        if (trainingParams != null)
        {
            //training needed inputs
            var inputOldAction   = UnityTFUtils.Input(new int?[] { ActionSizes[0] }, name: "InputOldAction")[0];
            var inputAdvantage   = UnityTFUtils.Input(new int?[] { 1 }, name: "InputAdvantage")[0];
            var inputTargetValue = UnityTFUtils.Input(new int?[] { 1 }, name: "InputTargetValue")[0];
            var inputOldValue    = UnityTFUtils.Input(new int?[] { 1 }, name: "InputOldValue")[0];

            //var inputClipEpsilon = UnityTFUtils.Input(batch_shape: new int?[] { }, name: "ClipEpsilon", dtype: DataType.Float)[0];

            var inputClipEpsilonValue = UnityTFUtils.Input(batch_shape: new int?[] { }, name: "ClipEpsilonValue", dtype: DataType.Float)[0];
            // value loss
            Tensor outputValueLoss = null;
            using (K.name_scope("ValueLoss"))
            {
                var clippedValueEstimate = inputOldValue + K.clip(outputValueFromNetwork - inputOldValue, 0.0f - inputClipEpsilonValue, inputClipEpsilonValue);
                var valueLoss1           = new MeanSquareError().Call(outputValueFromNetwork, inputTargetValue);
                var valueLoss2           = new MeanSquareError().Call(clippedValueEstimate, inputTargetValue);
                outputValueLoss = K.mean(K.maximum(valueLoss1, valueLoss2));
                outputValueLoss = K.mean(valueLoss1);
            }

            var           valueUpdates = AddOptimizer(valueWeights, outputValueLoss, optimizerValue);
            List <Tensor> valueInputs  = new List <Tensor>();
            if (HasVectorObservation)
            {
                valueInputs.Add(stateTensor);
            }
            if (HasVisualObservation)
            {
                valueInputs.AddRange(inputVisualTensors);
            }
            valueInputs.Add(inputOldValue);
            valueInputs.Add(inputTargetValue);
            valueInputs.Add(inputClipEpsilonValue);
            TrainValueFunction = K.function(valueInputs, new List <Tensor> {
                outputValueLoss
            }, valueUpdates, "TrainValueFunction");

            // actor losses
            Tensor meanLoss, varLoss;
            using (K.name_scope("ActorLosses"))
            {
                Tensor posAdvantage;
                if (usePositiveAdvOnly)
                {
                    posAdvantage = K.identity(K.relu(K.mean(inputAdvantage)), "ClipedPositiveAdv");
                }
                else
                {
                    posAdvantage = K.identity(K.mean(inputAdvantage), "Adv");
                }
                var meanNoGrad   = K.stop_gradient(outputActionMeanFromNetwork, "MeanNoGrad");
                var varNoGrad    = K.stop_gradient(outputVariance, "VarNoGrad");
                var logVar       = outActionLogVarianceFromNetwork;
                var logVarNoGrad = K.stop_gradient(logVar, "LogVarNoGrad");
                using (K.name_scope("VarLoss"))
                {
                    var logpNoMeanGrad = -1.0f * K.sum(0.5f * K.square(inputOldAction - meanNoGrad) / outputVariance + 0.5f * logVar, 1);
                    varLoss = K.identity(-1.0f * K.mean(posAdvantage * logpNoMeanGrad), "VarLoss");
                }
                using (K.name_scope("MeanLoss"))
                {
                    var logpNoVarGrad = -1.0f * K.sum(0.5f * K.square(inputOldAction - outputActionMeanFromNetwork) / varNoGrad + 0.5f * logVarNoGrad, 1);
                    meanLoss = K.identity(-1.0f * K.mean(posAdvantage * logpNoVarGrad), "MeanLoss");
                }
            }

            //add inputs, outputs and parameters to the list
            List <Tensor> allInputs = new List <Tensor>();
            if (HasVectorObservation)
            {
                allInputs.Add(stateTensor);
            }
            if (HasVisualObservation)
            {
                allInputs.AddRange(inputVisualTensors);
            }
            allInputs.Add(inputOldAction);
            allInputs.Add(inputAdvantage);


            //create optimizer and create necessary functions
            var updatesMean = AddOptimizer(meanWeights, meanLoss, optimizerMean);
            var updatesVar  = AddOptimizer(varweights, varLoss, optimizerVariance);

            TrainMeanFunction = K.function(allInputs, new List <Tensor> {
                meanLoss
            }, updatesMean, "UpdateMeanFunction");
            TrainVarianceFunction = K.function(allInputs, new List <Tensor> {
                varLoss
            }, updatesVar, "UpdateMeanFunction");

            //pretraining for output mean and var
            var inputInitialStd  = UnityTFUtils.Input(new int?[] { ActionSizes[0] }, name: "InputInitialStd")[0];
            var inputInitialMean = UnityTFUtils.Input(new int?[] { ActionSizes[0] }, name: "InputInitialMean")[0];
            var policyInitLoss   = K.mean(K.mean(K.square(inputInitialMean - outputActionMeanFromNetwork)));
            policyInitLoss += K.mean(K.mean(K.square(inputInitialStd - K.sqrt(outputVariance))));

            var updatesPretrain = AddOptimizer(network.GetActorWeights(), policyInitLoss, optimizerPretrain);
            var pretrainInputs  = new List <Tensor>();
            pretrainInputs.Add(stateTensor);
            pretrainInputs.Add(inputInitialMean);
            pretrainInputs.Add(inputInitialStd);
            PretrainFunction = K.function(pretrainInputs, new List <Tensor> {
                policyInitLoss
            }, updatesPretrain, "PretrainFunction");
        }
    }