public PPONetworkDiscreteSimple(int stateSize, int actionSize, int numLayers, int hiddenSize, DeviceDescriptor device, float initialWeightScale = 0.01f) { Device = device; StateSize = stateSize; ActionSize = actionSize; //create actor network part var inputA = new InputLayerDense(stateSize); var outputA = new OutputLayerDense(actionSize, new SoftmaxDef(), OutputLayerDense.LossFunction.None); outputA.InitialWeightScale = initialWeightScale; valueNetwork = new SequentialNetworkDense(inputA, LayerDefineHelper.DenseLayers(numLayers, hiddenSize, true, NormalizationMethod.None, 0, initialWeightScale, new TanhDef()), outputA, device); InputState = inputA.InputVariable; OutputMean = null; OutputVariance = null; OutputProbabilities = outputA.GetOutputVariable(); //this is for discrete action only. PolicyFunction = OutputProbabilities.ToFunction(); //create value network var inputC = new InputLayerCNTKVar(InputState); var outputC = new OutputLayerDense(1, null, OutputLayerDense.LossFunction.None); outputC.InitialWeightScale = initialWeightScale; policyNetwork = new SequentialNetworkDense(inputC, LayerDefineHelper.DenseLayers(numLayers, hiddenSize, true, NormalizationMethod.None, 0, initialWeightScale, new TanhDef()), outputC, device); OutputValue = outputC.GetOutputVariable(); ValueFunction = OutputValue.ToFunction(); }
public PPONetworkContinuousSimple(int stateSize, int actionSize, int numLayers, int hiddenSize, DeviceDescriptor device, float initialWeightScale = 0.01f) { Device = device; StateSize = stateSize; ActionSize = actionSize; //create actor network part var inputA = new InputLayerDense(stateSize); var outputA = new OutputLayerDense(actionSize, null, OutputLayerDense.LossFunction.None); outputA.InitialWeightScale = initialWeightScale; valueNetwork = new SequentialNetworkDense(inputA, LayerDefineHelper.DenseLayers(numLayers, hiddenSize, true, NormalizationMethod.None, 0, initialWeightScale, new TanhDef()), outputA, device); InputState = inputA.InputVariable; OutputMean = outputA.GetOutputVariable(); OutputProbabilities = null; //this is for discrete action only. //the variance output will use a seperate parameter as in Unity's implementation var log_sigma_sq = new Parameter(new int[] { actionSize }, DataType.Float, CNTKLib.ConstantInitializer(0), device, "PPO.log_sigma_square"); //test OutputVariance = CNTKLib.Exp(log_sigma_sq); PolicyFunction = Function.Combine(new Variable[] { OutputMean, OutputVariance }); //create value network var inputC = new InputLayerCNTKVar(InputState); var outputC = new OutputLayerDense(1, null, OutputLayerDense.LossFunction.None); outputC.InitialWeightScale = initialWeightScale; policyNetwork = new SequentialNetworkDense(inputC, LayerDefineHelper.DenseLayers(numLayers, hiddenSize, true, NormalizationMethod.None, 0, initialWeightScale, new TanhDef()), outputC, device); OutputValue = outputC.GetOutputVariable(); ValueFunction = OutputValue.ToFunction(); //PolicyParameters.Add(log_sigma_sq); }