public void Clip(Executor executor, Variable var) { if (var.DataType == typeof(float)) { var variable = (Variable <float>)var; var gradient = executor.GetGradient(variable); var l2normSqure = executor.Context.Eval(AleaTK.Library.ReduceSum(gradient * gradient)).ToScalar(); var l2norm = Math.Sqrt(l2normSqure); if (l2norm > ClipNorm) { executor.Context.Assign(gradient, gradient * ClipNorm.AsScalar <float>() / l2norm.AsScalar <float>()); } return; } if (var.DataType == typeof(double)) { var variable = (Variable <double>)var; var gradient = executor.GetGradient(variable); var l2normSqure = executor.Context.Eval(AleaTK.Library.ReduceSum(gradient * gradient)).ToScalar(); var l2norm = Math.Sqrt(l2normSqure); if (l2norm > ClipNorm) { executor.Context.Assign(gradient, gradient * ClipNorm.AsScalar <double>() / l2norm.AsScalar <double>()); } return; } throw new InvalidOperationException($"Unsupported gradient type {var.DataType}."); }
public override void Clip(Executor executor) { var l2normSqure = 0.0; foreach (var data in executor.Data) { if (data.Variable.Type == VariableType.Parameter) { if (data.Variable.DataType == typeof(float)) { var variable = (Variable <float>)data.Variable; var gradient = executor.GetGradient(variable); l2normSqure += executor.Context.Eval(AleaTK.Library.ReduceSum(gradient * gradient)).ToScalar(); } else if (data.Variable.DataType == typeof(double)) { var variable = (Variable <double>)data.Variable; var gradient = executor.GetGradient(variable); l2normSqure += executor.Context.Eval(AleaTK.Library.ReduceSum(gradient * gradient)).ToScalar(); } else { throw new InvalidOperationException($"Unsupported gradient type {data.Variable.DataType}."); } } } var l2norm = Math.Sqrt(l2normSqure); if (l2norm > ClipNorm) { foreach (var data in executor.Data) { if (data.Variable.Type == VariableType.Parameter) { if (data.Variable.DataType == typeof(float)) { var variable = (Variable <float>)data.Variable; var gradient = executor.GetGradient(variable); executor.Context.Assign(gradient, gradient * ClipNorm.AsScalar <float>() / l2norm.AsScalar <float>()); } else if (data.Variable.DataType == typeof(double)) { var variable = (Variable <double>)data.Variable; var gradient = executor.GetGradient(variable); executor.Context.Assign(gradient, gradient * ClipNorm.AsScalar <double>() / l2norm.AsScalar <double>()); } else { throw new InvalidOperationException($"Unsupported gradient type {data.Variable.DataType}."); } } } } }
public static void TestAttentionReduce() { var n = 3; var b = 4; var d = 5; var statesData = new double[n, b, d]; UniformRandomArray(statesData); var softmaxData = new double[n, b]; UniformRandomArray(softmaxData); var softmax = Variable <double>(PartialShape.Create(-1, b)); var states = Variable <double>(PartialShape.Create(-1, b, d)); var reduce = new AttentionReduce <double>(softmax, states); var ctx = Context.GpuContext(0); var exe = new Executor(ctx, reduce.Output) { AssignAllGradient = true }; exe.Initalize(); var dOutputData = new double[b, d]; UniformRandomArray(dOutputData); exe.AssignTensor(softmax, softmaxData.AsTensor()); exe.AssignTensor(states, statesData.AsTensor()); exe.Forward(); exe.AssignGradient(reduce.Output, dOutputData.AsTensor(), replace: true); exe.Backward(); var dSoftmax = exe.GetGradient(reduce.Softmax); var dStates = exe.GetGradient(reduce.States); var bump = 1e-6; var dSoftmaxFd = GradientChecker.FiniteDifferenceGradient(exe, softmax, bump: bump); AreClose(dSoftmaxFd.ToArray2D(), dSoftmax.ToArray2D(), 1e-7); var dStatesFd = GradientChecker.FiniteDifferenceGradient(exe, states, bump: bump); AreClose(dStatesFd.ToArray3D(), dStates.ToArray3D(), 1e-7); //var dVectorsFdArray = dVectorsFd.Reshape(-1).ToArray(); //var dVectorsBackpropArray = dStates.Reshape(-1).ToArray(); //var err = MaxAbsDiff(dVectorsFdArray, dVectorsBackpropArray); }
public override void Backward(Executor executor) { var states = executor.GetTensor(States); var softmax = executor.GetTensor(Softmax); var dOutput = executor.GetGradient(Output); var n = states.Shape[0]; var b = states.Shape[1]; var d = states.Shape[2]; executor.AssignGradient(States, softmax.Reshape(n, b, 1) * dOutput); // states (n,b,d) * dOutput (b,d) => (n,b,d) // softmax (n,b) executor.AssignGradient(Softmax, ReduceSum((states * dOutput).Reshape(n * b, d), 1).Reshape(n, b)); }
public static void LoopStyle() { var inputVar = Variable <double>(); var statesVar = Variable <double>(); var weightVar = Variable <double>(); var loop = new LoopDemo(inputVar, statesVar, weightVar); var outputVar = loop.Output; // create executor var ctx = Context.GpuContext(0); var exe = new Executor(ctx, outputVar) { AssignAllGradient = true }; exe.Initalize(); // fake forward data const int steps = 4; const int n = 5; var input = new double[n, n]; var states = new double[steps, n, n]; var weight = new double[n, n]; var rng = new Random(42); UniformRandomArray(input, rng); UniformRandomArray(states, rng); UniformRandomArray(weight, rng); exe.AssignTensor(inputVar, input.AsTensor()); exe.AssignTensor(statesVar, states.AsTensor()); exe.AssignTensor(weightVar, weight.AsTensor()); // run forward exe.Forward(); var outputTensor = exe.GetTensor(outputVar); outputTensor.Print(); // fake backward data var dOutput = new double[n, n]; UniformRandomArray(dOutput, rng); exe.AssignGradient(outputVar, dOutput.AsTensor(), replace: true); // run backward exe.Backward(); // verify gradients var bump = 1e-7; var dInputTensor = exe.GetGradient(inputVar); var dInputTensor_FD = GradientChecker.FiniteDifferenceGradient(exe, inputVar, bump: bump); //dInputTensor.Print(); //dInputTensor_FD.Print(); AreClose(dInputTensor_FD.ToArray2D(), dInputTensor.ToArray2D(), 1e-7); var dStatesTensor = exe.GetGradient(statesVar); var dStatesTensor_FD = GradientChecker.FiniteDifferenceGradient(exe, statesVar, bump: bump); //dStatesTensor.Reshape(steps, -1).Print(); //dStatesTensor_FD.Reshape(steps, -1).Print(); AreClose(dStatesTensor_FD.ToArray3D(), dStatesTensor.ToArray3D(), 1e-7); var dWeightTensor = exe.GetGradient(weightVar); var dWeightTensor_FD = GradientChecker.FiniteDifferenceGradient(exe, weightVar, bump: bump); //dWeightTensor.Print(); //dWeightTensor_FD.Print(); AreClose(dWeightTensor_FD.ToArray2D(), dWeightTensor.ToArray2D(), 1e-3); }
public override void Backward(Executor executor) { var input = executor.GetTensor(Input); var states = executor.GetTensor(States); var weight = executor.GetTensor(Weight); Util.EnsureTrue(input.Shape.Rank == 2); Util.EnsureTrue(states.Shape.Rank == 3, "states shape: (steps, n, n)"); Util.EnsureTrue(states.Shape[1] == states.Shape[2], "states shape: (steps, n, n)"); var steps = (int)states.Shape[0]; var n = states.Shape[1]; var intermediate = executor.GetTensor(Intermediate); var output = executor.GetTensor(Output); var dOutput = executor.GetGradient(Output); var dIntermediate = executor.GetGradient(Intermediate, intermediate.Shape); var dStates = executor.GetGradient(States, states.Shape); var dWeight = executor.GetGradient(Weight, weight.Shape); var dInput = executor.GetGradient(Input, input.Shape); var counterInput = executor.GetGradientAggregationCounter(Input); var counterWeight = executor.GetGradientAggregationCounter(Weight); var counterStates = executor.GetGradientAggregationCounter(States); var counterIntermediate = executor.GetGradientAggregationCounter(Intermediate); var subExecutor = (Executor)executor.Objects[SubExecutor]; for (var i = steps - 1; i >= 0; --i) { // need set both input and output tensor and their gradient var input_i = i == 0 ? input : intermediate.Slice(i - 1).Reshape(n, n); var state_i = states.Slice(i).Reshape(n, n); var output_i = i == steps - 1 ? output : intermediate.Slice(i).Reshape(n, n); subExecutor.SetTensor(SubInput, input_i); subExecutor.SetTensor(SubWeight, weight); subExecutor.SetTensor(SubState, state_i); subExecutor.SetTensor(SubOutput, output_i); var dInput_i = i == 0 ? dInput : dIntermediate.Slice(i - 1).Reshape(n, n); var dState_i = dStates.Slice(i).Reshape(n, n); var dOutput_i = i == steps - 1 ? dOutput : dIntermediate.Slice(i).Reshape(n, n); // since we have one shared variable, the weight, so we need update the // gradient aggregation counter ourselves // set counter = 0 means, you just point the memory for that gradient to another // tensor, but it contains no value for aggregation // but since weight is shared, so we need update its counter correctly, it // will be assigned by steps - 1 times. subExecutor.ClearGradientAggregationCounters(); subExecutor.SetGradient(SubInput, dInput_i, counter: i == 0 ? counterInput : counterIntermediate); subExecutor.SetGradient(SubWeight, dWeight, counter: counterWeight + steps - 1 - i); subExecutor.SetGradient(SubState, dState_i, counter: counterStates); subExecutor.SetGradient(SubOutput, dOutput_i); // do backward without clearing the counter, because we set the counter ourselves. subExecutor.Backward(clearGradientAggretionCounter: false); } executor.IncreaseGradientAggregationCounter(Input); executor.IncreaseGradientAggregationCounter(Weight); executor.IncreaseGradientAggregationCounter(States); executor.IncreaseGradientAggregationCounter(Intermediate); }
public static void UnrollingStyle() { // create unrolling graph const int steps = 4; var inputVar = Variable <double>(); var stateVars = Enumerable.Range(0, steps).Select(_ => Variable <double>()).ToArray(); var weightVar = Variable <double>(); var outputVar = CreateUnrollingGraph(inputVar, stateVars, weightVar); // create executor var ctx = Context.GpuContext(0); var exe = new Executor(ctx, outputVar) { AssignAllGradient = true }; exe.Initalize(); // fake forward data const int n = 5; var input = new double[n, n]; var states = Enumerable.Range(0, steps).Select(_ => new double[n, n]).ToArray(); var weight = new double[n, n]; var rng = new Random(42); UniformRandomArray(input, rng); foreach (var state in states) { UniformRandomArray(state, rng); } UniformRandomArray(weight, rng); exe.AssignTensor(inputVar, input.AsTensor()); for (var i = 0; i < steps; ++i) { exe.AssignTensor(stateVars[i], states[i].AsTensor()); } exe.AssignTensor(weightVar, weight.AsTensor()); // run forward exe.Forward(); var outputTensor = exe.GetTensor(outputVar); outputTensor.Print(); // fake backward data var dOutput = new double[n, n]; UniformRandomArray(dOutput, rng); exe.AssignGradient(outputVar, dOutput.AsTensor(), replace: true); // run backward exe.Backward(); // verify gradients var bump = 1e-7; var dInputTensor = exe.GetGradient(inputVar); var dInputTensor_FD = GradientChecker.FiniteDifferenceGradient(exe, inputVar, bump: bump); //dInputTensor.Print(); //dInputTensor_FD.Print(); AreClose(dInputTensor_FD.ToArray2D(), dInputTensor.ToArray2D(), 1e-7); for (var i = 0; i < steps; ++i) { var stateVar = stateVars[i]; var dStateTensor = exe.GetGradient(stateVar); var dStateTensor_FD = GradientChecker.FiniteDifferenceGradient(exe, stateVar, bump: bump); //dStateTensor.Print(); //dStateTensor_FD.Print(); AreClose(dStateTensor_FD.ToArray2D(), dStateTensor.ToArray2D(), 1e-7); } var dWeightTensor = exe.GetGradient(weightVar); var dWeightTensor_FD = GradientChecker.FiniteDifferenceGradient(exe, weightVar, bump: bump); //dWeightTensor.Print(); //dWeightTensor_FD.Print(); AreClose(dWeightTensor_FD.ToArray2D(), dWeightTensor.ToArray2D(), 1e-3); }
public static void TestAttention() { //var batch = 4; //var encoderHiddenSize = 5; //var decoderHiddenSize = 4; //var attentionDim = 3; var batch = 10; var encoderHiddenSize = 20; var decoderHiddenSize = 25; var attentionDim = 30; // (encoderSeqLength, batch, encoderHiddenSize) var encoderHiddenStates = Variable <double>(PartialShape.Create(-1, batch, encoderHiddenSize)); var decoderHiddenStates = Variable <double>(PartialShape.Create(batch, decoderHiddenSize)); var attention = new Attention <double>(encoderHiddenStates, decoderHiddenStates, attentionDim); var ctx = Context.GpuContext(0); var exe = new Executor(ctx, attention.Output) { AssignAllGradient = true }; exe.Initalize(); // encoderSeqLength is flexibly at runtime var encoderSeqLength = 3; var dataEncoderHiddenStates = new double[encoderSeqLength, batch, encoderHiddenSize]; UniformRandomArray(dataEncoderHiddenStates); var dataDecoderHiddenStates = new double[batch, decoderHiddenSize]; UniformRandomArray(dataDecoderHiddenStates); exe.AssignTensor(encoderHiddenStates, dataEncoderHiddenStates.AsTensor()); exe.AssignTensor(decoderHiddenStates, dataDecoderHiddenStates.AsTensor()); exe.Forward(); var tensorOutput = exe.GetTensor(attention.Output); //Console.WriteLine(tensorOutput.Shape); //tensorOutput.Print(); var dataDOutput = new double[batch, encoderHiddenSize]; UniformRandomArray(dataDOutput); exe.AssignGradient(attention.Output, dataDOutput.AsTensor(), replace: true); exe.Backward(); var tensorDWh = exe.GetGradient(attention.Wh); //tensorDWh.Print(); var tensorDWd = exe.GetGradient(attention.Wd); //tensorDWd.Print(); var tensorDH = exe.GetGradient(attention.EncoderHiddenStates); //Console.WriteLine(tensorDH.Shape); //tensorDH.Reshape(-1, encoderHiddenSize).Print(); var tensorDD = exe.GetGradient(attention.DecoderHiddenStates); //Console.WriteLine(tensorDD.Shape); //tensorDD.Print(); var bump = 1e-7; var tensorDWh_fd = GradientChecker.FiniteDifferenceGradient(exe, attention.Wh, bump: bump); //tensorDWh.Print(); //tensorDWh_fd.Print(); AreClose(tensorDWh.ToArray2D(), tensorDWh_fd.ToArray2D(), 1e-7); var tensorDWd_fd = GradientChecker.FiniteDifferenceGradient(exe, attention.Wd, bump: bump); //tensorDWd.Print(); //tensorDWd_fd.Print(); AreClose(tensorDWd.ToArray2D(), tensorDWd_fd.ToArray2D(), 1e-7); var tensorDH_fd = GradientChecker.FiniteDifferenceGradient(exe, attention.EncoderHiddenStates, bump: bump); //tensorDH.Reshape(-1, encoderHiddenSize).Print(); //tensorDH_fd.Reshape(-1, encoderHiddenSize).Print(); AreClose(tensorDH.ToArray3D(), tensorDH_fd.ToArray3D(), 1e-7); var tensorDD_fd = GradientChecker.FiniteDifferenceGradient(exe, attention.DecoderHiddenStates, bump: bump); //tensorDD.Print(); //tensorDD_fd.Print(); AreClose(tensorDD.ToArray2D(), tensorDD_fd.ToArray2D(), 1e-7); }