public Attention(Variable <T> encoderHiddenStates, Variable <T> decoderHiddenState, int attentionDim) { AttentionDim = attentionDim; EncoderHiddenStates = encoderHiddenStates; DecoderHiddenState = decoderHiddenState; Util.EnsureEqual(3, EncoderHiddenStates.Shape.Rank, "Input layout: (seqLength, batch, encoderHiddenSize)"); Util.EnsureTrue(EncoderHiddenStates.Shape[0] >= 0, "Input layout: (seqLength, batch, encoderHiddenSize)"); Util.EnsureTrue(EncoderHiddenStates.Shape[1] >= 0, "Input layout: (seqLength, batch, encoderHiddenSize)"); Util.EnsureTrue(EncoderHiddenStates.Shape[2] >= 0, "Input layout: (seqLength, batch, encoderHiddenSize)"); SeqLength = (int)EncoderHiddenStates.Shape[0]; Batch = (int)EncoderHiddenStates.Shape[1]; EncoderHiddenSize = (int)EncoderHiddenStates.Shape[2]; Util.EnsureEqual(2, DecoderHiddenState.Shape.Rank, "Input layout: (batch, decoderHiddenSize)"); Util.EnsureTrue(DecoderHiddenState.Shape[0] >= 0, "Input layout: (seqLength, batch, encoderHiddenSize)"); Util.EnsureTrue(DecoderHiddenState.Shape[1] >= 0, "Input layout: (seqLength, batch, encoderHiddenSize)"); Util.EnsureTrue(DecoderHiddenState.Shape[0] == EncoderHiddenStates.Shape[0]); DecoderHiddenSize = (int)DecoderHiddenState.Shape[1]; var scale = Sqrt(12.0.AsScalar <T>() / ((double)(AttentionDim + EncoderHiddenSize)).AsScalar <T>()); Wh = Parameter(scale * (RandomUniform <T>(Shape.Create(EncoderHiddenSize, AttentionDim), 0UL, 0UL) - 0.5.AsScalar <T>())); scale = Sqrt(12.0.AsScalar <T>() / ((double)(AttentionDim + DecoderHiddenSize)).AsScalar <T>()); Wd = Parameter(scale * (RandomUniform <T>(Shape.Create(DecoderHiddenSize, AttentionDim), 0UL, 0UL) - 0.5.AsScalar <T>())); scale = Sqrt(12.0.AsScalar <T>() / ((double)(AttentionDim)).AsScalar <T>()); V = Parameter(scale * (RandomUniform <T>(Shape.Create(AttentionDim), 0UL, 0UL) - 0.5.AsScalar <T>())); Softmax = Variable <T>(); AttentionState = Variable <T>(PartialShape.Create(Batch, EncoderHiddenSize)); }
public static Model ConvolutionalNeuralNetworkModel() { var images = Variable <float>(); var labels = Variable <float>(); ILayer <float> net = new Reshape <float>(images, PartialShape.Create(-1, 1, 28, 28)); net = new Convolution2D <float>(net.Output, 5, 5, 16); net = new ActivationReLU <float>(net.Output); net = new Pooling2D <float>(net.Output, PoolingMode.MAX, 2, 2, 2, 2); net = new Convolution2D <float>(net.Output, 5, 5, 32); net = new ActivationTanh <float>(net.Output); net = new Pooling2D <float>(net.Output, PoolingMode.MAX, 2, 2, 2, 2); net = new Reshape <float>(net.Output, PartialShape.Create(-1, net.Output.Shape.Skip(1).Aggregate(ScalarOps.Mul))); net = new FullyConnected <float>(net.Output, 50); net = new ActivationTanh <float>(net.Output); net = new FullyConnected <float>(net.Output, 10); return(new Model { Loss = new SoftmaxCrossEntropy <float>(net.Output, labels), Images = images, Labels = labels }); }
protected Variable(Type dataType, VariableType type, PartialShape shape) { DataType = dataType; Type = type; _shape = shape; _initializer = null; _owner = null; }
protected Variable(Type dataType, VariableType type, Expr initializer) { DataType = dataType; Type = type; _shape = initializer != null ? new PartialShape(initializer.Shape.AsArray) : null; _initializer = initializer; _owner = null; }
public Reshape(Variable <T> input, PartialShape shape) { Util.EnsureTrue(input.Type != VariableType.Parameter); Shape = input.HasShape ? PartialShape.Reshape(input.Shape, shape) : shape; Input = input; Output = Variable <T>(Shape); AddInput(Input); AddOutput(Output); }
public Embedding(Variable <int> indices, int embedSize, int embedDim, double initScale = 0.5) { Indices = indices; Weights = Library.Parameter((initScale * 2.0).AsScalar <T>() * RandomUniform <T>(Shape.Create(embedSize, embedDim)) - initScale.AsScalar <T>()); Output = Library.Variable <T>(PartialShape.Create(Indices.Shape.Concat(new long[] { embedDim }).ToArray())); EmbedSize = embedSize; EmbedDim = embedDim; AddInput(Indices); AddInput(Weights); AddOutput(Output); }
public static void TestAttentionReduce() { var n = 3; var b = 4; var d = 5; var statesData = new double[n, b, d]; UniformRandomArray(statesData); var softmaxData = new double[n, b]; UniformRandomArray(softmaxData); var softmax = Variable <double>(PartialShape.Create(-1, b)); var states = Variable <double>(PartialShape.Create(-1, b, d)); var reduce = new AttentionReduce <double>(softmax, states); var ctx = Context.GpuContext(0); var exe = new Executor(ctx, reduce.Output) { AssignAllGradient = true }; exe.Initalize(); var dOutputData = new double[b, d]; UniformRandomArray(dOutputData); exe.AssignTensor(softmax, softmaxData.AsTensor()); exe.AssignTensor(states, statesData.AsTensor()); exe.Forward(); exe.AssignGradient(reduce.Output, dOutputData.AsTensor(), replace: true); exe.Backward(); var dSoftmax = exe.GetGradient(reduce.Softmax); var dStates = exe.GetGradient(reduce.States); var bump = 1e-6; var dSoftmaxFd = GradientChecker.FiniteDifferenceGradient(exe, softmax, bump: bump); AreClose(dSoftmaxFd.ToArray2D(), dSoftmax.ToArray2D(), 1e-7); var dStatesFd = GradientChecker.FiniteDifferenceGradient(exe, states, bump: bump); AreClose(dStatesFd.ToArray3D(), dStates.ToArray3D(), 1e-7); //var dVectorsFdArray = dVectorsFd.Reshape(-1).ToArray(); //var dVectorsBackpropArray = dStates.Reshape(-1).ToArray(); //var err = MaxAbsDiff(dVectorsFdArray, dVectorsBackpropArray); }
public Lstm(Variable <T> x, int hiddenSize, Variable <T> cx = null, Variable <T> hx = null, double forgetBiasInit = 0.0) { // X shape (seqLength, batch, inputSize) Util.EnsureEqual(3, x.Shape.Rank, "Input layout: (seqLength, batch, inputSize)"); Util.EnsureTrue(x.Shape[0] > 0, "SeqLength must be determined."); Util.EnsureTrue(x.Shape[2] > 0, "InputSize must be determined."); X = x; SeqLength = (int)X.Shape[0]; InputSize = (int)X.Shape[2]; HiddenSize = hiddenSize; ForgetBiasInit = forgetBiasInit; // Y Shape (seqLength, batch, hiddenSize) Y = Variable <T>(PartialShape.Create(SeqLength, -1, HiddenSize)); // W (1 + inputSize + hiddenSize, 4 * hiddenSize) : B -> W -> U // layout: IFOA W = Parameter(RandomNormal <T>(Shape.Create(InputSize + HiddenSize + 1, 4 * HiddenSize)) / Math.Sqrt(InputSize + hiddenSize).AsScalar <T>()); // input and output states CX = cx ?? Variable <T>(PartialShape.Create(-1, HiddenSize)); HX = hx ?? Variable <T>(PartialShape.Create(-1, HiddenSize)); CY = Variable <T>(PartialShape.Create(-1, HiddenSize)); HY = Variable <T>(PartialShape.Create(-1, HiddenSize)); // build the graph AddInput(X); AddOutput(Y); AddInput(W); AddInput(CX); AddInput(HX); AddOutput(CY); AddOutput(HY); // Aux variables Hin = AuxVariable <T>(); Hout = AuxVariable <T>(); IFOA1 = AuxVariable <T>(); IFOA2 = AuxVariable <T>(); C = AuxVariable <T>(); Temp1 = AuxVariable <T>(); Temp2 = AuxVariable <T>(); AddAuxVar(Hin); AddAuxVar(Hout); AddAuxVar(IFOA1); AddAuxVar(IFOA2); AddAuxVar(C); AddAuxVar(Temp1); AddAuxVar(Temp2); }
public Convolution2D(Variable <T> data, int kernelH, int kernelW, int numFilter) { Util.EnsureTrue(data.Shape.Rank == 4); Util.EnsureTrue(data.Shape[1] > 0); Util.EnsureTrue(data.Shape[2] > 0); Util.EnsureTrue(data.Shape[3] > 0); var numInputFilter = data.Shape[1]; var numOutputFilter = numFilter; var height = data.Shape[2]; var width = data.Shape[3]; // fixed padding and stride now ConvolutionDesc = new ConvolutionDescriptor(); ConvolutionDesc.Set2D(0, 0, 1, 1, 1, 1, ConvolutionMode.CROSS_CORRELATION); using (var dataDesc = new TensorDescriptor()) using (var weightDesc = new FilterDescriptor()) { var dataType = Dnn.DataTypeOf <T>(); var tempN = 100; // for temp mini batch size dataDesc.Set4D(dataType, TensorFormat.CUDNN_TENSOR_NCHW, tempN, (int)numInputFilter, (int)height, (int)width); weightDesc.Set4D(dataType, TensorFormat.CUDNN_TENSOR_NCHW, numOutputFilter, (int)numInputFilter, kernelH, kernelW); // get output dimension int n, c, h, w; ConvolutionDesc.Get2DForwardOutputDim(dataDesc, weightDesc, out n, out c, out h, out w); //Console.WriteLine($"{c},{h},{w}"); // Create variables var scale = Sqrt(3.0.AsScalar <T>() / ((double)(numInputFilter * kernelH * kernelW)).AsScalar <T>()); Data = data; Weight = Parameter(scale * (2.0.AsScalar <T>() * RandomUniform <T>(Shape.Create(numOutputFilter, numInputFilter, kernelH, kernelW), 0UL, 0UL) - 1.0.AsScalar <T>())); Bias = Parameter(Fill(Shape.Create(c), ScalarOps.Conv <T>(0.1))); Output = Variable <T>(PartialShape.Create(-1, c, h, w)); Workspace1 = AuxVariable <byte>(); Workspace2 = AuxVariable <byte>(); AddInput(Data); AddInput(Weight); AddInput(Bias); AddOutput(Output); AddAuxVar(Workspace1); AddAuxVar(Workspace2); } }
public IteratedRnnCell(RnnType rnnRnnType, Variable <T> input, int numLayers, int hiddenSize, bool isTraining, double dropoutProbability, ulong dropoutSeed = 1337UL) { RnnType = rnnRnnType; IsTraining = isTraining; NumLayers = numLayers; HiddenSize = hiddenSize; DropoutProbability = isTraining ? dropoutProbability : 0.0; DropoutSeed = dropoutSeed; Util.EnsureEqual(3, input.Shape.Rank, "Input layout: (seqLength, batch, inputSize)"); Util.EnsureTrue(input.Shape[1] >= 0, "Input layout: (seqLength, batch, inputSize)"); Util.EnsureTrue(input.Shape[2] >= 0, "Input layout: (seqLength, batch, inputSize)"); Input = input; BatchSize = (int)input.Shape[1]; InputSize = (int)input.Shape[2]; // output Shape (seqLength, batchSize, hiddenSize) Output = Variable <T>(PartialShape.Create(-1, BatchSize, HiddenSize)); // W shape will be determined during initialization W = Parameter <T>(); // create variables for input hidden and cell state HX = Variable <T>(PartialShape.Create(NumLayers, BatchSize, HiddenSize)); CX = Variable <T>(PartialShape.Create(NumLayers, BatchSize, HiddenSize)); HY = Variable <T>(PartialShape.Create(NumLayers, BatchSize, HiddenSize)); CY = Variable <T>(PartialShape.Create(NumLayers, BatchSize, HiddenSize)); // state variable H and Y = (n - 1, layer, b, d), n is unknown var shape = PartialShape.Create(-1, NumLayers, BatchSize, HiddenSize); H = Library.Variable <T>(shape); C = Library.Variable <T>(shape); ReserveSpace = Library.Variable <byte>(); // construct the graph AddInput(Input); AddInput(W); AddOutput(Output); AddAuxVar(HX); AddAuxVar(CX); AddAuxVar(HY); AddAuxVar(CY); AddAuxVar(H); AddAuxVar(C); AddAuxVar(ReserveSpace); }
public Model(Context ctx, int numInputSteps, Config cfg, bool isTraining = true) { var addDropout = isTraining && cfg.DropoutProbability > 0.0; EncoderInputs = Library.Variable <int>(PartialShape.Create(numInputSteps, cfg.BatchSize)); Embedding = new Embedding <float>(EncoderInputs, cfg.VocabularySize, cfg.HiddenSize, initScale: cfg.InitScale); EmbeddingOutput = addDropout ? new Dropout <float>(Embedding.Output, cfg.DropoutProbability).Output : Embedding.Output; var rnnType = new LstmRnnType(); EncoderRnn = new Rnn <float>(rnnType, EmbeddingOutput, cfg.NumLayers, cfg.HiddenSize, isTraining: isTraining, dropout: addDropout ? cfg.DropoutProbability : 0.0); EncoderRnnOutput = addDropout ? new Dropout <float>(EncoderRnn.Y, cfg.DropoutProbability).Output : EncoderRnn.Y; // attention model }
public static Model MultiLayerPerceptronModel() { var images = Variable <float>(PartialShape.Create(-1, 28 * 28)); ILayer <float> net = new FullyConnected <float>(images, 128); net = new ActivationReLU <float>(net.Output); net = new FullyConnected <float>(net.Output, 64); net = new ActivationReLU <float>(net.Output); net = new FullyConnected <float>(net.Output, 10); var labels = Variable <float>(PartialShape.Create(-1, 10)); return(new Model { Loss = new SoftmaxCrossEntropy <float>(net.Output, labels), Images = images, Labels = labels }); }
public RnnDynamic(RnnType rnnRnnType, Variable <T> x, int numLayers, int hiddenSize, bool isTraining = true, double dropout = 0.0, ulong dropoutSeed = 1337UL) { RnnType = rnnRnnType; IsTraining = isTraining; NumLayers = numLayers; HiddenSize = hiddenSize; Dropout = isTraining ? dropout : 0.0; DropoutSeed = dropoutSeed; // X shape (seqLength, batch, inputSize) X = x; Util.EnsureEqual(3, X.Shape.Rank, "Input layout: (seqLength, batch, inputSize)"); Util.EnsureTrue(X.Shape[2] >= 0, "Input layout: (seqLength, batch, inputSize)"); InputSize = (int)X.Shape[2]; // Y Shape (maxSeqLength, not yet known, hiddenSize) Y = Variable <T>(PartialShape.Create(-1, -1, HiddenSize)); // W shape will be determined during initialization W = Parameter <T>(); // state variables var shape = PartialShape.Create(NumLayers, -1, HiddenSize); HX = Variable <T>(shape); CX = Variable <T>(shape); HY = Variable <T>(shape); CY = Variable <T>(shape); // construct the graph AddInput(X); AddInput(W); AddOutput(Y); AddAuxVar(HX); AddAuxVar(CX); AddAuxVar(HY); AddAuxVar(CY); AddAuxVar(DropoutStates); AddAuxVar(Workspace); AddAuxVar(ReserveSpace); }
public FullyConnected(Variable <T> data, long numHidden) { Util.EnsureTrue(data.HasShape); Util.EnsureEqual(2, data.Shape.Rank, "Input must be matrix."); Util.EnsureTrue(data.Shape[1] > 0L); Data = data; var numInput = data.Shape[1]; var scale = Sqrt(12.0.AsScalar <T>() / ((double)(numInput + numHidden)).AsScalar <T>()); Weights = Parameter(scale * (RandomUniform <T>(Shape.Create(numInput, numHidden), 0UL, 0UL) - 0.5.AsScalar <T>())); Bias = Parameter(Fill(Shape.Create(numHidden), ScalarOps.Conv <T>(0.0))); Output = Variable <T>(PartialShape.Create(data.Shape[0], numHidden)); AddInput(Data); AddInput(Weights); AddInput(Bias); AddOutput(Output); }
public AttentionReduce(Variable <T> softmax, Variable <T> states) { Softmax = softmax; States = states; Util.EnsureTrue(softmax.Shape.Rank == 2, "Softmax: (n,b)"); Util.EnsureTrue(states.Shape.Rank == 3, "States: (n,b,d)"); Util.EnsureTrue(softmax.Shape[1] > 0, "Softmax: b needed."); Util.EnsureTrue(states.Shape[1] > 0, "States: b needed."); Util.EnsureTrue(states.Shape[2] > 0, "States: d needed."); Util.EnsureTrue(softmax.Shape[1] == states.Shape[1], "b should match."); BatchSize = softmax.Shape[1]; StatesSize = states.Shape[2]; Output = Variable <T>(PartialShape.Create(BatchSize, StatesSize)); AddInput(Softmax); AddInput(States); AddOutput(Output); }
public Pooling2D(Variable <T> data, PoolingMode mode, int kernelH, int kernelW, int strideH, int strideW) { Descriptor = new PoolingDescriptor(); Descriptor.Set2D(mode, NanPropagation.NOT_PROPAGATE_NAN, kernelH, kernelW, 0, 0, strideH, strideW); var dataType = Dnn.DataTypeOf <T>(); var dataDesc = new TensorDescriptor(); dataDesc.Set4D(dataType, TensorFormat.CUDNN_TENSOR_NCHW, 10, (int)data.Shape[1], (int)data.Shape[2], (int)data.Shape[3]); int n, c, h, w; Descriptor.Get2dForwardOutputDim(dataDesc, out n, out c, out h, out w); Data = data; Output = Variable <T>(PartialShape.Create(-1, c, h, w)); AddInput(Data); AddOutput(Output); dataDesc.Dispose(); }
public static Variable <T> Variable <T>(PartialShape shape) { return(new Variable <T>(VariableType.Common, shape)); }
public override void Initialize(Executor executor) { var context = executor.Context.ToGpuContext(); var dnn = context.Dnn; // dropout var dropoutDesc = executor.DropoutDescDict[DropoutDesc]; IntPtr dropoutStatesSize; dnn.DropoutGetStatesSize(out dropoutStatesSize); var dropoutStates = executor.GetTensor(DropoutStates, Shape.Create(dropoutStatesSize.ToInt64())); dropoutDesc.Set(dnn, (float)Dropout, dropoutStates.Buffer.Ptr, dropoutStatesSize, DropoutSeed); // rnn descriptor var rnnDesc = executor.RnnDescDict[RnnDesc]; var mode = RnnType.Mode; rnnDesc.Set(HiddenSize, NumLayers, dropoutDesc, RNNInputMode.LINEAR_INPUT, DirectionMode.UNIDIRECTIONAL, mode, Dnn.DataTypeOf <T>()); // initialize weight, once only, using minibatch size 1 var shape = PartialShape.Create(1, InputSize, 1); // first dimension does not affect the weight shape and size TODO test all, tested only for LSTM var strides = Strides.Create(shape[1] * shape[2], shape[2], 1); var xDesc = new TensorDescriptor(); xDesc.SetND(Dnn.DataTypeOf <T>(), shape.AsInt32Array, strides.AsInt32Array); var wDesc = executor.FilterDescDict[WDesc]; IntPtr weightsSize; dnn.GetRNNParamsSize(rnnDesc, xDesc, out weightsSize, Dnn.DataTypeOf <T>()); Util.EnsureTrue(weightsSize.ToInt64() % Gpu.SizeOf <T>() == 0); var shapeW = Shape.Create(weightsSize.ToInt64() / Alea.Gpu.SizeOf <T>()); wDesc.SetND(Dnn.DataTypeOf <T>(), TensorFormat.CUDNN_TENSOR_NCHW, new [] { (int)shapeW[0], 1, 1 }); // since we are using cuDNN, we'd better make sure these varaibles are allocated executor.GetTensor(W, shapeW); if (IsTraining) { executor.GetGradient(W, shapeW); } // init weights var numLinearLayers = RnnType.NumLinLayers; using (var filterDesc = new FilterDescriptor()) { var w = executor.GetTensor(W); var filterDimA = new int[3]; for (var layer = 0; layer < NumLayers; ++layer) { for (var linLayerId = 0; linLayerId < numLinearLayers; ++linLayerId) { int nbDims; DataType dataType; TensorFormat format; deviceptr <T> linLayerMat; dnn.GetRNNLinLayerMatrixParams(rnnDesc, layer, xDesc, wDesc, w.Buffer.Ptr, linLayerId, filterDesc, out linLayerMat); filterDesc.GetND(out dataType, out format, out nbDims, filterDimA); var length = filterDimA.Aggregate(ScalarOps.Mul); var linLayerMatBuffer = new Buffer <T>(context.Device, w.Memory, new Layout(Shape.Create(length)), linLayerMat); var linLayerMatTensor = new Tensor <T>(linLayerMatBuffer); context.Assign(linLayerMatTensor, RandomNormal <T>(Shape.Create(length)) / (Math.Sqrt(HiddenSize + InputSize).AsScalar <T>())); deviceptr <T> linLayerBias; dnn.GetRNNLinLayerBiasParams(rnnDesc, layer, xDesc, wDesc, w.Buffer.Ptr, linLayerId, filterDesc, out linLayerBias); filterDesc.GetND(out dataType, out format, out nbDims, filterDimA); length = filterDimA.Aggregate(ScalarOps.Mul); var linLayerBiasBuffer = new Buffer <T>(context.Device, w.Memory, new Layout(Shape.Create(length)), linLayerBias); var linLayerBiasTensor = new Tensor <T>(linLayerBiasBuffer); RnnType.InitBias(context, layer, linLayerId, linLayerBiasTensor); } } } base.Initialize(executor); }
public static void TestAttention() { //var batch = 4; //var encoderHiddenSize = 5; //var decoderHiddenSize = 4; //var attentionDim = 3; var batch = 10; var encoderHiddenSize = 20; var decoderHiddenSize = 25; var attentionDim = 30; // (encoderSeqLength, batch, encoderHiddenSize) var encoderHiddenStates = Variable <double>(PartialShape.Create(-1, batch, encoderHiddenSize)); var decoderHiddenStates = Variable <double>(PartialShape.Create(batch, decoderHiddenSize)); var attention = new Attention <double>(encoderHiddenStates, decoderHiddenStates, attentionDim); var ctx = Context.GpuContext(0); var exe = new Executor(ctx, attention.Output) { AssignAllGradient = true }; exe.Initalize(); // encoderSeqLength is flexibly at runtime var encoderSeqLength = 3; var dataEncoderHiddenStates = new double[encoderSeqLength, batch, encoderHiddenSize]; UniformRandomArray(dataEncoderHiddenStates); var dataDecoderHiddenStates = new double[batch, decoderHiddenSize]; UniformRandomArray(dataDecoderHiddenStates); exe.AssignTensor(encoderHiddenStates, dataEncoderHiddenStates.AsTensor()); exe.AssignTensor(decoderHiddenStates, dataDecoderHiddenStates.AsTensor()); exe.Forward(); var tensorOutput = exe.GetTensor(attention.Output); //Console.WriteLine(tensorOutput.Shape); //tensorOutput.Print(); var dataDOutput = new double[batch, encoderHiddenSize]; UniformRandomArray(dataDOutput); exe.AssignGradient(attention.Output, dataDOutput.AsTensor(), replace: true); exe.Backward(); var tensorDWh = exe.GetGradient(attention.Wh); //tensorDWh.Print(); var tensorDWd = exe.GetGradient(attention.Wd); //tensorDWd.Print(); var tensorDH = exe.GetGradient(attention.EncoderHiddenStates); //Console.WriteLine(tensorDH.Shape); //tensorDH.Reshape(-1, encoderHiddenSize).Print(); var tensorDD = exe.GetGradient(attention.DecoderHiddenStates); //Console.WriteLine(tensorDD.Shape); //tensorDD.Print(); var bump = 1e-7; var tensorDWh_fd = GradientChecker.FiniteDifferenceGradient(exe, attention.Wh, bump: bump); //tensorDWh.Print(); //tensorDWh_fd.Print(); AreClose(tensorDWh.ToArray2D(), tensorDWh_fd.ToArray2D(), 1e-7); var tensorDWd_fd = GradientChecker.FiniteDifferenceGradient(exe, attention.Wd, bump: bump); //tensorDWd.Print(); //tensorDWd_fd.Print(); AreClose(tensorDWd.ToArray2D(), tensorDWd_fd.ToArray2D(), 1e-7); var tensorDH_fd = GradientChecker.FiniteDifferenceGradient(exe, attention.EncoderHiddenStates, bump: bump); //tensorDH.Reshape(-1, encoderHiddenSize).Print(); //tensorDH_fd.Reshape(-1, encoderHiddenSize).Print(); AreClose(tensorDH.ToArray3D(), tensorDH_fd.ToArray3D(), 1e-7); var tensorDD_fd = GradientChecker.FiniteDifferenceGradient(exe, attention.DecoderHiddenStates, bump: bump); //tensorDD.Print(); //tensorDD_fd.Print(); AreClose(tensorDD.ToArray2D(), tensorDD_fd.ToArray2D(), 1e-7); }
public Model(Context ctx, Config cfg, bool isTraining = true, bool usingCuDnn = true) { Config = cfg; IsTraining = isTraining; UsingCuDnn = usingCuDnn; Inputs = Variable <int>(PartialShape.Create(cfg.NumSteps, cfg.BatchSize)); Targets = Variable <int>(PartialShape.Create(cfg.NumSteps, cfg.BatchSize)); // embedding Embedding = new Embedding <float>(Inputs, cfg.VocabSize, cfg.HiddenSize, initScale: cfg.InitScale); // add dropout EmbeddedOutput = Embedding.Output; if (isTraining && cfg.KeepProb < 1.0) { var dropout = new Dropout <float>(EmbeddedOutput, dropoutProb: 1.0 - cfg.KeepProb); EmbeddedOutput = dropout.Output; } // rnn layer, dropout for intermediate lstm layers and for output if (usingCuDnn) { RnnAccelerated = new Rnn <float>(new LstmRnnType(forgetBiasInit: 0.0), EmbeddedOutput, cfg.NumLayers, cfg.HiddenSize, isTraining: isTraining, dropout: isTraining && cfg.KeepProb < 1.0 ? 1.0 - Config.KeepProb : 0.0); RnnOutput = RnnAccelerated.Y; if (isTraining && cfg.KeepProb < 1.0) { var dropout = new Dropout <float>(RnnOutput, dropoutProb: 1.0 - cfg.KeepProb); RnnOutput = dropout.Output; } } else { RnnDirect = new Lstm <float> [cfg.NumLayers]; for (var i = 0; i < cfg.NumLayers; ++i) { var lstm = new Lstm <float>(i == 0 ? EmbeddedOutput : RnnOutput, cfg.HiddenSize, forgetBiasInit: 0.0); RnnDirect[i] = lstm; RnnOutput = lstm.Y; if (isTraining && cfg.KeepProb < 1.0) { var dropout = new Dropout <float>(RnnOutput, dropoutProb: 1.0 - cfg.KeepProb); RnnOutput = dropout.Output; } } } FC = new FullyConnected <float>(RnnOutput.Reshape(RnnOutput.Shape[0] * RnnOutput.Shape[1], RnnOutput.Shape[2]), cfg.VocabSize); Loss = new SoftmaxCrossEntropySparse <float>(FC.Output, Targets.Reshape(Targets.Shape[0] * Targets.Shape[1])); Optimizer = new GradientDescentOptimizer(ctx, Loss.Loss, cfg.LearningRate, new GlobalNormGradientClipper(cfg.MaxGradNorm)); // warmup to force JIT compilation to get timings without JIT overhead Optimizer.Initalize(); ResetStates(); Optimizer.AssignTensor(Inputs, Fill(Shape.Create(Inputs.Shape.AsArray), 0)); Optimizer.AssignTensor(Targets, Fill(Shape.Create(Targets.Shape.AsArray), 0)); Optimizer.Forward(); if (isTraining) { Optimizer.Backward(); } // now reset states Optimizer.Initalize(); ResetStates(); }
public static Variable <T> Reshape <T>(this Variable <T> input, params long[] shape) { return(new Reshape <T>(input, PartialShape.Create(shape)).Output); }
public static void TestLstmAgainstReferenceResults() { var mfr = new MatFileReader(@"lstm_small.mat"); var inputSize = mfr.GetInt("InputSize"); var seqLength = mfr.GetInt("SeqLength"); var hiddenSize = mfr.GetInt("HiddenSize"); var batchSize = mfr.GetInt("BatchSize"); var x = Variable <float>(PartialShape.Create(seqLength, batchSize, inputSize)); var lstm = new Lstm <float>(x, hiddenSize); var ctx = Context.GpuContext(0); var exe = new Executor(ctx, lstm.Y); exe.Initalize(); var h0 = mfr.GetDoubleArray("h0").Select(n => (float)n).ToArray(); var c0 = mfr.GetDoubleArray("c0").Select(n => (float)n).ToArray(); exe.AssignTensor(lstm.CX, c0.AsTensor(Shape.Create(batchSize, hiddenSize))); exe.AssignTensor(lstm.HX, h0.AsTensor(Shape.Create(batchSize, hiddenSize))); var input = mfr.GetDoubleArray("X").Select(n => (float)n).ToArray(); exe.AssignTensor(x, input.AsTensor(Shape.Create(seqLength, batchSize, inputSize))); var w = mfr.GetDoubleArray("W").Select(n => (float)n).ToArray(); w.AsTensor(Shape.Create(inputSize + hiddenSize + 1, 4 * hiddenSize)).Print(); exe.AssignTensor(lstm.W, w.AsTensor(Shape.Create(inputSize + hiddenSize + 1, 4 * hiddenSize))); exe.Forward(); var H = mfr.GetDoubleArray("H").Select(n => (float)n).ToArray(); H.AsTensor(Shape.Create(seqLength * batchSize, hiddenSize)).Print(); var myH = exe.GetTensor(lstm.Y).ToArray(); myH.AsTensor(Shape.Create(seqLength * batchSize, hiddenSize)).Print(); AreClose(H, myH, 1e-6); var CN = mfr.GetDoubleArray("cn").Select(n => (float)n).ToArray(); CN.AsTensor(Shape.Create(batchSize, hiddenSize)).Print(); var myCN = exe.GetTensor(lstm.CY).ToArray(); myCN.AsTensor(Shape.Create(batchSize, hiddenSize)).Print(); AreClose(CN, myCN, 1e-6); var HN = mfr.GetDoubleArray("hn").Select(n => (float)n).ToArray(); HN.AsTensor(Shape.Create(batchSize, hiddenSize)).Print(); var myHN = exe.GetTensor(lstm.HY).ToArray(); myHN.AsTensor(Shape.Create(batchSize, hiddenSize)).Print(); AreClose(HN, myHN, 1e-6); var dH = mfr.GetDoubleArray("dH").Select(n => (float)n).ToArray(); exe.AssignGradient(lstm.Y, dH.AsTensor(Shape.Create(seqLength, batchSize, hiddenSize)), replace: true); exe.Backward(); var dX = mfr.GetDoubleArray("dX").Select(n => (float)n).ToArray(); dX.AsTensor(Shape.Create(seqLength * batchSize, inputSize)).Print(); var dXmy = exe.GetGradient(lstm.X).ToArray(); dXmy.AsTensor(Shape.Create(seqLength * batchSize, inputSize)).Print(); AreClose(dX, dXmy, 1e-6); var dW = mfr.GetDoubleArray("dW").Select(n => (float)n).ToArray(); dW.AsTensor(Shape.Create(inputSize + hiddenSize + 1, 4 * hiddenSize)).Print(); var dWmy = exe.GetGradient(lstm.W).ToArray(); dWmy.AsTensor(Shape.Create(lstm.W.Shape.AsArray)).Print(); AreClose(dW, dWmy, 1e-6); var dc0 = mfr.GetDoubleArray("dc0").Select(n => (float)n).ToArray(); dc0.AsTensor(Shape.Create(batchSize, hiddenSize)).Print(); var dc0my = exe.GetGradient(lstm.CX).ToArray(); dc0my.AsTensor(Shape.Create(batchSize, hiddenSize)).Print(); AreClose(dc0, dc0my, 1e-6); var dh0 = mfr.GetDoubleArray("dh0").Select(n => (float)n).ToArray(); dh0.AsTensor(Shape.Create(batchSize, hiddenSize)).Print(); var dh0my = exe.GetGradient(lstm.HX).ToArray(); dh0my.AsTensor(Shape.Create(batchSize, hiddenSize)).Print(); AreClose(dh0, dh0my, 1e-6); ctx.ToGpuContext().Stream.Synchronize(); }
public static void TestLstmAgainstCuDnnVersion() { var ctx = Context.GpuContext(0); var inputSize = 5; var seqLength = 3; var batchSize = 2; var hiddenSize = 4; var error = 1e-5; var data = Context.CpuContext.Eval((2.0f.AsScalar() * RandomUniform <float>(Shape.Create(seqLength, batchSize, inputSize)) - 1.0f.AsScalar())).ToArray3D(); //data.AsTensor(Shape.Create(seqLength*batchSize, inputSize)).Print(); var h0 = Context.CpuContext.Eval(RandomNormal <float>(Shape.Create(batchSize, hiddenSize))).ToArray2D(); var c0 = Context.CpuContext.Eval(RandomNormal <float>(Shape.Create(batchSize, hiddenSize))).ToArray2D(); var dy = Context.CpuContext.Eval((2.0f.AsScalar() * RandomUniform <float>(Shape.Create(seqLength, batchSize, hiddenSize)) - 1.0f.AsScalar())).ToArray3D(); //dy.AsTensor(Shape.Create(seqLength * batchSize, hiddenSize)).Print(); var wi = 0.5f; var wf = 0.4f; var wo = 0.3f; var wa = 0.2f; var ui = 0.5f; var uf = 0.4f; var uo = 0.3f; var ua = 0.1f; var bi = 0.5f; var bf = 0.4f; var bo = 0.3f; var ba = 0.2f; float[,,] y1, y2, dx1, dx2; float[,] cy1, cy2, hy1, hy2; float[,] dcx1, dcx2, dhx1, dhx2; float[,] dw1, dw2; { // calc with cuDNN var x = Variable <float>(PartialShape.Create(seqLength, batchSize, inputSize)); var lstm = new Rnn <float>(new LstmRnnType(), x, 1, hiddenSize, dropout: 0.0); var exe = new Executor(ctx, lstm.Y); exe.Initalize(); // set input exe.AssignTensor(lstm.X, data.AsTensor()); // set states exe.AssignTensor(lstm.CX, c0.AsTensor(Shape.Create(1, batchSize, hiddenSize))); exe.AssignTensor(lstm.HX, h0.AsTensor(Shape.Create(1, batchSize, hiddenSize))); // set weigths // cuDNN matrices order: IFAO var w = exe.GetTensor(lstm.W).Reshape(inputSize * 4 + hiddenSize * 4 + 2 * 4, hiddenSize); var offset = 0; // Wi ctx.Assign(w.Slice(Range(offset, offset + inputSize)), Fill(Shape.Create(inputSize, hiddenSize), wi)); offset += inputSize; // Wf ctx.Assign(w.Slice(Range(offset, offset + inputSize)), Fill(Shape.Create(inputSize, hiddenSize), wf)); offset += inputSize; // Wa ctx.Assign(w.Slice(Range(offset, offset + inputSize)), Fill(Shape.Create(inputSize, hiddenSize), wa)); offset += inputSize; // Wo ctx.Assign(w.Slice(Range(offset, offset + inputSize)), Fill(Shape.Create(inputSize, hiddenSize), wo)); offset += inputSize; // Ui ctx.Assign(w.Slice(Range(offset, offset + hiddenSize)), Fill(Shape.Create(hiddenSize, hiddenSize), ui)); offset += hiddenSize; // Uf ctx.Assign(w.Slice(Range(offset, offset + hiddenSize)), Fill(Shape.Create(hiddenSize, hiddenSize), uf)); offset += hiddenSize; // Ua ctx.Assign(w.Slice(Range(offset, offset + hiddenSize)), Fill(Shape.Create(hiddenSize, hiddenSize), ua)); offset += hiddenSize; // Uo ctx.Assign(w.Slice(Range(offset, offset + hiddenSize)), Fill(Shape.Create(hiddenSize, hiddenSize), uo)); offset += hiddenSize; // Bi ctx.Assign(w.Slice(offset), Fill(Shape.Create(1, hiddenSize), bi)); offset++; // Bf ctx.Assign(w.Slice(offset), Fill(Shape.Create(1, hiddenSize), bf)); offset++; // Ba ctx.Assign(w.Slice(offset), Fill(Shape.Create(1, hiddenSize), ba)); offset++; // Bo ctx.Assign(w.Slice(offset), Fill(Shape.Create(1, hiddenSize), bo)); exe.Forward(); y1 = exe.GetTensor(lstm.Y).ToArray3D(); cy1 = exe.GetTensor(lstm.CY).Reshape(batchSize, hiddenSize).ToArray2D(); hy1 = exe.GetTensor(lstm.HY).Reshape(batchSize, hiddenSize).ToArray2D(); exe.AssignGradient(lstm.Y, dy.AsTensor(), replace: true); exe.Backward(); dx1 = exe.GetGradient(lstm.X).ToArray3D(); dcx1 = exe.GetGradient(lstm.CX).Reshape(batchSize, hiddenSize).ToArray2D(); dhx1 = exe.GetGradient(lstm.HX).Reshape(batchSize, hiddenSize).ToArray2D(); // we make dw follow the shape as (1 + inputSize + hiddenSize, 4*hiddenSize), need to transpose because cuDNN uses Fortran storge order var dwCUDNN = exe.GetGradient(lstm.W).ToArray().AsTensor(); dw1 = new float[1 + inputSize + hiddenSize, 4 * hiddenSize]; var dw1Tensor = Reference <float>(dw1); var cpu = Context.CpuContext; offset = 0; // cuDNN order: IFAO, need to transpose because cuDNN uses Fortran storge order // Wi cpu.Assign(dw1Tensor.Slice(Range(1, inputSize + 1), Range(0, hiddenSize)), dwCUDNN.Slice(Range(offset, offset + inputSize * hiddenSize)).Reshape(hiddenSize, inputSize).T); offset += inputSize * hiddenSize; // Wf cpu.Assign(dw1Tensor.Slice(Range(1, inputSize + 1), Range(hiddenSize, 2 * hiddenSize)), dwCUDNN.Slice(Range(offset, offset + inputSize * hiddenSize)).Reshape(hiddenSize, inputSize).T); offset += inputSize * hiddenSize; // Wa cpu.Assign(dw1Tensor.Slice(Range(1, inputSize + 1), Range(3 * hiddenSize, 4 * hiddenSize)), dwCUDNN.Slice(Range(offset, offset + inputSize * hiddenSize)).Reshape(hiddenSize, inputSize).T); offset += inputSize * hiddenSize; // Wo cpu.Assign(dw1Tensor.Slice(Range(1, inputSize + 1), Range(2 * hiddenSize, 3 * hiddenSize)), dwCUDNN.Slice(Range(offset, offset + inputSize * hiddenSize)).Reshape(hiddenSize, inputSize).T); offset += inputSize * hiddenSize; // Ui cpu.Assign(dw1Tensor.Slice(Range(inputSize + 1, -1), Range(0, hiddenSize)), dwCUDNN.Slice(Range(offset, offset + hiddenSize * hiddenSize)).Reshape(hiddenSize, hiddenSize).T); offset += hiddenSize * hiddenSize; // Uf cpu.Assign(dw1Tensor.Slice(Range(inputSize + 1, -1), Range(hiddenSize, 2 * hiddenSize)), dwCUDNN.Slice(Range(offset, offset + hiddenSize * hiddenSize)).Reshape(hiddenSize, hiddenSize).T); offset += hiddenSize * hiddenSize; // Ua cpu.Assign(dw1Tensor.Slice(Range(inputSize + 1, -1), Range(3 * hiddenSize, 4 * hiddenSize)), dwCUDNN.Slice(Range(offset, offset + hiddenSize * hiddenSize)).Reshape(hiddenSize, hiddenSize).T); offset += hiddenSize * hiddenSize; // Uo cpu.Assign(dw1Tensor.Slice(Range(inputSize + 1, -1), Range(2 * hiddenSize, 3 * hiddenSize)), dwCUDNN.Slice(Range(offset, offset + hiddenSize * hiddenSize)).Reshape(hiddenSize, hiddenSize).T); offset += hiddenSize * hiddenSize; // Bi cpu.Assign(dw1Tensor.Slice(0, Range(0, hiddenSize)), dwCUDNN.Slice(Range(offset, offset + hiddenSize)).Reshape(hiddenSize, 1).T); offset += hiddenSize; // Bf cpu.Assign(dw1Tensor.Slice(0, Range(hiddenSize, 2 * hiddenSize)), dwCUDNN.Slice(Range(offset, offset + hiddenSize)).Reshape(hiddenSize, 1).T); offset += hiddenSize; // Ba cpu.Assign(dw1Tensor.Slice(0, Range(3 * hiddenSize, 4 * hiddenSize)), dwCUDNN.Slice(Range(offset, offset + hiddenSize)).Reshape(hiddenSize, 1).T); offset += hiddenSize; // Bo cpu.Assign(dw1Tensor.Slice(0, Range(2 * hiddenSize, 3 * hiddenSize)), dwCUDNN.Slice(Range(offset, offset + hiddenSize)).Reshape(hiddenSize, 1).T); } { // calc with direct LSTM implementation var x = Variable <float>(PartialShape.Create(seqLength, batchSize, inputSize)); var lstm = new Lstm <float>(x, hiddenSize, forgetBiasInit: 0.0); var exe = new Executor(ctx, lstm.Y); exe.Initalize(); // set input exe.AssignTensor(lstm.X, data.AsTensor()); // set states exe.AssignTensor(lstm.CX, c0.AsTensor()); exe.AssignTensor(lstm.HX, h0.AsTensor()); // set weights var w = exe.GetTensor(lstm.W); // Wi ctx.Assign(w.Slice(Range(1, inputSize + 1), Range(0, hiddenSize)), Fill(Shape.Create(inputSize, hiddenSize), wi)); // Wf ctx.Assign(w.Slice(Range(1, inputSize + 1), Range(hiddenSize, 2 * hiddenSize)), Fill(Shape.Create(inputSize, hiddenSize), wf)); // Wo ctx.Assign(w.Slice(Range(1, inputSize + 1), Range(2 * hiddenSize, 3 * hiddenSize)), Fill(Shape.Create(inputSize, hiddenSize), wo)); // Wa ctx.Assign(w.Slice(Range(1, inputSize + 1), Range(3 * hiddenSize, 4 * hiddenSize)), Fill(Shape.Create(inputSize, hiddenSize), wa)); // Ui ctx.Assign(w.Slice(Range(inputSize + 1, -1), Range(0, hiddenSize)), Fill(Shape.Create(hiddenSize, hiddenSize), ui)); // Uf ctx.Assign(w.Slice(Range(inputSize + 1, -1), Range(hiddenSize, 2 * hiddenSize)), Fill(Shape.Create(hiddenSize, hiddenSize), uf)); // Uo ctx.Assign(w.Slice(Range(inputSize + 1, -1), Range(2 * hiddenSize, 3 * hiddenSize)), Fill(Shape.Create(hiddenSize, hiddenSize), uo)); // Ua ctx.Assign(w.Slice(Range(inputSize + 1, -1), Range(3 * hiddenSize, 4 * hiddenSize)), Fill(Shape.Create(hiddenSize, hiddenSize), ua)); // Bi ctx.Assign(w.Slice(0, Range(0, hiddenSize)), Fill(Shape.Create(1, hiddenSize), bi)); // Bf ctx.Assign(w.Slice(0, Range(hiddenSize, 2 * hiddenSize)), Fill(Shape.Create(1, hiddenSize), bf)); // Bo ctx.Assign(w.Slice(0, Range(2 * hiddenSize, 3 * hiddenSize)), Fill(Shape.Create(1, hiddenSize), bo)); // Ba ctx.Assign(w.Slice(0, Range(3 * hiddenSize, 4 * hiddenSize)), Fill(Shape.Create(1, hiddenSize), ba)); exe.Forward(); y2 = exe.GetTensor(lstm.Y).ToArray3D(); cy2 = exe.GetTensor(lstm.CY).ToArray2D(); hy2 = exe.GetTensor(lstm.HY).ToArray2D(); exe.AssignGradient(lstm.Y, dy.AsTensor(), replace: true); exe.Backward(); dx2 = exe.GetGradient(lstm.X).ToArray3D(); dcx2 = exe.GetGradient(lstm.CX).Reshape(batchSize, hiddenSize).ToArray2D(); dhx2 = exe.GetGradient(lstm.HX).Reshape(batchSize, hiddenSize).ToArray2D(); dw2 = exe.GetGradient(lstm.W).ToArray2D(); } AreClose(y1, y2, error); AreClose(cy1, cy2, error); AreClose(hy1, hy2, error); AreClose(dx1, dx2, error); AreClose(dcx1, dcx2, error); AreClose(dhx1, dhx2, error); AreClose(dw1, dw2, error); }
public static void RnnAgainstRnnDynamic() { var ctx = Context.GpuContext(0); var inputSize = 5; var seqLength = 3; var batchSize = 2; var hiddenSize = 4; var error = 1e-5; var data = Context.CpuContext.Eval(RandomUniform <float>(-1, 1, Shape.Create(seqLength, batchSize, inputSize))).ToArray3D(); data.AsTensor(Shape.Create(seqLength * batchSize, inputSize)).Print(); var h0 = Context.CpuContext.Eval(RandomNormal <float>(Shape.Create(batchSize, hiddenSize))).ToArray2D(); var c0 = Context.CpuContext.Eval(RandomNormal <float>(Shape.Create(batchSize, hiddenSize))).ToArray2D(); var dy = Context.CpuContext.Eval(RandomUniform <float>(-1, 1, Shape.Create(seqLength, batchSize, hiddenSize))).ToArray3D(); float[,,] y1, y2, dx1, dx2; float[,] cy1, cy2, hy1, hy2; float[,] dcx1, dcx2, dhx1, dhx2; float[] dw1, dw2; { var x = Variable <float>(PartialShape.Create(seqLength, batchSize, inputSize)); var lstm = new Rnn <float>(new LstmRnnType(), x, 1, hiddenSize, dropout: 0.0); var exe = new Executor(ctx, lstm.Y); exe.Initalize(); // set input exe.AssignTensor(lstm.X, data.AsTensor()); // set states exe.AssignTensor(lstm.CX, c0.AsTensor(Shape.Create(1, batchSize, hiddenSize))); exe.AssignTensor(lstm.HX, h0.AsTensor(Shape.Create(1, batchSize, hiddenSize))); // set weigths, cuDNN matrices order: IFAO var w = exe.GetTensor(lstm.W).Reshape(inputSize * 4 + hiddenSize * 4 + 2 * 4, hiddenSize); SetWeights(ctx, w, inputSize, hiddenSize); exe.Forward(); y1 = exe.GetTensor(lstm.Y).ToArray3D(); cy1 = exe.GetTensor(lstm.CY).Reshape(batchSize, hiddenSize).ToArray2D(); hy1 = exe.GetTensor(lstm.HY).Reshape(batchSize, hiddenSize).ToArray2D(); exe.AssignGradient(lstm.Y, dy.AsTensor(), replace: true); exe.Backward(); dx1 = exe.GetGradient(lstm.X).ToArray3D(); dcx1 = exe.GetGradient(lstm.CX).Reshape(batchSize, hiddenSize).ToArray2D(); dhx1 = exe.GetGradient(lstm.HX).Reshape(batchSize, hiddenSize).ToArray2D(); dw1 = exe.GetGradient(lstm.W).ToArray(); // cuDNN weight is 1D linear blob } { var x = Variable <float>(PartialShape.Create(-1, -1, inputSize)); var lstm = new RnnDynamic <float>(new LstmRnnType(), x, 1, hiddenSize, dropout: 0.0); var exe = new Executor(ctx, lstm.Y); exe.Initalize(); // set input exe.AssignTensor(lstm.X, data.AsTensor()); // set states exe.AssignTensor(lstm.CX, c0.AsTensor(Shape.Create(1, batchSize, hiddenSize))); exe.AssignTensor(lstm.HX, h0.AsTensor(Shape.Create(1, batchSize, hiddenSize))); // set weigths, cuDNN matrices order: IFAO var w = exe.GetTensor(lstm.W).Reshape(inputSize * 4 + hiddenSize * 4 + 2 * 4, hiddenSize); SetWeights(ctx, w, inputSize, hiddenSize); exe.Forward(); y2 = exe.GetTensor(lstm.Y).ToArray3D(); cy2 = exe.GetTensor(lstm.CY).Reshape(batchSize, hiddenSize).ToArray2D(); hy2 = exe.GetTensor(lstm.HY).Reshape(batchSize, hiddenSize).ToArray2D(); exe.AssignGradient(lstm.Y, dy.AsTensor(), replace: true); exe.Backward(); dx2 = exe.GetGradient(lstm.X).ToArray3D(); dcx2 = exe.GetGradient(lstm.CX).Reshape(batchSize, hiddenSize).ToArray2D(); dhx2 = exe.GetGradient(lstm.HX).Reshape(batchSize, hiddenSize).ToArray2D(); dw2 = exe.GetGradient(lstm.W).ToArray(); } AreClose(y1, y2, error); AreClose(cy1, cy2, error); AreClose(hy1, hy2, error); AreClose(dx1, dx2, error); AreClose(dcx1, dcx2, error); AreClose(dhx1, dhx2, error); AreClose(dw1, dw2, error); }
public SequenceDecoderWithAttention(int encoderOutputSize) { // Y Shape (maxSeqLength, not yet known, hiddenSize) EncoderOutput = Variable <T>(PartialShape.Create(-1, -1, encoderOutputSize)); }
public Rnn(RnnType ty, Variable <T> x, int numLayers, int hiddenSize, bool isTraining = true, double dropout = 0.0, ulong dropoutSeed = 1337UL) { Type = ty; IsTraining = isTraining; NumLayers = numLayers; HiddenSize = hiddenSize; Dropout = isTraining ? dropout : 0.0; DropoutSeed = dropoutSeed; // X shape (seqLength, batch, inputSize) X = x; Util.EnsureEqual(3, X.Shape.Rank, "Input layout: (seqLength, batch, inputSize)"); Util.EnsureTrue(X.Shape[0] >= 0, "Input layout: (seqLength, batch, inputSize)"); Util.EnsureTrue(X.Shape[1] >= 0, "Input layout: (seqLength, batch, inputSize)"); Util.EnsureTrue(X.Shape[2] >= 0, "Input layout: (seqLength, batch, inputSize)"); SeqLength = (int)X.Shape[0]; MiniBatch = (int)X.Shape[1]; InputSize = (int)X.Shape[2]; // Y Shape (seqLength, batch, hiddenSize) Y = Variable <T>(PartialShape.Create(SeqLength, MiniBatch, HiddenSize)); // W shape will be determined during initialization W = Parameter <T>(); // state variables var shape = PartialShape.Create(NumLayers, MiniBatch, HiddenSize); var strides = Strides.Create(shape[1] * shape[2], shape[2], 1); // inner change most HX = Variable <T>(shape); CX = Variable <T>(shape); HY = Variable <T>(shape); CY = Variable <T>(shape); StateDesc = new TensorDescriptor(); StateDesc.SetND(Dnn.DataTypeOf <T>(), shape.AsInt32Array, strides.AsInt32Array); // xDesc is an array, for each step shape = PartialShape.Create(MiniBatch, InputSize, 1); strides = Strides.Create(shape[1] * shape[2], shape[2], 1); var xDesc = new TensorDescriptor(); xDesc.SetND(Dnn.DataTypeOf <T>(), shape.AsInt32Array, strides.AsInt32Array); XDesc = Enumerable.Repeat(xDesc, SeqLength).ToArray(); // yDesc is an array, for each step shape = PartialShape.Create(MiniBatch, HiddenSize, 1); strides = Strides.Create(shape[1] * shape[2], shape[2], 1); var yDesc = new TensorDescriptor(); yDesc.SetND(Dnn.DataTypeOf <T>(), shape.AsInt32Array, strides.AsInt32Array); YDesc = Enumerable.Repeat(yDesc, SeqLength).ToArray(); // construct the graph AddInput(X); AddInput(W); AddOutput(Y); AddAuxVar(HX); AddAuxVar(CX); AddAuxVar(HY); AddAuxVar(CY); AddAuxVar(DropoutStates); AddAuxVar(Workspace); AddAuxVar(ReserveSpace); }