public static Model ConvolutionalNeuralNetworkModel() { var images = Variable <float>(); var labels = Variable <float>(); ILayer <float> net = new Reshape <float>(images, PartialShape.Create(-1, 1, 28, 28)); net = new Convolution2D <float>(net.Output, 5, 5, 16); net = new ActivationReLU <float>(net.Output); net = new Pooling2D <float>(net.Output, PoolingMode.MAX, 2, 2, 2, 2); net = new Convolution2D <float>(net.Output, 5, 5, 32); net = new ActivationTanh <float>(net.Output); net = new Pooling2D <float>(net.Output, PoolingMode.MAX, 2, 2, 2, 2); net = new Reshape <float>(net.Output, PartialShape.Create(-1, net.Output.Shape.Skip(1).Aggregate(ScalarOps.Mul))); net = new FullyConnected <float>(net.Output, 50); net = new ActivationTanh <float>(net.Output); net = new FullyConnected <float>(net.Output, 10); return(new Model { Loss = new SoftmaxCrossEntropy <float>(net.Output, labels), Images = images, Labels = labels }); }
public Attention(Variable <T> encoderHiddenStates, Variable <T> decoderHiddenStates, long attentionDim) { AttentionDim = attentionDim; EncoderHiddenStates = encoderHiddenStates; DecoderHiddenStates = decoderHiddenStates; // one goal is, try to make batchSize and encoderSeqLength unknown at symbol layer // so, in LSTM outer op, we can create one graph and one sub-executor, and applied for // different encoderSeqLength and batchSize. Util.EnsureEqual(3, EncoderHiddenStates.Shape.Rank, "states layout: (encoderSeqLength, batch, encoderHiddenSize)"); Util.EnsureTrue(EncoderHiddenStates.Shape[2] > 0, "states should be determined."); EncoderHiddenSize = EncoderHiddenStates.Shape[2]; Util.EnsureEqual(2, DecoderHiddenStates.Shape.Rank, "DecoderHiddenStates layout: (batch, decoderHiddenSize)"); Util.EnsureTrue(DecoderHiddenStates.Shape[1] > 0, "DecoderHiddenStates should be determined."); DecoderHiddenSize = DecoderHiddenStates.Shape[1]; var scaleWh = Sqrt(12.0.AsScalar <T>() / ((double)(AttentionDim + EncoderHiddenSize)).AsScalar <T>()); Wh = Parameter(scaleWh * (RandomUniform <T>(Shape.Create(EncoderHiddenSize, AttentionDim), 0UL, 0UL) - 0.5.AsScalar <T>())); var scaleWd = Sqrt(12.0.AsScalar <T>() / ((double)(AttentionDim + DecoderHiddenSize)).AsScalar <T>()); Wd = Parameter(scaleWd * (RandomUniform <T>(Shape.Create(DecoderHiddenSize, AttentionDim), 0UL, 0UL) - 0.5.AsScalar <T>())); var scaleV = Sqrt(12.0.AsScalar <T>() / ((double)(AttentionDim)).AsScalar <T>()); V = Parameter(scaleV * (RandomUniform <T>(Shape.Create(AttentionDim, 1), 0UL, 0UL) - 0.5.AsScalar <T>())); // build the graph var h = EncoderHiddenStates; // (n*b,He) // He denotes hiddenSize of encoder var d = DecoderHiddenStates; // (b,Hd) // Hd denotes hiddenSize of decoder var whh = Dot(h.Reshape(-1, EncoderHiddenSize), Wh); // shape (n*b,K) K denotes attentionDim var wdd = Dot(d, Wd); // shape (b,K) // to add whh and wdd, we need broadcast, for this, we need to know at least n or b. // The decision here is to make b known at symbolic layer, because then you can have // flexibility on n (EncoderSeqLength), easier for making bucket. // another issue is, our backward of add has some issue dealing with 3d array which has broadcast // so, we can reshape them into 2d tensor here: // initial shape: (n*b,K) + (b,K) // reshape for the boadcast: (n,b*K) + (b*K) (for broadcasting, (b*K) will broadcast to (1,b*K) // then: (n,b*K) + (b*K) = (n,b*K) // reshape result to (n*b,K) BatchSize = EncoderHiddenStates.Shape[1]; Util.EnsureTrue(BatchSize > 0, "Batch need to be determined."); Util.EnsureTrue(BatchSize == DecoderHiddenStates.Shape[0]); var sum = (whh.Reshape(-1, BatchSize * AttentionDim) + wdd.Reshape(-1)).Reshape(-1, AttentionDim); // tanh, shape no change (n*b,K) var whd = new ActivationTanh <T>(sum); // (n*b,K) dot (K,1) = (n*b,1) => reshape to (n,b) var u = Dot(whd.Output, V).Reshape(-1, BatchSize); // same shape (n,b) var softmax = new Softmax <T>(u); // sum (n,b) * (n,b,d) var reduce = new AttentionReduce <T>(softmax.Output.Reshape(-1, BatchSize), h); Output = reduce.Output; }