public void SumProductWithSharedHasCorrectGrad() { // sequence of input var xs = T.Matrix <float>("xs"); // accumulator var z = T.Vector <float>("z"); var b = T.Vector <float>("b"); // sum xs in the accumulator Func <Tensor <float>, Tensor <float>, IList <Tensor <float> > > rec = (x, a) => new List <Tensor <float> >() { x + a, x *a + b }; var loop = T.Scan(rec, xs, new[] { z, null }); // get the last value var prod = loop[1][-1]; var cost = T.Sum(prod); //var dz = T.Grad(cost, z); var db = T.Grad(cost, b); var reshape = db as Reshaping <float>; var sum = reshape.x as Sum <float>; Assert.AreEqual(0, sum.Axis); var dfor = sum.x as Tensor <float> .For; var backLoop = dfor.Loop; Assert.AreEqual(3, backLoop.Sequences.Count); Assert.AreEqual(4, backLoop.Fors.Count); Assert.AreEqual(2, dfor.Index); // TODO: check why a recursive was expected //var db_ = dfor.RecursiveVariable; //Assert.AreEqual("db_", db_.Name); var variables = backLoop.Variables.Cast <Tensor <float> >().ToList(); var x_ = variables[0]; Assert.AreEqual("x_", x_.Name); var a_ = variables[1]; Assert.AreEqual("a_", a_.Name); var d_f1_ = variables[2]; Assert.AreEqual("delta_f1_", d_f1_.Name); var da_ = variables[4]; Assert.AreEqual("da_", da_.Name); var dx = (Tensor <float>)backLoop.Fors[0].Expression; var da = (Tensor <float>)backLoop.Fors[1].Expression; Assert.IsTrue((d_f1_ * a_ + da_).StructuralEquality(dx)); Assert.IsTrue((d_f1_ * x_ + da_).StructuralEquality(da)); Assert.IsTrue((d_f1_).StructuralEquality(dfor.Expression)); }
public void SumProductHasCorrectGrad() { // sequence of input var xs = T.Matrix <float>("xs"); // accumulator var z = T.Vector <float>("z"); // sum xs in the accumulator Func <Tensor <float>, Tensor <float>, IList <Tensor <float> > > rec = (x, a) => new List <Tensor <float> >() { x + a, x *a }; var loop = T.Scan(rec, xs, new[] { z, null }); // get the last value var prod = loop[1][-1]; var cost = T.Sum(prod); var dz = T.Grad(cost, z); var slicing = dz as Slicing <float>; Assert.AreEqual(1, slicing.Slices.Count); Assert.IsTrue(slicing.Slices[0].IsSingleton); Assert.AreEqual(-1, ((Scalar <int> .Const)slicing.Slices[0].Start).Value); var dfor = slicing.x as Tensor <float> .For; var backLoop = dfor.Loop; Assert.AreEqual(3, backLoop.Sequences.Count); Assert.AreEqual(3, backLoop.Fors.Count); Assert.AreEqual(1, dfor.Index); var variables = backLoop.Variables.Cast <Tensor <float> >().ToList(); var x_ = variables[0]; Assert.AreEqual("x_", x_.Name); var a_ = variables[1]; Assert.AreEqual("a_", a_.Name); var d_f1_ = variables[2]; Assert.AreEqual("delta_f1_", d_f1_.Name); var da_ = variables[4]; Assert.AreEqual("da_", da_.Name); var dx = (Tensor <float>)backLoop.Fors[0].Expression; var da = (Tensor <float>)backLoop.Fors[1].Expression; Assert.IsTrue((d_f1_ * a_ + da_).StructuralEquality(dx)); Assert.IsTrue((d_f1_ * x_ + da_).StructuralEquality(da)); }
public void SumHasCorrectGrad() { // sequence of input var xs = T.Matrix <float>("xs"); // accumulator var z = T.Vector <float>("z"); // sum xs in the accumulator var partialSums = T.Scan((x, a) => x + a, xs, z); // get the last value var sum = partialSums[-1]; var cost = T.Sum(sum * sum); var dz = T.Grad(cost, z); var slicing = dz as Slicing <float>; Assert.AreEqual(1, slicing.Slices.Count); Assert.IsTrue(slicing.Slices[0].IsSingleton); Assert.AreEqual(-1, ((Scalar <int> .Const)slicing.Slices[0].Start).Value); var dfor = slicing.x as Tensor <float> .For; var backLoop = dfor.Loop; Assert.AreEqual(3, backLoop.Sequences.Count); Assert.AreEqual(3, backLoop.Fors.Count); Assert.AreEqual(1, dfor.Index); var variables = backLoop.Variables.Cast <Tensor <float> >().ToList(); var x_ = variables[0]; Assert.AreEqual("x_", x_.Name); var a_ = variables[1]; Assert.AreEqual("a_", a_.Name); var delta_a_ = variables[2]; Assert.AreEqual("delta_a_", delta_a_.Name); var dx_ = variables[3]; Assert.AreEqual("dx_", dx_.Name); var da_ = variables[4]; Assert.AreEqual("da_", da_.Name); var dx = (Tensor <float>)backLoop.Fors[0].Expression; var da = (Tensor <float>)backLoop.Fors[1].Expression; Assert.IsTrue((delta_a_ + da_).StructuralEquality(dx)); Assert.IsTrue((delta_a_ + da_).StructuralEquality(da)); }
public void SumProductWithSharedCanTrain() { var n = 2; // sequence of input var xs = T.Matrix <float>("xs"); // accumulator var z = T.Vector <float>("z"); var b = T.Shared(NN.Ones(n), "b"); // sum xs in the accumulator Func <Tensor <float>, Tensor <float>, IList <Tensor <float> > > rec = (x, a) => new List <Tensor <float> >() { x + a, x *a + b }; var loop = T.Scan(rec, xs, new[] { z, null }); // get the last value var prod = loop[1][-1]; // compute the cost and the gradient for the shared b. var cost = T.Sum(prod); var db = T.Grad(cost, b); var costFunction = T.Function(input: (xs, z), output: cost); var xs_ = NN.Array(new float[, ] { { 1, -1 }, { 0, -2 } }); var z_ = NN.Zeros(n); var cost_xs_z = costFunction(xs_, z_); Assert.AreEqual(4, cost_xs_z); var updates = new OrderedDictionary { { b, b - 0.05f * db } }; var train = T.Function(input: (xs, z), output: cost, updates: updates); var cost_xs_z2 = train(xs_, z_); AssertArray.AreAlmostEqual(NN.Array(new[] { 0.95f, 0.95f }), b.Value); }
public void RnnXorHasCorrectGradient() { NN.Random.Seed(12345); int nh = 10; // hidden layer var Wbit = T.Shared(0.2f * NN.Random.Uniform(-1.0f, 1.0f, nh, 1).As <float>(), "Wbit"); var Wstate = T.Shared(NN.Eye <float>(nh), "Wstate"); var Wout = T.Shared(0.2f * NN.Random.Uniform(-1.0f, 1.0f, 1, nh).As <float>(), "Wout"); var b = T.Shared(0.2f * NN.Random.Uniform(-1.0f, 1.0f, nh, 1).As <float>(), "b"); var state0 = T.Shared(NN.Zeros <float>(nh, 1), "state0"); var bits = T.Tensor3 <float>("bits"); // n x 1 var expected = T.Matrix <float>("expected"); // 1 x 1 Func <Tensor <float>, Tensor <float>, Tensor <float> > recurrence = (bit, oldState) => { return(T.Tanh(T.Dot(Wbit, bit) + T.Dot(Wstate, oldState) + b)); }; var states = T.Scan(fn: recurrence, sequence: bits, outputsInfo: state0); var output = T.Tanh(T.Dot(Wout, states[(Slice)(-1)])); var error = 0.5f * T.Norm2(output - expected); var classify = T.Function(bits, output); var gradients = T.Grad(error); var gradWstate = gradients[Wstate]; Assert.IsNotNull(gradWstate); var gradWstateIsReshape = gradWstate as Reshaping <float>; Assert.IsNotNull(gradWstateIsReshape); var gradWstateIsSum = gradWstateIsReshape.x as Sum <float>; Assert.IsNotNull(gradWstateIsSum); var dfor = gradWstateIsSum.x as Tensor <float> .For; var backLoop = dfor.Loop; Assert.AreEqual(3, backLoop.Sequences.Count); // bit, states, delta Assert.AreEqual(6, backLoop.Fors.Count); // dbit, dstate, dWstate, db, dWbit, dstate_p1 Assert.AreEqual(3, dfor.Index); // TODO: check why a recursive was expected //var dWstate_ = dfor.RecursiveVariable; //Assert.AreEqual("dWstate_", dWstate_.Name); var variables = backLoop.Variables.Cast <Tensor <float> >().ToList(); var bit_ = variables[0]; Assert.AreEqual("bit_", bit_.Name); var oldState_ = variables[1]; Assert.AreEqual("oldState_", oldState_.Name); var delta_oldState_ = variables[2]; Assert.AreEqual("delta_oldState_", delta_oldState_.Name); var dbit_ = variables[3]; Assert.AreEqual("dbit_", dbit_.Name); var doldState_ = variables[4]; Assert.AreEqual("doldState_", doldState_.Name); var oldState_tp1_ = variables[5]; Assert.AreEqual("oldState_tp1", oldState_tp1_.Name); var d = T.Sum((delta_oldState_ + doldState_) * (1f - T.Square(oldState_tp1_)), axis: 1, keepDims: true); var doldState = (Tensor <float>)backLoop.Fors[1].Expression; (T.Dot(Wstate, d, transposeX: true)).AssertEqual(doldState); var dWstate = (Tensor <float>)backLoop.Fors[3].Expression; var dWstateExp = T.Dot(d, oldState_, transposeY: true); dWstateExp.AssertEqual(dWstate); var dbit = (Tensor <float>)backLoop.Fors[0].Expression; (T.Dot(Wbit, d, transposeX: true)).StructuralEquality(dbit); var oldState_tp1 = (Tensor <float>)backLoop.Fors[5].Expression; oldState_tp1.AssertEqual(oldState_); }
/// <summary></summary> /// <param name="inputDim">dimension of the input vectors</param> /// <param name="hiddenDim">dimension of the hidden layer</param> /// <param name="outputDim">dimension of the output vector</param> /// <param name="scale">scaling factor to initialize weights</param> public GRU(int inputDim, int hiddenDim, int outputDim, float scale = 0.2f) { // initial hidden state h0 = T.Shared(NN.Zeros <float>(hiddenDim), "h0"); // reset gate layers Wr = T.Shared(NN.Random.Uniform(-scale, scale, inputDim, hiddenDim), "Wr"); Ur = T.Shared(NN.Eye <float>(hiddenDim), "Ur"); br = T.Shared(NN.Zeros <float>(/*1,*/ hiddenDim), "br"); // update gate layers Wz = T.Shared(NN.Random.Uniform(-scale, scale, inputDim, hiddenDim), "Wz"); Uz = T.Shared(NN.Eye <float>(hiddenDim), "Uz"); bz = T.Shared(NN.Zeros <float>(/*1,*/ hiddenDim), "bz"); // layers W = T.Shared(NN.Random.Uniform(-scale, scale, inputDim, hiddenDim), "W"); U = T.Shared(NN.Eye <float>(hiddenDim), "U"); b = T.Shared(NN.Zeros <float>(/*1,*/ hiddenDim), "b"); // prediction layer S = T.Shared(NN.Random.Uniform(-scale, scale, hiddenDim, outputDim), "S"); Sb = T.Shared(NN.Zeros <float>(/*1,*/ outputDim), "Sb"); // bundle this.@params = new[] { h0, Wr, Ur, br, Wz, Uz, bz, W, U, b, S, Sb }; // Adagrad shared variables this.grads = new Dictionary <string, Tensor <float> .Shared>(); foreach (var param in @params) { var name = param.Name + "Grad"; this.grads[name] = T.Shared(NN.Zeros <float>(param.Value.Shape), name); } this.hists = new Dictionary <string, Tensor <float> .Shared>(); foreach (var param in @params) { var name = param.Name + "Hist"; this.hists[name] = T.Shared(NN.Zeros <float>(param.Value.Shape), name); } // Adadelta shared variables var hists2 = new Dictionary <string, Tensor <float> .Shared>(); foreach (var param in @params) { var name = param.Name + "Hist2"; hists2[name] = T.Shared(NN.Zeros <float>(param.Value.Shape), name); } var x = T.Matrix <float>("x"); // [sentence, inputDim] var expected = T.Vector <float>("expected"); Func <Tensor <float>, Tensor <float>, Tensor <float>[]> recurrence = (x_t, h_tm1) => { // reset gate var r_t = T.Sigmoid(T.Dot(x_t, Wr) + T.Dot(h_tm1, Ur) + br); // update gate var z_t = T.Sigmoid(T.Dot(x_t, Wz) + T.Dot(h_tm1, Uz) + bz); // proposed hidden state var _h_t = T.Tanh(T.Dot(x_t, W) + T.Dot(r_t * h_tm1, U) + b); // actual hidden state var h_t = z_t * h_tm1 + (1 - z_t) * _h_t; // return all the intermediate variables because they may be reused by T.Grad to optimize gradient computation return(new[] { h_t, r_t, z_t, _h_t }); }; var h = T.Scan(recurrence, x, new[] { h0, null, null, null })[0][-1]; // cost and gradients var output = T.Dot(h, S) + Sb; var error = 0.5f * T.Norm2(output - expected); var gradients = T.Grad(error); var updatesTrain = new OrderedDictionary(); foreach (var param in @params) { var grad = gradients[param]; //var grad = T.Clip(update.Item2, -10, 10); // Adagrad //const float eps = 1e-5f; var g = grads[param.Name + "Grad"]; updatesTrain[g] = g + grad; //updates[param] = param - lr * grad / T.Sqrt(hist + eps); // Adadelta //const float rho = 0.95f; //const float eps = 1e-5f; //var hist = hists[param.Name + "Hist"]; //var hist2 = hists2[param.Name + "Hist2"]; //var newHist = rho * hist + (1 - rho) * (grad * grad); //updates[hist] = newHist; //var newGrad = grad * T.Sqrt((hist2 + eps) / (newHist + eps)); //updates[param] = param - newGrad; //updates[hist2] = rho * hist2 + (1 - rho) * (newGrad * newGrad); // Regular //updates[param] = param - lr * grad; } var batchSize = T.Scalar <float>("batchSize"); var lr = T.Scalar <float>("lr"); const float eps = 1e-5f; var updates = new OrderedDictionary(); foreach (var param in this.@params) { var grad = this.grads[param.Name + "Grad"]; var meanGrad = grad / batchSize; var hist = this.hists[param.Name + "Hist"]; updates[hist] = hist + meanGrad * meanGrad; updates[param] = param - lr * meanGrad / T.Sqrt(hist + eps); updates[grad] = T.ZerosLike(grad); } // theano functions this.Classify = T.Function(input: x, output: output); this.Train = T.Function(input: (x, expected), output: error, updates: updatesTrain); this.Update = T.Function(input: (lr, batchSize), updates: updates); }