Esempio n. 1
0
        public void TestLoop1()
        {
            // Computing tanh(x(t).dot(W) + b) elementwise
            //http://deeplearning.net/software/theano/tutorial/loop.html

            // defining the tensor variables
            var X     = T.Matrix <float>("x");
            var W     = T.Matrix <float>("W");
            var b_sym = T.Vector <float>("b_sym");

            var results             = T.Scan(v => T.Tanh(T.Dot(v, W) + b_sym), sequence: X);
            var compute_elementwise = T.Function(inputs: new[] { X, W, b_sym }, output: results);

            // test values
            var x = NN.Eye <float>(2);
            var w = NN.Ones <float>(2, 2);
            var b = NN.Ones <float>(2);

            b.Item[1] = 2;

            var result   = compute_elementwise(new[] { x, w, b });
            var expected = NN.Tanh(x.Dot(w) + b);

            AssertArray.AreAlmostEqual(expected[0], result[0]);
        }
Esempio n. 2
0
        /// <summary>
        ///
        /// </summary>
        /// <param name="nh">dimension of the hidden layer</param>
        /// <param name="nc">number of classes</param>
        /// <param name="de">dimension of the word embeddings</param>
        /// <param name="cs">word window context size</param>
        public Elman3(int nh, int nc, int de, int cs)
        {
            // parameters of the model
            var scale = 0.2f;

            this.Wx = T.Shared(scale * NN.Random.Uniform(-1.0f, 1.0f, de * cs, nh), "Wx");
            //this.Wh = T.Shared(scale * NN.Random.Uniform(-1.0f, 1.0f, nh, nh), "Wh");
            this.Wh = T.Shared(NN.Eye <float>(nh), "Wh");
            this.W  = T.Shared(scale * NN.Random.Uniform(-1.0f, 1.0f, nh, nc), "W");
            this.bh = T.Shared(NN.Zeros <float>(nh), "bh");
            this.b  = T.Shared(NN.Zeros <float>(nc), "b");
            this.h0 = T.Shared(NN.Zeros <float>(nh), "h0");

            // bundle
            this.@params = new[] { this.Wx, this.Wh, this.W, this.bh, this.b, this.h0 };

            var x = T.Matrix <float>("x"); // [sentence, de * cs]
            var y = T.Scalar <int>("y");   // label

            Func <Tensor <float>, Tensor <float>, Tensor <float>[]> recurrence = (x_t, h_tm1) =>
            {
                var h_t = T.Sigmoid(T.Dot(x_t, this.Wx) + T.Dot(h_tm1, this.Wh) + this.bh);
                var s_t = T.Softmax(T.Dot(h_t, this.W) + this.b);
                return(new[] { h_t, s_t });
            };

            var result = T.Scan(
                fn: recurrence,
                sequences: x,
                outputsInfo: new[] { this.h0, null }
                /*, n_steps: x.Shape[0]*/);
            var h = result[0];
            var s = result[1];

            var p_y_given_x_lastword = s[-1, /*0,*/ XSlicer._];              // 0 because of Theano's Softmax ?
            var p_y_given_x_sentence = s[XSlicer._, /*0,*/ XSlicer._];
            var y_pred = T.Argmax(p_y_given_x_sentence, axis: 1);

            // cost and gradients and learning rate
            var lr = T.Scalar <float>("lr");

            nll = -T.Mean(T.Log(p_y_given_x_lastword)[y]);
            var gradients = T.Grad(nll);
            var updates   = new OrderedDictionary();

            foreach (var W in @params)
            {
                updates[W] = W - lr * gradients[W];
            }

            // theano functions

            this.classify = T.Function(input: x, output: y_pred);

            this.train = T.Function(input: (x, y, lr),
                                    output: nll,
                                    updates: updates);
        }
Esempio n. 3
0
        public void TestScan2()
        {
            var x = Op.Matrix <float>("x");
            var f = Op.Function(input: x, output: Op.Scan(v => 2f * v, sequence: x));

            var input  = NN.Eye <float>(2);
            var result = f(input);

            AssertArray.AreEqual(2 * input, result);
        }
Esempio n. 4
0
        public void TestScan()
        {
            var x = Matrix <float>("x");
            var f = Function(input: x, output: Scan(v => v, sequence: x));

            var input  = NN.Eye <float>(2);
            var result = f(input);

            AssertArray.AreEqual(input, result);
        }
Esempio n. 5
0
        public void TestScan3()
        {
            var x = Op.Matrix <float>("x");
            var y = Op.Matrix <float>("y");
            var f = Op.Function(input: (x, y), output: Op.Scan((v1, v2) => v1 + v2, sequences: new[] { x, y }));

            var input1 = NN.Eye <float>(2);
            var input2 = 2 * NN.Eye <float>(2);
            var result = f(input1, input2);

            AssertArray.AreEqual(input1 + input2, result);
        }
Esempio n. 6
0
        public void TestScan4()
        {
            var X    = Op.Matrix <float>("X");
            var acc0 = Op.Shared(NN.Zeros <float>(5), "acc0");

            var loop = Op.Scan(fn: (x, acc) => acc + x, sequence: X, outputsInfo: acc0);
            var f    = Op.Function(input: X, output: loop[-1]);

            var input1 = NN.Eye <float>(5);
            var result = f(input1);

            AssertArray.AreEqual(new float[] { 1, 1, 1, 1, 1 }, result);
        }
        public void TestDotWithIdentity()
        {
            var a = NN.Ones <float>(4, 5);

            a[_, Upto(-1)] = NN.Eye <float>(4);
            var b = NN.Random.Uniform(-1, 1, 4).As <float>();
            var c = NN.Ones <float>(5);

            c[Upto(4)] = b;

            var ac = a.Dot(c);
            var ab = a.DotWithBias(b);

            AssertArray.AreEqual(ac, ab);
        }
Esempio n. 8
0
        public static Array <float> PowerMethod(Array <float> a)
        {
            // https://en.wikipedia.org/wiki/Moore%E2%80%93Penrose_pseudoinverse

            // init A(0) = (A*A + dI)-1.A*
            var d      = 1e-6f;
            var result = a.T.Dot(a) + d * NN.Eye(a.Shape[1]);

            Lapack.Inverse(result.Values, result.Shape[0]);
            result = result.Dot(a.T);

            // iterate: A(i+1) = 2A(i) - A(i).A.A(i)
            for (int i = 0; i < 2; i++)
            {
                result = 2 * result - result.Dot(a).Dot(result);
            }
            return(result);
        }
        public void TestDotWithBias()
        {
            var a  = NN.Zeros <float>(3, 4);
            var id = NN.Eye <float>(3);

            a[_, Upto(-1)] = id;

            var expected = NN.Array <float>(new float[, ] {
                { 1, 0, 0, 0 },
                { 0, 1, 0, 0 },
                { 0, 0, 1, 0 }
            });

            AssertArray.AreAlmostEqual(expected, a);
            //var x = Tensor.Ones(3);
            //var y = Tensor.Ones(3).Scale(2);
            //Assert.AreEqual(x, y);
        }
Esempio n. 10
0
        public void TestPseudoInverse()
        {
            var path  = @"C:\Users\joc\AppData\Local\ProtoStudio\Banque\embeddings.bin";
            var words = Word2Vec.LoadBinary(path, normalize: true).Vectors /*[_, Until(100)]*/;
            //var pseudoInv = PseudoInv(words);
            var pseudoInv = PowerMethod(words);

            // when embeddings have linearly independent dimensions
            AssertArray.AreAlmostEqual(NN.Eye(words.Shape[1]), pseudoInv.Dot(words), 1e-6f, 1e-6f);

            // least probable: words are NOT linearly idependent
            //AssertArray.AreAlmostEqual(NN.Eye(words.Shape[0]), words.Dot(pseudoInv), 1e-3f, 1e-5f);

            if (words.Shape[0] <= 1000) // otherwise too long
            {
                AssertArray.AreAlmostEqual(words, words.Dot(pseudoInv).Dot(words), 1e-6f, 1e-6f);
            }
            AssertArray.AreAlmostEqual(pseudoInv, pseudoInv.Dot(words).Dot(pseudoInv), 1e-6f, 1e-6f);
        }
Esempio n. 11
0
        public Tsne(Array <float> X_, int dims, float perplexity)
        {
            X_.AssertOfDim(2);
            int n = X_.Shape[0];

            X = T.Shared(X_, "X");
            Y = T.Shared(NN.Random.Uniform(-1f, 1f, n, dims), "Y");

            YMomentum = T.Shared(NN.Zeros(n, dims), "YMomentum");
            dYLast    = T.Shared(NN.Zeros(n, dims), "dYLast");

            // ones everywhere, zero on the diag
            mask = T.Shared(NN.Ones(n, n) - NN.Eye(n), "mask");

            // Compute pairwise affinities
            var sum_Y = T.Sum(Y * Y, 1, keepDims: true);

            var num = 1 / (1 - T.DimShuffle((2 * T.Dot(Y, Y, transposeY: true) + sum_Y), 1, 0) + sum_Y);

            // set the diag to zero
            num *= mask;

            var Q = num / T.Sum(num);
            //Q = T.Max(Q, 1e-12f);

            var P_ = x2p(X_, 1e-5f, perplexity);

            P_ = P_ * 4f; // early exaggeration
            P_ = NN.Apply(P_, x => Math.Max(x, 1e-12f));
            P  = T.Shared(P_, "P");

            KL_Loss = T.Sum(P * T.Log(P / Q));

            dY   = T.Function(output: T.Grad(KL_Loss, Y));
            Loss = T.Function(output: KL_Loss);

            var updates = MomentumUpdate(Y, YMomentum, dYLast, T.Grad(KL_Loss, Y), 500);

            Train = T.Function(updates);
        }
Esempio n. 12
0
        public static void TestLook1()
        {
            // defining the tensor variables
            var X     = T.Matrix <float>("x");
            var W     = T.Matrix <float>("W");
            var b_sym = T.Matrix <float>("b_sym");

            var results             = T.Scan(v => T.Tanh(T.Dot(v, W) + b_sym), sequence: X);
            var compute_elementwise = T.Function(inputs: new[] { X, W, b_sym }, output: results);

            // test values
            var x = NN.Eye <float>(2);
            var w = NN.Ones <float>(2, 2);
            var b = NN.Ones <float>(2);

            b.Item[1] = 2;

            Console.WriteLine(compute_elementwise(new[] { x, w, b }).Item[0]);

            // comparison with tensors
            Console.WriteLine(NN.Tanh(x.Dot(w) + b));
        }
Esempio n. 13
0
        /// <summary></summary>
        /// <param name="inputDim">dimension of the input vectors</param>
        /// <param name="hiddenDim">dimension of the hidden layer</param>
        /// <param name="nClasses">dimension of the output vector</param>
        /// <param name="scale">scaling factor to initialize weights</param>
        public GRU2(int inputDim, int hiddenDim, int nClasses, float scale = 0.2f)
        {
            // /!\ softmax requires Dot(v, M) products

            // initial hidden state
            h0 = T.Shared(NN.Zeros <float>(hiddenDim), "h0");

            // reset gate layers
            Wr = T.Shared(NN.Random.Uniform(-scale, scale, inputDim, hiddenDim), "Wr");
            Ur = T.Shared(NN.Eye <float>(hiddenDim), "Ur");
            br = T.Shared(NN.Zeros <float>(/*1,*/ hiddenDim), "br");

            // update gate layers
            Wz = T.Shared(NN.Random.Uniform(-scale, scale, inputDim, hiddenDim), "Wz");
            Uz = T.Shared(NN.Eye <float>(hiddenDim), "Uz");
            bz = T.Shared(NN.Zeros <float>(/*1,*/ hiddenDim), "bz");

            // layers
            W = T.Shared(NN.Random.Uniform(-scale, scale, inputDim, hiddenDim), "W");
            U = T.Shared(NN.Eye <float>(hiddenDim), "U");
            b = T.Shared(NN.Zeros <float>(/*1,*/ hiddenDim), "b");

            // prediction layer
            S  = T.Shared(NN.Random.Uniform(-scale, scale, hiddenDim, nClasses), "S");
            Sb = T.Shared(NN.Zeros <float>(/*1,*/ nClasses), "Sb");

            // bundle
            this.@params = new[] { h0, Wr, Ur, br, Wz, Uz, bz, W, U, b, S, Sb };

            // Adagrad shared variables
            var hists = new Dictionary <string, Tensor <float> .Shared>();

            foreach (var param in @params)
            {
                var name = param.Name + "Hist";
                hists[name] = T.Shared(NN.Zeros <float>(param.Value.Shape), name);
            }

            // Adadelta shared variables
            var hists2 = new Dictionary <string, Tensor <float> .Shared>();

            foreach (var param in @params)
            {
                var name = param.Name + "Hist2";
                hists2[name] = T.Shared(NN.Zeros <float>(param.Value.Shape), name);
            }

            var x = T.Matrix <float>("x");  // [sentence, inputDim]
            var y = T.Scalar <int>("y");

            Func <Tensor <float>, Tensor <float>, Tensor <float>[]> recurrence = (x_t, h_tm1) =>
            {
                // reset gate
                var r_t = T.Sigmoid(T.Dot(x_t, Wr) + T.Dot(h_tm1, Ur) + br);
                // update gate
                var z_t = T.Sigmoid(T.Dot(x_t, Wz) + T.Dot(h_tm1, Uz) + bz);
                // proposed hidden state
                var _h_t = T.Tanh(T.Dot(x_t, W) + T.Dot(r_t * h_tm1, U) + b);
                // actual hidden state
                var h_t = z_t * h_tm1 + (1 - z_t) * _h_t;
                // return all the intermediate variables because they may be reused by T.Grad to optimize gradient computation
                return(new[] { h_t, r_t, z_t, _h_t });
            };

            var h      = T.Scan(recurrence, x, new[] { h0, null, null, null })[0][-1];
            var pred   = T.Softmax(T.Dot(h, S) + Sb);
            var y_pred = T.Argmax(pred, axis: 0);

            // cost and gradients and learning rate
            var lr        = T.Scalar <float>("lr");
            var nll       = -T.Mean(T.Log(pred)[y]);
            var gradients = T.Grad(nll);

            var updates = new OrderedDictionary();

            foreach (var param in @params)
            {
                var grad = gradients[param];
                //var grad = T.Clip(update.Item2, -10, 10);

                // Adagrad
                const float eps  = 1e-5f;
                var         hist = hists[param.Name + "Hist"];
                updates[hist]  = hist + grad * grad;
                updates[param] = param - lr * grad / T.Sqrt(hist + eps);

                // Adadelta
                //const float rho = 0.95f;
                //const float eps = 1e-5f;
                //var hist = hists[param.Name + "Hist"];
                //var hist2 = hists2[param.Name + "Hist2"];
                //var newHist = rho * hist + (1 - rho) * (grad * grad);
                //updates[hist] = newHist;
                //var newGrad = grad * T.Sqrt((hist2 + eps) / (newHist + eps));
                //updates[param] = param - newGrad;
                //updates[hist2] = rho * hist2 + (1 - rho) * (newGrad * newGrad);

                // Regular
                //updates[param] = param - lr * grad;
            }

            // theano functions
            this.classify = T.Function(input: x, output: y_pred);

            this.train = T.Function(input: (x, y, lr),
                                    output: nll,
                                    updates: updates);
        }
Esempio n. 14
0
        public void RnnXorHasCorrectGradient()
        {
            NN.Random.Seed(12345);
            int nh = 10; // hidden layer

            var Wbit   = T.Shared(0.2f * NN.Random.Uniform(-1.0f, 1.0f, nh, 1).As <float>(), "Wbit");
            var Wstate = T.Shared(NN.Eye <float>(nh), "Wstate");
            var Wout   = T.Shared(0.2f * NN.Random.Uniform(-1.0f, 1.0f, 1, nh).As <float>(), "Wout");
            var b      = T.Shared(0.2f * NN.Random.Uniform(-1.0f, 1.0f, nh, 1).As <float>(), "b");

            var state0 = T.Shared(NN.Zeros <float>(nh, 1), "state0");

            var bits     = T.Tensor3 <float>("bits");          // n x 1
            var expected = T.Matrix <float>("expected");       // 1 x 1

            Func <Tensor <float>, Tensor <float>, Tensor <float> > recurrence = (bit, oldState) =>
            {
                return(T.Tanh(T.Dot(Wbit, bit) + T.Dot(Wstate, oldState) + b));
            };

            var states = T.Scan(fn: recurrence, sequence: bits, outputsInfo: state0);

            var output    = T.Tanh(T.Dot(Wout, states[(Slice)(-1)]));
            var error     = 0.5f * T.Norm2(output - expected);
            var classify  = T.Function(bits, output);
            var gradients = T.Grad(error);

            var gradWstate = gradients[Wstate];

            Assert.IsNotNull(gradWstate);
            var gradWstateIsReshape = gradWstate as Reshaping <float>;

            Assert.IsNotNull(gradWstateIsReshape);
            var gradWstateIsSum = gradWstateIsReshape.x as Sum <float>;

            Assert.IsNotNull(gradWstateIsSum);
            var dfor     = gradWstateIsSum.x as Tensor <float> .For;
            var backLoop = dfor.Loop;

            Assert.AreEqual(3, backLoop.Sequences.Count); // bit, states, delta
            Assert.AreEqual(6, backLoop.Fors.Count);      // dbit, dstate, dWstate, db, dWbit, dstate_p1
            Assert.AreEqual(3, dfor.Index);

            // TODO: check why a recursive was expected
            //var dWstate_ = dfor.RecursiveVariable;
            //Assert.AreEqual("dWstate_", dWstate_.Name);

            var variables = backLoop.Variables.Cast <Tensor <float> >().ToList();
            var bit_      = variables[0];

            Assert.AreEqual("bit_", bit_.Name);
            var oldState_ = variables[1];

            Assert.AreEqual("oldState_", oldState_.Name);
            var delta_oldState_ = variables[2];

            Assert.AreEqual("delta_oldState_", delta_oldState_.Name);
            var dbit_ = variables[3];

            Assert.AreEqual("dbit_", dbit_.Name);
            var doldState_ = variables[4];

            Assert.AreEqual("doldState_", doldState_.Name);
            var oldState_tp1_ = variables[5];

            Assert.AreEqual("oldState_tp1", oldState_tp1_.Name);

            var d = T.Sum((delta_oldState_ + doldState_) * (1f - T.Square(oldState_tp1_)), axis: 1, keepDims: true);

            var doldState = (Tensor <float>)backLoop.Fors[1].Expression;

            (T.Dot(Wstate, d, transposeX: true)).AssertEqual(doldState);

            var dWstate    = (Tensor <float>)backLoop.Fors[3].Expression;
            var dWstateExp = T.Dot(d, oldState_, transposeY: true);

            dWstateExp.AssertEqual(dWstate);

            var dbit = (Tensor <float>)backLoop.Fors[0].Expression;

            (T.Dot(Wbit, d, transposeX: true)).StructuralEquality(dbit);

            var oldState_tp1 = (Tensor <float>)backLoop.Fors[5].Expression;

            oldState_tp1.AssertEqual(oldState_);
        }
Esempio n. 15
0
        /// <summary></summary>
        /// <param name="inputDim">dimension of the input vectors</param>
        /// <param name="hiddenDim">dimension of the hidden layer</param>
        /// <param name="outputDim">dimension of the output vector</param>
        /// <param name="scale">scaling factor to initialize weights</param>
        public GRU(int inputDim, int hiddenDim, int outputDim, float scale = 0.2f)
        {
            // initial hidden state
            h0 = T.Shared(NN.Zeros <float>(hiddenDim), "h0");

            // reset gate layers
            Wr = T.Shared(NN.Random.Uniform(-scale, scale, inputDim, hiddenDim), "Wr");
            Ur = T.Shared(NN.Eye <float>(hiddenDim), "Ur");
            br = T.Shared(NN.Zeros <float>(/*1,*/ hiddenDim), "br");

            // update gate layers
            Wz = T.Shared(NN.Random.Uniform(-scale, scale, inputDim, hiddenDim), "Wz");
            Uz = T.Shared(NN.Eye <float>(hiddenDim), "Uz");
            bz = T.Shared(NN.Zeros <float>(/*1,*/ hiddenDim), "bz");

            // layers
            W = T.Shared(NN.Random.Uniform(-scale, scale, inputDim, hiddenDim), "W");
            U = T.Shared(NN.Eye <float>(hiddenDim), "U");
            b = T.Shared(NN.Zeros <float>(/*1,*/ hiddenDim), "b");

            // prediction layer
            S  = T.Shared(NN.Random.Uniform(-scale, scale, hiddenDim, outputDim), "S");
            Sb = T.Shared(NN.Zeros <float>(/*1,*/ outputDim), "Sb");

            // bundle
            this.@params = new[] { h0, Wr, Ur, br, Wz, Uz, bz, W, U, b, S, Sb };

            // Adagrad shared variables
            this.grads = new Dictionary <string, Tensor <float> .Shared>();
            foreach (var param in @params)
            {
                var name = param.Name + "Grad";
                this.grads[name] = T.Shared(NN.Zeros <float>(param.Value.Shape), name);
            }

            this.hists = new Dictionary <string, Tensor <float> .Shared>();
            foreach (var param in @params)
            {
                var name = param.Name + "Hist";
                this.hists[name] = T.Shared(NN.Zeros <float>(param.Value.Shape), name);
            }

            // Adadelta shared variables
            var hists2 = new Dictionary <string, Tensor <float> .Shared>();

            foreach (var param in @params)
            {
                var name = param.Name + "Hist2";
                hists2[name] = T.Shared(NN.Zeros <float>(param.Value.Shape), name);
            }

            var x        = T.Matrix <float>("x"); // [sentence, inputDim]
            var expected = T.Vector <float>("expected");

            Func <Tensor <float>, Tensor <float>, Tensor <float>[]> recurrence = (x_t, h_tm1) =>
            {
                // reset gate
                var r_t = T.Sigmoid(T.Dot(x_t, Wr) + T.Dot(h_tm1, Ur) + br);
                // update gate
                var z_t = T.Sigmoid(T.Dot(x_t, Wz) + T.Dot(h_tm1, Uz) + bz);
                // proposed hidden state
                var _h_t = T.Tanh(T.Dot(x_t, W) + T.Dot(r_t * h_tm1, U) + b);
                // actual hidden state
                var h_t = z_t * h_tm1 + (1 - z_t) * _h_t;
                // return all the intermediate variables because they may be reused by T.Grad to optimize gradient computation
                return(new[] { h_t, r_t, z_t, _h_t });
            };

            var h = T.Scan(recurrence, x, new[] { h0, null, null, null })[0][-1];

            // cost and gradients
            var output    = T.Dot(h, S) + Sb;
            var error     = 0.5f * T.Norm2(output - expected);
            var gradients = T.Grad(error);

            var updatesTrain = new OrderedDictionary();

            foreach (var param in @params)
            {
                var grad = gradients[param];
                //var grad = T.Clip(update.Item2, -10, 10);

                // Adagrad
                //const float eps = 1e-5f;
                var g = grads[param.Name + "Grad"];
                updatesTrain[g] = g + grad;
                //updates[param] = param - lr * grad / T.Sqrt(hist + eps);

                // Adadelta
                //const float rho = 0.95f;
                //const float eps = 1e-5f;
                //var hist = hists[param.Name + "Hist"];
                //var hist2 = hists2[param.Name + "Hist2"];
                //var newHist = rho * hist + (1 - rho) * (grad * grad);
                //updates[hist] = newHist;
                //var newGrad = grad * T.Sqrt((hist2 + eps) / (newHist + eps));
                //updates[param] = param - newGrad;
                //updates[hist2] = rho * hist2 + (1 - rho) * (newGrad * newGrad);

                // Regular
                //updates[param] = param - lr * grad;
            }

            var         batchSize = T.Scalar <float>("batchSize");
            var         lr        = T.Scalar <float>("lr");
            const float eps       = 1e-5f;

            var updates = new OrderedDictionary();

            foreach (var param in this.@params)
            {
                var grad     = this.grads[param.Name + "Grad"];
                var meanGrad = grad / batchSize;

                var hist = this.hists[param.Name + "Hist"];
                updates[hist]  = hist + meanGrad * meanGrad;
                updates[param] = param - lr * meanGrad / T.Sqrt(hist + eps);
                updates[grad]  = T.ZerosLike(grad);
            }

            // theano functions
            this.Classify = T.Function(input: x, output: output);

            this.Train = T.Function(input: (x, expected),
                                    output: error,
                                    updates: updatesTrain);

            this.Update = T.Function(input: (lr, batchSize), updates: updates);
        }
Esempio n. 16
0
        public void TestMethod1()
        {
            // https://github.com/Theano/Theano/issues/3162
            // When using unbounded activation functions (e.g. Relu) the softmax function can saturate. This can lead to nan gradients when paired with categorical crossentropy cost.
            // If the softmax function is replaced with a numerically stable version of log-softmax and this is used directly in the cost function, then the gradients don't blow up.
            // It seems that this could be implemented as a pattern to recognize(softmax paired with categorical crossentropy).
            // Here's a code snippet that illustrates the problem with the regular softmax versus doing the same thing with the numerically stable log-softmax, where the former gives nans in the gradient and the latter does not blow up. It's interesting because the experiment indicates that for the regular softmax case, the crossentropy loss is coming out numerically stable but not the gradient.

            Binding.Compiler.Debug = true;

            var x = T.Matrix <real>("x");
            var y = T.Matrix <real>("y");

            // regular softmax and crossentropy
            var sm  = T.Softmax(x);
            var cm1 = CategoricalCrossentropy(sm, y);
            var g1  = T.Grad(T.Mean(cm1), x);

            // numerically stable log-softmax with crossentropy
            var xdev = x - T.Max(x, axis: 1, keepDims: true);
            var lsm  = xdev - T.Log(T.Sum(T.Exp(xdev), axis: 1, keepDims: true));
            //var lsm2 = xdev - T.LogSumExp(xdev, axis: 1, keepDims: true);
            var sm2 = T.Exp(lsm); // just used to show equivalence with sm
            var cm2 = -T.Sum(y * lsm, axis: 1);
            var g2  = T.Grad(T.Mean(cm2), x);

            // create some inputs into a softmax that are large and labels
            var large = 1f; // 10f
            var a     = NN.Exp(NN.Random.Uniform <float>(0, large, 5, 10));
            // create some one-hot coded labels
            var b = NN.Zeros <float>(5, 10);

            b[Range(0, 5), Range(0, 5)] = NN.Eye <float>(5);

            // show equivalence of softmax and exponentiated numerically stable log-softmax
            var f1   = T.Function(input: x, output: (sm, sm2));
            var sm_  = f1(a);
            var sm_1 = sm_.Item1;       // classical softmax
            var sm_2 = sm_.Item2;       // log(sum(exp)) softmax

            AssertArray.AreAlmostEqual(sm_1, sm_2);

            // now show that the two versions result in the same crossentropy cost
            // this indicates that the forward function does provide some numerical stability
            var f2  = T.Function(input: (x, y), output: (cm1, cm2));
            var c_  = f2(a, b);
            var c_1 = c_.Item1;
            var c_2 = c_.Item2;

            AssertArray.AreAlmostEqual(c_1, c_2);

            // now, show that in the standard softmax case the gradients blow up
            // while in the log-softmax case they don't
            var f3  = T.Function(input: (x, y), output: (g1, g2));
            var g_  = f3(a, b);
            var g_1 = g_.Item1;
            var g_2 = g_.Item2;

            Assert.IsTrue(float.IsNaN(g_1.Sum()));
            Assert.IsFalse(float.IsNaN(g_2.Sum()));
        }