Exemple #1
0
        /// <summary>
        /// Train a neural network
        /// </summary>
        /// <param name="X">The feature set Matrix.</param>
        /// <param name="y">The result set Matrix.</param>
        /// <param name="input_layer_size">The size of the input layer</param>
        /// <param name="hidden_layer_size">The size of the hidden layer</param>
        /// <param name="labels">A list of classification labels.</param>
        /// <param name="lambda">The regularization parameter which helps reduce overfitting.
        /// <param name="maxIterations">The maximum number of iterations to run the minimization function.</param>
        /// <returns>The trained weights for between the input and hidden layers.</returns>
        public static Matrix[] Train(Matrix X, Matrix y, int input_layer_size, int hidden_layer_size, double[] labels, double lambda, int maxIterations = 50)
        {
            int num_labels = labels.Length;

            Matrix initial_Theta1 = RandInitializeWeights(input_layer_size, hidden_layer_size);
            Matrix initial_Theta2 = RandInitializeWeights(hidden_layer_size, num_labels);

            Matrix initial_nn_params = Matrix.Join(initial_Theta1.Unrolled, initial_Theta2.Unrolled, MatrixDimensions.Rows);

            MinimizeOptions options = new MinimizeOptions();

            options.InputLayerSize          = input_layer_size;
            options.HiddenLayerSize         = hidden_layer_size;
            options.Labels                  = labels;
            options.RegularizationParameter = lambda;
            options.MaxIterations           = maxIterations;

            int    i         = 0;
            Matrix new_theta = LogisticRegression.Minimize(NNCostFunction, X, y, initial_nn_params, options, out i);

            Matrix[] thetas = new Matrix[2];
            thetas[0] = Matrix.Reshape(new_theta, 0, hidden_layer_size, input_layer_size + 1);
            thetas[1] = Matrix.Reshape(new_theta, (hidden_layer_size * (input_layer_size + 1)), num_labels, hidden_layer_size + 1);

            return(thetas);
        }
        /// <summary>
        /// Create multiple logistic regression classifiers, one for each class we're
        /// trying to categorize.
        /// </summary>
        /// <param name="X">The features Matrix (n x m).</param>
        /// <param name="y">The results Matrix (n x 1).</param>
        /// <param name="labels">The labels to classify against.</param>
        /// <param name="lambda">The regularization parameter.</param>
        /// <returns>A Matrix where each row is a learned set of parameters for that
        /// particular class.</returns>
        public static Matrix OneVsAll(Matrix X, Matrix y, double[] labels, double lambda, int maxIterations = 50)
        {
            int m = X.Rows;
            int n = X.Columns;

            X = Matrix.Join(Matrix.Ones(m, 1), X, MatrixDimensions.Columns);
            int numberOfLabels = labels.Length;

            Matrix          all_theta = new Matrix(numberOfLabels, n + 1);
            MinimizeOptions options   = new MinimizeOptions();

            options.RegularizationParameter = lambda;
            options.MaxIterations           = maxIterations;

            for (int c = 0; c < numberOfLabels; c++)
            {
                Matrix initial_theta = new Matrix(n + 1, 1);
                int    i             = 0;
                Matrix new_theta     = Minimize(CostFunction, X, y == labels[c], initial_theta, options, out i);
                all_theta.SetRow(c, new_theta.Transpose);
            }

            return(all_theta);
        }
Exemple #3
0
        static void NeuralNetworkDemo()
        {
            WriteH1("Neural Network Regression");

            #region PredictNN
            WriteH2("Predict Neural Network");
            Matrix Theta1 = new Matrix(new double[, ] {
                { 0.00000, 0.90930, -0.75680 },
                { 0.47943, 0.59847, -0.97753 },
                { 0.84147, 0.14112, -0.95892 },
                { 0.99749, -0.35078, -0.70554 }
            });
            Matrix Theta2 = new Matrix(new double[, ] {
                { 0.00000, 0.93204, 0.67546, -0.44252, -0.99616 },
                { 0.29552, 0.99749, 0.42738, -0.68777, -0.92581 },
                { 0.56464, 0.97385, 0.14112, -0.87158, -0.77276 },
                { 0.78333, 0.86321, -0.15775, -0.97753, -0.55069 }
            });

            Matrix X = new Matrix(new double[, ] {
                { 0.84147, 0.41212 },
                { 0.90930, -0.54402 },
                { 0.14112, -0.99999 },
                { -0.75680, -0.53657 },
                { -0.95892, 0.42017 },
                { -0.27942, 0.99061 },
                { 0.65699, 0.65029 },
                { 0.98936, -0.28790 }
            });
            Matrix prediction = NeuralNetwork.Predict(Theta1, Theta2, X);
            Console.WriteLine("Target: 3.00 ; 0.00 ; 0.00 ; 3.00 ; 3.00 ; 3.00 ; 3.00 ; 1.00 ;\nActual: {0}", prediction.ToString().Replace("\n", "; "));
            #endregion

            #region Sigmoid Gradient
            Matrix a = new Matrix(new double[, ] {
                { -1, -2, -3 }
            });
            X = Matrix.Join(a, Matrix.Magic(3), MatrixDimensions.Rows);
            Matrix sg = NeuralNetwork.SigmoidGradient(X);

            WriteH2("Sigmoid Gradient");
            Console.WriteLine(sg);
            #endregion

            #region Neural Network Cost Function
            Matrix nn = new Matrix(new double[, ] {
                { 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8 }
            });
            int      il     = 2;
            int      hl     = 2;
            double[] labels = new double[] { 1, 2, 3, 4 };
            X = new Matrix(new double[, ] {
                { 0.5403, -0.41615 },
                { -0.98999, -0.65364 },
                { 0.28366, 0.96017 }
            });
            Matrix y = new Matrix(new double[, ] {
                { 4 },
                { 2 },
                { 3 }
            });

            MinimizeOptions options = new MinimizeOptions();
            options.InputLayerSize          = il;
            options.HiddenLayerSize         = hl;
            options.Labels                  = labels;
            options.RegularizationParameter = 4;
            Tuple <double, Matrix> result = NeuralNetwork.NNCostFunction(X, y, nn, options);

            WriteH2("Neural Network Cost Function");
            Console.WriteLine($"J: {result.Item1} (Expected Result: 19.474)");
            Console.WriteLine(result.Item2);

            #endregion
        }
Exemple #4
0
        static void LogisticRegressionDemo()
        {
            WriteH1("Logistic Regression");

            #region Sigmoid Function
            WriteH2("Sigmoid Function");

            Matrix m1 = new Matrix(new double[, ] {
                { 1200000 }
            });
            Matrix sigmoid1 = LogisticRegression.Sigmoid(m1);
            Console.Write("Target: 1.0   Actual: {0}", sigmoid1);

            m1[0, 0] = -25000;
            sigmoid1 = LogisticRegression.Sigmoid(m1);
            Console.Write("Target: 0.0   Actual: {0}", sigmoid1);

            m1[0, 0] = 0;
            sigmoid1 = LogisticRegression.Sigmoid(m1);
            Console.Write("Target: 0.5   Actual: {0}", sigmoid1);

            m1 = new Matrix(new double[, ] {
                { 4, 5, 6 }
            });
            sigmoid1 = LogisticRegression.Sigmoid(m1);
            Console.Write("Target: 0.98 0.99 0.997   Actual: {0}", sigmoid1);
            #endregion

            #region Predict
            WriteH2("Prediction");

            m1 = new Matrix(new double[, ] {
                { 1, 1 }, { 1, 2.5 }, { 1, 3 }, { 1, 4 }
            });
            Matrix theta = new Matrix(new double[, ] {
                { -3.5 }, { 1.3 }
            });
            Matrix prediction = LogisticRegression.Predict(m1, theta);
            Console.WriteLine("Target: 0.0 ; 0.0 ; 1.0 ; 1.0 ;  Actual: {0}", prediction.ToString().Replace("\n", "; "));

            m1    = Matrix.Magic(3);
            theta = new Matrix(new double[, ] {
                { 4 }, { 3 }, { -8 }
            });
            prediction = LogisticRegression.Predict(m1, theta);
            Console.WriteLine("Target: 0.0 ; 0.0 ; 1.0 ;        Actual: {0}", prediction.ToString().Replace("\n", "; "));
            #endregion

            #region Cost Function
            WriteH2("Cost Function");
            Matrix X = Matrix.AddIdentityColumn(Matrix.Magic(3));
            Matrix y = new Matrix(new double[, ] {
                { 1 }, { 0 }, { 1 }
            });
            theta = new Matrix(new double[, ] {
                { -2 }, { -1 }, { 1 }, { 2 }
            });
            Tuple <double, Matrix> cost = LogisticRegression.CostFunction(X, y, theta);

            Console.WriteLine("Target: 4.6832 ;  Actual: {0}", cost.Item1);

            #endregion

            #region Regularized Cost Function
            WriteH2("Regularized Cost Function");
            MinimizeOptions options = new MinimizeOptions();
            options.RegularizationParameter = 3;
            cost = LogisticRegression.CostFunction(X, y, theta, options);
            Console.WriteLine("Target: 7.6832 ;  Actual: {0}", cost.Item1);

            X = new Matrix(new double[, ] {
                { 1.0, 0.1, 0.6, 1.1 },
                { 1.0, 0.2, 0.7, 1.2 },
                { 1.0, 0.3, 0.8, 1.3 },
                { 1.0, 0.4, 0.9, 1.4 },
                { 1.0, 0.5, 1.0, 1.5 }
            });
            y = new Matrix(new double[, ] {
                { 1.0 },
                { 0.0 },
                { 1.0 },
                { 0.0 },
                { 1.0 }
            });
            theta = new Matrix(new double[, ] {
                { -2 }, { -1 }, { 1 }, { 2 }
            });
            cost = LogisticRegression.CostFunction(X, y, theta, options);
            Console.WriteLine("Target: 2.5348 ;  Actual: {0}", cost.Item1);

            #endregion

            #region OneVsAll
            WriteH2("One vs All");
            X = new Matrix(new double[, ] {
                { 8.0, 1.0, 6.0 },
                { 3.0, 5.0, 7.0 },
                { 4.0, 9.0, 2.0 },
                { 0.84147, 0.90930, 0.14112 },
                { 0.54030, -0.41615, -0.98999 }
            });
            y = new Matrix(new double[, ] {
                { 1.0 },
                { 2.0 },
                { 2.0 },
                { 1.0 },
                { 3.0 }
            });
            //Matrix testTheta = new Matrix(4, 1);
            //Matrix X0 = Matrix.Join(Matrix.Ones(5, 1), X, MatrixDimensions.Columns);
            //cost = LogisticRegression.CostFunction(X0, y==1, testTheta, 0.1);
            //Console.WriteLine(cost.Item1);
            //Console.WriteLine(cost.Item2);

            double[] labels    = new double[] { 1.0, 2.0, 3.0 };
            Matrix   all_theta = LogisticRegression.OneVsAll(X, y, labels, 0.1);

            Console.WriteLine(all_theta);
            #endregion

            #region PredictOneVsAll
            WriteH2("Predict One vs All");
            X = new Matrix(new double[, ] {
                { 1.0, 7.0 },
                { 4.0, 5.0 },
                { 7.0, 8.0 },
                { 1.0, 4.0 }
            });
            all_theta = new Matrix(new double[, ] {
                { 1.0, -6.0, 3.0 },
                { -2.0, 4.0, -3.0 }
            });
            prediction = LogisticRegression.PredictOneVsAll(all_theta, X);
            Console.WriteLine("Target: 0; 1; 1; 0;    Actual: {0}", prediction.ToString().Replace("\n", "; "));
            #endregion
        }
        /// <summary>
        /// Calculate the regularized cost function for logistic regression.
        /// </summary>
        /// <param name="X">The original features Matrix.</param>
        /// <param name="y">The results Matrix.</param>
        /// <param name="theta">The theta values to apply to each feature.</param>
        /// <param name="lambda">The regularization parameter which helps reduce overfitting.
        /// Note that using values that are too high will lead to underfitting.</param>
        /// <returns>The cost of using the given value of theta, and the gradient of
        /// the cost (useful for iterative minimization functions)</returns>
        public static Tuple <double, Matrix> CostFunction(Matrix X, Matrix y, Matrix theta, MinimizeOptions options)
        {
            double lambda = options.RegularizationParameter;
            double m      = (double)X.Rows;
            Matrix t      = new Matrix(theta);
            Matrix h      = Sigmoid(X * t); // Hypothesis
            Matrix ev     = h - y;          // Error Vector
            double part1  = (-y.Transpose * Matrix.ElementLog(h)).SumAllElements;
            double part2  = ((1 - y).Transpose * Matrix.ElementLog(1 - h)).SumAllElements;

            double J = (1.0 / m) * (part1 - part2);

            t[0, 0] = 0;
            double theta_sq = (t.Transpose * t).SumAllElements;

            J += ((lambda / (2.0 * m)) * theta_sq);
            Matrix grad = ((1 / m) * (X.Transpose * (h - y))) + ((lambda / m) * t);

            return(Tuple.Create(J, grad));
        }
        /// <summary>
        /// Minimize the cost function for an initial set of values.
        /// </summary>
        /// <param name="f">A function that calculates the cost, and a vector
        /// of partial derivatives.</param>
        /// <param name="Features">The features being assessed for each label.</param>
        /// <param name="y">A truth table containing '1' if it matches the label
        /// being investigated, or '0' if it's not a match.</param>
        /// <param name="theta">Initial values for theta.</param>
        /// <param name="lambda">The regularization parameter.</param>
        /// <param name="maxIterations">The maximum number of iterations to perform
        /// before stopping.</param>
        /// <param name="i">The final number of iterations used to find a result.</param>
        /// <returns>The solution for theta for a given set of labels.</returns>
        public static Matrix Minimize(MinimizeFunction f, Matrix Features, Matrix y, Matrix theta, MinimizeOptions options, out int i)
        {
            int    maxIterations = options.MaxIterations;
            double lambda        = options.RegularizationParameter;
            int    length        = maxIterations > 0 ? maxIterations : 100;

            // Most of the below is adapted from fmincg.m by Carl Edward Rasmussen.
            // Original Copyright notice:
            // -------------------------------------------------------------------------
            // Copyright(C) 2001 and 2002 by Carl Edward Rasmussen. Date 2002 - 02 - 13
            //
            // (C)Copyright 1999, 2000 & 2001, Carl Edward Rasmussen
            //
            // Permission is granted for anyone to copy, use, or modify these
            // programs and accompanying documents for purposes of research or
            // education, provided this copyright notice is retained, and note is
            // made of any changes that have been made.
            //
            // These programs and documents are distributed without any warranty,
            // express or implied.As the programs were written for research
            // purposes only, they have not been tested to the degree that would be
            // advisable in any important application.All use of these programs is
            // entirely at the user's own risk.
            // -------------------------------------------------------------------------
            // NOTE: Original code was written in Octave, while here it's obviously been
            // re-written in C#. There are likely a few differences and errors with this
            // implementation, which will hopefully be ironed out in time. These are
            // entirely my own fault, and not the original author's.

            double RHO   = 0.01; // a bunch of constants for line searches
            double SIG   = 0.5;  // RHO and SIG are the constants in the Wolfe - Powell conditions
            double INT   = 0.1;  // don't reevaluate within 0.1 of the limit of the current bracket
            double EXT   = 3.0;  // extrapolate maximum 3 times the current bracket
            double MAX   = 20;   // max 20 function evaluations per line search
            double RATIO = 100;  // maximum allowed slope ratio

            double red = 1;

            i = 0;                                            // zero the run length counter
            bool ls_failed = false;                           // no previous line search has failed
            // fX = [];

            //[f1 df1] = eval(argstr);                      // get function value and gradient
            Tuple <double, Matrix> cost1 = f(Features, y, theta, options);
            double f1  = cost1.Item1;
            Matrix df1 = cost1.Item2;

            i = i + (length < 0 ? 1 : 0);                           // count epochs?!
            Matrix s = -df1;                                        // search direction is steepest
            double d1 = (-Matrix.MultiplyTransposeBy(s))[0, 0];     // this is the slope
            double z1 = red / (1.0 - d1);                           // initial step is red/(|s|+1)
            double z2, A, B;

            while (i < Math.Abs(length))                                                         // while not finished
            {
                i = i + (length > 0 ? 1 : 0);                                                    // count iterations?!

                Matrix theta0 = new Matrix(theta); double f0 = f1; Matrix df0 = new Matrix(df1); // make a copy of current values
                theta = theta + z1 * s;                                                          // begin line search
                Tuple <double, Matrix> cost2 = f(Features, y, theta, options);
                double f2  = cost2.Item1;
                Matrix df2 = new Matrix(cost2.Item2);

                //[f2 df2] = eval(argstr);
                i = i + (length < 0 ? 1 : 0);                                // count epochs?!
                double d2 = MultiplyFirstRowTransposeBy(df2, s);
                double f3 = f1; double d3 = d1; double z3 = -z1;             // initialize point 3 equal to point 1
                double M = 0;

                M = (length > 0) ? MAX : Math.Min(MAX, -length - i);

                bool success = false; double limit = -1;                     // initialize quantities
                while (true)
                {
                    while (((f2 > f1 + z1 * RHO * d1) || (d2 > -SIG * d1)) && (M > 0))
                    {
                        limit = z1;                                         // tighten the bracket
                        z2    = 0;
                        if (f2 > f1)
                        {
                            z2 = z3 - (0.5 * d3 * z3 * z3) / (d3 * z3 + f2 - f3);                 // quadratic fit
                        }
                        else
                        {
                            A  = 6 * (f2 - f3) / z3 + 3 * (d2 + d3);                  // cubic fit
                            B  = 3 * (f3 - f2) - z3 * (d3 + 2 * d2);
                            z2 = (Math.Sqrt(B * B - A * d2 * z3 * z3) - B) / A;       // numerical error possible -ok!
                        }

                        if ((double.IsNaN(z2)) || (double.IsInfinity(z2)))
                        {
                            z2 = z3 / 2;                                          // if we had a numerical problem then bisect
                        }
                        z2    = Math.Max(Math.Min(z2, INT * z3), (1 - INT) * z3); // don't accept too close to limits
                        z1    = z1 + z2;                                          // update the step
                        theta = theta + z2 * s;
                        // [f2 df2] = eval(argstr);
                        cost2 = f(Features, y, theta, options);
                        f2    = cost2.Item1;
                        df2   = new Matrix(cost2.Item2);

                        M = M - 1; i = i + (length < 0 ? 1 : 0);  // count epochs?!
                        //d2 = (df2.Transpose * s)[0, 0];
                        d2 = MultiplyFirstRowTransposeBy(df2, s); // (Matrix.MultiplyTransposeBy(df2, s))[0, 0];
                        z3 = z3 - z2;                             // z3 is now relative to the location of z2
                    }
                    if ((f2 > (f1 + z1 * RHO * d1)) || (d2 > (-SIG * d1)))
                    {
                        break;                                                // this is a failure
                    }
                    else if (d2 > (SIG * d1))
                    {
                        success = true;
                        break;                                             // success
                    }
                    else if (M == 0)
                    {
                        break;                                                      // failure
                    }
                    A  = 6 * (f2 - f3) / z3 + 3 * (d2 + d3);                        // make cubic extrapolation
                    B  = 3 * (f3 - f2) - z3 * (d3 + 2 * d2);
                    z2 = -d2 * z3 * z3 / (B + Math.Sqrt(B * B - A * d2 * z3 * z3)); // num.error possible - ok!
                    //if ~isreal(z2) || isnan(z2) || isinf(z2) || z2 < 0        // num prob or wrong sign?
                    if (double.IsNaN(z2) || double.IsInfinity(z2) || z2 < 0)        // num prob or wrong sign?
                    {
                        if (limit < -0.5)                                           // if we have no upper limit
                        {
                            z2 = z1 * (EXT - 1);                                    // the extrapolate the maximum amount
                        }
                        else
                        {
                            z2 = (limit - z1) / 2;                                   // otherwise bisect
                        }
                    }
                    else if ((limit > -0.5) && (z2 + z1 > limit))        // extraplation beyond max?
                    {
                        z2 = (limit - z1) / 2;                           // bisect
                    }
                    else if ((limit < -0.5) && (z2 + z1 > z1 * EXT))     // extrapolation beyond limit
                    {
                        z2 = z1 * (EXT - 1.0);                           // set to extrapolation limit
                    }
                    else if (z2 < -z3 * INT)
                    {
                        z2 = -z3 * INT;
                    }
                    else if ((limit > -0.5) && (z2 < (limit - z1) * (1.0 - INT)))  // too close to limit?
                    {
                        z2 = (limit - z1) * (1.0 - INT);
                    }
                    f3 = f2; d3 = d2; z3 = -z2;                  // set point 3 equal to point 2
                    z1 = z1 + z2; theta = theta + z2 * s;        // update current estimates
                    //[f2 df2] = eval(argstr);
                    cost2 = f(Features, y, theta, options);
                    f2    = cost2.Item1;
                    df2   = new Matrix(cost2.Item2);

                    M  = M - 1; i = i + (length < 0 ? 1 : 0);       // count epochs?!
                    d2 = MultiplyFirstRowTransposeBy(df2, s);
                }                                                   // end of line search



                if (success)                                                                                       // if line search succeeded
                {
                    f1 = f2;                                                                                       //fX = [fX' f1]';
                    //fprintf('%s %4i | Cost: %4.6e\r', S, i, f1);
                    s = (df2.Transpose * df2 - df1.Transpose * df2)[0, 0] / (df1.Transpose * df1)[0, 0] * s - df2; // Polack-Ribiere direction
                    Matrix tmp = new Matrix(df1); df1 = df2; df2 = tmp;                                            // swap derivatives
                    //d2 = (df1.Transpose * s)[0, 0];
                    d2 = (Matrix.MultiplyTransposeBy(df1, s))[0, 0];
                    if (d2 > 0)                                    // new slope must be negative
                    {
                        s = -df1;                                  // otherwise use steepest direction
                        //d2 = (-s.Transpose * s)[0, 0];
                        d2 = MultiplyFirstRowTransposeBy(s, true); // d2 = (-Matrix.MultiplyTransposeBy(s))[0, 0];
                    }
                    double TEST = d2 - double.MinValue;
                    z1        = z1 * Math.Min(RATIO, d1 / (d2 - double.Epsilon)); // slope ratio but max RATIO
                    d1        = d2;
                    ls_failed = false;                                            // this line search did not fail
                }
                else
                {
                    theta = new Matrix(theta0); f1 = f0; df1 = df0;     // restore point from before failed line search
                    if (ls_failed || i > Math.Abs(length))              // line search failed twice in a row
                    {
                        break;                                          // or we ran out of time, so we give up
                    }
                    Matrix tmp = new Matrix(df1); df1 = df2; df2 = tmp; // swap derivatives
                    s         = -df1;                                   // try steepest
                    d1        = MultiplyFirstRowTransposeBy(s, true);
                    z1        = 1 / (1 - d1);
                    ls_failed = true;                                    //this line search failed
                }
            }

            return(theta);
        }
Exemple #7
0
        /// <summary>
        /// The Neural Network cost function for a two layer classification Neural Network.
        /// </summary>
        /// <param name="nn_parameters">The unrolled parameter vector that contains all the weights.</param>
        /// <param name="input_layer_size">The number of nodes in the input layer.</param>
        /// <param name="hidden_layer_size">The number of nodes in the hidden layer.</param>
        /// <param name="labels">A list of classification labels.</param>
        /// <param name="X">The feature set Matrix.</param>
        /// <param name="y">The result set Matrix.</param>
        /// <param name="lambda">The regularization parameter which helps reduce overfitting.
        /// Note that using values that are too high will lead to underfitting.</param>
        /// <returns>The cost of using the given value of theta, and the gradient of
        /// the cost (useful for iterative minimization functions)</returns>
        public static Tuple <double, Matrix> NNCostFunction(Matrix X, Matrix y, Matrix nn_parameters, MinimizeOptions options)
        {
            double lambda            = options.RegularizationParameter;
            int    input_layer_size  = options.InputLayerSize;
            int    hidden_layer_size = options.HiddenLayerSize;

            double[] labels = options.Labels;

            double        costFunction    = 0;
            int           num_labels      = labels.Length;
            List <Matrix> output_gradient = new List <Matrix>();

            Matrix Theta1 = Matrix.Reshape(nn_parameters, 0, hidden_layer_size, input_layer_size + 1);
            Matrix Theta2 = Matrix.Reshape(nn_parameters, (hidden_layer_size * (input_layer_size + 1)), num_labels, hidden_layer_size + 1);

            // y_matrix has the following attributes:
            // Rows: same as the number of rows in Y -- one for each example result.
            // Columns: one for each label.
            // Values: Each row consists of zeros, except for one, which matches the
            // value of y in that row to the index of the label. For example, if there
            // are three labels (3, 6, 8), and y contains 2 rows (8, 3), then y_matrix
            // would be:
            // 0 0 1
            // 1 0 0
            Matrix y_matrix = AssignLabels(y, labels);

            // Add ones to the X Matrix
            Matrix a1 = Matrix.AddIdentityColumn(X);

            Matrix z2 = a1 * Theta1.Transpose;
            Matrix a2 = LogisticRegression.Sigmoid(z2);

            a2 = Matrix.AddIdentityColumn(a2);

            Matrix z3 = a2 * Theta2.Transpose;
            Matrix a3 = LogisticRegression.Sigmoid(z3);

            Matrix log1 = Matrix.ElementLog(a3);
            Matrix log2 = Matrix.ElementLog(1 - a3);

            Matrix part1 = Matrix.ElementMultiply(-y_matrix, log1);
            Matrix part2 = Matrix.ElementMultiply((1 - y_matrix), log2);

            Matrix t0 = Theta1.RemoveColumn(0);
            Matrix t1 = Theta2.RemoveColumn(0);

            // Calculate regularization component
            double multiplier = lambda / (2 * X.Rows);
            double reg1       = Matrix.ElementPower(t0, 2).SumAllElements;
            double reg2       = Matrix.ElementPower(t1, 2).SumAllElements;
            double r          = multiplier * (reg1 + reg2);

            // Calculate cost
            costFunction = (1.0 / X.Rows) * (part1 - part2).SumAllElements + r;


            // Back Propogation
            Matrix d3 = a3 - y_matrix;
            Matrix d2 = Matrix.ElementMultiply(
                (t1.Transpose * d3.Transpose).Transpose,
                SigmoidGradient(z2)
                );

            Matrix Delta1 = d2.Transpose * a1;
            Matrix Delta2 = d3.Transpose * a2;

            Theta1 = Matrix.Join(new Matrix(t0.Rows, 1), t0, MatrixDimensions.Columns);
            Theta2 = Matrix.Join(new Matrix(t1.Rows, 1), t1, MatrixDimensions.Columns);

            double scale_value   = lambda / X.Rows;
            Matrix Theta1_scaled = Theta1 * scale_value;
            Matrix Theta2_scaled = Theta2 * scale_value;

            Matrix Theta1_grad = ((Delta1 / X.Rows) + Theta1_scaled).Unrolled;
            Matrix Theta2_grad = ((Delta2 / X.Rows) + Theta2_scaled).Unrolled;

            return(new Tuple <double, Matrix>(costFunction, Matrix.Join(Theta1_grad, Theta2_grad, MatrixDimensions.Rows)));
        }