/// <summary> /// This method calculates all the derivatives for each layer in the neural network. /// It starts by calculating dC/dAL. /// After this it will go through each layer starting from the last /// calculating the derivative of the cost function with respect to /// W, b, and A of the previous layer. /// /// dW and db will be used for updating the parameters. /// /// dAPrev is passed to the next step of the back prop as it is used to calculate dZ at that step. /// </summary> /// <param name="Y">The true labels of the input data.</param> /// <param name="AL">The final predictions of the network.</param> /// <param name="linearCache">A linear cache obtained from forward prop.</param> /// <param name="zCache">A cache containing every computer linear function Z.</param> /// <param name="lambda">The L2 regularization hyper-parameter.</param> /// <returns>The derivatives to run gradient descent with.</returns> public Dictionary <string, MatrixVectors> BackwardPropagation(MatrixVectors Y, MatrixVectors AL, List <LinearCache> linearCache, List <MatrixVectors> zCache, float lambda) { Dictionary <string, MatrixVectors> gradients = new Dictionary <string, MatrixVectors>(); List <LinearCache> linearCaches = linearCache; List <MatrixVectors> Zs = zCache; int layersCount = linearCaches.Count; MatrixVectors YDividedByAL = Y.MatrixElementWise(AL, Operation.Divide); MatrixVectors OneMinusY = Y.BroadcastScalar(1, Operation.Subtract, true); MatrixVectors OneMinusAL = AL.BroadcastScalar(1, Operation.Subtract, true); MatrixVectors OneMinusYDividedByOneMinusAL = OneMinusY.MatrixElementWise(OneMinusAL, Operation.Divide); MatrixVectors dAL_P1 = YDividedByAL.MatrixElementWise(OneMinusYDividedByOneMinusAL, Operation.Subtract); MatrixVectors dAL = dAL_P1.BroadcastScalar(-1, Operation.Multiply); Tuple <MatrixVectors, MatrixVectors, MatrixVectors> derivatives = ActivationsBackward(dAL, Zs[layersCount - 1], linearCaches[layersCount - 1], Activation.Sigmoid, lambda); MatrixVectors dWL = derivatives.Item1; MatrixVectors dbL = derivatives.Item2; MatrixVectors dAPrev = derivatives.Item3; gradients.Add("dW" + layersCount, dWL); gradients.Add("db" + layersCount, dbL); for (int l = layersCount - 1; l > 0; l--) { Tuple <MatrixVectors, MatrixVectors, MatrixVectors> deriv = ActivationsBackward(dAPrev, Zs[l - 1], linearCaches[l - 1], Activation.ReLu, lambda); MatrixVectors dW = deriv.Item1; MatrixVectors db = deriv.Item2; dAPrev = deriv.Item3; gradients.Add("dW" + l, dW); gradients.Add("db" + l, db); } return(gradients); }
/// <summary> /// This method runs the linear function z = MatrixMultiplication(w, A_prev) + b. /// </summary> /// <param name="previousLayersActivations">A vector containing the previous layers activations.</param> /// <param name="weights">A matrix containing the weights.</param> /// <param name="bias">A vector containing the bias'.</param> /// <returns> /// The linear cache which holds the weights, bias and the previous layers activations. Also returns Z. /// </returns> private Tuple <LinearCache, MatrixVectors> LinearForward(MatrixVectors previousLayersActivations, MatrixVectors weights, MatrixVectors bias) { MatrixVectors z = weights.Dot(previousLayersActivations).MatrixElementWise(bias, Operation.Add); LinearCache linearCache = new LinearCache(weights, bias, previousLayersActivations); return(new Tuple <LinearCache, MatrixVectors>(linearCache, z)); }
public static MatrixVectors Dot(this MatrixVectors matrix_1, MatrixVectors matrix_2) { if (matrix_1.columns != matrix_2.rows)//the number of columns in the first matrix must be equal to the number of rows in the second { Console.WriteLine("Error in Matrix multiplication"); return(null); } MatrixVectors outputMatrix = new MatrixVectors(matrix_1.rows, matrix_2.columns);//the output matrix uses the columns of the second matrix and the rows of the first for (int c = 0; c < matrix_2.columns; c++) { for (int y = 0; y < matrix_1.rows; y++) { float value = 0; for (int x = 0; x < matrix_1.columns; x++) { value += matrix_1.MatrixVector[x, y] * matrix_2.MatrixVector[c, x]; } outputMatrix.MatrixVector[c, y] = value; } } return(outputMatrix); }
/// <summary> /// This method uses the cross entropy cost function to caculate the losses. /// </summary> /// <param name="AL">The final prediction of the network ranging from 0 to 1.</param> /// <param name="_y">The true label 0 or 1.</param> /// <param name="lambda">The L2 regularization hyper-parameter.</param> /// <param name="theta">Dictionary containing weights and bias'.</param> /// <param name="dims">Number of neurons in each layer of the network.</param> /// <returns>A float value which is the calculated loss as well as its derivative.</returns> public float ComputeCost(MatrixVectors AL, MatrixVectors _y, float lambda, Dictionary <string, MatrixVectors> theta, int[] dims) { if (AL.columns > 1 || _y.columns > 1 || !AL.CompareShape(_y)) { Console.WriteLine("Invalid YShape"); return(0f); } float crossEntropyCost = 0; float regularizedCost = 0; for (int l = 1; l < dims.Length; l++) { regularizedCost += MatrixCalculations.MatrixSummation(MatrixCalculations.Square(theta["W" + l])); } regularizedCost *= lambda / 2; for (int y = 0; y < _y.rows; y++) { float currentAL = AL.MatrixVector[0, y]; float currentY = _y.MatrixVector[0, y]; float currentCost = (float)-(currentY * Math.Log10(currentAL) + (1 - currentY) * Math.Log10(1 - currentAL)); crossEntropyCost += currentCost; } float totalCost = crossEntropyCost + regularizedCost; return(totalCost); }
public static MatrixVectors Maximum(MatrixVectors matrix_1, float scalar = 0, MatrixVectors matrix_2 = null) { if (matrix_2 != null) { if (!matrix_1.CompareShape(matrix_2)) { Console.WriteLine("Matrix shapes do not align"); return(null); } } MatrixVectors maximizedMatrix = new MatrixVectors(matrix_1.rows, matrix_1.columns); for (int y = 0; y < maximizedMatrix.rows; y++) { for (int x = 0; x < maximizedMatrix.columns; x++) { if (matrix_2 != null) { maximizedMatrix.MatrixVector[x, y] = Math.Max(matrix_1.MatrixVector[x, y], matrix_2.MatrixVector[x, y]); } else { maximizedMatrix.MatrixVector[x, y] = Math.Max(scalar, matrix_1.MatrixVector[x, y]); } } } return(maximizedMatrix); }
public static MatrixVectors ListToVector(this List <float> list, int axis) { MatrixVectors vector; if (axis == 0) { vector = new MatrixVectors(1, list.Count); for (int x = 0; x < list.Count; x++) { vector.MatrixVector[x, 0] = list[x]; } } else if (axis == 1) { vector = new MatrixVectors(list.Count, 1); for (int y = 0; y < list.Count; y++) { vector.MatrixVector[0, y] = list[y]; } } else { return(null); } return(vector); }
/// <summary> /// This method does the sigmoid calculation equivalent to 1 / (1 + np.Exp(-z)) in python. /// </summary> /// <param name="Z">The linear function of the weights biases and previous layers activations.</param> /// <returns>A vector containing the non-linear sigmoid activations of the linear function z.</returns> private MatrixVectors Sigmoid(MatrixVectors Z) { MatrixVectors activationsVector = Z.BroadcastScalar(-1, Operation.Multiply).Exp(); activationsVector = activationsVector.BroadcastScalar(1, Operation.Add); activationsVector = activationsVector.BroadcastScalar(1, Operation.Divide, true); return(activationsVector); }
/// <summary> /// Calculates the derivative of the cross entropy cost function with respect to Z /// assuming A of the same layer as Z was calculated using the Sigmoid function. /// </summary> /// <param name="dA">Derivative of the cost function with respect to the activation.</param> /// <param name="Z">The linear function of the weights biases and previous layers activations.</param> /// <returns>The derivative of the cost function with respect to Z.</returns> private MatrixVectors SigmoidPrime(MatrixVectors dA, MatrixVectors Z) { MatrixVectors A_prev = Sigmoid(Z); MatrixVectors OneMinusA_prev = A_prev.BroadcastScalar(1, Operation.Subtract, true); MatrixVectors A_prevMultipliedByOneMinusA_prev = A_prev.MatrixElementWise(OneMinusA_prev, Operation.Multiply); MatrixVectors dZ = dA.MatrixElementWise(A_prevMultipliedByOneMinusA_prev, Operation.Multiply); return(dZ); }
/// <summary> /// Uses gradient descent to update the weights and biases. /// </summary> /// <param name="theta">Dictionary containing the weights and bias' of the network.</param> /// <param name="gradients">The derivatives used to calculate gradient descent</param> /// <param name="dims">Number of neurons in each layer of the network.</param> /// <param name="alpha">The learning rate</param> /// <returns>The updated parameters theta.</returns> public Dictionary <string, MatrixVectors> UpdateParameters(Dictionary <string, MatrixVectors> theta, Dictionary <string, MatrixVectors> gradients, int[] dims, float alpha) { for (int l = 1; l < dims.Length; l++) { MatrixVectors dWxLearningRate = gradients["dW" + l].BroadcastScalar(alpha, Operation.Multiply); MatrixVectors dbxLearningRate = gradients["db" + l].BroadcastScalar(alpha, Operation.Multiply); theta["W" + l] = theta["W" + l].MatrixElementWise(dWxLearningRate, Operation.Subtract); theta["b" + l] = theta["b" + l].MatrixElementWise(dbxLearningRate, Operation.Subtract); } return(theta); }
public static float MatrixSummation(MatrixVectors matrix) { float sum = 0; for (int y = 0; y < matrix.rows; y++) { for (int x = 0; x < matrix.columns; x++) { sum += matrix.MatrixVector[x, y]; } } return(sum); }
public static MatrixVectors Transpose(this MatrixVectors matrix) { MatrixVectors matrixTranspose = new MatrixVectors(matrix.columns, matrix.rows); for (int y = 0; y < matrixTranspose.rows; y++) { for (int x = 0; x < matrixTranspose.columns; x++) { matrixTranspose.MatrixVector[x, y] = matrix.MatrixVector[y, x]; } } return(matrixTranspose); }
public static MatrixVectors Sqrt(this MatrixVectors matrix) { MatrixVectors outputMatrix = new MatrixVectors(matrix.rows, matrix.columns); for (int y = 0; y < matrix.rows; y++) { for (int x = 0; x < matrix.columns; x++) { outputMatrix.MatrixVector[x, y] = (float)Math.Sqrt(matrix.MatrixVector[x, y]); } } return(outputMatrix); }
/// <summary> /// Initializes the weights and biases and returns them as a dictionary /// the string key represents the name of the parameter as "W[l]" or "b[l]". /// </summary> /// <param name="dims">Number of neurons in each layer of the network.</param> /// <returns>Dictionary containing weights and bias'.</returns> public Dictionary <string, MatrixVectors> InitalizeParameters(int[] dims) { Dictionary <string, MatrixVectors> theta = new Dictionary <string, MatrixVectors>(); for (int l = 1; l < dims.Length; l++) { MatrixVectors weights = new MatrixVectors(dims[l], dims[l - 1]); weights = weights.BroadcastScalar((float)Math.Sqrt(1 / dims[l - 1]), Operation.Multiply); MatrixVectors bias = new MatrixVectors(dims[l], 1); weights.InitializeRandom(); theta.Add("W" + l, weights); theta.Add("b" + l, bias); } return(theta); }
public static MatrixVectors BroadcastScalar(this MatrixVectors matrix, float scalar, Operation operation, bool reverse = false) { MatrixVectors outputMatrix = new MatrixVectors(matrix.rows, matrix.columns); for (int y = 0; y < matrix.rows; y++) { for (int x = 0; x < matrix.columns; x++) { switch (operation) { case Operation.Add: outputMatrix.MatrixVector[x, y] = matrix.MatrixVector[x, y] + scalar; break; case Operation.Subtract: if (reverse) { outputMatrix.MatrixVector[x, y] = scalar - matrix.MatrixVector[x, y]; } else { outputMatrix.MatrixVector[x, y] = matrix.MatrixVector[x, y] - scalar; } break; case Operation.Multiply: outputMatrix.MatrixVector[x, y] = matrix.MatrixVector[x, y] * scalar; break; case Operation.Divide: if (reverse) { outputMatrix.MatrixVector[x, y] = scalar / matrix.MatrixVector[x, y]; } else { outputMatrix.MatrixVector[x, y] = matrix.MatrixVector[x, y] / scalar; } break; } } } return(outputMatrix); }
public static MatrixVectors Flatten(this MatrixVectors matrix, int axis) { ///<summary> /// axis = 0 returns a row vector. /// axis = 1 returns a column vector. ///</summary> MatrixVectors flattenedMatrix; int index = 0; if (axis == 0) { flattenedMatrix = new MatrixVectors(1, matrix.rows * matrix.columns); for (int y = 0; y < matrix.rows; y++) { for (int x = 0; x < matrix.columns; x++) { flattenedMatrix.MatrixVector[index, 0] = matrix.MatrixVector[x, y]; index++; } } } else if (axis == 1) { flattenedMatrix = new MatrixVectors(matrix.rows * matrix.columns, 1); for (int y = 0; y < matrix.rows; y++) { for (int x = 0; x < matrix.columns; x++) { flattenedMatrix.MatrixVector[0, index] = matrix.MatrixVector[x, y]; index++; } } } else { return(null); } return(flattenedMatrix); }
public Tuple <List <MatrixVectors>, List <MatrixVectors> > LoadIrisData(string path = "C:\\Users/Admin/source/repos/ANN/ANN/Data/IrisData/IrisTraining.csv") { List <MatrixVectors> dataVector = new List <MatrixVectors>(); List <MatrixVectors> labelVector = new List <MatrixVectors>(); using (StreamReader labeledReader = new StreamReader(path)) { int k = 0; while (!labeledReader.EndOfStream) { k++; string line = labeledReader.ReadLine(); if (k != 1) { string[] values = line.Split(','); List <float> currentData = new List <float>(); MatrixVectors currentLabel = new MatrixVectors(1, 1); for (int i = 1; i < values.Length; i++) { if (i == 5) { if (values[i] == "Iris-setosa") { currentLabel.MatrixVector[0, 0] = 0; } else if (values[i] == "Iris-versicolor") { currentLabel.MatrixVector[0, 0] = 1; } } else { currentData.Add(float.Parse(values[i])); } } dataVector.Add(currentData.ListToVector(1)); labelVector.Add(currentLabel); } } } return(new Tuple <List <MatrixVectors>, List <MatrixVectors> >(dataVector, labelVector)); }
/// <summary> /// This methods job is the calculate the activations of each layer. /// It uses input layer as the first layers previous activations /// and uses theta to calculate the linear function for the activations. /// /// This method gathers the linear and z caches of every layer. /// It will then generate a prediction(AL) as the final layers activations. /// </summary> /// <param name="xInput">The input layer of the network.</param> /// <param name="theta">The weights and biases of the network.</param> /// <param name="dims">Number of neurons in each layer of the network.</param> /// <returns>A tuple containing the linear and z caches along with the prediction.</returns> public Tuple <List <LinearCache>, List <MatrixVectors>, MatrixVectors> ForwardPropagation(MatrixVectors xInput, Dictionary <string, MatrixVectors> theta, int[] dims) { List <LinearCache> linearCaches = new List <LinearCache>(); List <MatrixVectors> z_cache = new List <MatrixVectors>(); MatrixVectors previousLayersactivations = xInput; for (int l = 1; l < dims.Length - 1; l++) { MatrixVectors weights = theta["W" + l]; MatrixVectors bias = theta["b" + l]; Tuple <LinearCache, MatrixVectors, MatrixVectors> cacheAndActivation = ActivationsForward(previousLayersactivations, weights, bias, Activation.ReLu); LinearCache linearCache = cacheAndActivation.Item1; MatrixVectors z = cacheAndActivation.Item2; linearCaches.Add(linearCache); z_cache.Add(z); previousLayersactivations = cacheAndActivation.Item3; } MatrixVectors finalWeights = theta["W" + (dims.Length - 1).ToString()]; MatrixVectors finalBias = theta["b" + (dims.Length - 1).ToString()]; Tuple <LinearCache, MatrixVectors, MatrixVectors> finalLinearCacheAndActivation = ActivationsForward(previousLayersactivations, finalWeights, finalBias, Activation.Sigmoid); LinearCache finalLinearCache = finalLinearCacheAndActivation.Item1; MatrixVectors finalZ = finalLinearCacheAndActivation.Item2; MatrixVectors finalActivation = finalLinearCacheAndActivation.Item3; linearCaches.Add(finalLinearCache); z_cache.Add(finalZ); Tuple <List <LinearCache>, List <MatrixVectors>, MatrixVectors> cachesAndActivation = new Tuple <List <LinearCache>, List <MatrixVectors>, MatrixVectors>(linearCaches, z_cache, finalActivation); return(cachesAndActivation); }
/// <summary> /// This method calculates the derivatives of the parameters and the /// derivative of the previous layers activations all with respect to to the /// cross entropy cost function. /// </summary> /// <param name="dZ">The derivative of the cost function with respect to Z.</param> /// <param name="linearCache">A linear cache obtained from forward prop.</param> /// <param name="lambda">The L2 regularization hyper-parameter.</param> /// <returns> /// The derivatives for gradient descent. /// </returns> private Tuple <MatrixVectors, MatrixVectors, MatrixVectors> LinearBackward(MatrixVectors dZ, LinearCache linearCache, float lambda) { MatrixVectors regularizedWeight = linearCache.weights.BroadcastScalar(lambda, Operation.Multiply); MatrixVectors dW = dZ.Dot(linearCache.previousLayersActivations.Transpose()); MatrixVectors dWRegularized = dW.MatrixElementWise(regularizedWeight, Operation.Add); MatrixVectors db = dZ.MatrixAxisSummation(1); MatrixVectors dAPrev = linearCache.weights.Transpose().Dot(dZ); if (!dW.CompareShape(linearCache.weights)) { Console.WriteLine("Does not have the right shape for dW"); } if (!db.CompareShape(linearCache.bias)) { Console.WriteLine("Does not have the right shape for db"); } if (!dAPrev.CompareShape(linearCache.previousLayersActivations)) { Console.WriteLine("Does not have the right shape for dAPrev"); } return(new Tuple <MatrixVectors, MatrixVectors, MatrixVectors>(dWRegularized, db, dAPrev)); }
/// <summary> /// Calculates the derivative of the cross entropy /// cost function with respect to to Z assuming A of the same layer as Z was /// calculated using the ReLu function. /// </summary> /// <param name="dA">Derivative of the cost function with respect to the activation.</param> /// <param name="Z">The linear function of the weights biases and previous layers activations.</param> /// <returns>The derivative of the cost function with respect to Z.</returns> private MatrixVectors ReLuPrime(MatrixVectors dA, MatrixVectors Z) { MatrixVectors dZ = dA; if (!dZ.CompareShape(Z) || !Z.CompareShape(dA)) { Console.WriteLine("Error"); return(null); } for (int y = 0; y < dZ.rows; y++) { for (int x = 0; x < dZ.columns; x++) { if (Z.MatrixVector[x, y] <= 0) { dZ.MatrixVector[x, y] = 0; } } } return(dZ); }
/// <summary> /// This method runs the linear function and the specified activation function /// to calculate the Z and A of the current layer. /// </summary> /// <param name="previousLayersActivations">Vector of the previous layer's activations.</param> /// <param name="weights">Matrix of the current layers weights.</param> /// <param name="bias">Vector of the current layers bias'.</param> /// <param name="activation">The type of activation function to use.</param> /// <returns> /// It returns a tuple with the cache as the first item and the final activations as /// the second item. /// </returns> private Tuple <LinearCache, MatrixVectors, MatrixVectors> ActivationsForward(MatrixVectors previousLayersActivations, MatrixVectors weights, MatrixVectors bias, Activation activation) { Tuple <LinearCache, MatrixVectors> cache = LinearForward(previousLayersActivations, weights, bias); MatrixVectors z = cache.Item2; MatrixVectors activationsVector; switch (activation) { case Activation.Sigmoid: activationsVector = Sigmoid(z); break; case Activation.ReLu: activationsVector = Relu(z); break; default: throw new ArgumentOutOfRangeException(); } LinearCache linearCache = cache.Item1; return(new Tuple <LinearCache, MatrixVectors, MatrixVectors>(linearCache, z, activationsVector)); }
public static MatrixVectors MatrixAxisSummation(this MatrixVectors matrix, int axis) { ///<summary> /// axis = 0 returns a row vector of all the rows summed together. /// axis = 1 returns a column vector of all the columns summed together. /// In numpy this is equivalent to np.sum with KeepDims always True. ///</summary> if (axis == 0) { MatrixVectors summedOverColumns = new MatrixVectors(1, matrix.columns); for (int x = 0; x < matrix.columns; x++) { for (int y = 0; y < matrix.rows; y++) { summedOverColumns.MatrixVector[x, 0] += matrix.MatrixVector[x, y]; } } return(summedOverColumns); } else if (axis == 1) { MatrixVectors summedOverRows = new MatrixVectors(matrix.rows, 1); for (int y = 0; y < matrix.rows; y++) { for (int x = 0; x < matrix.columns; x++) { summedOverRows.MatrixVector[0, y] += matrix.MatrixVector[x, y]; } } return(summedOverRows); } else { throw new ArgumentOutOfRangeException(); } }
public static MatrixVectors MatrixElementWise(this MatrixVectors matrix_1, MatrixVectors matrix_2, Operation operation) { if (matrix_1.columns != matrix_2.columns || matrix_1.rows != matrix_2.rows) { Console.WriteLine("Cannot do Matrix element wise multiplications "); return(null); } MatrixVectors outputMatrix = new MatrixVectors(matrix_1.rows, matrix_1.columns); for (int y = 0; y < outputMatrix.rows; y++) { for (int x = 0; x < outputMatrix.columns; x++) { switch (operation) { case Operation.Add: outputMatrix.MatrixVector[x, y] = matrix_1.MatrixVector[x, y] + matrix_2.MatrixVector[x, y]; break; case Operation.Subtract: outputMatrix.MatrixVector[x, y] = matrix_1.MatrixVector[x, y] - matrix_2.MatrixVector[x, y]; break; case Operation.Multiply: outputMatrix.MatrixVector[x, y] = matrix_1.MatrixVector[x, y] * matrix_2.MatrixVector[x, y]; break; case Operation.Divide: outputMatrix.MatrixVector[x, y] = matrix_1.MatrixVector[x, y] / matrix_2.MatrixVector[x, y]; break; } } } return(outputMatrix); }
public bool CompareShape(MatrixVectors matrix) { return(matrix.columns == columns && matrix.rows == rows); }
/// <summary> /// This method will calculate dC with respect to Z from one of the specified activations /// then use this dC/dZ to calculate the other derivatives. /// </summary> /// <param name="dA">The derivative of the cost function with respect to the activations.</param> /// <param name="Z">The linear function of the weights biases and previous layers activations.</param> /// <param name="linearCache">A linear cache obtained from forward prop.</param> /// <param name="activation">The type of activation to use. Corrosponds with the activation that was used for this layer during forward prop.</param> /// <param name="lambda">The L2 regularization hyper-parameter.</param> /// <returns>The derivatives provided from the <see cref="LinearBackward"/> function.</returns> private Tuple <MatrixVectors, MatrixVectors, MatrixVectors> ActivationsBackward(MatrixVectors dA, MatrixVectors Z, LinearCache linearCache, Activation activation, float lambda) { MatrixVectors dZ; switch (activation) { case Activation.Sigmoid: dZ = SigmoidPrime(dA, Z); break; case Activation.ReLu: dZ = ReLuPrime(dA, Z); break; default: throw new ArgumentOutOfRangeException(); } return(LinearBackward(dZ, linearCache, lambda)); }
/// <summary> /// Executes the non-linear ReLu activation function on some linear function Z. /// </summary> /// <param name="Z">The linear function of the weights biases and previous layers activations.</param> /// <returns>A vector containing the non-linear sigmoid activations of the linear function z.</returns> private MatrixVectors Relu(MatrixVectors Z) { MatrixVectors activationsVector = MatrixCalculations.Maximum(Z, 0); return(activationsVector); }