public static WeightsUpdater RMSProp([NotNull] RMSPropInfo info, [NotNull] NeuralNetworkBase network) { // Setup float eta = info.Eta, rho = info.Rho, lambda = info.Lambda, epsilon = info.Epsilon; float[][] mW = new float[network.WeightedLayersIndexes.Length][], mB = new float[network.WeightedLayersIndexes.Length][]; for (int i = 0; i < network.WeightedLayersIndexes.Length; i++) { WeightedLayerBase layer = network.Layers[network.WeightedLayersIndexes[i]].To <INetworkLayer, WeightedLayerBase>(); mW[i] = new float[layer.Weights.Length]; mB[i] = new float[layer.Biases.Length]; } // Closure unsafe void Minimize(int i, in Tensor dJdw, in Tensor dJdb, int samples, WeightedLayerBase layer) { // Tweak the weights float alpha = eta / samples, l2Factor = eta * lambda / samples; fixed(float *pw = layer.Weights, pmw = mW[i]) { float *pdj = dJdw; int w = layer.Weights.Length; for (int x = 0; x < w; x++) { float pdJi = pdj[x]; pmw[x] = rho * pmw[x] + (1 - rho) * pdJi * pdJi; pw[x] -= l2Factor * pw[x] + alpha * pdJi / ((float)Math.Sqrt(pmw[x]) + epsilon); } } // Tweak the biases of the lth layer fixed(float *pb = layer.Biases, pmb = mB[i]) { float *pdj = dJdb; int w = layer.Biases.Length; for (int b = 0; b < w; b++) { float pdJi = pdj[b]; pmb[b] = rho * pmb[b] + (1 - rho) * pdJi * pdJi; pb[b] -= alpha * pdJi / ((float)Math.Sqrt(pmb[b]) + epsilon); } } } return(Minimize); }
public static WeightsUpdater Momentum([NotNull] MomentumInfo info, [NotNull] NeuralNetworkBase network) { // Setup float eta = info.Eta, lambda = info.Lambda, momentum = info.Momentum; float[][] mW = new float[network.WeightedLayersIndexes.Length][], mB = new float[network.WeightedLayersIndexes.Length][]; for (int i = 0; i < network.WeightedLayersIndexes.Length; i++) { WeightedLayerBase layer = network.Layers[network.WeightedLayersIndexes[i]].To <INetworkLayer, WeightedLayerBase>(); mW[i] = new float[layer.Weights.Length]; mB[i] = new float[layer.Biases.Length]; } // Closure unsafe void Minimize(int i, in Tensor dJdw, in Tensor dJdb, int samples, WeightedLayerBase layer) { // Tweak the weights float alpha = eta / samples, l2Factor = eta * lambda / samples; fixed(float *pw = layer.Weights, pmw = mW[i]) { float *pdj = dJdw; int w = layer.Weights.Length; for (int x = 0; x < w; x++) { pmw[x] = momentum * pmw[x] + pdj[x]; pw[x] -= l2Factor * pw[x] + alpha * pmw[x]; } } // Tweak the biases of the lth layer fixed(float *pb = layer.Biases, pmb = mB[i]) { float *pdj = dJdb; int w = layer.Biases.Length; for (int b = 0; b < w; b++) { pmb[b] = momentum * pmb[b] + pdj[b]; pb[b] -= alpha * pdj[b]; } } } return(Minimize); }
private static unsafe void TestGradient(WeightedLayerBase cpu, WeightedLayerBase gpu, float[,] x, float[,] delta) { fixed(float *px = x, pdelta = delta) { Tensor.Reshape(px, x.GetLength(0), x.GetLength(1), out Tensor xt); Tensor.Reshape(pdelta, delta.GetLength(0), delta.GetLength(1), out Tensor deltat); cpu.ComputeGradient(xt, deltat, out Tensor dJdw_cpu, out Tensor dJdb_cpu); gpu.ComputeGradient(xt, deltat, out Tensor dJdw_gpu, out Tensor dJdb_gpu); Assert.IsTrue(dJdw_cpu.ContentEquals(dJdw_gpu)); Assert.IsTrue(dJdb_cpu.ContentEquals(dJdb_gpu, 1e-4f, 1e-5f)); // The cuDNN ConvolutionBackwardBias is not always as precise as the CPU version dJdw_cpu.Free(); dJdw_gpu.Free(); dJdb_cpu.Free(); dJdb_gpu.Free(); } }
private static void TestBackward(WeightedLayerBase cpu, WeightedLayerBase gpu, int samples) { SetBackpropagationProperty(true); Tensor x = CreateRandomTensor(samples, cpu.InputInfo.Size), dy = CreateRandomTensor(samples, cpu.OutputInfo.Size); Tensor.Like(x, out Tensor dx1); Tensor.Like(x, out Tensor dx2); cpu.Forward(x, out Tensor z_cpu, out Tensor a_cpu); gpu.Forward(x, out Tensor z_gpu, out Tensor a_gpu); cpu.Backpropagate(x, z_cpu, dy, dx1, out Tensor dJdw_cpu, out Tensor dJdb_cpu); gpu.Backpropagate(x, z_gpu, dy, dx2, out Tensor dJdw_gpu, out Tensor dJdb_gpu); Assert.IsTrue(dx1.ContentEquals(dx2, 1e-5f, 1e-5f)); Assert.IsTrue(dJdw_cpu.ContentEquals(dJdw_gpu, 1e-4f, 1e-5f)); Assert.IsTrue(dJdb_cpu.ContentEquals(dJdb_gpu, 1e-4f, 1e-5f)); // The cuDNN ConvolutionBackwardBias is not always as precise as the CPU version Tensor.Free(x, dy, dx1, dx2, z_cpu, a_cpu, z_gpu, a_gpu, dJdw_cpu, dJdb_cpu, dJdw_gpu, dJdb_gpu); SetBackpropagationProperty(false); }
public static WeightsUpdater AdaMax([NotNull] AdaMaxInfo info, [NotNull] NeuralNetworkBase network) { // Initialize AdaDelta parameters float eta = info.Eta, beta1 = info.Beta1, beta2 = info.Beta2; float[][] mW = new float[network.WeightedLayersIndexes.Length][], uW = new float[network.WeightedLayersIndexes.Length][], mB = new float[network.WeightedLayersIndexes.Length][], uB = new float[network.WeightedLayersIndexes.Length][]; float[] beta1t = new float[network.WeightedLayersIndexes.Length]; for (int i = 0; i < network.WeightedLayersIndexes.Length; i++) { WeightedLayerBase layer = network.Layers[network.WeightedLayersIndexes[i]].To <INetworkLayer, WeightedLayerBase>(); mW[i] = new float[layer.Weights.Length]; uW[i] = new float[layer.Weights.Length]; mB[i] = new float[layer.Biases.Length]; uB[i] = new float[layer.Biases.Length]; beta1t[i] = beta1; } // AdaDelta update for weights and biases unsafe void Minimize(int i, in Tensor dJdw, in Tensor dJdb, int samples, WeightedLayerBase layer) { // Alpha at timestep t float b1t = beta1t[i]; beta1t[i] *= beta1; // Weights fixed(float *pw = layer.Weights, pm = mW[i], pu = uW[i]) { float *pdJ = dJdw; int w = layer.Weights.Length; for (int x = 0; x < w; x++) { float pdJi = pdJ[x]; pm[x] = beta1 * pm[x] + (1 - beta1) * pdJi; pu[x] = (beta2 * pu[x]).Max(pdJi.Abs()); pw[x] -= eta / (1 - b1t) * pm[x] / pu[x]; } } // Biases fixed(float *pb = layer.Biases, pm = mB[i], pu = uB[i]) { float *pdJ = dJdb; int w = layer.Biases.Length; for (int b = 0; b < w; b++) { float pdJi = pdJ[b]; pm[b] = beta1 * pm[b] + (1 - beta1) * pdJi; pu[b] = (beta2 * pu[b]).Max(pdJi.Abs()); pb[b] -= eta / (1 - b1t) * pm[b] / pu[b]; } } } return(Minimize); } #endregion }
public static WeightsUpdater Adam([NotNull] AdamInfo info, [NotNull] NeuralNetworkBase network) { // Initialize Adam parameters float eta = info.Eta, beta1 = info.Beta1, beta2 = info.Beta2, epsilon = info.Epsilon; float[][] mW = new float[network.WeightedLayersIndexes.Length][], vW = new float[network.WeightedLayersIndexes.Length][], mB = new float[network.WeightedLayersIndexes.Length][], vB = new float[network.WeightedLayersIndexes.Length][]; float[] beta1t = new float[network.WeightedLayersIndexes.Length], beta2t = new float[network.WeightedLayersIndexes.Length]; for (int i = 0; i < network.WeightedLayersIndexes.Length; i++) { WeightedLayerBase layer = network.Layers[network.WeightedLayersIndexes[i]].To <INetworkLayer, WeightedLayerBase>(); mW[i] = new float[layer.Weights.Length]; vW[i] = new float[layer.Weights.Length]; mB[i] = new float[layer.Biases.Length]; vB[i] = new float[layer.Biases.Length]; beta1t[i] = beta1; beta2t[i] = beta2; } // AdaDelta update for weights and biases unsafe void Minimize(int i, in Tensor dJdw, in Tensor dJdb, int samples, WeightedLayerBase layer) { // Alpha at timestep t float alphat = eta * (float)Math.Sqrt(1 - beta2t[i]) / (1 - beta1t[i]); beta1t[i] *= beta1; beta2t[i] *= beta2; // Weights fixed(float *pw = layer.Weights, pm = mW[i], pv = vW[i]) { float *pdJ = dJdw; int w = layer.Weights.Length; for (int x = 0; x < w; x++) { float pdJi = pdJ[x]; pm[x] = pm[x] * beta1 + (1 - beta1) * pdJi; pv[x] = pv[x] * beta2 + (1 - beta2) * pdJi * pdJi; pw[x] -= alphat * pm[x] / ((float)Math.Sqrt(pv[x]) + epsilon); } } // Biases fixed(float *pb = layer.Biases, pm = mB[i], pv = vB[i]) { float *pdJ = dJdb; int w = layer.Biases.Length; for (int b = 0; b < w; b++) { float pdJi = pdJ[b]; pm[b] = pm[b] * beta1 + (1 - beta1) * pdJi; pv[b] = pv[b] * beta2 + (1 - beta2) * pdJi * pdJi; pb[b] -= alphat * pm[b] / ((float)Math.Sqrt(pv[b]) + epsilon); } } } return(Minimize); }
public static WeightsUpdater AdaDelta([NotNull] AdaDeltaInfo info, [NotNull] NeuralNetworkBase network) { // Initialize AdaDelta parameters float rho = info.Rho, epsilon = info.Epsilon, l2 = info.L2; float[][] egSquaredW = new float[network.WeightedLayersIndexes.Length][], eDeltaxSquaredW = new float[network.WeightedLayersIndexes.Length][], egSquaredB = new float[network.WeightedLayersIndexes.Length][], eDeltaxSquaredB = new float[network.WeightedLayersIndexes.Length][]; for (int i = 0; i < network.WeightedLayersIndexes.Length; i++) { WeightedLayerBase layer = network.Layers[network.WeightedLayersIndexes[i]].To <INetworkLayer, WeightedLayerBase>(); egSquaredW[i] = new float[layer.Weights.Length]; eDeltaxSquaredW[i] = new float[layer.Weights.Length]; egSquaredB[i] = new float[layer.Biases.Length]; eDeltaxSquaredB[i] = new float[layer.Biases.Length]; } // AdaDelta update for weights and biases unsafe void Minimize(int i, in Tensor dJdw, in Tensor dJdb, int samples, WeightedLayerBase layer) { fixed(float *pw = layer.Weights, egSqrt = egSquaredW[i], eDSqrtx = eDeltaxSquaredW[i]) { float *pdj = dJdw; int w = layer.Weights.Length; for (int x = 0; x < w; x++) { float gt = pdj[x]; egSqrt[x] = rho * egSqrt[x] + (1 - rho) * gt * gt; float rmsDx_1 = (float)Math.Sqrt(eDSqrtx[x] + epsilon), rmsGt = (float)Math.Sqrt(egSqrt[x] + epsilon), dx = -(rmsDx_1 / rmsGt) * gt; eDSqrtx[x] = rho * eDSqrtx[x] + (1 - rho) * dx * dx; pw[x] += dx - l2 * pw[x]; } } // Tweak the biases of the lth layer fixed(float *pb = layer.Biases, egSqrt = egSquaredB[i], eDSqrtb = eDeltaxSquaredB[i]) { float *pdj = dJdb; int w = layer.Biases.Length; for (int b = 0; b < w; b++) { float gt = pdj[b]; egSqrt[b] = rho * egSqrt[b] + (1 - rho) * gt * gt; float rmsDx_1 = (float)Math.Sqrt(eDSqrtb[b] + epsilon), rmsGt = (float)Math.Sqrt(egSqrt[b] + epsilon), db = -(rmsDx_1 / rmsGt) * gt; eDSqrtb[b] = rho * eDSqrtb[b] + (1 - rho) * db * db; pb[b] += db - l2 * pb[b]; } } } return(Minimize); }