public override void Execute(MyAbstractWeightLayer layer) { if (layer.Connection == ConnectionType.FULLY_CONNECTED) { m_RMSPropUpdateKernel.SetupExecution(layer.Neurons); m_RMSPropUpdateKernel.Run( layer.Input, layer.Delta, layer.Weights, layer.PreviousWeightDelta, layer.Bias, layer.PreviousBiasDelta, Owner.RMS.TrainingRate, Owner.RMS.Momentum, Owner.L1, Owner.L2, layer.DropoutMask, layer.Input.Count, layer.Neurons, layer.MeanSquareWeight, layer.MeanSquareBias, Owner.RMS.SmoothingFactor ); } else { MyLog.ERROR.WriteLine("No method provided to RMS propagate a " + layer.Connection + " connected MyAbstractWeightLayer in " + Owner); } }
public override void Execute(MyAbstractWeightLayer layer) { if (layer.Connection == ConnectionType.FULLY_CONNECTED) { ComputeWeightGradientSum(layer); m_adadeltaUpdateKernel.SetupExecution(layer.Weights.Count); m_adadeltaUpdateKernel.Run( layer.Input, layer.Delta, layer.WeightGradient, layer.BiasGradient, layer.Weights, layer.Bias, Owner.L1, Owner.L2, layer.DropoutMask, layer.Neurons, Owner.BatchSize, layer.Weights.Count, layer.MeanSquareWeight, layer.PreviousWeightDelta, layer.MeanSquareBias, layer.PreviousBiasDelta, Owner.Adadelta.Ro, Owner.Adadelta.Epsilon ); } else if (layer.Connection == ConnectionType.GAUSSIAN) { // Gaussian hidden layer just propagates delta, no weight updates } else if (layer.Connection == ConnectionType.CONVOLUTION && layer is MyConvolutionLayer) { MyConvolutionLayer convLayer = (MyConvolutionLayer)layer; m_convAdadeltaUpdateKernel.SetupExecution(convLayer.Weights.Count); m_convAdadeltaUpdateKernel.Run( convLayer.Weights, convLayer.Bias, convLayer.Delta, convLayer.PaddedImage, convLayer.InputWidth + convLayer.ZeroPadding + convLayer.ZeroPadding, (convLayer.InputWidth + convLayer.ZeroPadding + convLayer.ZeroPadding) * (convLayer.InputHeight + convLayer.ZeroPadding + convLayer.ZeroPadding), convLayer.FilterWidth, convLayer.FilterWidth * convLayer.FilterHeight, convLayer.FilterWidth * convLayer.FilterHeight * convLayer.InputDepth, convLayer.OutputWidth, convLayer.OutputHeight, convLayer.OutputWidth * convLayer.OutputHeight, convLayer.HorizontalStride, convLayer.VerticalStride, convLayer.L1Term, convLayer.L2Term, convLayer.MeanSquareWeight, convLayer.PreviousWeightDelta, convLayer.MeanSquareBias, convLayer.PreviousBiasDelta, Owner.Adadelta.Ro, Owner.Adadelta.Epsilon, Owner.BatchSize, convLayer.Weights.Count // should be equal to FilterWidth * FilterHeight * FilterCount * InputDepth ); } else { MyLog.ERROR.WriteLine("No method provided to Adadelta propagate a " + layer.Connection + " connected MyAbstractWeightLayer in " + Owner); } }
public override void Execute() { Owner.Delta.Fill(0.0f); // number of neurons of ensemble is the same as for each input m_deltaKernel.SetConstantVariable <float>("Lambda", Lambda); int inputLayerCount = Owner.InputConnections.Count(x => x.From is MyAbstractWeightLayer); foreach (MyConnection connection in Owner.InputConnections) { if (connection.From is MyAbstractLayer) { MyAbstractLayer prevLayer = connection.From as MyAbstractLayer; if (prevLayer is MyAbstractWeightLayer) { MyAbstractWeightLayer prevWeightLayer = prevLayer as MyAbstractWeightLayer; m_deltaKernel.Run( (int)prevLayer.ActivationFunction, prevWeightLayer.NeuronInput, prevLayer.Output, Owner.Output, Owner.Neurons, prevLayer.Delta, Owner.Delta, inputLayerCount ); } prevLayer.Delta.SafeCopyToHost(); Owner.Delta.SafeCopyToHost(); } } Owner.Delta.SafeCopyToHost(); }
public void ComputeWeightGradientSum(MyAbstractWeightLayer layer) { if (Owner.BatchSize == 1) // cuBLAS tends to be slower when BatchSize is 1, gradient is computed in update weights kernels return; // WeightGradient = Delta x Transpose(Input) MyCublasFactory.Instance.Gemm(Operation.NonTranspose, Operation.Transpose, layer.Neurons, layer.Input.Count / Owner.BatchSize, Owner.BatchSize, 1.0f, layer.Delta.GetDevice(layer), layer.Neurons, layer.Input.GetDevice(layer), layer.Input.Count / Owner.BatchSize, 0.0f, layer.WeightGradient.GetDevice(layer), layer.Neurons ); // BiasGradient = Delta x Transpose(BiasInput). BiasInput is vector of ones MyCublasFactory.Instance.Gemm(Operation.NonTranspose, Operation.Transpose, layer.Neurons, 1, Owner.BatchSize, 1.0f, layer.Delta.GetDevice(layer), layer.Neurons, layer.BiasInput.GetDevice(layer), 1, 0.0f, layer.BiasGradient.GetDevice(layer), layer.Neurons ); }
public override void Init(int nGPU) { } //Kernel initialization public override void Execute() { float maxRelDiff = 0.0f; float maxAbsDiff = 0.0f; int maxDiffLayer = 0; int maxDiffWeight = 0; float maxDiffWeightValue = 0.0f; float maxDiffStepSize = 0.0f; float maxDiffAnalyticalGrad = 0.0f; float maxDiffNumericalGrad = 0.0f; float sampleProbability = 1.0f / Owner.TotalWeights; for (int s = 0; s < SamplesPerTimestep; s++) { // dice roll float diceRoll = (float)Rand.NextDouble(); // convert diceroll to parameter to sample int w = (int)System.Math.Floor(diceRoll / sampleProbability); if (w >= Owner.TotalWeights) { if (w > Owner.TotalWeights) MyLog.ERROR.Write("w > Owner.TotalWeights"); // just for testing, this should never hit w = Owner.TotalWeights - 1; // this is just to make if safe, but it should never hit } // loop through the layers MyAbstractLayer layer = Owner.FirstTopologicalLayer; while (layer != null) { // check for weights if (layer is MyAbstractWeightLayer) { MyAbstractWeightLayer weightLayer = (layer as MyAbstractWeightLayer); if (weightLayer.Weights.Count <= w) w -= weightLayer.Weights.Count; else { weightLayer.Weights.SafeCopyToHost(w, 1); // copy this weight to host float originalWeight = weightLayer.Weights.Host[w]; // save weight float stepSize = System.Math.Abs(originalWeight) * RelativeStepSize; // set stepSize // get errorPlus weightLayer.Weights.Host[w] = originalWeight + stepSize; // increase weight weightLayer.Weights.SafeCopyToDevice(w, 1); // back to device Owner.FeedForward(); // forward the network float errorPlus = Owner.GetError(); // get errorMinus weightLayer.Weights.Host[w] = originalWeight - stepSize; // decrease weight weightLayer.Weights.SafeCopyToDevice(w, 1); // back to device Owner.FeedForward(); // forward the network float errorMinus = Owner.GetError(); // reset to original weightLayer.Weights.Host[w] = originalWeight; // back to where we started weightLayer.Weights.SafeCopyToDevice(w, 1); // back to device Owner.FeedForward(); // forward the network Owner.GetError(); // this sets the original error // numerical gradient float numericalGradient = (errorPlus - errorMinus) / (2 * stepSize); if (numericalGradient == 0) { MyLog.DEBUG.WriteLine("t: " + SimulationStep + " id: " + weightLayer.Id + " w" + w + ": " + weightLayer.Weights.Host[w] + " step: " + stepSize + " numerical gradient is 0."); break; // continue to next sample } // analytical gradient int n = w % weightLayer.Neurons; int i = (w - n) / weightLayer.Neurons; weightLayer.Delta.SafeCopyToHost(n, 1); // copy delta to host weightLayer.Input.SafeCopyToHost(i, 1); // copy input to host weightLayer.DropoutMask.SafeCopyToHost(n, 1); // copy dropoutmask to host //weightLayer.Weights.SafeCopyToHost(w, 1); // already present at host due to resetting to original if (weightLayer.DropoutMask.Host[n] > 0) break; float analyticalGradient = weightLayer.Delta.Host[n] * weightLayer.Input.Host[i] + Owner.L1 * (weightLayer.Weights.Host[w] < 0.0f ? -1.0f : 1.0f) + Owner.L2 * weightLayer.Weights.Host[w]; float relativeDiff = 0.0f; float absoluteDiff = 0.0f; if (analyticalGradient == 0) { MyLog.DEBUG.WriteLine("t: " + SimulationStep + " id: " + weightLayer.Id + " w" + w + ": " + weightLayer.Weights.Host[w] + " step: " + stepSize + " analytical gradient is 0."); break; // continue to next sample } absoluteDiff = System.Math.Abs(numericalGradient - analyticalGradient); relativeDiff = absoluteDiff / (System.Math.Abs(numericalGradient) + System.Math.Abs(analyticalGradient)); if (relativeDiff > maxRelDiff && absoluteDiff > ThresholdAbsolute) { maxAbsDiff = absoluteDiff; maxRelDiff = relativeDiff; maxDiffLayer = weightLayer.Id; maxDiffWeight = w; maxDiffWeightValue = weightLayer.Weights.Host[w]; maxDiffStepSize = stepSize; maxDiffAnalyticalGrad = analyticalGradient; maxDiffNumericalGrad = numericalGradient; } MyLog.DEBUG.WriteLine("t: " + SimulationStep + " id: " + weightLayer.Id + " w" + w + ": " + weightLayer.Weights.Host[w] + " step: " + stepSize + " AG: " + analyticalGradient + " NG: " + numericalGradient + " diff: " + relativeDiff); break; // continue to next sample } } layer = layer.NextTopologicalLayer; // catch unmatched dice-rolls if (layer == null) MyLog.ERROR.Write("GradientCheck task: Weight w " + w + " not found within " + Owner.TotalWeights + " total weights"); // just for testing, this should never hit } } // handle the relativeDiff we just found if (maxRelDiff > ThresholdRelative && maxRelDiff > ThresholdAbsolute) { MyLog.INFO.WriteLine("Gradient threshold exceeded on SimulationStep: " + SimulationStep); MyLog.INFO.WriteLine("Max analytical vs numerical relative gradient difference found in layer id " + maxDiffLayer + " for weight " + maxDiffWeight + ": " + maxDiffWeightValue + " with Step size: " + maxDiffStepSize); MyLog.INFO.WriteLine("Analytical gradient: " + maxDiffAnalyticalGrad + " Numerical gradient: " + maxDiffNumericalGrad + " Relative difference: " + maxRelDiff); MyLog.INFO.WriteLine(); } }
public override void Execute(MyAbstractWeightLayer layer) { if (layer.Connection == ConnectionType.FULLY_CONNECTED) { ComputeWeightGradientSum(layer); m_SGDupdateKernel.SetupExecution(layer.Weights.Count); m_SGDupdateKernel.Run( layer.Input, layer.Delta, layer.WeightGradient, layer.BiasGradient, layer.Weights, layer.PreviousWeightDelta, layer.Bias, layer.PreviousBiasDelta, Owner.SGD.TrainingRate, Owner.SGD.Momentum, Owner.L1, Owner.L2, layer.DropoutMask, layer.Neurons, Owner.BatchSize, layer.Weights.Count ); } else if (layer.Connection == ConnectionType.GAUSSIAN) { // Gaussian hidden layer just propagates delta, no weight updates } else if (layer.Connection == ConnectionType.PARTIAL_UPDATE && layer is IPartialUpdateLayer) { // Update some but not all of the weights IPartialUpdateLayer partialUpdateLayer = layer as IPartialUpdateLayer; m_partialSGDupdateKernel.SetupExecution(layer.Weights.Count); m_partialSGDupdateKernel.Run( layer.Input, layer.Delta, layer.Weights, layer.PreviousWeightDelta, layer.Bias, layer.PreviousBiasDelta, Owner.SGD.TrainingRate, Owner.SGD.Momentum, Owner.L1, Owner.L2, layer.DropoutMask, layer.Neurons, layer.Weights.Count, partialUpdateLayer.SuppressUpdatesAt(), partialUpdateLayer.SuppressUpdatesCount() ); } else if (layer.Connection == ConnectionType.CONVOLUTION && layer is MyConvolutionLayer) { MyConvolutionLayer convLayer = (MyConvolutionLayer) layer; m_convSGDupdateKernel.SetupExecution(convLayer.Weights.Count); m_convSGDupdateKernel.Run( Owner.SGD.TrainingRate, Owner.SGD.Momentum, convLayer.Weights, convLayer.Bias, convLayer.PreviousBiasDelta, convLayer.Delta, convLayer.PreviousWeightDelta, convLayer.PaddedImage, convLayer.InputWidth + convLayer.ZeroPadding + convLayer.ZeroPadding, (convLayer.InputWidth + convLayer.ZeroPadding + convLayer.ZeroPadding)* (convLayer.InputHeight + convLayer.ZeroPadding + convLayer.ZeroPadding), convLayer.FilterWidth, convLayer.FilterWidth*convLayer.FilterHeight, convLayer.FilterWidth*convLayer.FilterHeight*convLayer.InputDepth, convLayer.OutputWidth, convLayer.OutputHeight, convLayer.OutputWidth*convLayer.OutputHeight, convLayer.HorizontalStride, convLayer.VerticalStride, convLayer.L1Term, convLayer.L2Term, Owner.BatchSize, convLayer.Weights.Count // should be equal to FilterWidth * FilterHeight * FilterCount * InputDepth ); } else { MyLog.ERROR.WriteLine("No method provided to SGD propagate a " + layer.Connection + " connected MyAbstractWeightLayer in " + Owner); } }
public virtual void Execute(MyAbstractWeightLayer layer) { MyLog.ERROR.WriteLine("No method provided to backpropagate MyAbstractWeightLayer " + layer + " in " + Owner); }
public override void Execute() { MyNode node = Owner.Input.Owner; if (node is MyAbstractLayer) { MyAbstractLayer previousLayer = node as MyAbstractLayer; // Reset delta previousLayer.Delta.Fill(0); // Disable backprop when in generative mode if (!Owner.Generate.IsIncomingRised()) { // Set locations for mean deltas CUdeviceptr meanDeltas = previousLayer.Delta.GetDevicePtr(Owner, 0); // Set locations for sigma deltas CUdeviceptr sigmaDeltas = previousLayer.Delta.GetDevicePtr(Owner, previousLayer.Delta.Count / 2); // Determine input to previous layer CUdeviceptr prevInputPtr = MyAbstractLayer.DetermineInput(previousLayer); // set locations for sigmas (prev layer or constant CUdeviceptr sigmas; if (Owner.UseSigmaConstant) { sigmas = Owner.SigmaConstants.GetDevicePtr(Owner); } else { sigmas = Owner.Input.GetDevicePtr(Owner, Owner.Input.Count / 2); } m_samplingDeltaKernel.Run( Convert.ToInt32(Owner.UseSigmaConstant), (int)previousLayer.ActivationFunction, prevInputPtr, sigmas, meanDeltas, sigmaDeltas, Owner.Delta, Owner.RandomNormal, Owner.Neurons ); // Regularization needs weights to compute gradients if (Regularize && previousLayer != null && previousLayer is MyAbstractWeightLayer) { MyAbstractWeightLayer previousWeightLayer = previousLayer as MyAbstractWeightLayer; // Try to regularize loss: mean^2 + sigma^2 - log(sigma^2) // In other words regularize means to 0 and sigmas to 1 int weightCount = previousWeightLayer.Weights.Count; m_regularizationDeltaKernel.SetConstantVariable <float>("RegularizationCoefficient", RegularizationCoefficient); m_regularizationDeltaKernel.SetupExecution(weightCount); m_regularizationDeltaKernel.Run( Convert.ToInt32(Owner.UseSigmaConstant), (int)previousLayer.ActivationFunction, prevInputPtr, previousLayer.Input, previousWeightLayer.Weights, previousLayer.Output.Count, meanDeltas, sigmaDeltas ); } } } }