public override unsafe float[] CalculateLayer(float[,] weightMx, float[] bias, float[] prevActivations, IActivationFunction sigmoidFunction) { int matrixRows = weightMx.GetLength(0); float[] output = new float[matrixRows]; int[] configParams = new int[] { /*rows: */ weightMx.GetLength(0), /*cols: */ weightMx.GetLength(1), /*ApplySigmoid*/ sigmoidFunction.GetOpenCLFunctionId() }; fixed(int *configPtr = configParams) { fixed(float *weightArrayPtr = weightMx, biasPtr = bias, prevActivationPtr = prevActivations) { MemoryAllocation mem_param_weightMx, mem_param_bias, mem_param_prevActivation, mem_param_config, mem_param_output; mem_param_weightMx = computeFramework.GetMemoryFor(weightMx.Length * 4, MemoryFlag.ReadOnly | MemoryFlag.CopyHostPointer, new IntPtr(weightArrayPtr)); mem_param_bias = computeFramework.GetMemoryFor(bias.Length * 4, MemoryFlag.ReadOnly | MemoryFlag.CopyHostPointer, new IntPtr(biasPtr)); mem_param_prevActivation = computeFramework.GetMemoryFor(prevActivations.Length * 4, MemoryFlag.ReadOnly | MemoryFlag.CopyHostPointer, new IntPtr(prevActivationPtr)); mem_param_config = computeFramework.GetMemoryFor(configParams.Length * 4, MemoryFlag.ReadOnly | MemoryFlag.CopyHostPointer, new IntPtr(configPtr)); mem_param_output = computeFramework.GetMemoryFor(matrixRows * 4, MemoryFlag.WriteOnly, IntPtr.Zero); computeFramework.SetKernelArg(calcLayerKernel, 0, mem_param_weightMx); computeFramework.SetKernelArg(calcLayerKernel, 1, mem_param_bias); computeFramework.SetKernelArg(calcLayerKernel, 2, mem_param_prevActivation); computeFramework.SetKernelArg(calcLayerKernel, 3, mem_param_config); computeFramework.SetKernelArg(calcLayerKernel, 4, mem_param_output); int localWorkgroupSize = 32; int globalWorkSize = ExtendGlobalWorkSize(matrixRows, localWorkgroupSize); computeFramework.EnqueueKernel(calcLayerKernel, new IntPtr[] { new IntPtr(globalWorkSize) }, new IntPtr[] { new IntPtr(localWorkgroupSize) }); fixed(float *outputPtr = output) { computeFramework.ReadBuffer(mem_param_output, true, UIntPtr.Zero, new UIntPtr((uint)matrixRows * 4U), new IntPtr(outputPtr)); } } } computeFramework.UnuseMemoryAllocations(); return(output); }
public override unsafe List <List <NeuronData> > CalculateAccumulatedGradientForMinibatch(Network network, TrainingSuite suite, int trainingDataBegin, int trainingDataEnd) { int trainingSamples = trainingDataEnd - trainingDataBegin; var ret = Utils.CreateGradientVector(network); int[] networkConfigParams = null; int totalWeightAndBiasCount = 0; int delta_k_vectorSize = 0; int totalActivationCount = 0; //Add int inputActivationCount = network.layers.First().GetWeightsPerNeuron(); { foreach (var item in network.layers) { totalActivationCount += item.GetNeuronCount(); } List <int> networkConfigParamsList = new List <int>(); //0 networkConfigParamsList.Add(0); //layer index to be processed //1 networkConfigParamsList.Add(network.layers[0].activationFunction.GetOpenCLFunctionId()); //Activation function //2 networkConfigParamsList.Add(network.layers.Count); //Layer count //3 networkConfigParamsList.Add(trainingSamples); //numTrainingSamples //4 networkConfigParamsList.Add(suite.config.costFunction.GetOpenCLFunctionID()); //Cost function //5 networkConfigParamsList.Add(totalActivationCount); //totalActivationCount //6 networkConfigParamsList.Add(0); //totalWeightsAndBiases //7 networkConfigParamsList.Add(0); //widestLayerNeuronCount //8 networkConfigParamsList.Add(network.layers.First().GetWeightsPerNeuron()); //Input count for (int i = 0; i < network.layers.Count; i++) { networkConfigParamsList.Add(network.layers[i].GetNeuronCount()); //Layer neuron count totalWeightAndBiasCount += network.layers[i].biases.Length; totalWeightAndBiasCount += network.layers[i].weightMx.Length; if (i > 0) //The first layer will not write the delta_k vector, so it shouldn't contribute to its size. { delta_k_vectorSize = Math.Max(network.layers[i].GetNeuronCount(), delta_k_vectorSize); } } networkConfigParamsList[6] = totalWeightAndBiasCount; networkConfigParamsList[7] = delta_k_vectorSize; networkConfigParams = networkConfigParamsList.ToArray(); } float[] desiredOutputs = new float[network.layers.Last().GetNeuronCount() * trainingSamples]; float[] outputGradient = new float[totalWeightAndBiasCount];//Memory layout is: [weights, biases for trainingsample0, layer0-N][weights, biases for trainingsample1, layer0-N] ... float[] inputParameters = new float[trainingSamples * inputActivationCount]; float[] weightsAndBiases = new float[totalWeightAndBiasCount]; fixed(int *networkConfigParamsPtr = networkConfigParams) { fixed(float *outputGradientPtr = outputGradient, desiredOutputsPtr = desiredOutputs, inputParametersPtr = inputParameters, weightsAndBiasesPtr = weightsAndBiases) { MemoryAllocation mem_NetworkConfigParams = computeFramework.GetMemoryFor(networkConfigParams.Length * 4, MemoryFlag.ReadOnly | MemoryFlag.CopyHostPointer, new IntPtr(networkConfigParamsPtr)); for (int i = 0; i < trainingSamples; ++i) { Buffer.BlockCopy(suite.trainingData[trainingDataBegin + i].input, 0, inputParameters, i * inputActivationCount * 4, inputActivationCount * 4); } MemoryAllocation mem_InputActivations = computeFramework.GetMemoryFor(inputParameters.Length * 4, MemoryFlag.ReadOnly | MemoryFlag.CopyHostPointer, new IntPtr(inputParametersPtr)); ///Contains the whole network's activation values, and Z values for each training sample ///Memory layout for one layer is like this: [...input values...][...first layer's activations...][...second layer's activations]...[last layer's activations][first layer's z values][second layer's zvalues]...[last layer's z values] ///After that, the next layer's same values are there MemoryAllocation mem_activationsAndZValues = computeFramework.GetMemoryFor(totalActivationCount * trainingSamples * 2 * 4, MemoryFlag.ReadWrite, IntPtr.Zero); { int offset = 0; foreach (var layer in network.layers) { Buffer.BlockCopy(layer.weightMx, 0, weightsAndBiases, offset, layer.weightMx.Length * 4); offset += layer.weightMx.Length * 4; Buffer.BlockCopy(layer.biases, 0, weightsAndBiases, offset, layer.biases.Length * 4); offset += layer.biases.Length * 4; } } MemoryAllocation mem_weightsAndBiases = computeFramework.GetMemoryFor(weightsAndBiases.Length * 4, MemoryFlag.ReadOnly | MemoryFlag.CopyHostPointer, new IntPtr(weightsAndBiasesPtr)); //delta_k_vector is double buffered (hence the * 2). In a pass, the previous delta_k values are read, and the next ones are written //Memory layout is: [delta_k_vector buffer1 of trainingSample0][delta_k_vector buffer2 of trainingSample0] [delta_k_vector buffer1 of trainingSample1][delta_k_vector buffer2 of trainingSample1] ... MemoryAllocation mem_delta_k_vector = computeFramework.GetMemoryFor(Math.Max(1, delta_k_vectorSize * trainingSamples * 2 * 4), MemoryFlag.ReadWrite, IntPtr.Zero); computeFramework.SetKernelArg(forwardPass, 0, mem_NetworkConfigParams); computeFramework.SetKernelArg(forwardPass, 1, mem_activationsAndZValues); computeFramework.SetKernelArg(forwardPass, 2, mem_InputActivations); computeFramework.SetKernelArg(forwardPass, 3, mem_weightsAndBiases); int[] dynamic_kernel_arguments = new int[network.layers.Count * 2]; for (int i = 0; i < network.layers.Count; ++i) { dynamic_kernel_arguments[i * 2] = i; //Layer index dynamic_kernel_arguments[i * 2 + 1] = network.layers[i].activationFunction.GetOpenCLFunctionId(); //Layer's activation function } int num_dynamic_kernel_args = dynamic_kernel_arguments.Length / network.layers.Count; var localWorkGroupSize = new IntPtr[] { new IntPtr(deviceConfig.idealWorkgroupSizeX), new IntPtr(deviceConfig.idealWorkgroupSizeY) }; var globalWorkSize = new IntPtr[] { new IntPtr(0), new IntPtr(ExtendGlobalWorkSize(trainingSamples, localWorkGroupSize[1].ToInt32())) }; #region Forward pass for (int i = 0; i < network.layers.Count; ++i) { if (i > 0) { computeFramework.UploadToMemory(mem_NetworkConfigParams, 0, i * num_dynamic_kernel_args, dynamic_kernel_arguments, false, num_dynamic_kernel_args); } globalWorkSize[0] = new IntPtr(ExtendGlobalWorkSize(network.layers[i].GetNeuronCount(), localWorkGroupSize[0].ToInt32())); computeFramework.EnqueueKernel(forwardPass, globalWorkSize, localWorkGroupSize); // todo: run forward pass } #endregion #region backward pass int desiredOutputByteSizePerTrainigSample = network.layers.Last().GetNeuronCount() * 4; for (int i = 0; i < trainingSamples; ++i) { Buffer.BlockCopy(suite.trainingData[trainingDataBegin + i].desiredOutput, 0, desiredOutputs, i * desiredOutputByteSizePerTrainigSample, desiredOutputByteSizePerTrainigSample); } var mem_desired_outputs = computeFramework.GetMemoryFor(desiredOutputs.Length * 4, MemoryFlag.ReadOnly | MemoryFlag.CopyHostPointer, new IntPtr(desiredOutputsPtr)); var mem_param_gradient = computeFramework.GetMemoryFor(outputGradient.Length * 4, MemoryFlag.ReadWrite | MemoryFlag.CopyHostPointer, new IntPtr(outputGradientPtr)); computeFramework.SetKernelArg(backwardPassKernel, 0, mem_NetworkConfigParams); computeFramework.SetKernelArg(backwardPassKernel, 1, mem_activationsAndZValues); computeFramework.SetKernelArg(backwardPassKernel, 2, mem_delta_k_vector); computeFramework.SetKernelArg(backwardPassKernel, 3, mem_param_gradient); computeFramework.SetKernelArg(backwardPassKernel, 4, mem_desired_outputs); computeFramework.SetKernelArg(backwardPassKernel, 5, mem_InputActivations); computeFramework.SetKernelArg(backwardPassKernel, 6, mem_weightsAndBiases); //Run backward pass for all hidden layers for (int i = network.layers.Count - 1; i >= 0; --i) { globalWorkSize[0] = new IntPtr(ExtendGlobalWorkSize(network.layers[i].GetNeuronCount(), localWorkGroupSize[0].ToInt32())); if (i != network.layers.Count - 1) { computeFramework.UploadToMemory(mem_NetworkConfigParams, 0, i * num_dynamic_kernel_args, dynamic_kernel_arguments, false, num_dynamic_kernel_args); } computeFramework.EnqueueKernel(backwardPassKernel, globalWorkSize, localWorkGroupSize); } #endregion computeFramework.FlushCommandBuffer(); computeFramework.ReadBuffer(mem_param_gradient, true, new UIntPtr(0), new UIntPtr(mem_param_gradient.bufferSizeInBytes), new IntPtr(outputGradientPtr)); } } computeFramework.UnuseMemoryAllocations(); int gradIdx = 0; foreach (var layer in ret) { foreach (var neuron in layer) { Buffer.BlockCopy(outputGradient, gradIdx * 4, neuron.weights, 0, neuron.weights.Length * 4); gradIdx += neuron.weights.Length; neuron.bias = outputGradient[gradIdx]; ++gradIdx; } } return(ret); }