public override float Inference(CudaDeviceVariable <float> input) { _input = input; NPPImage_32fC1 tempConv = new NPPImage_32fC1(_tempConvolution.DevicePointer, InWidth, InHeight, InWidth * sizeof(float)); for (int outLayer = 0; outLayer < OutChannels; outLayer++) { SizeT offsetOut = outLayer * OutWidth * OutHeight * sizeof(float); CUdeviceptr ptrWithOffsetOut = _z.DevicePointer + offsetOut; NPPImage_32fC1 imgOut = new NPPImage_32fC1(ptrWithOffsetOut, OutWidth, OutHeight, OutWidth * sizeof(float)); imgOut.Set(0); for (int inLayer = 0; inLayer < InChannels; inLayer++) { SizeT offsetIn = inLayer * InWidth * InHeight * sizeof(float); CUdeviceptr ptrWithOffsetIn = _input.DevicePointer + offsetIn; NPPImage_32fC1 imgIn = new NPPImage_32fC1(ptrWithOffsetIn, InWidth, InHeight, InWidth * sizeof(float)); imgIn.SetRoi(_filterX / 2, _filterY / 2, InWidth - _filterX + 1, InHeight - _filterY + 1); SizeT offsetFilter = (outLayer * InChannels * _filterX * _filterY + inLayer * _filterX * _filterY) * sizeof(float); CudaDeviceVariable <float> filter = new CudaDeviceVariable <float>(_weights.DevicePointer + offsetFilter, false, _filterX * _filterY * sizeof(float)); imgIn.Filter(tempConv, filter, new NppiSize(_filterX, _filterY), new NppiPoint(_filterX / 2, _filterY / 2)); imgOut.Add(tempConv); } imgOut.Add(bHost[outLayer]); } switch (_activation) { case Activation.None: _y.CopyToDevice(_z); break; case Activation.Relu: //_aRelu is set to 0! _KernelPReluForward.RunSafe(_z, _aRelu, _y, _outWidth * _outHeight, _outChannels, _batch); break; case Activation.PRelu: _KernelPReluForward.RunSafe(_z, _aRelu, _y, _outWidth * _outHeight, _outChannels, _batch); break; case Activation.LeakyRelu: _KernelPReluForward.RunSafe(_z, _aRelu, _y, _outWidth * _outHeight, _outChannels, _batch); break; default: break; } return(_nextLayer.Inference(_y)); }