public void AddBias(CUDNNTensorDescriptor dstTensorDesc, Layer layer, int c, DeviceMemory<float> data) { _biasTensorDesc.Set4D(TensorFormat, DataType, 1, c, 1, 1); const float alpha = 1.0f; const float beta = 1.0f; _cudnn.AddTensor(CUDNNInterop.cudnnAddMode_t.CUDNN_ADD_SAME_C, alpha, _biasTensorDesc, layer.BiasD.Ptr, beta, dstTensorDesc, data.Ptr); }
//[CudnnMnistFCF] public void FullyConnectedForward(Layer ip, nchw_t nchw, DeviceMemory<float> srcData, ref DeviceMemory<float> dstData) { if (nchw.N != 1) throw new Exception("Not Implemented"); var dimX = nchw.C * nchw.H * nchw.W; var dimY = ip.Outputs; Resize(ref dstData, dimY); const float alpha = 1.0f; const float beta = 1.0f; // This cuMemcpyDtoD is a raw CUDA API call so it should be guarded with worker.Eval var output = dstData; _worker.EvalAction(() => CUDAInterop.cuMemcpyDtoD(output.Ptr.Handle, ip.BiasD.Handle, (IntPtr)(dimY * sizeof(float)))); // This cublas call doesn't need worker.Eval because cublas is a thin wrapper for the raw API // and it alreadyhas worke.eval _cublas.Sgemv(CUBLASInterop.cublasOperation_t.CUBLAS_OP_T, dimX, dimY, alpha, ip.DataD.Ptr, dimX, srcData.Ptr, 1, beta, dstData.Ptr, 1); nchw.H = 1; nchw.W = 1; nchw.C = dimY; }
//[/CudnnMnistAF] //[CudnnMnistClassify] public int ClassifyExample(string fname, Layer conv1, Layer conv2, Layer ip1, Layer ip2) { var nchw = new nchw_t() { N = 1, C = 1, H = Data.ImageH, W = Data.ImageW }; var imgDataH = new float[Data.ImageH * Data.ImageW]; var oHostSrc = Data.LoadImage(fname).Select(x => (int)x).ToArray(); for (var i = 0; i < Data.ImageH; i++) { for (var j = 0; j < Data.ImageW; j++) { var idx = Data.ImageH * i + j; imgDataH[idx] = oHostSrc[idx] / 255.0f; } } using (var srcData = _worker.Malloc(imgDataH)) using (var dstData = _worker.Malloc<float>(0)) { Console.WriteLine("Performing forward propagation..."); var src = srcData; var dst = dstData; ConvoluteForward(conv1, nchw, src, ref dst); PoolForward(nchw, dst, ref src); ConvoluteForward(conv2, nchw, src, ref dst); PoolForward(nchw, dst, ref src); FullyConnectedForward(ip1, nchw, src, ref dst); ActivationForward(nchw, dst, ref src); FullyConnectedForward(ip2, nchw, src, ref dst); SoftmaxForward(nchw, dst, ref src); Console.WriteLine("Finished forward propagation."); const int maxDigits = 10; var hsrc = src.Gather(); var result = hsrc.Take(maxDigits).ToArray(); var id = 0; for (var i = 1; i < maxDigits; i++) if (result[id] < result[i]) id = i; Console.WriteLine("Classification Complete.\n"); return id; } }
//[/CudnnMnistFCF] //[CudnnMnistCF] public void ConvoluteForward(Layer conv, nchw_t nchw, DeviceMemory<float> srcData, ref DeviceMemory<float> dstData) { _srcTensorDesc.Set4D(TensorFormat, DataType, nchw.N, nchw.C, nchw.H, nchw.W); _filterDesc.Set4D(DataType, conv.Outputs, conv.Inputs, conv.KernelDim, conv.KernelDim); _convDesc.Set2D(0, 0, 1, 1, 1, 1, CUDNNInterop.cudnnConvolutionMode_t.CUDNN_CROSS_CORRELATION); // find dimension of convoltion output // outputDim = 1 + (inputDim + 2*pad - filterDim) / convolutionStride int n, c, h, w; _convDesc.Get2DForwardOutputDim(_srcTensorDesc, _filterDesc, out n, out c, out h, out w); nchw.N = n; nchw.C = c; nchw.H = h; nchw.W = w; _dstTensorDesc.Set4D(TensorFormat, DataType, nchw.N, nchw.C, nchw.H, nchw.W); var algo = _cudnn.GetConvolutionForwardAlgorithm(_srcTensorDesc, _filterDesc, _convDesc, _dstTensorDesc, CUDNNInterop.cudnnConvolutionFwdPreference_t.CUDNN_CONVOLUTION_FWD_PREFER_FASTEST, (IntPtr)0); Resize(ref dstData, nchw.N * nchw.C * nchw.H * nchw.W); var sizeInBytes = _cudnn.GetConvolutionForwardWorkspaceSize(_srcTensorDesc, _filterDesc, _convDesc, _dstTensorDesc, algo); using (var workSpace = _worker.Malloc<byte>(sizeInBytes.ToInt32())) { const float alpha = 1.0f; const float beta = 0.0f; _cudnn.ConvolutionForward(alpha, _srcTensorDesc, srcData.Ptr, _filterDesc, conv.DataD.Ptr, _convDesc, algo, workSpace.Ptr, sizeInBytes, beta, _dstTensorDesc, dstData.Ptr); AddBias(_dstTensorDesc, conv, c, dstData); } }