public Conv2Cudnn(IAllocator allocator, SeedSource seedSource, DType elementType, int batchSize, int inputWidth, int inputHeight, int nInputPlane, int nOutputPlane, ConvolutionDesc2d cd) : base(allocator, seedSource, elementType, batchSize, inputWidth, inputHeight, nInputPlane, nOutputPlane, cd) { // Reshape weight and bias - CuDNN expects the dimensions to be structured slightly differently this.weight = ViewReplace(this.weight, nOutputPlane, nInputPlane, cd.kH, cd.kW); this.bias = ViewReplace(this.bias, 1, nOutputPlane, 1, 1); this.gradWeight = ViewReplace(this.gradWeight, this.weight.Shape); this.gradBias = ViewReplace(this.gradBias, this.bias.Shape); var fwdWorkspace = DNN.GetConvolutionForwardWorkspaceSize(allocator, fwdAlgo, cd, new TensorShape(elementType, new long[] { batchSize, nInputPlane, inputHeight, inputWidth }), new TensorShape(weight), new TensorShape(activation)); var bwdFilterWorkspace = DNN.GetConvolutionBackwardFilterWorkspaceSize(allocator, bwdFilterAlgo, cd, new TensorShape(elementType, new long[] { batchSize, nInputPlane, inputHeight, inputWidth }), new TensorShape(activation), new TensorShape(weight)); var bwdFilterInputWorkspace = DNN.GetConvolutionBackwardDataWorkspaceSize(allocator, bwdDataAlgo, cd, new TensorShape(weight), new TensorShape(activation), new TensorShape(elementType, new long[] { batchSize, nInputPlane, inputHeight, inputWidth })); var workspaceSize = Math.Max(Math.Max(fwdWorkspace, bwdFilterWorkspace), bwdFilterInputWorkspace); this.workspace = (CudaStorage)allocator.Allocate(DType.UInt8, workspaceSize); }
public void CopyGpu(Tensor result, Tensor src, long totalElements) { // We assume here that we are using the default stream for both devices. TSCudaContext context = CudaHelpers.TSContextForTensor(src); CudaStorage resultStorage = (CudaStorage)result.Storage; CudaContext resultContext = context.CudaContextForTensor(result); CUdeviceptr resultPtr = resultStorage.DevicePtrAtElement(result.StorageOffset); CudaStorage srcStorage = (CudaStorage)src.Storage; CudaContext srcContext = context.CudaContextForTensor(src); CUdeviceptr srcPtr = srcStorage.DevicePtrAtElement(src.StorageOffset); if (CudaHelpers.GetDeviceId(result) != CudaHelpers.GetDeviceId(src)) { // Cross-device copy. Perform two-way barrier between both devices' default streams. resultContext.SetCurrent(); CudaEvent dstReady = new CudaEvent(CUEventFlags.DisableTiming); dstReady.Record(); srcContext.SetCurrent(); CUResult res = DriverAPINativeMethods.Streams.cuStreamWaitEvent(CUstream.NullStream, dstReady.Event, 0); if (res != CUResult.Success) { throw new CudaException(res); } dstReady.Dispose(); } else { srcContext.SetCurrent(); } bool canMemcpy = CanMemcpy(result, src, totalElements); if (canMemcpy) { CUResult res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyAsync( resultPtr, srcPtr, totalElements * src.ElementType.Size(), CUstream.NullStream); if (res != CUResult.Success) { throw new CudaException(res); } } else { if (result.ElementType != src.ElementType) { CopyGpuConvertTypes(result, src, totalElements); } else if (context.CanAccessPeer(CudaHelpers.GetDeviceId(src), CudaHelpers.GetDeviceId(result))) { CopyGpuDirect(result, src, srcContext); } else { CopyGpuIndirect(result, src, totalElements); } } }