public static Tensor Invoke(CudaReduceAllKernels reduceAllKernels, float init, ReduceInitType initType, string kernelName, Tensor result, Tensor src, object extraArg = null) { int deviceId = CudaHelpers.GetDeviceId(src); TSCudaContext context = CudaHelpers.TSContextForTensor(src); CudaContext cudaContext = context.CudaContextForDevice(deviceId); if (src.DimensionCount > TSCudaContext.MaxDims) { throw new InvalidOperationException("Tensors with dimension count > " + TSCudaContext.MaxDims + " are not supported"); } Tensor writeTarget = TensorResultBuilder.GetWriteTarget(result, src, false, 1); if (src.DimensionCount == 0) { return(result); } long totalElements = src.ElementCount(); ApplySpecialization config = new ApplySpecialization(src); object totalElementsTyped = config.Use32BitIndices ? (uint)totalElements : (ulong)totalElements; object initValueTyped = ReduceInitConverter.GetInitValue(init, initType, src.ElementType); dim3 grid; dim3 block; byte[] ptx = reduceAllKernels.GetPtx(context.Compiler); string fullKernelName = PermutationGenerator.GetMangledName(kernelName, config); ManagedCuda.BasicTypes.CUdeviceptr outputDevicePtr = CudaHelpers.GetBufferStart(writeTarget); if (isTwoPassReductionSize(totalElements)) { getPass1ReduceBlockGrid(context, deviceId, totalElements, out grid, out block); uint smemSize = block.x * sizeof(float); ManagedCuda.BasicTypes.CUdeviceptr scratchSpace = context.ScratchSpaceForDevice(deviceId).buffer; if (extraArg == null) { InvokeReduceAll(context, cudaContext, ptx, "twoPassA_" + fullKernelName, grid, block, smemSize, config, src, totalElementsTyped, initValueTyped, scratchSpace); } else { InvokeReduceAll(context, cudaContext, ptx, "twoPassA_" + fullKernelName, grid, block, smemSize, config, src, totalElementsTyped, initValueTyped, scratchSpace, extraArg); } uint numPass1Blocks = grid.x; getPass2ReduceBlockGrid(context, deviceId, totalElements, out grid, out block); smemSize = block.x * sizeof(float); InvokeReduceAllPass2(context, cudaContext, ptx, "twoPassB_" + fullKernelName, grid, block, smemSize, config.Use32BitIndices, numPass1Blocks, initValueTyped, scratchSpace, outputDevicePtr); } else { getSinglePassReduceBlockGrid(totalElements, out grid, out block); uint smemSize = block.x * sizeof(float); if (extraArg == null) { InvokeReduceAll(context, cudaContext, ptx, "onePass_" + fullKernelName, grid, block, smemSize, config, src, totalElementsTyped, initValueTyped, outputDevicePtr); } else { InvokeReduceAll(context, cudaContext, ptx, "onePass_" + fullKernelName, grid, block, smemSize, config, src, totalElementsTyped, initValueTyped, outputDevicePtr, extraArg); } } return(writeTarget); }
/// <summary> /// Copies the gpu. /// </summary> /// <param name="result">The result.</param> /// <param name="src">The source.</param> /// <param name="totalElements">The total elements.</param> /// <exception cref="CudaException"> /// </exception> public void CopyGpu(Tensor result, Tensor src, long totalElements) { // We assume here that we are using the default stream for both devices. var context = CudaHelpers.TSContextForTensor(src); var resultStorage = (CudaStorage)result.Storage; var resultContext = context.CudaContextForTensor(result); var resultPtr = resultStorage.DevicePtrAtElement(result.StorageOffset); var srcStorage = (CudaStorage)src.Storage; var srcContext = context.CudaContextForTensor(src); var srcPtr = srcStorage.DevicePtrAtElement(src.StorageOffset); if (CudaHelpers.GetDeviceId(result) != CudaHelpers.GetDeviceId(src)) { // Cross-device copy. Perform two-way barrier between both devices' default streams. resultContext.SetCurrent(); var dstReady = new CudaEvent(CUEventFlags.DisableTiming); dstReady.Record(); srcContext.SetCurrent(); var res = DriverAPINativeMethods.Streams.cuStreamWaitEvent(CUstream.NullStream, dstReady.Event, 0); if (res != CUResult.Success) { throw new CudaException(res); } dstReady.Dispose(); } else { srcContext.SetCurrent(); } var canMemcpy = CanMemcpy(result, src, totalElements); if (canMemcpy) { var res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyAsync( resultPtr, srcPtr, totalElements * src.ElementType.Size(), CUstream.NullStream); if (res != CUResult.Success) { throw new CudaException(res); } } else { if (result.ElementType != src.ElementType) { CopyGpuConvertTypes(result, src, totalElements); } else if (context.CanAccessPeer(CudaHelpers.GetDeviceId(src), CudaHelpers.GetDeviceId(result))) { CopyGpuDirect(result, src, srcContext); } else { CopyGpuIndirect(result, src, totalElements); } } }