public static Tensor Invoke(CudaReduceAllKernels reduceAllKernels, float init, ReduceInitType initType, string kernelName, Tensor result, Tensor src, object extraArg = null) { int deviceId = CudaHelpers.GetDeviceId(src); TSCudaContext context = CudaHelpers.TSContextForTensor(src); CudaContext cudaContext = context.CudaContextForDevice(deviceId); if (src.DimensionCount > TSCudaContext.MaxDims) { throw new InvalidOperationException("Tensors with dimension count > " + TSCudaContext.MaxDims + " are not supported"); } Tensor writeTarget = TensorResultBuilder.GetWriteTarget(result, src, false, 1); if (src.DimensionCount == 0) { return(result); } long totalElements = src.ElementCount(); ApplySpecialization config = new ApplySpecialization(src); object totalElementsTyped = config.Use32BitIndices ? (uint)totalElements : (ulong)totalElements; object initValueTyped = ReduceInitConverter.GetInitValue(init, initType, src.ElementType); dim3 grid; dim3 block; byte[] ptx = reduceAllKernels.GetPtx(context.Compiler); string fullKernelName = PermutationGenerator.GetMangledName(kernelName, config); ManagedCuda.BasicTypes.CUdeviceptr outputDevicePtr = CudaHelpers.GetBufferStart(writeTarget); if (isTwoPassReductionSize(totalElements)) { getPass1ReduceBlockGrid(context, deviceId, totalElements, out grid, out block); uint smemSize = block.x * sizeof(float); ManagedCuda.BasicTypes.CUdeviceptr scratchSpace = context.ScratchSpaceForDevice(deviceId).buffer; if (extraArg == null) { InvokeReduceAll(context, cudaContext, ptx, "twoPassA_" + fullKernelName, grid, block, smemSize, config, src, totalElementsTyped, initValueTyped, scratchSpace); } else { InvokeReduceAll(context, cudaContext, ptx, "twoPassA_" + fullKernelName, grid, block, smemSize, config, src, totalElementsTyped, initValueTyped, scratchSpace, extraArg); } uint numPass1Blocks = grid.x; getPass2ReduceBlockGrid(context, deviceId, totalElements, out grid, out block); smemSize = block.x * sizeof(float); InvokeReduceAllPass2(context, cudaContext, ptx, "twoPassB_" + fullKernelName, grid, block, smemSize, config.Use32BitIndices, numPass1Blocks, initValueTyped, scratchSpace, outputDevicePtr); } else { getSinglePassReduceBlockGrid(totalElements, out grid, out block); uint smemSize = block.x * sizeof(float); if (extraArg == null) { InvokeReduceAll(context, cudaContext, ptx, "onePass_" + fullKernelName, grid, block, smemSize, config, src, totalElementsTyped, initValueTyped, outputDevicePtr); } else { InvokeReduceAll(context, cudaContext, ptx, "onePass_" + fullKernelName, grid, block, smemSize, config, src, totalElementsTyped, initValueTyped, outputDevicePtr, extraArg); } } return(writeTarget); }
public static Tensor Invoke(CudaReduceKernels reduceKernels, string kernelName, float init, ReduceInitType initType, Tensor result, Tensor src, int dim, object extraArg = null) { if (src.DimensionCount == 0) { return(result); } var context = CudaHelpers.TSContextForTensor(src); var cudaContext = context.CudaContextForTensor(src); var requiredOutputSize = (long[])src.Sizes.Clone(); requiredOutputSize[dim] = 1; var writeTarget = TensorResultBuilder.GetWriteTarget(result, src, false, requiredOutputSize); ThrowIfAnyTensorInvalid(writeTarget, src); var inElements = src.ElementCount(); var reductionSize = src.Sizes[dim]; var reductionStride = src.Strides[dim]; var outElements = inElements / reductionSize; var contigReduction = reductionStride == 1; // We must make sure that when the tensor is passed to the kernel, src.Sizes[dim] is set to 1 // This includes for the purposes of determining which tensor specializations to use (changing // the dimension size to 1 may make the tensor non-contiguous var newSizes = (long[])src.Sizes.Clone(); newSizes[dim] = 1; var srcSlim = new Tensor(newSizes, src.Strides, src.Storage, src.StorageOffset); var config = new ApplySpecialization(writeTarget, srcSlim); object totalSlices = config.Use32BitIndices ? (uint)outElements : (ulong)outElements; object reductionSizeTyped = config.Use32BitIndices ? (uint)reductionSize : (ulong)reductionSize; object reductionStrideTyped = config.Use32BitIndices ? (uint)reductionStride : (ulong)reductionStride; var initValueTyped = ReduceInitConverter.GetInitValue(init, initType, src.ElementType); var ptx = reduceKernels.GetPtx(context.Compiler); if (contigReduction) { var block = GetContigReduceBlock(cudaContext, outElements, reductionSize); var grid = GetContigReduceGrid(outElements); var smemSize = (uint)src.ElementType.Size() * block.x; var fullName = "contig_" + PermutationGenerator.GetMangledName(kernelName, config); if (extraArg == null) { InvokeReduce(context, cudaContext, ptx, fullName, grid, block, smemSize, config, writeTarget, srcSlim, reductionSizeTyped, totalSlices, initValueTyped); } else { InvokeReduce(context, cudaContext, ptx, fullName, grid, block, smemSize, config, writeTarget, srcSlim, reductionSizeTyped, totalSlices, initValueTyped, extraArg); } } else { var deviceProps = context.DeviceInfoForContext(cudaContext); var block = GetNonContigReduceBlock(deviceProps); var grid = GetNoncontigReduceGrid(deviceProps, outElements); uint smemSize = 0; var fullName = "noncontig_" + PermutationGenerator.GetMangledName(kernelName, config); if (extraArg == null) { InvokeReduce(context, cudaContext, ptx, fullName, grid, block, smemSize, config, writeTarget, srcSlim, reductionStrideTyped, reductionSizeTyped, totalSlices, initValueTyped); } else { InvokeReduce(context, cudaContext, ptx, fullName, grid, block, smemSize, config, writeTarget, srcSlim, reductionStrideTyped, reductionSizeTyped, totalSlices, initValueTyped, extraArg); } } return(writeTarget); }