/// <summary> /// Invokes the specified reduce all kernels. /// </summary> /// <param name="reduceAllKernels">The reduce all kernels.</param> /// <param name="init">The initialize.</param> /// <param name="initType">Type of the initialize.</param> /// <param name="kernelName">Name of the kernel.</param> /// <param name="result">The result.</param> /// <param name="src">The source.</param> /// <param name="extraArg">The extra argument.</param> /// <returns>Tensor.</returns> /// <exception cref="InvalidOperationException">Tensors with dimension count > " + TSCudaContext.MaxDims + " are not supported</exception> public static NDArray Invoke(CudaReduceAllKernels reduceAllKernels, float init, ReduceInitType initType, string kernelName, NDArray result, NDArray src, object extraArg = null) { var deviceId = CudaHelpers.GetDeviceId(src); var context = CudaHelpers.TSContextForTensor(src); var cudaContext = context.CudaContextForDevice(deviceId); if (src.DimensionCount > TSCudaContext.MaxDims) { throw new InvalidOperationException("Tensors with dimension count > " + TSCudaContext.MaxDims + " are not supported"); } var writeTarget = TensorResultBuilder.GetWriteTarget(result, src, false, 1); if (src.DimensionCount == 0) { return(result); } var totalElements = src.ElementCount(); var config = new ApplySpecialization(src); object totalElementsTyped = config.Use32BitIndices ? (uint)totalElements : (ulong)totalElements; object initValueTyped = ReduceInitConverter.GetInitValue(init, initType, src.ElementType); dim3 grid; dim3 block; var ptx = reduceAllKernels.GetPtx(context.Compiler); var fullKernelName = PermutationGenerator.GetMangledName(kernelName, config); var outputDevicePtr = CudaHelpers.GetBufferStart(writeTarget); if (isTwoPassReductionSize(totalElements)) { getPass1ReduceBlockGrid(context, deviceId, totalElements, out grid, out block); uint smemSize = block.x * sizeof(float); var scratchSpace = context.ScratchSpaceForDevice(deviceId).buffer; if (extraArg == null) { InvokeReduceAll(context, cudaContext, ptx, "twoPassA_" + fullKernelName, grid, block, smemSize, config, src, totalElementsTyped, initValueTyped, scratchSpace); } else { InvokeReduceAll(context, cudaContext, ptx, "twoPassA_" + fullKernelName, grid, block, smemSize, config, src, totalElementsTyped, initValueTyped, scratchSpace, extraArg); } uint numPass1Blocks = grid.x; getPass2ReduceBlockGrid(context, deviceId, totalElements, out grid, out block); smemSize = block.x * sizeof(float); InvokeReduceAllPass2(context, cudaContext, ptx, "twoPassB_" + fullKernelName, grid, block, smemSize, config.Use32BitIndices, numPass1Blocks, initValueTyped, scratchSpace, outputDevicePtr); } else { getSinglePassReduceBlockGrid(totalElements, out grid, out block); uint smemSize = block.x * sizeof(float); if (extraArg == null) { InvokeReduceAll(context, cudaContext, ptx, "onePass_" + fullKernelName, grid, block, smemSize, config, src, totalElementsTyped, initValueTyped, outputDevicePtr); } else { InvokeReduceAll(context, cudaContext, ptx, "onePass_" + fullKernelName, grid, block, smemSize, config, src, totalElementsTyped, initValueTyped, outputDevicePtr, extraArg); } } return(writeTarget); }
/// <summary> /// Invokes the specified reduce kernels. /// </summary> /// <param name="reduceKernels">The reduce kernels.</param> /// <param name="kernelName">Name of the kernel.</param> /// <param name="init">The initialize.</param> /// <param name="initType">Type of the initialize.</param> /// <param name="result">The result.</param> /// <param name="src">The source.</param> /// <param name="dim">The dim.</param> /// <param name="extraArg">The extra argument.</param> /// <returns>Tensor.</returns> public static NDArray Invoke(CudaReduceKernels reduceKernels, string kernelName, float init, ReduceInitType initType, NDArray result, NDArray src, int dim, object extraArg = null) { if (src.DimensionCount == 0) { return(result); } var context = CudaHelpers.TSContextForTensor(src); var cudaContext = context.CudaContextForTensor(src); var requiredOutputSize = (long[])src.Shape.Clone(); requiredOutputSize[dim] = 1; var writeTarget = TensorResultBuilder.GetWriteTarget(result, src, false, requiredOutputSize); ThrowIfAnyTensorInvalid(writeTarget, src); var inElements = src.ElementCount(); var reductionSize = src.Shape[dim]; var reductionStride = src.Strides[dim]; var outElements = inElements / reductionSize; var contigReduction = reductionStride == 1; // We must make sure that when the tensor is passed to the kernel, src.Sizes[dim] is set to 1 // This includes for the purposes of determining which tensor specializations to use (changing // the dimension size to 1 may make the tensor non-contiguous var newSizes = (long[])src.Shape.Clone(); newSizes[dim] = 1; var srcSlim = new NDArray(newSizes, src.Strides, src.Storage, src.StorageOffset); var config = new ApplySpecialization(writeTarget, srcSlim); object totalSlices = config.Use32BitIndices ? (uint)outElements : (ulong)outElements; object reductionSizeTyped = config.Use32BitIndices ? (uint)reductionSize : (ulong)reductionSize; object reductionStrideTyped = config.Use32BitIndices ? (uint)reductionStride : (ulong)reductionStride; object initValueTyped = ReduceInitConverter.GetInitValue(init, initType, src.ElementType); var ptx = reduceKernels.GetPtx(context.Compiler); if (contigReduction) { var block = GetContigReduceBlock(cudaContext, outElements, reductionSize); var grid = GetContigReduceGrid(outElements); uint smemSize = (uint)src.ElementType.Size() * block.x; var fullName = "contig_" + PermutationGenerator.GetMangledName(kernelName, config); if (extraArg == null) { InvokeReduce(context, cudaContext, ptx, fullName, grid, block, smemSize, config, writeTarget, srcSlim, reductionSizeTyped, totalSlices, initValueTyped); } else { InvokeReduce(context, cudaContext, ptx, fullName, grid, block, smemSize, config, writeTarget, srcSlim, reductionSizeTyped, totalSlices, initValueTyped, extraArg); } } else { var deviceProps = context.DeviceInfoForContext(cudaContext); var block = GetNonContigReduceBlock(deviceProps); var grid = GetNoncontigReduceGrid(deviceProps, outElements); uint smemSize = 0; var fullName = "noncontig_" + PermutationGenerator.GetMangledName(kernelName, config); if (extraArg == null) { InvokeReduce(context, cudaContext, ptx, fullName, grid, block, smemSize, config, writeTarget, srcSlim, reductionStrideTyped, reductionSizeTyped, totalSlices, initValueTyped); } else { InvokeReduce(context, cudaContext, ptx, fullName, grid, block, smemSize, config, writeTarget, srcSlim, reductionStrideTyped, reductionSizeTyped, totalSlices, initValueTyped, extraArg); } } return(writeTarget); }