Example #1
0
        /// <summary>
        /// Invokes the specified reduce all kernels.
        /// </summary>
        /// <param name="reduceAllKernels">The reduce all kernels.</param>
        /// <param name="init">The initialize.</param>
        /// <param name="initType">Type of the initialize.</param>
        /// <param name="kernelName">Name of the kernel.</param>
        /// <param name="result">The result.</param>
        /// <param name="src">The source.</param>
        /// <param name="extraArg">The extra argument.</param>
        /// <returns>Tensor.</returns>
        /// <exception cref="InvalidOperationException">Tensors with dimension count > " + TSCudaContext.MaxDims + " are not supported</exception>
        public static NDArray Invoke(CudaReduceAllKernels reduceAllKernels, float init, ReduceInitType initType, string kernelName, NDArray result, NDArray src, object extraArg = null)
        {
            var deviceId    = CudaHelpers.GetDeviceId(src);
            var context     = CudaHelpers.TSContextForTensor(src);
            var cudaContext = context.CudaContextForDevice(deviceId);

            if (src.DimensionCount > TSCudaContext.MaxDims)
            {
                throw new InvalidOperationException("Tensors with dimension count > " + TSCudaContext.MaxDims + " are not supported");
            }

            var writeTarget = TensorResultBuilder.GetWriteTarget(result, src, false, 1);

            if (src.DimensionCount == 0)
            {
                return(result);
            }

            var    totalElements      = src.ElementCount();
            var    config             = new ApplySpecialization(src);
            object totalElementsTyped = config.Use32BitIndices ? (uint)totalElements : (ulong)totalElements;
            object initValueTyped     = ReduceInitConverter.GetInitValue(init, initType, src.ElementType);

            dim3 grid;
            dim3 block;

            var ptx            = reduceAllKernels.GetPtx(context.Compiler);
            var fullKernelName = PermutationGenerator.GetMangledName(kernelName, config);

            var outputDevicePtr = CudaHelpers.GetBufferStart(writeTarget);

            if (isTwoPassReductionSize(totalElements))
            {
                getPass1ReduceBlockGrid(context, deviceId, totalElements, out grid, out block);
                uint smemSize = block.x * sizeof(float);

                var scratchSpace = context.ScratchSpaceForDevice(deviceId).buffer;

                if (extraArg == null)
                {
                    InvokeReduceAll(context, cudaContext, ptx, "twoPassA_" + fullKernelName, grid, block, smemSize, config, src, totalElementsTyped, initValueTyped, scratchSpace);
                }
                else
                {
                    InvokeReduceAll(context, cudaContext, ptx, "twoPassA_" + fullKernelName, grid, block, smemSize, config, src, totalElementsTyped, initValueTyped, scratchSpace, extraArg);
                }


                uint numPass1Blocks = grid.x;
                getPass2ReduceBlockGrid(context, deviceId, totalElements, out grid, out block);
                smemSize = block.x * sizeof(float);

                InvokeReduceAllPass2(context, cudaContext, ptx, "twoPassB_" + fullKernelName, grid, block, smemSize, config.Use32BitIndices, numPass1Blocks, initValueTyped, scratchSpace, outputDevicePtr);
            }
            else
            {
                getSinglePassReduceBlockGrid(totalElements, out grid, out block);
                uint smemSize = block.x * sizeof(float);

                if (extraArg == null)
                {
                    InvokeReduceAll(context, cudaContext, ptx, "onePass_" + fullKernelName, grid, block, smemSize, config, src, totalElementsTyped, initValueTyped, outputDevicePtr);
                }
                else
                {
                    InvokeReduceAll(context, cudaContext, ptx, "onePass_" + fullKernelName, grid, block, smemSize, config, src, totalElementsTyped, initValueTyped, outputDevicePtr, extraArg);
                }
            }

            return(writeTarget);
        }
Example #2
0
        /// <summary>
        /// Invokes the specified reduce kernels.
        /// </summary>
        /// <param name="reduceKernels">The reduce kernels.</param>
        /// <param name="kernelName">Name of the kernel.</param>
        /// <param name="init">The initialize.</param>
        /// <param name="initType">Type of the initialize.</param>
        /// <param name="result">The result.</param>
        /// <param name="src">The source.</param>
        /// <param name="dim">The dim.</param>
        /// <param name="extraArg">The extra argument.</param>
        /// <returns>Tensor.</returns>
        public static NDArray Invoke(CudaReduceKernels reduceKernels, string kernelName, float init, ReduceInitType initType, NDArray result, NDArray src, int dim, object extraArg = null)
        {
            if (src.DimensionCount == 0)
            {
                return(result);
            }

            var context     = CudaHelpers.TSContextForTensor(src);
            var cudaContext = context.CudaContextForTensor(src);

            var requiredOutputSize = (long[])src.Shape.Clone();

            requiredOutputSize[dim] = 1;
            var writeTarget = TensorResultBuilder.GetWriteTarget(result, src, false, requiredOutputSize);

            ThrowIfAnyTensorInvalid(writeTarget, src);

            var inElements      = src.ElementCount();
            var reductionSize   = src.Shape[dim];
            var reductionStride = src.Strides[dim];
            var outElements     = inElements / reductionSize;
            var contigReduction = reductionStride == 1;


            // We must make sure that when the tensor is passed to the kernel, src.Sizes[dim] is set to 1
            // This includes for the purposes of determining which tensor specializations to use (changing
            // the dimension size to 1 may make the tensor non-contiguous
            var newSizes = (long[])src.Shape.Clone();

            newSizes[dim] = 1;
            var srcSlim = new NDArray(newSizes, src.Strides, src.Storage, src.StorageOffset);

            var    config               = new ApplySpecialization(writeTarget, srcSlim);
            object totalSlices          = config.Use32BitIndices ? (uint)outElements : (ulong)outElements;
            object reductionSizeTyped   = config.Use32BitIndices ? (uint)reductionSize : (ulong)reductionSize;
            object reductionStrideTyped = config.Use32BitIndices ? (uint)reductionStride : (ulong)reductionStride;
            object initValueTyped       = ReduceInitConverter.GetInitValue(init, initType, src.ElementType);

            var ptx = reduceKernels.GetPtx(context.Compiler);

            if (contigReduction)
            {
                var  block    = GetContigReduceBlock(cudaContext, outElements, reductionSize);
                var  grid     = GetContigReduceGrid(outElements);
                uint smemSize = (uint)src.ElementType.Size() * block.x;

                var fullName = "contig_" + PermutationGenerator.GetMangledName(kernelName, config);
                if (extraArg == null)
                {
                    InvokeReduce(context, cudaContext, ptx, fullName, grid, block, smemSize, config, writeTarget, srcSlim, reductionSizeTyped, totalSlices, initValueTyped);
                }
                else
                {
                    InvokeReduce(context, cudaContext, ptx, fullName, grid, block, smemSize, config, writeTarget, srcSlim, reductionSizeTyped, totalSlices, initValueTyped, extraArg);
                }
            }
            else
            {
                var  deviceProps = context.DeviceInfoForContext(cudaContext);
                var  block       = GetNonContigReduceBlock(deviceProps);
                var  grid        = GetNoncontigReduceGrid(deviceProps, outElements);
                uint smemSize    = 0;

                var fullName = "noncontig_" + PermutationGenerator.GetMangledName(kernelName, config);
                if (extraArg == null)
                {
                    InvokeReduce(context, cudaContext, ptx, fullName, grid, block, smemSize, config, writeTarget, srcSlim, reductionStrideTyped, reductionSizeTyped, totalSlices, initValueTyped);
                }
                else
                {
                    InvokeReduce(context, cudaContext, ptx, fullName, grid, block, smemSize, config, writeTarget, srcSlim, reductionStrideTyped, reductionSizeTyped, totalSlices, initValueTyped, extraArg);
                }
            }

            return(writeTarget);
        }