public static void Invoke(TSCudaContext context, CudaContext cudaContext, byte[] ptx, string baseName, params object[] args)
        {
            ThrowIfAnyTensorInvalid(args);

            cudaContext.SetCurrent();

            CudaDeviceProperties deviceInfo = context.DeviceInfoForContext(cudaContext);

            IEnumerable <Tensor> allTensors = args.OfType <Tensor>();
            Tensor firstTensor       = allTensors.First();
            long   elementCount      = firstTensor.ElementCount();
            ApplySpecialization spec = new ApplySpecialization(allTensors.ToArray());

            ConvertTensorArgs.Convert(cudaContext, spec.Use32BitIndices, args);

            ManagedCuda.VectorTypes.dim3 block = ApplyUtils.GetApplyBlock();
            ManagedCuda.VectorTypes.dim3 grid  = ApplyUtils.GetApplyGrid(deviceInfo, elementCount);

            string     fullKernelName = PermutationGenerator.GetMangledName(baseName, spec);
            CudaKernel kernel         = context.KernelCache.Get(cudaContext, ptx, fullKernelName);

            kernel.GridDimensions  = grid;
            kernel.BlockDimensions = block;
            kernel.RunAsync(CUstream.NullStream, args);
        }
        public Tensor Scatter(Tensor result, Tensor src, int dim, Tensor indices)
        {
            TSCudaContext context     = CudaHelpers.TSContextForTensor(src);
            CudaContext   cudaContext = context.CudaContextForTensor(src);

            if (result == null)
            {
                throw new ArgumentNullException("result");
            }

            if (result.DimensionCount != src.DimensionCount)
            {
                throw new InvalidOperationException("result and src must have same number of dimensions");
            }

            if (dim < 0 && dim >= result.DimensionCount)
            {
                throw new ArgumentOutOfRangeException("dim");
            }

            if (indices.DimensionCount != src.DimensionCount)
            {
                throw new InvalidOperationException("src and indices must have same number of dimensions");
            }

            if (!src.IsSameSizeAs(indices))
            {
                throw new InvalidOperationException("src and indices must be the same size");
            }

            if (!TensorResultBuilder.ArrayEqualExcept(src.Sizes, result.Sizes, dim))
            {
                throw new InvalidOperationException("result and src must be the same size except in dimension dim");
            }

            Tensor writeTarget = result;

            long nElement = indices.ElementCount();
            dim3 block    = ApplyUtils.GetApplyBlock();
            dim3 grid     = ApplyUtils.GetApplyGrid(context.DeviceInfoForContext(cudaContext), nElement);

            if (ApplyUtils.CanUse32BitIndexMath(writeTarget) &&
                ApplyUtils.CanUse32BitIndexMath(src) &&
                ApplyUtils.CanUse32BitIndexMath(indices))
            {
                int    dims       = indices.DimensionCount <= 3 ? indices.DimensionCount : -1;
                string kernelName = MakeKernelName(ScatterBaseName, true, dims);
                Invoke(context, cudaContext, kernelName, grid, block, 0, CUstream.NullStream, true,
                       writeTarget, src, indices, dim, (int)nElement);
            }
            else
            {
                string kernelName = MakeKernelName(ScatterBaseName, false, -1);
                Invoke(context, cudaContext, kernelName, grid, block, 0, CUstream.NullStream, false,
                       writeTarget, src, indices, dim, nElement);
            }

            return(writeTarget);
        }
Beispiel #3
0
        public Tensor Scatter(Tensor result, Tensor src, int dim, Tensor indices)
        {
            try
            {
                TSCudaContext context     = CudaHelpers.TSContextForTensor(src);
                CudaContext   cudaContext = context.CudaContextForTensor(src);

                if (result == null)
                {
                    throw new ArgumentNullException("result");
                }

                if (result.DimensionCount != src.DimensionCount)
                {
                    throw new InvalidOperationException($"result and src must have same number of dimensions. result dim count = '{result.DimensionCount}', source dim count = '{src.DimensionCount}'");
                }

                if (dim < 0 && dim >= result.DimensionCount)
                {
                    throw new ArgumentOutOfRangeException("dim");
                }

                if (indices.DimensionCount != src.DimensionCount)
                {
                    throw new InvalidOperationException("src and indices must have same number of dimensions");
                }

                if (!src.IsSameSizeAs(indices))
                {
                    throw new InvalidOperationException("src and indices must be the same size");
                }

                if (!TensorResultBuilder.ArrayEqualExcept(src.Sizes, result.Sizes, dim))
                {
                    throw new InvalidOperationException("result and src must be the same size except in dimension dim");
                }

                Tensor writeTarget = result;

                long nElement = indices.ElementCount();
                dim3 block    = ApplyUtils.GetApplyBlock();
                dim3 grid     = ApplyUtils.GetApplyGrid(context.DeviceInfoForContext(cudaContext), nElement);

                Invoke(context, cudaContext, "scatter_kernel", grid, block, 0, CUstream.NullStream, false, writeTarget, src, indices, dim, nElement);

                return(writeTarget);
            }
            catch (Exception err)
            {
                Logger.WriteLine($"Error = '{err.Message}', Call stack = '{err.StackTrace}'");
                throw;
            }
        }
Beispiel #4
0
        public Tensor Gather(Tensor result, Tensor src, int dim, Tensor indices)
        {
            TSCudaContext context     = CudaHelpers.TSContextForTensor(src);
            CudaContext   cudaContext = context.CudaContextForTensor(src);

            if (result != null && result.DimensionCount != src.DimensionCount)
            {
                throw new InvalidOperationException("result and src must have same number of dimensions");
            }

            if (result != null && dim < 0 && dim >= result.DimensionCount)
            {
                throw new ArgumentOutOfRangeException("dim");
            }

            if (indices.DimensionCount != src.DimensionCount)
            {
                throw new InvalidOperationException("src and indices must have same number of dimensions");
            }

            if (result != null && !result.IsSameSizeAs(indices))
            {
                throw new InvalidOperationException("result and indices must be the same size");
            }

            if (result != null && !TensorResultBuilder.ArrayEqualExcept(src.Sizes, result.Sizes, dim))
            {
                throw new InvalidOperationException("result and src must be the same size except in dimension dim");
            }

            Tensor writeTarget = TensorResultBuilder.GetWriteTarget(result, indices.Allocator, src.ElementType, false, indices.Sizes);

            long nElement = indices.ElementCount();
            dim3 block    = ApplyUtils.GetApplyBlock();
            dim3 grid     = ApplyUtils.GetApplyGrid(context.DeviceInfoForContext(cudaContext), nElement);

            Invoke(context, cudaContext, "gather_kernel", grid, block, 0, CUstream.NullStream, false, writeTarget, src, indices, dim, nElement);

            return(writeTarget);
        }
Beispiel #5
0
        /// <summary>
        /// Invokes the specified context.
        /// </summary>
        /// <param name="context">The context.</param>
        /// <param name="cudaContext">The cuda context.</param>
        /// <param name="ptx">The PTX.</param>
        /// <param name="baseName">Name of the base.</param>
        /// <param name="args">The arguments.</param>
        public static void Invoke(TSCudaContext context, CudaContext cudaContext, byte[] ptx, string baseName, params object[] args)
        {
            ThrowIfAnyTensorInvalid(args);

            var deviceInfo = context.DeviceInfoForContext(cudaContext);

            var allTensors   = args.OfType <NDArray>();
            var firstTensor  = allTensors.First();
            var elementCount = firstTensor.ElementCount();
            var spec         = new ApplySpecialization(allTensors.ToArray());

            ConvertTensorArgs.Convert(cudaContext, spec.Use32BitIndices, args);

            var block = ApplyUtils.GetApplyBlock();
            var grid  = ApplyUtils.GetApplyGrid(deviceInfo, elementCount);

            var fullKernelName = PermutationGenerator.GetMangledName(baseName, spec);
            var kernel         = context.KernelCache.Get(cudaContext, ptx, fullKernelName);

            kernel.GridDimensions  = grid;
            kernel.BlockDimensions = block;
            kernel.RunAsync(CUstream.NullStream, args);
        }
Beispiel #6
0
        public Tensor ScatterFill(Tensor result, float value, int dim, Tensor indices)
        {
            TSCudaContext context     = CudaHelpers.TSContextForTensor(indices);
            CudaContext   cudaContext = context.CudaContextForTensor(indices);

            if (result == null)
            {
                throw new ArgumentNullException("result");
            }

            if (dim < 0 && dim >= result.DimensionCount)
            {
                throw new ArgumentOutOfRangeException("dim");
            }

            if (indices.DimensionCount != result.DimensionCount)
            {
                throw new InvalidOperationException("result and indices must have same number of dimensions");
            }

            if (!TensorResultBuilder.ArrayEqualExcept(indices.Sizes, result.Sizes, dim))
            {
                throw new InvalidOperationException("result and indices must be the same size except in dimension dim");
            }

            Tensor writeTarget = result;

            long nElement = indices.ElementCount();
            dim3 block    = ApplyUtils.GetApplyBlock();
            dim3 grid     = ApplyUtils.GetApplyGrid(context.DeviceInfoForContext(cudaContext), nElement);

            Invoke(context, cudaContext, "scatterFill_kernel", grid, block, 0, CUstream.NullStream, false,
                   writeTarget, indices, value, dim, nElement);

            return(writeTarget);
        }
        public Tensor IndexSelect(Tensor result, Tensor src, int dim, Tensor indices)
        {
            TSCudaContext context     = CudaHelpers.TSContextForTensor(src);
            CudaContext   cudaContext = context.CudaContextForTensor(src);

            long[] requiredOutputSize = (long[])src.Sizes.Clone();
            requiredOutputSize[dim] = 1;
            Tensor writeTarget = TensorResultBuilder.GetWriteTarget(result, src, true, requiredOutputSize);


            // The `src` is partitioned into two parts:
            // -the size of each slice we are indexing, which is the
            // total size of the tensor ignoring dimension `dim`;
            // -the number of indices we are choosing, which is the total size
            // of the tensor `indices`.
            long numIndices       = indices.ElementCount();
            long dstTotalSize     = writeTarget.ElementCount();
            long srcSelectDimSize = src.Sizes[dim];
            long sliceSize        = dstTotalSize / numIndices;

            int  mpc             = context.DeviceInfoForContext(cudaContext).MultiProcessorCount;
            dim3 smallIndexGrid  = new dim3((uint)Math.Min(ApplyUtils.CeilDiv(sliceSize, 128), (mpc * 8)));
            dim3 smallIndexBlock = new dim3((uint)Math.Min(sliceSize, 128));

            dim3 largeIndexGrid  = new dim3((uint)Math.Min(ApplyUtils.CeilDiv(dstTotalSize, 128), (mpc * 8)));
            dim3 largeIndexBlock = new dim3((uint)Math.Min(dstTotalSize, 128));


            long[] newResultSize = (long[])writeTarget.Sizes.Clone();
            newResultSize[dim] = 1;
            Tensor resultFlat = new Tensor(newResultSize, writeTarget.Strides, writeTarget.Storage, writeTarget.StorageOffset);

            long[] newSrcSize = (long[])src.Sizes.Clone();
            newSrcSize[dim] = 1;
            Tensor srcFlat = new Tensor(newSrcSize, src.Strides, src.Storage, src.StorageOffset);


            if (ApplyUtils.CanUse32BitIndexMath(writeTarget) &&
                ApplyUtils.CanUse32BitIndexMath(src) &&
                ApplyUtils.CanUse32BitIndexMath(indices))
            {
                // Threshold for small kernel
                bool   smallKernel = numIndices <= 16;
                string kernelName  = "";
                bool   indContig   = indices.IsContiguous();

                if (writeTarget.DimensionCount == src.DimensionCount &&
                    writeTarget.DimensionCount <= 3 &&
                    indContig)
                {
                    kernelName = MakeKernelName(smallKernel, true, writeTarget.DimensionCount, src.DimensionCount, -2);
                }
                else
                {
                    kernelName = MakeKernelName(smallKernel, true, -1, -1, -1);
                }

                dim3 grid  = smallKernel ? smallIndexGrid : largeIndexGrid;
                dim3 block = smallKernel ? smallIndexBlock : largeIndexBlock;
                Invoke(context, cudaContext, kernelName, grid, block, 0, CUstream.NullStream, true,
                       writeTarget, src, indices, dim, dim, sliceSize, srcSelectDimSize);
            }
            else
            {
                string kernelName = MakeKernelName(false, false, -1, -1, -1);

                Invoke(context, cudaContext, kernelName, largeIndexGrid, largeIndexBlock, 0, CUstream.NullStream, false,
                       writeTarget, src, indices, dim, dim, dstTotalSize, sliceSize, srcSelectDimSize);
            }



            return(writeTarget);
        }
Beispiel #8
0
        public static Tensor Invoke(CudaReduceKernels reduceKernels, string kernelName, float init, ReduceInitType initType, Tensor result, Tensor src, int dim, object extraArg = null)
        {
            if (src.DimensionCount == 0)
            {
                return(result);
            }

            TSCudaContext context     = CudaHelpers.TSContextForTensor(src);
            CudaContext   cudaContext = context.CudaContextForTensor(src);

            long[] requiredOutputSize = (long[])src.Sizes.Clone();
            requiredOutputSize[dim] = 1;
            Tensor writeTarget = TensorResultBuilder.GetWriteTarget(result, src, false, requiredOutputSize);

            ThrowIfAnyTensorInvalid(writeTarget, src);

            long inElements      = src.ElementCount();
            long reductionSize   = src.Sizes[dim];
            long reductionStride = src.Strides[dim];
            long outElements     = inElements / reductionSize;
            bool contigReduction = reductionStride == 1;


            // We must make sure that when the tensor is passed to the kernel, src.Sizes[dim] is set to 1
            // This includes for the purposes of determining which tensor specializations to use (changing
            // the dimension size to 1 may make the tensor non-contiguous
            long[] newSizes = (long[])src.Sizes.Clone();
            newSizes[dim] = 1;
            Tensor srcSlim = new Tensor(newSizes, src.Strides, src.Storage, src.StorageOffset);

            ApplySpecialization config  = new ApplySpecialization(writeTarget, srcSlim);
            object totalSlices          = config.Use32BitIndices ? (uint)outElements : (ulong)outElements;
            object reductionSizeTyped   = config.Use32BitIndices ? (uint)reductionSize : (ulong)reductionSize;
            object reductionStrideTyped = config.Use32BitIndices ? (uint)reductionStride : (ulong)reductionStride;
            object initValueTyped       = ReduceInitConverter.GetInitValue(init, initType, src.ElementType);

            byte[] ptx = reduceKernels.GetPtx(context.Compiler);

            if (contigReduction)
            {
                dim3 block    = GetContigReduceBlock(cudaContext, outElements, reductionSize);
                dim3 grid     = GetContigReduceGrid(outElements);
                uint smemSize = (uint)src.ElementType.Size() * block.x;

                string fullName = "contig_" + PermutationGenerator.GetMangledName(kernelName, config);
                if (extraArg == null)
                {
                    InvokeReduce(context, cudaContext, ptx, fullName, grid, block, smemSize, config, writeTarget, srcSlim, reductionSizeTyped, totalSlices, initValueTyped);
                }
                else
                {
                    InvokeReduce(context, cudaContext, ptx, fullName, grid, block, smemSize, config, writeTarget, srcSlim, reductionSizeTyped, totalSlices, initValueTyped, extraArg);
                }
            }
            else
            {
                CudaDeviceProperties deviceProps = context.DeviceInfoForContext(cudaContext);
                dim3 block    = GetNonContigReduceBlock(deviceProps);
                dim3 grid     = GetNoncontigReduceGrid(deviceProps, outElements);
                uint smemSize = 0;

                string fullName = "noncontig_" + PermutationGenerator.GetMangledName(kernelName, config);
                if (extraArg == null)
                {
                    InvokeReduce(context, cudaContext, ptx, fullName, grid, block, smemSize, config, writeTarget, srcSlim, reductionStrideTyped, reductionSizeTyped, totalSlices, initValueTyped);
                }
                else
                {
                    InvokeReduce(context, cudaContext, ptx, fullName, grid, block, smemSize, config, writeTarget, srcSlim, reductionStrideTyped, reductionSizeTyped, totalSlices, initValueTyped, extraArg);
                }
            }

            return(writeTarget);
        }