public static void Invoke(TSCudaContext context, CudaContext cudaContext, byte[] ptx, string baseName, params object[] args) { ThrowIfAnyTensorInvalid(args); cudaContext.SetCurrent(); CudaDeviceProperties deviceInfo = context.DeviceInfoForContext(cudaContext); IEnumerable <Tensor> allTensors = args.OfType <Tensor>(); Tensor firstTensor = allTensors.First(); long elementCount = firstTensor.ElementCount(); ApplySpecialization spec = new ApplySpecialization(allTensors.ToArray()); ConvertTensorArgs.Convert(cudaContext, spec.Use32BitIndices, args); ManagedCuda.VectorTypes.dim3 block = ApplyUtils.GetApplyBlock(); ManagedCuda.VectorTypes.dim3 grid = ApplyUtils.GetApplyGrid(deviceInfo, elementCount); string fullKernelName = PermutationGenerator.GetMangledName(baseName, spec); CudaKernel kernel = context.KernelCache.Get(cudaContext, ptx, fullKernelName); kernel.GridDimensions = grid; kernel.BlockDimensions = block; kernel.RunAsync(CUstream.NullStream, args); }
public Tensor Scatter(Tensor result, Tensor src, int dim, Tensor indices) { TSCudaContext context = CudaHelpers.TSContextForTensor(src); CudaContext cudaContext = context.CudaContextForTensor(src); if (result == null) { throw new ArgumentNullException("result"); } if (result.DimensionCount != src.DimensionCount) { throw new InvalidOperationException("result and src must have same number of dimensions"); } if (dim < 0 && dim >= result.DimensionCount) { throw new ArgumentOutOfRangeException("dim"); } if (indices.DimensionCount != src.DimensionCount) { throw new InvalidOperationException("src and indices must have same number of dimensions"); } if (!src.IsSameSizeAs(indices)) { throw new InvalidOperationException("src and indices must be the same size"); } if (!TensorResultBuilder.ArrayEqualExcept(src.Sizes, result.Sizes, dim)) { throw new InvalidOperationException("result and src must be the same size except in dimension dim"); } Tensor writeTarget = result; long nElement = indices.ElementCount(); dim3 block = ApplyUtils.GetApplyBlock(); dim3 grid = ApplyUtils.GetApplyGrid(context.DeviceInfoForContext(cudaContext), nElement); if (ApplyUtils.CanUse32BitIndexMath(writeTarget) && ApplyUtils.CanUse32BitIndexMath(src) && ApplyUtils.CanUse32BitIndexMath(indices)) { int dims = indices.DimensionCount <= 3 ? indices.DimensionCount : -1; string kernelName = MakeKernelName(ScatterBaseName, true, dims); Invoke(context, cudaContext, kernelName, grid, block, 0, CUstream.NullStream, true, writeTarget, src, indices, dim, (int)nElement); } else { string kernelName = MakeKernelName(ScatterBaseName, false, -1); Invoke(context, cudaContext, kernelName, grid, block, 0, CUstream.NullStream, false, writeTarget, src, indices, dim, nElement); } return(writeTarget); }
public Tensor Scatter(Tensor result, Tensor src, int dim, Tensor indices) { try { TSCudaContext context = CudaHelpers.TSContextForTensor(src); CudaContext cudaContext = context.CudaContextForTensor(src); if (result == null) { throw new ArgumentNullException("result"); } if (result.DimensionCount != src.DimensionCount) { throw new InvalidOperationException($"result and src must have same number of dimensions. result dim count = '{result.DimensionCount}', source dim count = '{src.DimensionCount}'"); } if (dim < 0 && dim >= result.DimensionCount) { throw new ArgumentOutOfRangeException("dim"); } if (indices.DimensionCount != src.DimensionCount) { throw new InvalidOperationException("src and indices must have same number of dimensions"); } if (!src.IsSameSizeAs(indices)) { throw new InvalidOperationException("src and indices must be the same size"); } if (!TensorResultBuilder.ArrayEqualExcept(src.Sizes, result.Sizes, dim)) { throw new InvalidOperationException("result and src must be the same size except in dimension dim"); } Tensor writeTarget = result; long nElement = indices.ElementCount(); dim3 block = ApplyUtils.GetApplyBlock(); dim3 grid = ApplyUtils.GetApplyGrid(context.DeviceInfoForContext(cudaContext), nElement); Invoke(context, cudaContext, "scatter_kernel", grid, block, 0, CUstream.NullStream, false, writeTarget, src, indices, dim, nElement); return(writeTarget); } catch (Exception err) { Logger.WriteLine($"Error = '{err.Message}', Call stack = '{err.StackTrace}'"); throw; } }
public Tensor Gather(Tensor result, Tensor src, int dim, Tensor indices) { TSCudaContext context = CudaHelpers.TSContextForTensor(src); CudaContext cudaContext = context.CudaContextForTensor(src); if (result != null && result.DimensionCount != src.DimensionCount) { throw new InvalidOperationException("result and src must have same number of dimensions"); } if (result != null && dim < 0 && dim >= result.DimensionCount) { throw new ArgumentOutOfRangeException("dim"); } if (indices.DimensionCount != src.DimensionCount) { throw new InvalidOperationException("src and indices must have same number of dimensions"); } if (result != null && !result.IsSameSizeAs(indices)) { throw new InvalidOperationException("result and indices must be the same size"); } if (result != null && !TensorResultBuilder.ArrayEqualExcept(src.Sizes, result.Sizes, dim)) { throw new InvalidOperationException("result and src must be the same size except in dimension dim"); } Tensor writeTarget = TensorResultBuilder.GetWriteTarget(result, indices.Allocator, src.ElementType, false, indices.Sizes); long nElement = indices.ElementCount(); dim3 block = ApplyUtils.GetApplyBlock(); dim3 grid = ApplyUtils.GetApplyGrid(context.DeviceInfoForContext(cudaContext), nElement); Invoke(context, cudaContext, "gather_kernel", grid, block, 0, CUstream.NullStream, false, writeTarget, src, indices, dim, nElement); return(writeTarget); }
/// <summary> /// Invokes the specified context. /// </summary> /// <param name="context">The context.</param> /// <param name="cudaContext">The cuda context.</param> /// <param name="ptx">The PTX.</param> /// <param name="baseName">Name of the base.</param> /// <param name="args">The arguments.</param> public static void Invoke(TSCudaContext context, CudaContext cudaContext, byte[] ptx, string baseName, params object[] args) { ThrowIfAnyTensorInvalid(args); var deviceInfo = context.DeviceInfoForContext(cudaContext); var allTensors = args.OfType <NDArray>(); var firstTensor = allTensors.First(); var elementCount = firstTensor.ElementCount(); var spec = new ApplySpecialization(allTensors.ToArray()); ConvertTensorArgs.Convert(cudaContext, spec.Use32BitIndices, args); var block = ApplyUtils.GetApplyBlock(); var grid = ApplyUtils.GetApplyGrid(deviceInfo, elementCount); var fullKernelName = PermutationGenerator.GetMangledName(baseName, spec); var kernel = context.KernelCache.Get(cudaContext, ptx, fullKernelName); kernel.GridDimensions = grid; kernel.BlockDimensions = block; kernel.RunAsync(CUstream.NullStream, args); }
public Tensor ScatterFill(Tensor result, float value, int dim, Tensor indices) { TSCudaContext context = CudaHelpers.TSContextForTensor(indices); CudaContext cudaContext = context.CudaContextForTensor(indices); if (result == null) { throw new ArgumentNullException("result"); } if (dim < 0 && dim >= result.DimensionCount) { throw new ArgumentOutOfRangeException("dim"); } if (indices.DimensionCount != result.DimensionCount) { throw new InvalidOperationException("result and indices must have same number of dimensions"); } if (!TensorResultBuilder.ArrayEqualExcept(indices.Sizes, result.Sizes, dim)) { throw new InvalidOperationException("result and indices must be the same size except in dimension dim"); } Tensor writeTarget = result; long nElement = indices.ElementCount(); dim3 block = ApplyUtils.GetApplyBlock(); dim3 grid = ApplyUtils.GetApplyGrid(context.DeviceInfoForContext(cudaContext), nElement); Invoke(context, cudaContext, "scatterFill_kernel", grid, block, 0, CUstream.NullStream, false, writeTarget, indices, value, dim, nElement); return(writeTarget); }
public Tensor IndexSelect(Tensor result, Tensor src, int dim, Tensor indices) { TSCudaContext context = CudaHelpers.TSContextForTensor(src); CudaContext cudaContext = context.CudaContextForTensor(src); long[] requiredOutputSize = (long[])src.Sizes.Clone(); requiredOutputSize[dim] = 1; Tensor writeTarget = TensorResultBuilder.GetWriteTarget(result, src, true, requiredOutputSize); // The `src` is partitioned into two parts: // -the size of each slice we are indexing, which is the // total size of the tensor ignoring dimension `dim`; // -the number of indices we are choosing, which is the total size // of the tensor `indices`. long numIndices = indices.ElementCount(); long dstTotalSize = writeTarget.ElementCount(); long srcSelectDimSize = src.Sizes[dim]; long sliceSize = dstTotalSize / numIndices; int mpc = context.DeviceInfoForContext(cudaContext).MultiProcessorCount; dim3 smallIndexGrid = new dim3((uint)Math.Min(ApplyUtils.CeilDiv(sliceSize, 128), (mpc * 8))); dim3 smallIndexBlock = new dim3((uint)Math.Min(sliceSize, 128)); dim3 largeIndexGrid = new dim3((uint)Math.Min(ApplyUtils.CeilDiv(dstTotalSize, 128), (mpc * 8))); dim3 largeIndexBlock = new dim3((uint)Math.Min(dstTotalSize, 128)); long[] newResultSize = (long[])writeTarget.Sizes.Clone(); newResultSize[dim] = 1; Tensor resultFlat = new Tensor(newResultSize, writeTarget.Strides, writeTarget.Storage, writeTarget.StorageOffset); long[] newSrcSize = (long[])src.Sizes.Clone(); newSrcSize[dim] = 1; Tensor srcFlat = new Tensor(newSrcSize, src.Strides, src.Storage, src.StorageOffset); if (ApplyUtils.CanUse32BitIndexMath(writeTarget) && ApplyUtils.CanUse32BitIndexMath(src) && ApplyUtils.CanUse32BitIndexMath(indices)) { // Threshold for small kernel bool smallKernel = numIndices <= 16; string kernelName = ""; bool indContig = indices.IsContiguous(); if (writeTarget.DimensionCount == src.DimensionCount && writeTarget.DimensionCount <= 3 && indContig) { kernelName = MakeKernelName(smallKernel, true, writeTarget.DimensionCount, src.DimensionCount, -2); } else { kernelName = MakeKernelName(smallKernel, true, -1, -1, -1); } dim3 grid = smallKernel ? smallIndexGrid : largeIndexGrid; dim3 block = smallKernel ? smallIndexBlock : largeIndexBlock; Invoke(context, cudaContext, kernelName, grid, block, 0, CUstream.NullStream, true, writeTarget, src, indices, dim, dim, sliceSize, srcSelectDimSize); } else { string kernelName = MakeKernelName(false, false, -1, -1, -1); Invoke(context, cudaContext, kernelName, largeIndexGrid, largeIndexBlock, 0, CUstream.NullStream, false, writeTarget, src, indices, dim, dim, dstTotalSize, sliceSize, srcSelectDimSize); } return(writeTarget); }
public static Tensor Invoke(CudaReduceKernels reduceKernels, string kernelName, float init, ReduceInitType initType, Tensor result, Tensor src, int dim, object extraArg = null) { if (src.DimensionCount == 0) { return(result); } TSCudaContext context = CudaHelpers.TSContextForTensor(src); CudaContext cudaContext = context.CudaContextForTensor(src); long[] requiredOutputSize = (long[])src.Sizes.Clone(); requiredOutputSize[dim] = 1; Tensor writeTarget = TensorResultBuilder.GetWriteTarget(result, src, false, requiredOutputSize); ThrowIfAnyTensorInvalid(writeTarget, src); long inElements = src.ElementCount(); long reductionSize = src.Sizes[dim]; long reductionStride = src.Strides[dim]; long outElements = inElements / reductionSize; bool contigReduction = reductionStride == 1; // We must make sure that when the tensor is passed to the kernel, src.Sizes[dim] is set to 1 // This includes for the purposes of determining which tensor specializations to use (changing // the dimension size to 1 may make the tensor non-contiguous long[] newSizes = (long[])src.Sizes.Clone(); newSizes[dim] = 1; Tensor srcSlim = new Tensor(newSizes, src.Strides, src.Storage, src.StorageOffset); ApplySpecialization config = new ApplySpecialization(writeTarget, srcSlim); object totalSlices = config.Use32BitIndices ? (uint)outElements : (ulong)outElements; object reductionSizeTyped = config.Use32BitIndices ? (uint)reductionSize : (ulong)reductionSize; object reductionStrideTyped = config.Use32BitIndices ? (uint)reductionStride : (ulong)reductionStride; object initValueTyped = ReduceInitConverter.GetInitValue(init, initType, src.ElementType); byte[] ptx = reduceKernels.GetPtx(context.Compiler); if (contigReduction) { dim3 block = GetContigReduceBlock(cudaContext, outElements, reductionSize); dim3 grid = GetContigReduceGrid(outElements); uint smemSize = (uint)src.ElementType.Size() * block.x; string fullName = "contig_" + PermutationGenerator.GetMangledName(kernelName, config); if (extraArg == null) { InvokeReduce(context, cudaContext, ptx, fullName, grid, block, smemSize, config, writeTarget, srcSlim, reductionSizeTyped, totalSlices, initValueTyped); } else { InvokeReduce(context, cudaContext, ptx, fullName, grid, block, smemSize, config, writeTarget, srcSlim, reductionSizeTyped, totalSlices, initValueTyped, extraArg); } } else { CudaDeviceProperties deviceProps = context.DeviceInfoForContext(cudaContext); dim3 block = GetNonContigReduceBlock(deviceProps); dim3 grid = GetNoncontigReduceGrid(deviceProps, outElements); uint smemSize = 0; string fullName = "noncontig_" + PermutationGenerator.GetMangledName(kernelName, config); if (extraArg == null) { InvokeReduce(context, cudaContext, ptx, fullName, grid, block, smemSize, config, writeTarget, srcSlim, reductionStrideTyped, reductionSizeTyped, totalSlices, initValueTyped); } else { InvokeReduce(context, cudaContext, ptx, fullName, grid, block, smemSize, config, writeTarget, srcSlim, reductionStrideTyped, reductionSizeTyped, totalSlices, initValueTyped, extraArg); } } return(writeTarget); }