public void AddApplyTTTT(string kernelBaseName, string operatorCode) { foreach (ApplySpecialization spec in ApplySpecialization.AllSpecializations(4)) { string kernelName = GetMangledName(kernelBaseName, spec); string indexType = spec.Use32BitIndices ? ApplySpecialization.IndexType32 : ApplySpecialization.IndexType64; string dimsA = spec.TensorDims[0].ToString(); string dimsB = spec.TensorDims[1].ToString(); string dimsC = spec.TensorDims[2].ToString(); string dimsD = spec.TensorDims[3].ToString(); sb.AppendLine($"struct ConcreteOp_{kernelName} {{ __device__ __forceinline__ void operator()(float* a, float *b, float *c, float *d) const {{ {operatorCode} }} }};"); sb.AppendLine("extern \"C\" {"); sb.AppendLine($" __global__ void {kernelName}(TensorInfo<{indexType}> tensorA, TensorInfo<{indexType}> tensorB, TensorInfo<{indexType}> tensorC, TensorInfo<{indexType}> tensorD, __int64 totalElements)"); sb.AppendLine(" {"); sb.AppendLine($" for ({indexType} linearIndex = blockIdx.x * blockDim.x + threadIdx.x;linearIndex < totalElements;linearIndex += gridDim.x * blockDim.x)"); sb.AppendLine(" {"); sb.AppendLine($" const {indexType} aOffset = IndexToOffset < {indexType}, {dimsA}>::get(linearIndex, tensorA);"); sb.AppendLine($" const {indexType} bOffset = IndexToOffset < {indexType}, {dimsB}>::get(linearIndex, tensorB);"); sb.AppendLine($" const {indexType} cOffset = IndexToOffset < {indexType}, {dimsC}>::get(linearIndex, tensorC);"); sb.AppendLine($" const {indexType} dOffset = IndexToOffset < {indexType}, {dimsD}>::get(linearIndex, tensorD);"); sb.AppendLine($" ConcreteOp_{kernelName}()(&tensorA.data[aOffset], &tensorB.data[bOffset], &tensorC.data[cOffset], &tensorD.data[dOffset]);"); sb.AppendLine(" }"); sb.AppendLine(" }"); sb.AppendLine("}"); } }
public void AddApplyTTSS(string kernelBaseName, string operatorCode) { foreach (ApplySpecialization spec in ApplySpecialization.AllSpecializations(2)) { string kernelName = GetMangledName(kernelBaseName, spec); string indexType = spec.Use32BitIndices ? ApplySpecialization.IndexType32 : ApplySpecialization.IndexType64; string dimsA = spec.TensorDims[0].ToString(); string dimsB = spec.TensorDims[1].ToString(); sb.AppendLine($"struct ConcreteOp_{kernelName} {{"); sb.AppendLine("float c;"); sb.AppendLine("float d;"); sb.AppendLine($"__device__ ConcreteOp_{kernelName}(float cVal, float dVal) {{ this->c = cVal; this->d = dVal; }}"); sb.AppendLine($"__device__ __forceinline__ void operator()(float* a, float *b) const {{ {operatorCode} }} }};"); sb.AppendLine("extern \"C\" {"); sb.AppendLine($" __global__ void {kernelName}(TensorInfo<{indexType}> tensorA, TensorInfo<{indexType}> tensorB, float c, float d, __int64 totalElements)"); sb.AppendLine(" {"); sb.AppendLine($" for ({indexType} linearIndex = blockIdx.x * blockDim.x + threadIdx.x;linearIndex < totalElements;linearIndex += gridDim.x * blockDim.x)"); sb.AppendLine(" {"); sb.AppendLine($" const {indexType} aOffset = IndexToOffset < {indexType}, {dimsA}>::get(linearIndex, tensorA);"); sb.AppendLine($" const {indexType} bOffset = IndexToOffset < {indexType}, {dimsB}>::get(linearIndex, tensorB);"); sb.AppendLine($" ConcreteOp_{kernelName} op = ConcreteOp_{kernelName}(c, d);"); sb.AppendLine($" op(&tensorA.data[aOffset], &tensorB.data[bOffset]);"); sb.AppendLine(" }"); sb.AppendLine(" }"); sb.AppendLine("}"); } }
public static void Invoke(TSCudaContext context, CudaContext cudaContext, byte[] ptx, string baseName, params object[] args) { ThrowIfAnyTensorInvalid(args); cudaContext.SetCurrent(); CudaDeviceProperties deviceInfo = context.DeviceInfoForContext(cudaContext); IEnumerable <Tensor> allTensors = args.OfType <Tensor>(); Tensor firstTensor = allTensors.First(); long elementCount = firstTensor.ElementCount(); ApplySpecialization spec = new ApplySpecialization(allTensors.ToArray()); ConvertTensorArgs.Convert(cudaContext, spec.Use32BitIndices, args); ManagedCuda.VectorTypes.dim3 block = ApplyUtils.GetApplyBlock(); ManagedCuda.VectorTypes.dim3 grid = ApplyUtils.GetApplyGrid(deviceInfo, elementCount); string fullKernelName = PermutationGenerator.GetMangledName(baseName, spec); CudaKernel kernel = context.KernelCache.Get(cudaContext, ptx, fullKernelName); kernel.GridDimensions = grid; kernel.BlockDimensions = block; kernel.RunAsync(CUstream.NullStream, args); }
public static void ApplyPrecompile(CudaCompiler compiler, DeviceKernelTemplate template, int tensorCount) { foreach (ApplySpecialization spec in ApplySpecialization.AllSpecializations(tensorCount)) { template.PtxForConfig(compiler, spec.GetConfig()); } }
/// <summary> /// Adds the reduce all norm. /// </summary> /// <param name="kernelBaseName">Name of the kernel base.</param> public void AddReduceAllNorm(string kernelBaseName) { foreach (var spec in ApplySpecialization.AllSpecializations(1)) { var kernelName = GetMangledName(kernelBaseName, spec); var indexType = spec.Use32BitIndices ? ApplySpecialization.IndexType32 : ApplySpecialization.IndexType64; var dimsA = spec.TensorDims[0].ToString(); sb.AppendFormat("REDUCE_ALL_NORM_KERNELS({0}, {1}, {2})\n", indexType, dimsA, kernelName); } }
public void AddApplyTSS(string kernelBaseName, string operatorCode) { foreach (ApplySpecialization spec in ApplySpecialization.AllSpecializations(1)) { string kernelName = GetMangledName(kernelBaseName, spec); string indexType = spec.Use32BitIndices ? ApplySpecialization.IndexType32 : ApplySpecialization.IndexType64; string dimsA = spec.TensorDims[0].ToString(); sb.AppendFormat("APPLY_TSS({0}, {1}, {2}, {3})\n", indexType, dimsA, kernelName, operatorCode); } }
public void AddReduceAllSubSquare(string kernelBaseName) { foreach (ApplySpecialization spec in ApplySpecialization.AllSpecializations(1)) { string kernelName = GetMangledName(kernelBaseName, spec); string indexType = spec.Use32BitIndices ? ApplySpecialization.IndexType32 : ApplySpecialization.IndexType64; string dimsA = spec.TensorDims[0].ToString(); sb.AppendFormat("REDUCE_ALL_SUB_SQUARE_KERNELS({0}, {1}, {2})\n", indexType, dimsA, kernelName); } }
public void AddReduceNorm(string kernelBaseName) { foreach (ApplySpecialization spec in ApplySpecialization.AllSpecializations(2)) { string kernelName = GetMangledName(kernelBaseName, spec); string indexType = spec.Use32BitIndices ? ApplySpecialization.IndexType32 : ApplySpecialization.IndexType64; string dimsA = spec.TensorDims[0].ToString(); string dimsB = spec.TensorDims[1].ToString(); sb.AppendFormat("REDUCE_NORM_KERNELS({0}, {1}, {2}, {3})\n", indexType, dimsA, dimsB, kernelName); } }
/// <summary> /// Adds the reduce. /// </summary> /// <param name="kernelBaseName">Name of the kernel base.</param> /// <param name="modifyOpCode">The modify op code.</param> /// <param name="reduceOpCode">The reduce op code.</param> public void AddReduce(string kernelBaseName, string modifyOpCode, string reduceOpCode) { foreach (var spec in ApplySpecialization.AllSpecializations(2)) { var kernelName = GetMangledName(kernelBaseName, spec); var indexType = spec.Use32BitIndices ? ApplySpecialization.IndexType32 : ApplySpecialization.IndexType64; var dimsA = spec.TensorDims[0].ToString(); var dimsB = spec.TensorDims[1].ToString(); sb.AppendFormat("REDUCE_KERNELS({0}, {1}, {2}, {3}, {4}, {5})\n", indexType, dimsA, dimsB, kernelName, modifyOpCode, reduceOpCode); } }
/// <summary> /// Adds the apply tt. /// </summary> /// <param name="kernelBaseName">Name of the kernel base.</param> /// <param name="operatorCode">The operator code.</param> public void AddApplyTT(string kernelBaseName, string operatorCode) { foreach (var spec in ApplySpecialization.AllSpecializations(2)) { var kernelName = GetMangledName(kernelBaseName, spec); var indexType = spec.Use32BitIndices ? ApplySpecialization.IndexType32 : ApplySpecialization.IndexType64; var dimsA = spec.TensorDims[0].ToString(); var dimsB = spec.TensorDims[1].ToString(); sb.AppendFormat("APPLY_TT({0}, {1}, {2}, {3}, {4})\n", indexType, dimsA, dimsB, kernelName, operatorCode); } }
// TODO make member of ApplySpecialization public static string GetMangledName(string baseName, ApplySpecialization spec) { StringBuilder sb = new StringBuilder(); sb.Append(baseName); sb.Append(spec.Use32BitIndices ? "__int32" : "__int64"); foreach (int dimSize in spec.TensorDims) { sb.Append("_").Append(dimSize.ToString().Replace('-', 'M')); } return(sb.ToString()); }
public static void InvokeReduceAllPass2(TSCudaContext context, CudaContext cudaContext, byte[] ptx, string kernelName, dim3 grid, dim3 block, uint smemSize, bool index32, params object[] args) { KernelConfig config = new ApplySpecialization(index32).GetConfig(); CudaKernel kernel = context.KernelCache.Get(cudaContext, ptx, kernelName); kernel.GridDimensions = grid; kernel.BlockDimensions = block; kernel.DynamicSharedMemory = smemSize; kernel.Run(args); }
public void AddApplyTTTTT(string kernelBaseName, string operatorCode) { foreach (ApplySpecialization spec in ApplySpecialization.AllSpecializations(5)) { string kernelName = GetMangledName(kernelBaseName, spec); string indexType = spec.Use32BitIndices ? ApplySpecialization.IndexType32 : ApplySpecialization.IndexType64; string dimsA = spec.TensorDims[0].ToString(); string dimsB = spec.TensorDims[1].ToString(); string dimsC = spec.TensorDims[2].ToString(); string dimsD = spec.TensorDims[3].ToString(); string dimsE = spec.TensorDims[4].ToString(); sb.AppendFormat("APPLY_TTTTT({0}, {1}, {2}, {3}, {4}, {5}, {6}, {7})\n", indexType, dimsA, dimsB, dimsC, dimsD, dimsE, kernelName, operatorCode); } }
public void AddApplyT(string kernelBaseName, string operatorCode) { foreach (ApplySpecialization spec in ApplySpecialization.AllSpecializations(1)) { string kernelName = GetMangledName(kernelBaseName, spec); string indexType = spec.Use32BitIndices ? ApplySpecialization.IndexType32 : ApplySpecialization.IndexType64; string dimsA = spec.TensorDims[0].ToString(); sb.AppendLine($"struct ConcreteOp_{kernelName} {{ __device__ __forceinline__ void operator()(float* v) const {{ {operatorCode} }} }};"); sb.AppendLine("extern \"C\" {"); sb.AppendLine($" __global__ void {kernelName}(TensorInfo<{indexType}> src, __int64 totalElements)"); sb.AppendLine(" {"); sb.AppendLine($" for ({indexType} linearIndex = blockIdx.x * blockDim.x + threadIdx.x;linearIndex < totalElements;linearIndex += gridDim.x * blockDim.x)"); sb.AppendLine(" {"); sb.AppendLine($" const {indexType} aOffset = IndexToOffset < {indexType}, {dimsA}>::get(linearIndex, src);"); sb.AppendLine($" ConcreteOp_{kernelName}()(&src.data[aOffset]);"); sb.AppendLine(" }"); sb.AppendLine(" }"); sb.AppendLine("}"); } }
/// <summary> /// Invokes the specified context. /// </summary> /// <param name="context">The context.</param> /// <param name="cudaContext">The cuda context.</param> /// <param name="ptx">The PTX.</param> /// <param name="baseName">Name of the base.</param> /// <param name="args">The arguments.</param> public static void Invoke(TSCudaContext context, CudaContext cudaContext, byte[] ptx, string baseName, params object[] args) { ThrowIfAnyTensorInvalid(args); var deviceInfo = context.DeviceInfoForContext(cudaContext); var allTensors = args.OfType <NDArray>(); var firstTensor = allTensors.First(); var elementCount = firstTensor.ElementCount(); var spec = new ApplySpecialization(allTensors.ToArray()); ConvertTensorArgs.Convert(cudaContext, spec.Use32BitIndices, args); var block = ApplyUtils.GetApplyBlock(); var grid = ApplyUtils.GetApplyGrid(deviceInfo, elementCount); var fullKernelName = PermutationGenerator.GetMangledName(baseName, spec); var kernel = context.KernelCache.Get(cudaContext, ptx, fullKernelName); kernel.GridDimensions = grid; kernel.BlockDimensions = block; kernel.RunAsync(CUstream.NullStream, args); }
public static Tensor Invoke(CudaReduceKernels reduceKernels, string kernelName, float init, ReduceInitType initType, Tensor result, Tensor src, int dim, object extraArg = null) { if (src.DimensionCount == 0) { return(result); } var context = CudaHelpers.TSContextForTensor(src); var cudaContext = context.CudaContextForTensor(src); var requiredOutputSize = (long[])src.Sizes.Clone(); requiredOutputSize[dim] = 1; var writeTarget = TensorResultBuilder.GetWriteTarget(result, src, false, requiredOutputSize); ThrowIfAnyTensorInvalid(writeTarget, src); var inElements = src.ElementCount(); var reductionSize = src.Sizes[dim]; var reductionStride = src.Strides[dim]; var outElements = inElements / reductionSize; var contigReduction = reductionStride == 1; // We must make sure that when the tensor is passed to the kernel, src.Sizes[dim] is set to 1 // This includes for the purposes of determining which tensor specializations to use (changing // the dimension size to 1 may make the tensor non-contiguous var newSizes = (long[])src.Sizes.Clone(); newSizes[dim] = 1; var srcSlim = new Tensor(newSizes, src.Strides, src.Storage, src.StorageOffset); var config = new ApplySpecialization(writeTarget, srcSlim); object totalSlices = config.Use32BitIndices ? (uint)outElements : (ulong)outElements; object reductionSizeTyped = config.Use32BitIndices ? (uint)reductionSize : (ulong)reductionSize; object reductionStrideTyped = config.Use32BitIndices ? (uint)reductionStride : (ulong)reductionStride; var initValueTyped = ReduceInitConverter.GetInitValue(init, initType, src.ElementType); var ptx = reduceKernels.GetPtx(context.Compiler); if (contigReduction) { var block = GetContigReduceBlock(cudaContext, outElements, reductionSize); var grid = GetContigReduceGrid(outElements); var smemSize = (uint)src.ElementType.Size() * block.x; var fullName = "contig_" + PermutationGenerator.GetMangledName(kernelName, config); if (extraArg == null) { InvokeReduce(context, cudaContext, ptx, fullName, grid, block, smemSize, config, writeTarget, srcSlim, reductionSizeTyped, totalSlices, initValueTyped); } else { InvokeReduce(context, cudaContext, ptx, fullName, grid, block, smemSize, config, writeTarget, srcSlim, reductionSizeTyped, totalSlices, initValueTyped, extraArg); } } else { var deviceProps = context.DeviceInfoForContext(cudaContext); var block = GetNonContigReduceBlock(deviceProps); var grid = GetNoncontigReduceGrid(deviceProps, outElements); uint smemSize = 0; var fullName = "noncontig_" + PermutationGenerator.GetMangledName(kernelName, config); if (extraArg == null) { InvokeReduce(context, cudaContext, ptx, fullName, grid, block, smemSize, config, writeTarget, srcSlim, reductionStrideTyped, reductionSizeTyped, totalSlices, initValueTyped); } else { InvokeReduce(context, cudaContext, ptx, fullName, grid, block, smemSize, config, writeTarget, srcSlim, reductionStrideTyped, reductionSizeTyped, totalSlices, initValueTyped, extraArg); } } return(writeTarget); }
public static Tensor Invoke(CudaReduceAllKernels reduceAllKernels, float init, ReduceInitType initType, string kernelName, Tensor result, Tensor src, object extraArg = null) { int deviceId = CudaHelpers.GetDeviceId(src); TSCudaContext context = CudaHelpers.TSContextForTensor(src); CudaContext cudaContext = context.CudaContextForDevice(deviceId); if (src.DimensionCount > TSCudaContext.MaxDims) { throw new InvalidOperationException("Tensors with dimension count > " + TSCudaContext.MaxDims + " are not supported"); } Tensor writeTarget = TensorResultBuilder.GetWriteTarget(result, src, false, 1); if (src.DimensionCount == 0) { return(result); } long totalElements = src.ElementCount(); ApplySpecialization config = new ApplySpecialization(src); object totalElementsTyped = config.Use32BitIndices ? (uint)totalElements : (ulong)totalElements; object initValueTyped = ReduceInitConverter.GetInitValue(init, initType, src.ElementType); dim3 grid; dim3 block; byte[] ptx = reduceAllKernels.GetPtx(context.Compiler); string fullKernelName = PermutationGenerator.GetMangledName(kernelName, config); ManagedCuda.BasicTypes.CUdeviceptr outputDevicePtr = CudaHelpers.GetBufferStart(writeTarget); if (isTwoPassReductionSize(totalElements)) { getPass1ReduceBlockGrid(context, deviceId, totalElements, out grid, out block); uint smemSize = block.x * sizeof(float); ManagedCuda.BasicTypes.CUdeviceptr scratchSpace = context.ScratchSpaceForDevice(deviceId).buffer; if (extraArg == null) { InvokeReduceAll(context, cudaContext, ptx, "twoPassA_" + fullKernelName, grid, block, smemSize, config, src, totalElementsTyped, initValueTyped, scratchSpace); } else { InvokeReduceAll(context, cudaContext, ptx, "twoPassA_" + fullKernelName, grid, block, smemSize, config, src, totalElementsTyped, initValueTyped, scratchSpace, extraArg); } uint numPass1Blocks = grid.x; getPass2ReduceBlockGrid(context, deviceId, totalElements, out grid, out block); smemSize = block.x * sizeof(float); InvokeReduceAllPass2(context, cudaContext, ptx, "twoPassB_" + fullKernelName, grid, block, smemSize, config.Use32BitIndices, numPass1Blocks, initValueTyped, scratchSpace, outputDevicePtr); } else { getSinglePassReduceBlockGrid(totalElements, out grid, out block); uint smemSize = block.x * sizeof(float); if (extraArg == null) { InvokeReduceAll(context, cudaContext, ptx, "onePass_" + fullKernelName, grid, block, smemSize, config, src, totalElementsTyped, initValueTyped, outputDevicePtr); } else { InvokeReduceAll(context, cudaContext, ptx, "onePass_" + fullKernelName, grid, block, smemSize, config, src, totalElementsTyped, initValueTyped, outputDevicePtr, extraArg); } } return(writeTarget); }
public static void InvokeReduceAll(TSCudaContext context, CudaContext cudaContext, byte[] ptx, string kernelName, dim3 grid, dim3 block, uint smemSize, ApplySpecialization spec, params object[] args) { ConvertTensorArgs.Convert(cudaContext, spec.Use32BitIndices, args); CudaKernel kernel = context.KernelCache.Get(cudaContext, ptx, kernelName); kernel.GridDimensions = grid; kernel.BlockDimensions = block; kernel.DynamicSharedMemory = smemSize; kernel.Run(args); }