Exemple #1
0
        public void AddApplyTTTT(string kernelBaseName, string operatorCode)
        {
            foreach (ApplySpecialization spec in ApplySpecialization.AllSpecializations(4))
            {
                string kernelName = GetMangledName(kernelBaseName, spec);
                string indexType  = spec.Use32BitIndices ? ApplySpecialization.IndexType32 : ApplySpecialization.IndexType64;
                string dimsA      = spec.TensorDims[0].ToString();
                string dimsB      = spec.TensorDims[1].ToString();
                string dimsC      = spec.TensorDims[2].ToString();
                string dimsD      = spec.TensorDims[3].ToString();

                sb.AppendLine($"struct ConcreteOp_{kernelName} {{ __device__ __forceinline__ void operator()(float* a, float *b, float *c, float *d) const {{ {operatorCode} }} }};");
                sb.AppendLine("extern \"C\" {");
                sb.AppendLine($"   __global__ void {kernelName}(TensorInfo<{indexType}> tensorA, TensorInfo<{indexType}> tensorB, TensorInfo<{indexType}> tensorC, TensorInfo<{indexType}> tensorD, __int64 totalElements)");
                sb.AppendLine("   {");

                sb.AppendLine($"      for ({indexType} linearIndex = blockIdx.x * blockDim.x + threadIdx.x;linearIndex < totalElements;linearIndex += gridDim.x * blockDim.x)");
                sb.AppendLine("      {");
                sb.AppendLine($"         const {indexType} aOffset = IndexToOffset < {indexType}, {dimsA}>::get(linearIndex, tensorA);");
                sb.AppendLine($"         const {indexType} bOffset = IndexToOffset < {indexType}, {dimsB}>::get(linearIndex, tensorB);");
                sb.AppendLine($"         const {indexType} cOffset = IndexToOffset < {indexType}, {dimsC}>::get(linearIndex, tensorC);");
                sb.AppendLine($"         const {indexType} dOffset = IndexToOffset < {indexType}, {dimsD}>::get(linearIndex, tensorD);");
                sb.AppendLine($"         ConcreteOp_{kernelName}()(&tensorA.data[aOffset], &tensorB.data[bOffset], &tensorC.data[cOffset], &tensorD.data[dOffset]);");
                sb.AppendLine("      }");
                sb.AppendLine("   }");
                sb.AppendLine("}");
            }
        }
Exemple #2
0
        public void AddApplyTTSS(string kernelBaseName, string operatorCode)
        {
            foreach (ApplySpecialization spec in ApplySpecialization.AllSpecializations(2))
            {
                string kernelName = GetMangledName(kernelBaseName, spec);
                string indexType  = spec.Use32BitIndices ? ApplySpecialization.IndexType32 : ApplySpecialization.IndexType64;
                string dimsA      = spec.TensorDims[0].ToString();
                string dimsB      = spec.TensorDims[1].ToString();

                sb.AppendLine($"struct ConcreteOp_{kernelName} {{");
                sb.AppendLine("float c;");
                sb.AppendLine("float d;");
                sb.AppendLine($"__device__ ConcreteOp_{kernelName}(float cVal, float dVal) {{ this->c = cVal; this->d = dVal; }}");
                sb.AppendLine($"__device__ __forceinline__ void operator()(float* a, float *b) const {{ {operatorCode} }} }};");

                sb.AppendLine("extern \"C\" {");
                sb.AppendLine($"   __global__ void {kernelName}(TensorInfo<{indexType}> tensorA, TensorInfo<{indexType}> tensorB, float c, float d, __int64 totalElements)");
                sb.AppendLine("   {");
                sb.AppendLine($"      for ({indexType} linearIndex = blockIdx.x * blockDim.x + threadIdx.x;linearIndex < totalElements;linearIndex += gridDim.x * blockDim.x)");
                sb.AppendLine("      {");
                sb.AppendLine($"         const {indexType} aOffset = IndexToOffset < {indexType}, {dimsA}>::get(linearIndex, tensorA);");
                sb.AppendLine($"         const {indexType} bOffset = IndexToOffset < {indexType}, {dimsB}>::get(linearIndex, tensorB);");
                sb.AppendLine($"         ConcreteOp_{kernelName} op = ConcreteOp_{kernelName}(c, d);");
                sb.AppendLine($"         op(&tensorA.data[aOffset], &tensorB.data[bOffset]);");
                sb.AppendLine("      }");
                sb.AppendLine("   }");
                sb.AppendLine("}");
            }
        }
        public static void Invoke(TSCudaContext context, CudaContext cudaContext, byte[] ptx, string baseName, params object[] args)
        {
            ThrowIfAnyTensorInvalid(args);

            cudaContext.SetCurrent();

            CudaDeviceProperties deviceInfo = context.DeviceInfoForContext(cudaContext);

            IEnumerable <Tensor> allTensors = args.OfType <Tensor>();
            Tensor firstTensor       = allTensors.First();
            long   elementCount      = firstTensor.ElementCount();
            ApplySpecialization spec = new ApplySpecialization(allTensors.ToArray());

            ConvertTensorArgs.Convert(cudaContext, spec.Use32BitIndices, args);

            ManagedCuda.VectorTypes.dim3 block = ApplyUtils.GetApplyBlock();
            ManagedCuda.VectorTypes.dim3 grid  = ApplyUtils.GetApplyGrid(deviceInfo, elementCount);

            string     fullKernelName = PermutationGenerator.GetMangledName(baseName, spec);
            CudaKernel kernel         = context.KernelCache.Get(cudaContext, ptx, fullKernelName);

            kernel.GridDimensions  = grid;
            kernel.BlockDimensions = block;
            kernel.RunAsync(CUstream.NullStream, args);
        }
 public static void ApplyPrecompile(CudaCompiler compiler, DeviceKernelTemplate template, int tensorCount)
 {
     foreach (ApplySpecialization spec in ApplySpecialization.AllSpecializations(tensorCount))
     {
         template.PtxForConfig(compiler, spec.GetConfig());
     }
 }
 /// <summary>
 /// Adds the reduce all norm.
 /// </summary>
 /// <param name="kernelBaseName">Name of the kernel base.</param>
 public void AddReduceAllNorm(string kernelBaseName)
 {
     foreach (var spec in ApplySpecialization.AllSpecializations(1))
     {
         var kernelName = GetMangledName(kernelBaseName, spec);
         var indexType  = spec.Use32BitIndices ? ApplySpecialization.IndexType32 : ApplySpecialization.IndexType64;
         var dimsA      = spec.TensorDims[0].ToString();
         sb.AppendFormat("REDUCE_ALL_NORM_KERNELS({0}, {1}, {2})\n", indexType, dimsA, kernelName);
     }
 }
Exemple #6
0
 public void AddApplyTSS(string kernelBaseName, string operatorCode)
 {
     foreach (ApplySpecialization spec in ApplySpecialization.AllSpecializations(1))
     {
         string kernelName = GetMangledName(kernelBaseName, spec);
         string indexType  = spec.Use32BitIndices ? ApplySpecialization.IndexType32 : ApplySpecialization.IndexType64;
         string dimsA      = spec.TensorDims[0].ToString();
         sb.AppendFormat("APPLY_TSS({0}, {1}, {2}, {3})\n", indexType, dimsA, kernelName, operatorCode);
     }
 }
Exemple #7
0
 public void AddReduceAllSubSquare(string kernelBaseName)
 {
     foreach (ApplySpecialization spec in ApplySpecialization.AllSpecializations(1))
     {
         string kernelName = GetMangledName(kernelBaseName, spec);
         string indexType  = spec.Use32BitIndices ? ApplySpecialization.IndexType32 : ApplySpecialization.IndexType64;
         string dimsA      = spec.TensorDims[0].ToString();
         sb.AppendFormat("REDUCE_ALL_SUB_SQUARE_KERNELS({0}, {1}, {2})\n", indexType, dimsA, kernelName);
     }
 }
Exemple #8
0
 public void AddReduceNorm(string kernelBaseName)
 {
     foreach (ApplySpecialization spec in ApplySpecialization.AllSpecializations(2))
     {
         string kernelName = GetMangledName(kernelBaseName, spec);
         string indexType  = spec.Use32BitIndices ? ApplySpecialization.IndexType32 : ApplySpecialization.IndexType64;
         string dimsA      = spec.TensorDims[0].ToString();
         string dimsB      = spec.TensorDims[1].ToString();
         sb.AppendFormat("REDUCE_NORM_KERNELS({0}, {1}, {2}, {3})\n", indexType, dimsA, dimsB, kernelName);
     }
 }
 /// <summary>
 /// Adds the reduce.
 /// </summary>
 /// <param name="kernelBaseName">Name of the kernel base.</param>
 /// <param name="modifyOpCode">The modify op code.</param>
 /// <param name="reduceOpCode">The reduce op code.</param>
 public void AddReduce(string kernelBaseName, string modifyOpCode, string reduceOpCode)
 {
     foreach (var spec in ApplySpecialization.AllSpecializations(2))
     {
         var kernelName = GetMangledName(kernelBaseName, spec);
         var indexType  = spec.Use32BitIndices ? ApplySpecialization.IndexType32 : ApplySpecialization.IndexType64;
         var dimsA      = spec.TensorDims[0].ToString();
         var dimsB      = spec.TensorDims[1].ToString();
         sb.AppendFormat("REDUCE_KERNELS({0}, {1}, {2}, {3}, {4}, {5})\n", indexType, dimsA, dimsB, kernelName, modifyOpCode, reduceOpCode);
     }
 }
 /// <summary>
 /// Adds the apply tt.
 /// </summary>
 /// <param name="kernelBaseName">Name of the kernel base.</param>
 /// <param name="operatorCode">The operator code.</param>
 public void AddApplyTT(string kernelBaseName, string operatorCode)
 {
     foreach (var spec in ApplySpecialization.AllSpecializations(2))
     {
         var kernelName = GetMangledName(kernelBaseName, spec);
         var indexType  = spec.Use32BitIndices ? ApplySpecialization.IndexType32 : ApplySpecialization.IndexType64;
         var dimsA      = spec.TensorDims[0].ToString();
         var dimsB      = spec.TensorDims[1].ToString();
         sb.AppendFormat("APPLY_TT({0}, {1}, {2}, {3}, {4})\n", indexType, dimsA, dimsB, kernelName, operatorCode);
     }
 }
Exemple #11
0
        // TODO make member of ApplySpecialization
        public static string GetMangledName(string baseName, ApplySpecialization spec)
        {
            StringBuilder sb = new StringBuilder();

            sb.Append(baseName);
            sb.Append(spec.Use32BitIndices ? "__int32" : "__int64");
            foreach (int dimSize in spec.TensorDims)
            {
                sb.Append("_").Append(dimSize.ToString().Replace('-', 'M'));
            }
            return(sb.ToString());
        }
Exemple #12
0
        public static void InvokeReduceAllPass2(TSCudaContext context, CudaContext cudaContext, byte[] ptx, string kernelName, dim3 grid, dim3 block, uint smemSize, bool index32, params object[] args)
        {
            KernelConfig config = new ApplySpecialization(index32).GetConfig();


            CudaKernel kernel = context.KernelCache.Get(cudaContext, ptx, kernelName);

            kernel.GridDimensions      = grid;
            kernel.BlockDimensions     = block;
            kernel.DynamicSharedMemory = smemSize;

            kernel.Run(args);
        }
Exemple #13
0
 public void AddApplyTTTTT(string kernelBaseName, string operatorCode)
 {
     foreach (ApplySpecialization spec in ApplySpecialization.AllSpecializations(5))
     {
         string kernelName = GetMangledName(kernelBaseName, spec);
         string indexType  = spec.Use32BitIndices ? ApplySpecialization.IndexType32 : ApplySpecialization.IndexType64;
         string dimsA      = spec.TensorDims[0].ToString();
         string dimsB      = spec.TensorDims[1].ToString();
         string dimsC      = spec.TensorDims[2].ToString();
         string dimsD      = spec.TensorDims[3].ToString();
         string dimsE      = spec.TensorDims[4].ToString();
         sb.AppendFormat("APPLY_TTTTT({0}, {1}, {2}, {3}, {4}, {5}, {6}, {7})\n", indexType, dimsA, dimsB, dimsC, dimsD, dimsE, kernelName, operatorCode);
     }
 }
Exemple #14
0
        public void AddApplyT(string kernelBaseName, string operatorCode)
        {
            foreach (ApplySpecialization spec in ApplySpecialization.AllSpecializations(1))
            {
                string kernelName = GetMangledName(kernelBaseName, spec);
                string indexType  = spec.Use32BitIndices ? ApplySpecialization.IndexType32 : ApplySpecialization.IndexType64;
                string dimsA      = spec.TensorDims[0].ToString();

                sb.AppendLine($"struct ConcreteOp_{kernelName} {{ __device__ __forceinline__ void operator()(float* v) const {{ {operatorCode} }} }};");
                sb.AppendLine("extern \"C\" {");
                sb.AppendLine($"   __global__ void {kernelName}(TensorInfo<{indexType}> src, __int64 totalElements)");
                sb.AppendLine("   {");

                sb.AppendLine($"      for ({indexType} linearIndex = blockIdx.x * blockDim.x + threadIdx.x;linearIndex < totalElements;linearIndex += gridDim.x * blockDim.x)");
                sb.AppendLine("      {");
                sb.AppendLine($"         const {indexType} aOffset = IndexToOffset < {indexType}, {dimsA}>::get(linearIndex, src);");
                sb.AppendLine($"         ConcreteOp_{kernelName}()(&src.data[aOffset]);");
                sb.AppendLine("      }");
                sb.AppendLine("   }");
                sb.AppendLine("}");
            }
        }
        /// <summary>
        /// Invokes the specified context.
        /// </summary>
        /// <param name="context">The context.</param>
        /// <param name="cudaContext">The cuda context.</param>
        /// <param name="ptx">The PTX.</param>
        /// <param name="baseName">Name of the base.</param>
        /// <param name="args">The arguments.</param>
        public static void Invoke(TSCudaContext context, CudaContext cudaContext, byte[] ptx, string baseName, params object[] args)
        {
            ThrowIfAnyTensorInvalid(args);

            var deviceInfo = context.DeviceInfoForContext(cudaContext);

            var allTensors   = args.OfType <NDArray>();
            var firstTensor  = allTensors.First();
            var elementCount = firstTensor.ElementCount();
            var spec         = new ApplySpecialization(allTensors.ToArray());

            ConvertTensorArgs.Convert(cudaContext, spec.Use32BitIndices, args);

            var block = ApplyUtils.GetApplyBlock();
            var grid  = ApplyUtils.GetApplyGrid(deviceInfo, elementCount);

            var fullKernelName = PermutationGenerator.GetMangledName(baseName, spec);
            var kernel         = context.KernelCache.Get(cudaContext, ptx, fullKernelName);

            kernel.GridDimensions  = grid;
            kernel.BlockDimensions = block;
            kernel.RunAsync(CUstream.NullStream, args);
        }
Exemple #16
0
        public static Tensor Invoke(CudaReduceKernels reduceKernels, string kernelName, float init, ReduceInitType initType, Tensor result, Tensor src, int dim, object extraArg = null)
        {
            if (src.DimensionCount == 0)
            {
                return(result);
            }

            var context     = CudaHelpers.TSContextForTensor(src);
            var cudaContext = context.CudaContextForTensor(src);

            var requiredOutputSize = (long[])src.Sizes.Clone();

            requiredOutputSize[dim] = 1;
            var writeTarget = TensorResultBuilder.GetWriteTarget(result, src, false, requiredOutputSize);

            ThrowIfAnyTensorInvalid(writeTarget, src);

            var inElements      = src.ElementCount();
            var reductionSize   = src.Sizes[dim];
            var reductionStride = src.Strides[dim];
            var outElements     = inElements / reductionSize;
            var contigReduction = reductionStride == 1;


            // We must make sure that when the tensor is passed to the kernel, src.Sizes[dim] is set to 1
            // This includes for the purposes of determining which tensor specializations to use (changing
            // the dimension size to 1 may make the tensor non-contiguous
            var newSizes = (long[])src.Sizes.Clone();

            newSizes[dim] = 1;
            var srcSlim = new Tensor(newSizes, src.Strides, src.Storage, src.StorageOffset);

            var    config               = new ApplySpecialization(writeTarget, srcSlim);
            object totalSlices          = config.Use32BitIndices ? (uint)outElements : (ulong)outElements;
            object reductionSizeTyped   = config.Use32BitIndices ? (uint)reductionSize : (ulong)reductionSize;
            object reductionStrideTyped = config.Use32BitIndices ? (uint)reductionStride : (ulong)reductionStride;
            var    initValueTyped       = ReduceInitConverter.GetInitValue(init, initType, src.ElementType);

            var ptx = reduceKernels.GetPtx(context.Compiler);

            if (contigReduction)
            {
                var block    = GetContigReduceBlock(cudaContext, outElements, reductionSize);
                var grid     = GetContigReduceGrid(outElements);
                var smemSize = (uint)src.ElementType.Size() * block.x;

                var fullName = "contig_" + PermutationGenerator.GetMangledName(kernelName, config);
                if (extraArg == null)
                {
                    InvokeReduce(context, cudaContext, ptx, fullName, grid, block, smemSize, config, writeTarget, srcSlim, reductionSizeTyped, totalSlices, initValueTyped);
                }
                else
                {
                    InvokeReduce(context, cudaContext, ptx, fullName, grid, block, smemSize, config, writeTarget, srcSlim, reductionSizeTyped, totalSlices, initValueTyped, extraArg);
                }
            }
            else
            {
                var  deviceProps = context.DeviceInfoForContext(cudaContext);
                var  block       = GetNonContigReduceBlock(deviceProps);
                var  grid        = GetNoncontigReduceGrid(deviceProps, outElements);
                uint smemSize    = 0;

                var fullName = "noncontig_" + PermutationGenerator.GetMangledName(kernelName, config);
                if (extraArg == null)
                {
                    InvokeReduce(context, cudaContext, ptx, fullName, grid, block, smemSize, config, writeTarget, srcSlim, reductionStrideTyped, reductionSizeTyped, totalSlices, initValueTyped);
                }
                else
                {
                    InvokeReduce(context, cudaContext, ptx, fullName, grid, block, smemSize, config, writeTarget, srcSlim, reductionStrideTyped, reductionSizeTyped, totalSlices, initValueTyped, extraArg);
                }
            }

            return(writeTarget);
        }
Exemple #17
0
        public static Tensor Invoke(CudaReduceAllKernels reduceAllKernels, float init, ReduceInitType initType, string kernelName, Tensor result, Tensor src, object extraArg = null)
        {
            int           deviceId    = CudaHelpers.GetDeviceId(src);
            TSCudaContext context     = CudaHelpers.TSContextForTensor(src);
            CudaContext   cudaContext = context.CudaContextForDevice(deviceId);

            if (src.DimensionCount > TSCudaContext.MaxDims)
            {
                throw new InvalidOperationException("Tensors with dimension count > " + TSCudaContext.MaxDims + " are not supported");
            }

            Tensor writeTarget = TensorResultBuilder.GetWriteTarget(result, src, false, 1);

            if (src.DimensionCount == 0)
            {
                return(result);
            }

            long totalElements         = src.ElementCount();
            ApplySpecialization config = new ApplySpecialization(src);
            object totalElementsTyped  = config.Use32BitIndices ? (uint)totalElements : (ulong)totalElements;
            object initValueTyped      = ReduceInitConverter.GetInitValue(init, initType, src.ElementType);

            dim3 grid;
            dim3 block;

            byte[] ptx            = reduceAllKernels.GetPtx(context.Compiler);
            string fullKernelName = PermutationGenerator.GetMangledName(kernelName, config);

            ManagedCuda.BasicTypes.CUdeviceptr outputDevicePtr = CudaHelpers.GetBufferStart(writeTarget);

            if (isTwoPassReductionSize(totalElements))
            {
                getPass1ReduceBlockGrid(context, deviceId, totalElements, out grid, out block);
                uint smemSize = block.x * sizeof(float);

                ManagedCuda.BasicTypes.CUdeviceptr scratchSpace = context.ScratchSpaceForDevice(deviceId).buffer;

                if (extraArg == null)
                {
                    InvokeReduceAll(context, cudaContext, ptx, "twoPassA_" + fullKernelName, grid, block, smemSize, config, src, totalElementsTyped, initValueTyped, scratchSpace);
                }
                else
                {
                    InvokeReduceAll(context, cudaContext, ptx, "twoPassA_" + fullKernelName, grid, block, smemSize, config, src, totalElementsTyped, initValueTyped, scratchSpace, extraArg);
                }

                uint numPass1Blocks = grid.x;
                getPass2ReduceBlockGrid(context, deviceId, totalElements, out grid, out block);
                smemSize = block.x * sizeof(float);

                InvokeReduceAllPass2(context, cudaContext, ptx, "twoPassB_" + fullKernelName, grid, block, smemSize, config.Use32BitIndices, numPass1Blocks, initValueTyped, scratchSpace, outputDevicePtr);
            }
            else
            {
                getSinglePassReduceBlockGrid(totalElements, out grid, out block);
                uint smemSize = block.x * sizeof(float);

                if (extraArg == null)
                {
                    InvokeReduceAll(context, cudaContext, ptx, "onePass_" + fullKernelName, grid, block, smemSize, config, src, totalElementsTyped, initValueTyped, outputDevicePtr);
                }
                else
                {
                    InvokeReduceAll(context, cudaContext, ptx, "onePass_" + fullKernelName, grid, block, smemSize, config, src, totalElementsTyped, initValueTyped, outputDevicePtr, extraArg);
                }
            }

            return(writeTarget);
        }
Exemple #18
0
        public static void InvokeReduceAll(TSCudaContext context, CudaContext cudaContext, byte[] ptx, string kernelName, dim3 grid, dim3 block, uint smemSize, ApplySpecialization spec, params object[] args)
        {
            ConvertTensorArgs.Convert(cudaContext, spec.Use32BitIndices, args);

            CudaKernel kernel = context.KernelCache.Get(cudaContext, ptx, kernelName);

            kernel.GridDimensions      = grid;
            kernel.BlockDimensions     = block;
            kernel.DynamicSharedMemory = smemSize;

            kernel.Run(args);
        }