public static void Invoke(TSCudaContext context, CudaContext cudaContext, byte[] ptx, string baseName, params object[] args)
        {
            ThrowIfAnyTensorInvalid(args);

            cudaContext.SetCurrent();

            CudaDeviceProperties deviceInfo = context.DeviceInfoForContext(cudaContext);

            IEnumerable <Tensor> allTensors = args.OfType <Tensor>();
            Tensor firstTensor       = allTensors.First();
            long   elementCount      = firstTensor.ElementCount();
            ApplySpecialization spec = new ApplySpecialization(allTensors.ToArray());

            ConvertTensorArgs.Convert(cudaContext, spec.Use32BitIndices, args);

            ManagedCuda.VectorTypes.dim3 block = ApplyUtils.GetApplyBlock();
            ManagedCuda.VectorTypes.dim3 grid  = ApplyUtils.GetApplyGrid(deviceInfo, elementCount);

            string     fullKernelName = PermutationGenerator.GetMangledName(baseName, spec);
            CudaKernel kernel         = context.KernelCache.Get(cudaContext, ptx, fullKernelName);

            kernel.GridDimensions  = grid;
            kernel.BlockDimensions = block;
            kernel.RunAsync(CUstream.NullStream, args);
        }
Beispiel #2
0
        public static void InvokeReduceAll(TSCudaContext context, CudaContext cudaContext, byte[] ptx, string kernelName, dim3 grid, dim3 block, uint smemSize, ApplySpecialization spec, params object[] args)
        {
            ConvertTensorArgs.Convert(cudaContext, spec.Use32BitIndices, args);

            CudaKernel kernel = context.KernelCache.Get(cudaContext, ptx, kernelName);

            kernel.GridDimensions      = grid;
            kernel.BlockDimensions     = block;
            kernel.DynamicSharedMemory = smemSize;

            kernel.Run(args);
        }
        private void Invoke(TSCudaContext context, CudaContext cudaContext, string kernelName, dim3 grid, dim3 block, uint smemSize, CUstream stream, bool index32, params object[] args)
        {
            ConvertTensorArgs.Convert(cudaContext, index32, args);

            byte[]     ptx    = GetPtx(context.Compiler);
            CudaKernel kernel = context.KernelCache.Get(cudaContext, ptx, kernelName);

            kernel.GridDimensions      = grid;
            kernel.BlockDimensions     = block;
            kernel.DynamicSharedMemory = smemSize;
            kernel.RunAsync(stream, args);
        }
Beispiel #4
0
        /// <summary>
        /// Invokes the specified context.
        /// </summary>
        /// <param name="context">The context.</param>
        /// <param name="cudaContext">The cuda context.</param>
        /// <param name="ptx">The PTX.</param>
        /// <param name="baseName">Name of the base.</param>
        /// <param name="args">The arguments.</param>
        public static void Invoke(TSCudaContext context, CudaContext cudaContext, byte[] ptx, string baseName, params object[] args)
        {
            ThrowIfAnyTensorInvalid(args);

            var deviceInfo = context.DeviceInfoForContext(cudaContext);

            var allTensors   = args.OfType <NDArray>();
            var firstTensor  = allTensors.First();
            var elementCount = firstTensor.ElementCount();
            var spec         = new ApplySpecialization(allTensors.ToArray());

            ConvertTensorArgs.Convert(cudaContext, spec.Use32BitIndices, args);

            var block = ApplyUtils.GetApplyBlock();
            var grid  = ApplyUtils.GetApplyGrid(deviceInfo, elementCount);

            var fullKernelName = PermutationGenerator.GetMangledName(baseName, spec);
            var kernel         = context.KernelCache.Get(cudaContext, ptx, fullKernelName);

            kernel.GridDimensions  = grid;
            kernel.BlockDimensions = block;
            kernel.RunAsync(CUstream.NullStream, args);
        }