Esempio n. 1
0
        private static void Run_M_V_float(TSCudaContext context, Tensor result, Tensor mat, Tensor vec)
        {
            // Require lhs to be row-major. This means we must tell BLAS to transpose it (BLAS expects column-major matrices)
            if (mat.Strides[1] != 1)
            {
                throw new ArgumentException("lhs must be contiguous in the last dimension");
            }

            using (Util.PooledObject <CudaBlas> blas = context.BlasForTensor(mat))
            {
                ManagedCuda.BasicTypes.CUdeviceptr yPtr = CudaHelpers.GetBufferStart(result);
                ManagedCuda.BasicTypes.CUdeviceptr aPtr = CudaHelpers.GetBufferStart(mat);
                ManagedCuda.BasicTypes.CUdeviceptr xPtr = CudaHelpers.GetBufferStart(vec);

                Operation trans = Operation.Transpose;
                int       m     = (int)mat.Sizes[1];
                int       n     = (int)mat.Sizes[0];
                int       incx  = (int)vec.Strides[0];
                int       lda   = (int)mat.Strides[0];
                int       incy  = (int)result.Strides[0];
                float     alpha = 1;
                float     beta  = 0;

                CudaBlasNativeMethods.cublasSgemv_v2(blas.Value.CublasHandle, trans, m, n, ref alpha, aPtr, lda, xPtr, incx, ref beta, yPtr, incy);
            }
        }
Esempio n. 2
0
        private static ScratchSpace AllocScratchSpace(CudaContext context, CudaDeviceProperties deviceProps)
        {
            int size = ScratchSpacePerSMStream * deviceProps.MultiProcessorCount;

            ManagedCuda.BasicTypes.CUdeviceptr buffer = context.AllocateMemory(size);
            return(new ScratchSpace()
            {
                size = size, buffer = buffer
            });
        }
Esempio n. 3
0
        public static Tensor Invoke(CudaReduceAllKernels reduceAllKernels, float init, ReduceInitType initType, string kernelName, Tensor result, Tensor src, object extraArg = null)
        {
            int           deviceId    = CudaHelpers.GetDeviceId(src);
            TSCudaContext context     = CudaHelpers.TSContextForTensor(src);
            CudaContext   cudaContext = context.CudaContextForDevice(deviceId);

            if (src.DimensionCount > TSCudaContext.MaxDims)
            {
                throw new InvalidOperationException("Tensors with dimension count > " + TSCudaContext.MaxDims + " are not supported");
            }

            Tensor writeTarget = TensorResultBuilder.GetWriteTarget(result, src, false, 1);

            if (src.DimensionCount == 0)
            {
                return(result);
            }

            long totalElements         = src.ElementCount();
            ApplySpecialization config = new ApplySpecialization(src);
            object totalElementsTyped  = config.Use32BitIndices ? (uint)totalElements : (ulong)totalElements;
            object initValueTyped      = ReduceInitConverter.GetInitValue(init, initType, src.ElementType);

            dim3 grid;
            dim3 block;

            byte[] ptx            = reduceAllKernels.GetPtx(context.Compiler);
            string fullKernelName = PermutationGenerator.GetMangledName(kernelName, config);

            ManagedCuda.BasicTypes.CUdeviceptr outputDevicePtr = CudaHelpers.GetBufferStart(writeTarget);

            if (isTwoPassReductionSize(totalElements))
            {
                getPass1ReduceBlockGrid(context, deviceId, totalElements, out grid, out block);
                uint smemSize = block.x * sizeof(float);

                ManagedCuda.BasicTypes.CUdeviceptr scratchSpace = context.ScratchSpaceForDevice(deviceId).buffer;

                if (extraArg == null)
                {
                    InvokeReduceAll(context, cudaContext, ptx, "twoPassA_" + fullKernelName, grid, block, smemSize, config, src, totalElementsTyped, initValueTyped, scratchSpace);
                }
                else
                {
                    InvokeReduceAll(context, cudaContext, ptx, "twoPassA_" + fullKernelName, grid, block, smemSize, config, src, totalElementsTyped, initValueTyped, scratchSpace, extraArg);
                }

                uint numPass1Blocks = grid.x;
                getPass2ReduceBlockGrid(context, deviceId, totalElements, out grid, out block);
                smemSize = block.x * sizeof(float);

                InvokeReduceAllPass2(context, cudaContext, ptx, "twoPassB_" + fullKernelName, grid, block, smemSize, config.Use32BitIndices, numPass1Blocks, initValueTyped, scratchSpace, outputDevicePtr);
            }
            else
            {
                getSinglePassReduceBlockGrid(totalElements, out grid, out block);
                uint smemSize = block.x * sizeof(float);

                if (extraArg == null)
                {
                    InvokeReduceAll(context, cudaContext, ptx, "onePass_" + fullKernelName, grid, block, smemSize, config, src, totalElementsTyped, initValueTyped, outputDevicePtr);
                }
                else
                {
                    InvokeReduceAll(context, cudaContext, ptx, "onePass_" + fullKernelName, grid, block, smemSize, config, src, totalElementsTyped, initValueTyped, outputDevicePtr, extraArg);
                }
            }

            return(writeTarget);
        }
Esempio n. 4
0
 public void EpsilonPointer(out ManagedCuda.BasicTypes.CUdeviceptr epsilonDevicePointer)
 {
     epsilonDevicePointer = cudaEpsilon.DevicePointer;
 }
Esempio n. 5
0
 public void SigmaPointer(out ManagedCuda.BasicTypes.CUdeviceptr sigmaDevicePointer)
 {
     sigmaDevicePointer = cudaSigma.DevicePointer;
 }