Пример #1
0
        /// <summary>
        /// Determine the maximum number of CTAs that can be run simultaneously per SM.<para/>
        /// This is equivalent to the calculation done in the CUDA Occupancy Calculator
        /// spreadsheet
        /// </summary>
        /// <param name="properties"></param>
        /// <param name="attributes"></param>
        /// <param name="blockSize"></param>
        /// <param name="dynamic_smem_bytes"></param>
        /// <param name="state"></param>
        /// <returns></returns>
        public static cudaOccResult cudaOccMaxActiveBlocksPerMultiprocessor(
            cudaOccDeviceProp properties,
            cudaOccFuncAttributes attributes,
            int blockSize,
            SizeT dynamic_smem_bytes,
            cudaOccDeviceState state)
        {
            int regAllocationUnit = 0, warpAllocationMultiple = 0, maxBlocksPerSM = 0;
            int ctaLimitWarps = 0, ctaLimitBlocks = 0, smemPerCTA = 0, smemBytes = 0, smemAllocationUnit = 0;
            int cacheConfigSMem = 0, sharedMemPerMultiprocessor = 0, ctaLimitRegs = 0, regsPerCTA = 0;
            int regsPerWarp = 0, numSides = 0, numRegsPerSide = 0, ctaLimit = 0;
            int maxWarpsPerSm = 0, warpsPerCTA = 0, ctaLimitSMem = 0;
            cudaOccLimitingFactors limitingFactors = 0;
            cudaOccResult          result          = new cudaOccResult();

            if (properties == null || attributes == null || blockSize <= 0)
            {
                throw new CudaOccupancyException(cudaOccError.ErrorInvalidInput);
            }

            //////////////////////////////////////////
            // Limits due to warps/SM or blocks/SM
            //////////////////////////////////////////
            CudaOccupancyException.CheckZero(properties.warpSize);
            maxWarpsPerSm          = properties.maxThreadsPerMultiProcessor / properties.warpSize;
            warpAllocationMultiple = cudaOccWarpAllocationMultiple(properties);

            CudaOccupancyException.CheckZero(warpAllocationMultiple);
            warpsPerCTA = round_i(divide_ri(blockSize, properties.warpSize), warpAllocationMultiple);

            maxBlocksPerSM = cudaOccMaxBlocksPerMultiprocessor(properties);

            // Calc limits
            CudaOccupancyException.CheckZero(warpsPerCTA);
            ctaLimitWarps  = (blockSize <= properties.maxThreadsPerBlock) ? maxWarpsPerSm / warpsPerCTA : 0;
            ctaLimitBlocks = maxBlocksPerSM;

            //////////////////////////////////////////
            // Limits due to shared memory/SM
            //////////////////////////////////////////
            smemAllocationUnit = cudaOccSMemAllocationUnit(properties);
            smemBytes          = (int)(attributes.sharedSizeBytes + dynamic_smem_bytes);
            CudaOccupancyException.CheckZero(smemAllocationUnit);
            smemPerCTA = round_i(smemBytes, smemAllocationUnit);

            // Calc limit
            cacheConfigSMem = cudaOccSMemPerMultiprocessor(properties, state.cacheConfig);

            // sharedMemoryPerMultiprocessor is by default limit set in hardware but user requested shared memory
            // limit is used instead if it is greater than total shared memory used by function .
            sharedMemPerMultiprocessor = (cacheConfigSMem >= smemPerCTA)
                                ? cacheConfigSMem
                                : (int)properties.sharedMemPerMultiprocessor;
            // Limit on blocks launched should be calculated with shared memory per SM but total shared memory
            // used by function should be limited by shared memory per block
            ctaLimitSMem = 0;
            if (properties.sharedMemPerBlock >= (SizeT)smemPerCTA)
            {
                ctaLimitSMem = smemPerCTA > 0 ? sharedMemPerMultiprocessor / smemPerCTA : maxBlocksPerSM;
            }

            //////////////////////////////////////////
            // Limits due to registers/SM
            //////////////////////////////////////////
            regAllocationUnit = cudaOccRegAllocationUnit(properties, attributes.numRegs);
            CudaOccupancyException.CheckZero(regAllocationUnit);

            // Calc limit
            ctaLimitRegs = 0;
            if (properties.major <= 1)
            {
                // GPUs of compute capability 1.x allocate registers to CTAs
                // Number of regs per block is regs per thread times number of warps times warp size, rounded up to allocation unit
                regsPerCTA   = round_i(attributes.numRegs * properties.warpSize * warpsPerCTA, regAllocationUnit);
                ctaLimitRegs = regsPerCTA > 0 ? properties.regsPerMultiprocessor / regsPerCTA : maxBlocksPerSM;
            }
            else
            {
                // GPUs of compute capability 2.x and higher allocate registers to warps
                // Number of regs per warp is regs per thread times number of warps times warp size, rounded up to allocation unit
                regsPerWarp = round_i(attributes.numRegs * properties.warpSize, regAllocationUnit);
                regsPerCTA  = regsPerWarp * warpsPerCTA;
                if (properties.regsPerBlock >= regsPerCTA)
                {
                    numSides = cudaOccSidesPerMultiprocessor(properties);
                    CudaOccupancyException.CheckZero(numSides);
                    numRegsPerSide = properties.regsPerMultiprocessor / numSides;
                    ctaLimitRegs   = regsPerWarp > 0 ? ((numRegsPerSide / regsPerWarp) * numSides) / warpsPerCTA : maxBlocksPerSM;
                }
            }

            //////////////////////////////////////////
            // Overall limit is min() of limits due to above reasons
            //////////////////////////////////////////
            ctaLimit = min_(ctaLimitRegs, min_(ctaLimitSMem, min_(ctaLimitWarps, ctaLimitBlocks)));
            // Determine occupancy limiting factors


            result.ActiveBlocksPerMultiProcessor = ctaLimit;

            if (ctaLimit == ctaLimitWarps)
            {
                limitingFactors |= cudaOccLimitingFactors.Warps;
            }
            if (ctaLimit == ctaLimitRegs && regsPerCTA > 0)
            {
                limitingFactors |= cudaOccLimitingFactors.Registers;
            }
            if (ctaLimit == ctaLimitSMem && smemPerCTA > 0)
            {
                limitingFactors |= cudaOccLimitingFactors.SharedMemory;
            }
            if (ctaLimit == ctaLimitBlocks)
            {
                limitingFactors |= cudaOccLimitingFactors.Blocks;
            }
            result.LimitingFactors = limitingFactors;

            result.BlockLimitRegs      = ctaLimitRegs;
            result.BlockLimitSharedMem = ctaLimitSMem;
            result.BlockLimitWarps     = ctaLimitWarps;
            result.BlockLimitBlocks    = ctaLimitBlocks;

            result.BllocatedRegistersPerBlock = regsPerCTA;
            result.AllocatedSharedMemPerBlock = smemPerCTA;

            result.ActiveWarpsPerMultiProcessor   = ctaLimit * ((int)Math.Ceiling(blockSize / (double)properties.warpSize));
            result.ActiceThreadsPerMultiProcessor = result.ActiveWarpsPerMultiProcessor * properties.warpSize;
            result.OccupancyOfEachMultiProcessor  = (int)Math.Round(result.ActiveWarpsPerMultiProcessor / (double)maxWarpsPerSm * 100);
            return(result);
        }
Пример #2
0
        ///////////////////////////////////
        //      API Implementations      //
        ///////////////////////////////////


        /// <summary>
        /// Determine the maximum number of CTAs that can be run simultaneously per SM.<para/>
        /// This is equivalent to the calculation done in the CUDA Occupancy Calculator
        /// spreadsheet
        /// </summary>
        /// <param name="result"></param>
        /// <param name="properties"></param>
        /// <param name="attributes"></param>
        /// <param name="state"></param>
        /// <param name="blockSize"></param>
        /// <param name="dynamicSmemSize"></param>
        /// <returns></returns>
        public static void cudaOccMaxActiveBlocksPerMultiprocessor(
            cudaOccResult result,
            cudaOccDeviceProp properties,
            cudaOccFuncAttributes attributes,
            cudaOccDeviceState state,
            int blockSize,
            SizeT dynamicSmemSize)
        {
            int ctaLimitWarps  = 0;
            int ctaLimitBlocks = 0;
            int ctaLimitSMem   = 0;
            int ctaLimitRegs   = 0;
            int ctaLimit       = 0;
            cudaOccLimitingFactors limitingFactors = 0;

            cudaOccPartitionedGCConfig gcConfig = cudaOccPartitionedGCConfig.Off;

            //if (!result || !properties || !attributes || !state || blockSize <= 0) {
            //	return CUDA_OCC_ERROR_INVALID_INPUT;
            //}

            ///////////////////////////
            // Check user input
            ///////////////////////////

            cudaOccInputCheck(properties, attributes, state);

            ///////////////////////////
            // Initialization
            ///////////////////////////

            gcConfig = cudaOccPartitionedGCExpected(properties, attributes);

            ///////////////////////////
            // Compute occupancy
            ///////////////////////////

            // Limits due to registers/SM
            // Also compute if partitioned global caching has to be turned off
            //
            ctaLimitRegs = cudaOccMaxBlocksPerSMRegsLimit(ref gcConfig, result, properties, attributes, blockSize);


            // Limits due to warps/SM
            //
            ctaLimitWarps = cudaOccMaxBlocksPerSMWarpsLimit(gcConfig, properties, attributes, blockSize);


            // Limits due to blocks/SM
            //
            ctaLimitBlocks = cudaOccMaxBlocksPerMultiprocessor(properties);

            // Limits due to shared memory/SM
            //
            ctaLimitSMem = cudaOccMaxBlocksPerSMSmemLimit(result, properties, attributes, state, blockSize, dynamicSmemSize);


            ///////////////////////////
            // Overall occupancy
            ///////////////////////////

            // Overall limit is min() of limits due to above reasons
            //
            ctaLimit = __occMin(ctaLimitRegs, __occMin(ctaLimitSMem, __occMin(ctaLimitWarps, ctaLimitBlocks)));

            // Fill in the return values
            //
            // Determine occupancy limiting factors
            //
            if (ctaLimit == ctaLimitWarps)
            {
                limitingFactors |= cudaOccLimitingFactors.Warps;
            }
            if (ctaLimit == ctaLimitRegs)
            {
                limitingFactors |= cudaOccLimitingFactors.Registers;
            }
            if (ctaLimit == ctaLimitSMem)
            {
                limitingFactors |= cudaOccLimitingFactors.SharedMemory;
            }
            if (ctaLimit == ctaLimitBlocks)
            {
                limitingFactors |= cudaOccLimitingFactors.Blocks;
            }
            result.LimitingFactors = limitingFactors;

            result.BlockLimitRegs      = ctaLimitRegs;
            result.BlockLimitSharedMem = ctaLimitSMem;
            result.BlockLimitWarps     = ctaLimitWarps;
            result.BlockLimitBlocks    = ctaLimitBlocks;
            result.partitionedGCConfig = gcConfig;

            // Final occupancy
            result.ActiveBlocksPerMultiProcessor = ctaLimit;
        }