Пример #1
0
        // Shared memory limit
        //
        private static int cudaOccMaxBlocksPerSMSmemLimit(
            cudaOccResult result,
            cudaOccDeviceProp properties,
            cudaOccFuncAttributes attributes,
            cudaOccDeviceState state,
            int blockSize,
            SizeT dynamicSmemSize)
        {
            int   allocationGranularity;
            SizeT userSmemPreference;
            SizeT totalSmemUsagePerCTA;
            SizeT smemAllocatedPerCTA;
            SizeT sharedMemPerMultiprocessor;
            int   maxBlocks;

            allocationGranularity = cudaOccSMemAllocationGranularity(properties);


            // Obtain the user preferred shared memory size. This setting is ignored if
            // user requests more shared memory than preferred.
            //
            userSmemPreference = cudaOccSMemPerMultiprocessor(properties, state.cacheConfig);

            totalSmemUsagePerCTA = attributes.sharedSizeBytes + dynamicSmemSize;
            smemAllocatedPerCTA  = __occRoundUp((int)totalSmemUsagePerCTA, (int)allocationGranularity);

            if (smemAllocatedPerCTA > properties.sharedMemPerBlock)
            {
                maxBlocks = 0;
            }
            else
            {
                // User requested shared memory limit is used as long as it is greater
                // than the total shared memory used per CTA, i.e. as long as at least
                // one CTA can be launched. Otherwise, the maximum shared memory limit
                // is used instead.
                //
                if (userSmemPreference >= smemAllocatedPerCTA)
                {
                    sharedMemPerMultiprocessor = userSmemPreference;
                }
                else
                {
                    sharedMemPerMultiprocessor = properties.sharedMemPerMultiprocessor;
                }

                if (smemAllocatedPerCTA > 0)
                {
                    maxBlocks = (int)(sharedMemPerMultiprocessor / smemAllocatedPerCTA);
                }
                else
                {
                    maxBlocks = int.MaxValue;
                }
            }

            result.AllocatedSharedMemPerBlock = smemAllocatedPerCTA;

            return(maxBlocks);
        }
Пример #2
0
        /// <summary>
        /// Determine the potential block size that allows maximum number of CTAs that can run on multiprocessor simultaneously
        /// </summary>
        /// <param name="properties"></param>
        /// <param name="attributes"></param>
        /// <param name="state"></param>
        /// <param name="blockSizeToSMem">
        /// A function to convert from block size to dynamic shared memory size.<para/>
        /// e.g.:
        /// If no dynamic shared memory is used: x => 0<para/>
        /// If 4 bytes shared memory per thread is used: x = 4 * x</param>
        /// <returns>maxBlockSize</returns>
        public static int cudaOccMaxPotentialOccupancyBlockSize(
            cudaOccDeviceProp properties,
            cudaOccFuncAttributes attributes,
            cudaOccDeviceState state,
            del_blockSizeToDynamicSMemSize blockSizeToSMem)
        {
            int maxOccupancy     = properties.maxThreadsPerMultiProcessor;
            int largestBlockSize = min_(properties.maxThreadsPerBlock, attributes.maxThreadsPerBlock);
            int granularity      = properties.warpSize;
            int maxBlockSize     = 0;
            int blockSize        = 0;
            int highestOccupancy = 0;

            for (blockSize = largestBlockSize; blockSize > 0; blockSize -= granularity)
            {
                cudaOccResult res       = cudaOccMaxActiveBlocksPerMultiprocessor(properties, attributes, blockSize, blockSizeToSMem(blockSize), state);
                int           occupancy = res.ActiveBlocksPerMultiProcessor;
                occupancy = blockSize * occupancy;

                if (occupancy > highestOccupancy)
                {
                    maxBlockSize     = blockSize;
                    highestOccupancy = occupancy;
                }

                // can not get higher occupancy
                if (highestOccupancy == maxOccupancy)
                {
                    break;
                }
            }

            return(maxBlockSize);
        }
Пример #3
0
        /// <summary>
        /// Determine the maximum number of CTAs that can be run simultaneously per SM.<para/>
        /// This is equivalent to the calculation done in the CUDA Occupancy Calculator
        /// spreadsheet
        /// </summary>
        /// <param name="properties"></param>
        /// <param name="attributes"></param>
        /// <param name="blockSize"></param>
        /// <param name="dynamic_smem_bytes"></param>
        /// <param name="state"></param>
        /// <returns></returns>
        public static cudaOccResult cudaOccMaxActiveBlocksPerMultiprocessor(
            cudaOccDeviceProp properties,
            cudaOccFuncAttributes attributes,
            int blockSize,
            SizeT dynamic_smem_bytes,
            cudaOccDeviceState state)
        {
            int regAllocationUnit = 0, warpAllocationMultiple = 0, maxBlocksPerSM = 0;
            int ctaLimitWarps = 0, ctaLimitBlocks = 0, smemPerCTA = 0, smemBytes = 0, smemAllocationUnit = 0;
            int cacheConfigSMem = 0, sharedMemPerMultiprocessor = 0, ctaLimitRegs = 0, regsPerCTA = 0;
            int regsPerWarp = 0, numSides = 0, numRegsPerSide = 0, ctaLimit = 0;
            int maxWarpsPerSm = 0, warpsPerCTA = 0, ctaLimitSMem = 0;
            cudaOccLimitingFactors limitingFactors = 0;
            cudaOccResult          result          = new cudaOccResult();

            if (properties == null || attributes == null || blockSize <= 0)
            {
                throw new CudaOccupancyException(cudaOccError.ErrorInvalidInput);
            }

            //////////////////////////////////////////
            // Limits due to warps/SM or blocks/SM
            //////////////////////////////////////////
            CudaOccupancyException.CheckZero(properties.warpSize);
            maxWarpsPerSm          = properties.maxThreadsPerMultiProcessor / properties.warpSize;
            warpAllocationMultiple = cudaOccWarpAllocationMultiple(properties);

            CudaOccupancyException.CheckZero(warpAllocationMultiple);
            warpsPerCTA = round_i(divide_ri(blockSize, properties.warpSize), warpAllocationMultiple);

            maxBlocksPerSM = cudaOccMaxBlocksPerMultiprocessor(properties);

            // Calc limits
            CudaOccupancyException.CheckZero(warpsPerCTA);
            ctaLimitWarps  = (blockSize <= properties.maxThreadsPerBlock) ? maxWarpsPerSm / warpsPerCTA : 0;
            ctaLimitBlocks = maxBlocksPerSM;

            //////////////////////////////////////////
            // Limits due to shared memory/SM
            //////////////////////////////////////////
            smemAllocationUnit = cudaOccSMemAllocationUnit(properties);
            smemBytes          = (int)(attributes.sharedSizeBytes + dynamic_smem_bytes);
            CudaOccupancyException.CheckZero(smemAllocationUnit);
            smemPerCTA = round_i(smemBytes, smemAllocationUnit);

            // Calc limit
            cacheConfigSMem = cudaOccSMemPerMultiprocessor(properties, state.cacheConfig);

            // sharedMemoryPerMultiprocessor is by default limit set in hardware but user requested shared memory
            // limit is used instead if it is greater than total shared memory used by function .
            sharedMemPerMultiprocessor = (cacheConfigSMem >= smemPerCTA)
                                ? cacheConfigSMem
                                : (int)properties.sharedMemPerMultiprocessor;
            // Limit on blocks launched should be calculated with shared memory per SM but total shared memory
            // used by function should be limited by shared memory per block
            ctaLimitSMem = 0;
            if (properties.sharedMemPerBlock >= (SizeT)smemPerCTA)
            {
                ctaLimitSMem = smemPerCTA > 0 ? sharedMemPerMultiprocessor / smemPerCTA : maxBlocksPerSM;
            }

            //////////////////////////////////////////
            // Limits due to registers/SM
            //////////////////////////////////////////
            regAllocationUnit = cudaOccRegAllocationUnit(properties, attributes.numRegs);
            CudaOccupancyException.CheckZero(regAllocationUnit);

            // Calc limit
            ctaLimitRegs = 0;
            if (properties.major <= 1)
            {
                // GPUs of compute capability 1.x allocate registers to CTAs
                // Number of regs per block is regs per thread times number of warps times warp size, rounded up to allocation unit
                regsPerCTA   = round_i(attributes.numRegs * properties.warpSize * warpsPerCTA, regAllocationUnit);
                ctaLimitRegs = regsPerCTA > 0 ? properties.regsPerMultiprocessor / regsPerCTA : maxBlocksPerSM;
            }
            else
            {
                // GPUs of compute capability 2.x and higher allocate registers to warps
                // Number of regs per warp is regs per thread times number of warps times warp size, rounded up to allocation unit
                regsPerWarp = round_i(attributes.numRegs * properties.warpSize, regAllocationUnit);
                regsPerCTA  = regsPerWarp * warpsPerCTA;
                if (properties.regsPerBlock >= regsPerCTA)
                {
                    numSides = cudaOccSidesPerMultiprocessor(properties);
                    CudaOccupancyException.CheckZero(numSides);
                    numRegsPerSide = properties.regsPerMultiprocessor / numSides;
                    ctaLimitRegs   = regsPerWarp > 0 ? ((numRegsPerSide / regsPerWarp) * numSides) / warpsPerCTA : maxBlocksPerSM;
                }
            }

            //////////////////////////////////////////
            // Overall limit is min() of limits due to above reasons
            //////////////////////////////////////////
            ctaLimit = min_(ctaLimitRegs, min_(ctaLimitSMem, min_(ctaLimitWarps, ctaLimitBlocks)));
            // Determine occupancy limiting factors


            result.ActiveBlocksPerMultiProcessor = ctaLimit;

            if (ctaLimit == ctaLimitWarps)
            {
                limitingFactors |= cudaOccLimitingFactors.Warps;
            }
            if (ctaLimit == ctaLimitRegs && regsPerCTA > 0)
            {
                limitingFactors |= cudaOccLimitingFactors.Registers;
            }
            if (ctaLimit == ctaLimitSMem && smemPerCTA > 0)
            {
                limitingFactors |= cudaOccLimitingFactors.SharedMemory;
            }
            if (ctaLimit == ctaLimitBlocks)
            {
                limitingFactors |= cudaOccLimitingFactors.Blocks;
            }
            result.LimitingFactors = limitingFactors;

            result.BlockLimitRegs      = ctaLimitRegs;
            result.BlockLimitSharedMem = ctaLimitSMem;
            result.BlockLimitWarps     = ctaLimitWarps;
            result.BlockLimitBlocks    = ctaLimitBlocks;

            result.BllocatedRegistersPerBlock = regsPerCTA;
            result.AllocatedSharedMemPerBlock = smemPerCTA;

            result.ActiveWarpsPerMultiProcessor   = ctaLimit * ((int)Math.Ceiling(blockSize / (double)properties.warpSize));
            result.ActiceThreadsPerMultiProcessor = result.ActiveWarpsPerMultiProcessor * properties.warpSize;
            result.OccupancyOfEachMultiProcessor  = (int)Math.Round(result.ActiveWarpsPerMultiProcessor / (double)maxWarpsPerSm * 100);
            return(result);
        }
Пример #4
0
		/// <summary>
		/// Determine the maximum number of CTAs that can be run simultaneously per SM.<para/>
		/// This is equivalent to the calculation done in the CUDA Occupancy Calculator
		/// spreadsheet
		/// </summary>
		/// <param name="properties"></param>
		/// <param name="attributes"></param>
		/// <param name="blockSize"></param>
		/// <param name="dynamic_smem_bytes"></param>
		/// <param name="state"></param>
		/// <returns></returns>
		public static cudaOccResult cudaOccMaxActiveBlocksPerMultiprocessor(
			cudaOccDeviceProp properties,
			cudaOccFuncAttributes attributes,
			int blockSize,
			SizeT dynamic_smem_bytes,
			cudaOccDeviceState state)
		{
			int regAllocationUnit = 0, warpAllocationMultiple = 0, maxBlocksPerSM=0;
			int ctaLimitWarps = 0, ctaLimitBlocks = 0, smemPerCTA = 0, smemBytes = 0, smemAllocationUnit = 0;
			int cacheConfigSMem = 0, sharedMemPerMultiprocessor = 0, ctaLimitRegs = 0, regsPerCTA=0;
			int regsPerWarp = 0, numSides = 0, numRegsPerSide = 0, ctaLimit=0;
			int maxWarpsPerSm = 0, warpsPerCTA = 0, ctaLimitSMem=0;
			cudaOccLimitingFactors limitingFactors = 0;
			cudaOccResult result = new cudaOccResult();

			if(properties == null || attributes == null || blockSize <= 0)
			{
				throw new CudaOccupancyException(cudaOccError.ErrorInvalidInput);
			}

			//////////////////////////////////////////
			// Limits due to warps/SM or blocks/SM
			//////////////////////////////////////////
			CudaOccupancyException.CheckZero(properties.warpSize);
			maxWarpsPerSm   = properties.maxThreadsPerMultiProcessor / properties.warpSize;
			warpAllocationMultiple = cudaOccWarpAllocationMultiple(properties);

			CudaOccupancyException.CheckZero(warpAllocationMultiple);
			warpsPerCTA = round_i(divide_ri(blockSize, properties.warpSize), warpAllocationMultiple);

			maxBlocksPerSM  = cudaOccMaxBlocksPerMultiprocessor(properties);

			// Calc limits
			CudaOccupancyException.CheckZero(warpsPerCTA);
			ctaLimitWarps  = (blockSize <= properties.maxThreadsPerBlock) ? maxWarpsPerSm / warpsPerCTA : 0;
			ctaLimitBlocks = maxBlocksPerSM;

			//////////////////////////////////////////
			// Limits due to shared memory/SM
			//////////////////////////////////////////
			smemAllocationUnit     = cudaOccSMemAllocationUnit(properties);
			smemBytes  = (int)(attributes.sharedSizeBytes + dynamic_smem_bytes);
			CudaOccupancyException.CheckZero(smemAllocationUnit);
			smemPerCTA = round_i(smemBytes, smemAllocationUnit);

			// Calc limit
			cacheConfigSMem = cudaOccSMemPerMultiprocessor(properties,state.cacheConfig);

			// sharedMemoryPerMultiprocessor is by default limit set in hardware but user requested shared memory
			// limit is used instead if it is greater than total shared memory used by function .
			sharedMemPerMultiprocessor = (cacheConfigSMem >= smemPerCTA)
				? cacheConfigSMem
				: (int)properties.sharedMemPerMultiprocessor;
			// Limit on blocks launched should be calculated with shared memory per SM but total shared memory
			// used by function should be limited by shared memory per block
			ctaLimitSMem = 0;
			if(properties.sharedMemPerBlock >= (SizeT)smemPerCTA)
			{
				ctaLimitSMem = smemPerCTA > 0 ? sharedMemPerMultiprocessor / smemPerCTA : maxBlocksPerSM;
			}

			//////////////////////////////////////////
			// Limits due to registers/SM
			//////////////////////////////////////////
			regAllocationUnit      = cudaOccRegAllocationUnit(properties, attributes.numRegs);
			CudaOccupancyException.CheckZero(regAllocationUnit);

			// Calc limit
			ctaLimitRegs = 0;
			if(properties.major <= 1)
			{
				// GPUs of compute capability 1.x allocate registers to CTAs
				// Number of regs per block is regs per thread times number of warps times warp size, rounded up to allocation unit
				regsPerCTA = round_i(attributes.numRegs * properties.warpSize * warpsPerCTA, regAllocationUnit);
				ctaLimitRegs = regsPerCTA > 0 ? properties.regsPerMultiprocessor / regsPerCTA : maxBlocksPerSM;
			}
			else
			{
				// GPUs of compute capability 2.x and higher allocate registers to warps
				// Number of regs per warp is regs per thread times number of warps times warp size, rounded up to allocation unit
				regsPerWarp = round_i(attributes.numRegs * properties.warpSize, regAllocationUnit);
				regsPerCTA = regsPerWarp * warpsPerCTA;
				if(properties.regsPerBlock >= regsPerCTA)
				{
					numSides = cudaOccSidesPerMultiprocessor(properties);
					CudaOccupancyException.CheckZero(numSides);
					numRegsPerSide = properties.regsPerMultiprocessor / numSides;
					ctaLimitRegs = regsPerWarp > 0 ? ((numRegsPerSide / regsPerWarp) * numSides) / warpsPerCTA : maxBlocksPerSM;
				}
			}

			//////////////////////////////////////////
			// Overall limit is min() of limits due to above reasons
			//////////////////////////////////////////
			ctaLimit = min_(ctaLimitRegs, min_(ctaLimitSMem, min_(ctaLimitWarps, ctaLimitBlocks)));
			// Determine occupancy limiting factors
			
			
			result.ActiveBlocksPerMultiProcessor = ctaLimit;

			if(ctaLimit==ctaLimitWarps)
			{
				limitingFactors |= cudaOccLimitingFactors.Warps;
			}
			if(ctaLimit==ctaLimitRegs && regsPerCTA > 0)
			{
				limitingFactors |= cudaOccLimitingFactors.Registers;
			}
			if(ctaLimit==ctaLimitSMem && smemPerCTA > 0)
			{
				limitingFactors |= cudaOccLimitingFactors.SharedMemory;
			}
			if(ctaLimit==ctaLimitBlocks)
			{
				limitingFactors |= cudaOccLimitingFactors.Blocks;
			}
			result.LimitingFactors = limitingFactors;

			result.BlockLimitRegs = ctaLimitRegs;
			result.BlockLimitSharedMem = ctaLimitSMem;
			result.BlockLimitWarps = ctaLimitWarps;
			result.BlockLimitBlocks = ctaLimitBlocks;

			result.BllocatedRegistersPerBlock = regsPerCTA;
			result.AllocatedSharedMemPerBlock = smemPerCTA;

			result.ActiveWarpsPerMultiProcessor = ctaLimit * ((int)Math.Ceiling(blockSize / (double)properties.warpSize));
			result.ActiceThreadsPerMultiProcessor = result.ActiveWarpsPerMultiProcessor * properties.warpSize;
			result.OccupancyOfEachMultiProcessor = (int)Math.Round(result.ActiveWarpsPerMultiProcessor / (double)maxWarpsPerSm * 100);
			return result;
		}
Пример #5
0
        /// <summary>
        ///
        /// </summary>
        /// <param name="minGridSize"></param>
        /// <param name="blockSize"></param>
        /// <param name="properties"></param>
        /// <param name="attributes"></param>
        /// <param name="state"></param>
        /// <param name="blockSizeToDynamicSMemSize"></param>
        /// <param name="dynamicSMemSize"></param>
        public static void cudaOccMaxPotentialOccupancyBlockSize(
            ref int minGridSize,
            ref int blockSize,
            cudaOccDeviceProp properties,
            cudaOccFuncAttributes attributes,
            cudaOccDeviceState state,
            del_blockSizeToDynamicSMemSize blockSizeToDynamicSMemSize,
            SizeT dynamicSMemSize)
        {
            cudaOccResult result = new cudaOccResult();

            // Limits
            int occupancyLimit;
            int granularity;
            int blockSizeLimit;

            // Recorded maximum
            int maxBlockSize = 0;
            int numBlocks    = 0;
            int maxOccupancy = 0;

            // Temporary
            int blockSizeToTryAligned;
            int blockSizeToTry;
            int blockSizeLimitAligned;
            int occupancyInBlocks;
            int occupancyInThreads;

            ///////////////////////////
            // Check user input
            ///////////////////////////

            //if (!minGridSize || !blockSize || !properties || !attributes || !state) {
            //	return CUDA_OCC_ERROR_INVALID_INPUT;
            //}

            cudaOccInputCheck(properties, attributes, state);

            /////////////////////////////////////////////////////////////////////////////////
            // Try each block size, and pick the block size with maximum occupancy
            /////////////////////////////////////////////////////////////////////////////////

            occupancyLimit = properties.maxThreadsPerMultiProcessor;
            granularity    = properties.warpSize;

            blockSizeLimit        = __occMin(properties.maxThreadsPerBlock, attributes.maxThreadsPerBlock);
            blockSizeLimitAligned = __occRoundUp(blockSizeLimit, granularity);

            for (blockSizeToTryAligned = blockSizeLimitAligned; blockSizeToTryAligned > 0; blockSizeToTryAligned -= granularity)
            {
                blockSizeToTry = __occMin(blockSizeLimit, blockSizeToTryAligned);

                // Ignore dynamicSMemSize if the user provides a mapping
                //
                if (blockSizeToDynamicSMemSize != null)
                {
                    dynamicSMemSize = blockSizeToDynamicSMemSize(blockSizeToTry);
                }

                cudaOccMaxActiveBlocksPerMultiprocessor(
                    result,
                    properties,
                    attributes,
                    state,
                    blockSizeToTry,
                    dynamicSMemSize);

                //if (status != CUDA_OCC_SUCCESS) {
                //	return status;
                //}

                occupancyInBlocks  = result.ActiveBlocksPerMultiProcessor;
                occupancyInThreads = blockSizeToTry * occupancyInBlocks;

                if (occupancyInThreads > maxOccupancy)
                {
                    maxBlockSize = blockSizeToTry;
                    numBlocks    = occupancyInBlocks;
                    maxOccupancy = occupancyInThreads;
                }

                // Early out if we have reached the maximum
                //
                if (occupancyLimit == maxOccupancy)
                {
                    break;
                }
            }

            ///////////////////////////
            // Return best available
            ///////////////////////////

            // Suggested min grid size to achieve a full machine launch
            //
            minGridSize = numBlocks * properties.numSms;
            blockSize   = maxBlockSize;
        }
Пример #6
0
        ///////////////////////////////////
        //      API Implementations      //
        ///////////////////////////////////


        /// <summary>
        /// Determine the maximum number of CTAs that can be run simultaneously per SM.<para/>
        /// This is equivalent to the calculation done in the CUDA Occupancy Calculator
        /// spreadsheet
        /// </summary>
        /// <param name="result"></param>
        /// <param name="properties"></param>
        /// <param name="attributes"></param>
        /// <param name="state"></param>
        /// <param name="blockSize"></param>
        /// <param name="dynamicSmemSize"></param>
        /// <returns></returns>
        public static void cudaOccMaxActiveBlocksPerMultiprocessor(
            cudaOccResult result,
            cudaOccDeviceProp properties,
            cudaOccFuncAttributes attributes,
            cudaOccDeviceState state,
            int blockSize,
            SizeT dynamicSmemSize)
        {
            int ctaLimitWarps  = 0;
            int ctaLimitBlocks = 0;
            int ctaLimitSMem   = 0;
            int ctaLimitRegs   = 0;
            int ctaLimit       = 0;
            cudaOccLimitingFactors limitingFactors = 0;

            cudaOccPartitionedGCConfig gcConfig = cudaOccPartitionedGCConfig.Off;

            //if (!result || !properties || !attributes || !state || blockSize <= 0) {
            //	return CUDA_OCC_ERROR_INVALID_INPUT;
            //}

            ///////////////////////////
            // Check user input
            ///////////////////////////

            cudaOccInputCheck(properties, attributes, state);

            ///////////////////////////
            // Initialization
            ///////////////////////////

            gcConfig = cudaOccPartitionedGCExpected(properties, attributes);

            ///////////////////////////
            // Compute occupancy
            ///////////////////////////

            // Limits due to registers/SM
            // Also compute if partitioned global caching has to be turned off
            //
            ctaLimitRegs = cudaOccMaxBlocksPerSMRegsLimit(ref gcConfig, result, properties, attributes, blockSize);


            // Limits due to warps/SM
            //
            ctaLimitWarps = cudaOccMaxBlocksPerSMWarpsLimit(gcConfig, properties, attributes, blockSize);


            // Limits due to blocks/SM
            //
            ctaLimitBlocks = cudaOccMaxBlocksPerMultiprocessor(properties);

            // Limits due to shared memory/SM
            //
            ctaLimitSMem = cudaOccMaxBlocksPerSMSmemLimit(result, properties, attributes, state, blockSize, dynamicSmemSize);


            ///////////////////////////
            // Overall occupancy
            ///////////////////////////

            // Overall limit is min() of limits due to above reasons
            //
            ctaLimit = __occMin(ctaLimitRegs, __occMin(ctaLimitSMem, __occMin(ctaLimitWarps, ctaLimitBlocks)));

            // Fill in the return values
            //
            // Determine occupancy limiting factors
            //
            if (ctaLimit == ctaLimitWarps)
            {
                limitingFactors |= cudaOccLimitingFactors.Warps;
            }
            if (ctaLimit == ctaLimitRegs)
            {
                limitingFactors |= cudaOccLimitingFactors.Registers;
            }
            if (ctaLimit == ctaLimitSMem)
            {
                limitingFactors |= cudaOccLimitingFactors.SharedMemory;
            }
            if (ctaLimit == ctaLimitBlocks)
            {
                limitingFactors |= cudaOccLimitingFactors.Blocks;
            }
            result.LimitingFactors = limitingFactors;

            result.BlockLimitRegs      = ctaLimitRegs;
            result.BlockLimitSharedMem = ctaLimitSMem;
            result.BlockLimitWarps     = ctaLimitWarps;
            result.BlockLimitBlocks    = ctaLimitBlocks;
            result.partitionedGCConfig = gcConfig;

            // Final occupancy
            result.ActiveBlocksPerMultiProcessor = ctaLimit;
        }
Пример #7
0
        private static int cudaOccMaxBlocksPerSMRegsLimit(
            ref cudaOccPartitionedGCConfig gcConfig,
            cudaOccResult result,
            cudaOccDeviceProp properties,
            cudaOccFuncAttributes attributes,
            int blockSize)
        {
            int allocationGranularity;
            int warpsAllocatedPerCTA;
            int regsAllocatedPerCTA;
            int regsAssumedPerCTA;
            int regsPerWarp;
            int regsAllocatedPerWarp;
            int numSubPartitions;
            int numRegsPerSubPartition;
            int numWarpsPerSubPartition;
            int numWarpsPerSM;
            int maxBlocks;

            allocationGranularity = cudaOccRegAllocationGranularity(
                properties,
                attributes.numRegs);                   // Fermi requires special handling of certain register usage

            numSubPartitions = cudaOccSubPartitionsPerMultiprocessor(properties);

            warpsAllocatedPerCTA = __occDivideRoundUp(blockSize, properties.warpSize);

            // GPUs of compute capability 2.x and higher allocate registers to warps
            //
            // Number of regs per warp is regs per thread x warp size, rounded up to
            // register allocation granularity
            //
            regsPerWarp          = attributes.numRegs * properties.warpSize;
            regsAllocatedPerWarp = __occRoundUp(regsPerWarp, allocationGranularity);
            regsAllocatedPerCTA  = regsAllocatedPerWarp * warpsAllocatedPerCTA;

            // Hardware verifies if a launch fits the per-CTA register limit. For
            // historical reasons, the verification logic assumes register
            // allocations are made to all partitions simultaneously. Therefore, to
            // simulate the hardware check, the warp allocation needs to be rounded
            // up to the number of partitions.
            //
            regsAssumedPerCTA = regsAllocatedPerWarp * __occRoundUp(warpsAllocatedPerCTA, numSubPartitions);

            if (properties.regsPerBlock < regsAssumedPerCTA ||               // Hardware check
                properties.regsPerBlock < regsAllocatedPerCTA)               // Software check
            {
                maxBlocks = 0;
            }
            else
            {
                if (regsAllocatedPerWarp > 0)
                {
                    // Registers are allocated in each sub-partition. The max number
                    // of warps that can fit on an SM is equal to the max number of
                    // warps per sub-partition x number of sub-partitions.
                    //
                    numRegsPerSubPartition  = properties.regsPerMultiprocessor / numSubPartitions;
                    numWarpsPerSubPartition = numRegsPerSubPartition / regsAllocatedPerWarp;

                    maxBlocks = 0;

                    if (gcConfig != cudaOccPartitionedGCConfig.Off)
                    {
                        int numSubPartitionsPerSmPartition;
                        int numWarpsPerSmPartition;
                        int maxBlocksPerSmPartition;

                        // If partitioned global caching is on, then a CTA can only
                        // use a half SM, and thus a half of the registers available
                        // per SM
                        //
                        numSubPartitionsPerSmPartition = numSubPartitions / 2;
                        numWarpsPerSmPartition         = numWarpsPerSubPartition * numSubPartitionsPerSmPartition;
                        maxBlocksPerSmPartition        = numWarpsPerSmPartition / warpsAllocatedPerCTA;
                        maxBlocks = maxBlocksPerSmPartition * 2;
                    }

                    // Try again if partitioned global caching is not enabled, or if
                    // the CTA cannot fit on the SM with caching on. In the latter
                    // case, the device will automatically turn off caching, except
                    // if the device forces it. The user can also override this
                    // assumption with PARTITIONED_GC_ON_STRICT to calculate
                    // occupancy and launch configuration.
                    //
                    {
                        bool gcOff         = (gcConfig == cudaOccPartitionedGCConfig.Off);
                        bool zeroOccupancy = (maxBlocks == 0);
                        bool cachingForced = (gcConfig == cudaOccPartitionedGCConfig.OnStrict ||
                                              cudaOccPartitionedGCForced(properties));

                        if (gcOff || (zeroOccupancy && (!cachingForced)))
                        {
                            gcConfig      = cudaOccPartitionedGCConfig.Off;
                            numWarpsPerSM = numWarpsPerSubPartition * numSubPartitions;
                            maxBlocks     = numWarpsPerSM / warpsAllocatedPerCTA;
                        }
                    }
                }
                else
                {
                    maxBlocks = int.MaxValue;
                }
            }


            result.AllocatedRegistersPerBlock = regsAllocatedPerCTA;

            return(maxBlocks);
        }
Пример #8
0
        // Shared memory limit
        //
        private static int cudaOccMaxBlocksPerSMSmemLimit(
			cudaOccResult result,
			cudaOccDeviceProp     properties,
			cudaOccFuncAttributes attributes,
			cudaOccDeviceState    state,
			int                   blockSize,
			SizeT                 dynamicSmemSize)
        {
            int allocationGranularity;
            SizeT userSmemPreference;
            SizeT totalSmemUsagePerCTA;
            SizeT smemAllocatedPerCTA;
            SizeT sharedMemPerMultiprocessor;
            int maxBlocks;

            allocationGranularity = cudaOccSMemAllocationGranularity(properties);

            // Obtain the user preferred shared memory size. This setting is ignored if
            // user requests more shared memory than preferred.
            //
            userSmemPreference = cudaOccSMemPerMultiprocessor(properties, state.cacheConfig);

            totalSmemUsagePerCTA = attributes.sharedSizeBytes + dynamicSmemSize;
            smemAllocatedPerCTA = __occRoundUp((int)totalSmemUsagePerCTA, (int)allocationGranularity);

            if (smemAllocatedPerCTA > properties.sharedMemPerBlock) {
                maxBlocks = 0;
            }
            else {
                // User requested shared memory limit is used as long as it is greater
                // than the total shared memory used per CTA, i.e. as long as at least
                // one CTA can be launched. Otherwise, the maximum shared memory limit
                // is used instead.
                //
                if (userSmemPreference >= smemAllocatedPerCTA) {
                    sharedMemPerMultiprocessor = userSmemPreference;
                }
                else{
                    sharedMemPerMultiprocessor = properties.sharedMemPerMultiprocessor;
                }

                if (smemAllocatedPerCTA > 0) {
                    maxBlocks = (int)(sharedMemPerMultiprocessor / smemAllocatedPerCTA);
                }
                else {
                    maxBlocks = int.MaxValue;
                }
            }

            result.AllocatedSharedMemPerBlock = smemAllocatedPerCTA;

            return maxBlocks;
        }
Пример #9
0
        private static int cudaOccMaxBlocksPerSMRegsLimit(
			ref cudaOccPartitionedGCConfig  gcConfig,
			cudaOccResult         result,
			cudaOccDeviceProp     properties,
			cudaOccFuncAttributes attributes,
			int                   blockSize)
        {
            int allocationGranularity;
            int warpsAllocatedPerCTA;
            int regsAllocatedPerCTA;
            int regsAssumedPerCTA;
            int regsPerWarp;
            int regsAllocatedPerWarp;
            int numSubPartitions;
            int numRegsPerSubPartition;
            int numWarpsPerSubPartition;
            int numWarpsPerSM;
            int maxBlocks;

            allocationGranularity = cudaOccRegAllocationGranularity(
                properties,
                attributes.numRegs);   // Fermi requires special handling of certain register usage

            numSubPartitions = cudaOccSubPartitionsPerMultiprocessor(properties);

            warpsAllocatedPerCTA = __occDivideRoundUp(blockSize, properties.warpSize);

            // GPUs of compute capability 2.x and higher allocate registers to warps
            //
            // Number of regs per warp is regs per thread x warp size, rounded up to
            // register allocation granularity
            //
            regsPerWarp          = attributes.numRegs * properties.warpSize;
            regsAllocatedPerWarp = __occRoundUp(regsPerWarp, allocationGranularity);
            regsAllocatedPerCTA  = regsAllocatedPerWarp * warpsAllocatedPerCTA;

            // Hardware verifies if a launch fits the per-CTA register limit. For
            // historical reasons, the verification logic assumes register
            // allocations are made to all partitions simultaneously. Therefore, to
            // simulate the hardware check, the warp allocation needs to be rounded
            // up to the number of partitions.
            //
            regsAssumedPerCTA = regsAllocatedPerWarp * __occRoundUp(warpsAllocatedPerCTA, numSubPartitions);

            if (properties.regsPerBlock < regsAssumedPerCTA ||   // Hardware check
                properties.regsPerBlock < regsAllocatedPerCTA) { // Software check
                maxBlocks = 0;
            }
            else {
                if (regsAllocatedPerWarp > 0) {
                    // Registers are allocated in each sub-partition. The max number
                    // of warps that can fit on an SM is equal to the max number of
                    // warps per sub-partition x number of sub-partitions.
                    //
                    numRegsPerSubPartition  = properties.regsPerMultiprocessor / numSubPartitions;
                    numWarpsPerSubPartition = numRegsPerSubPartition / regsAllocatedPerWarp;

                    maxBlocks = 0;

                    if (gcConfig != cudaOccPartitionedGCConfig.Off) {
                        int numSubPartitionsPerSmPartition;
                        int numWarpsPerSmPartition;
                        int maxBlocksPerSmPartition;

                        // If partitioned global caching is on, then a CTA can only
                        // use a half SM, and thus a half of the registers available
                        // per SM
                        //
                        numSubPartitionsPerSmPartition = numSubPartitions / 2;
                        numWarpsPerSmPartition         = numWarpsPerSubPartition * numSubPartitionsPerSmPartition;
                        maxBlocksPerSmPartition        = numWarpsPerSmPartition / warpsAllocatedPerCTA;
                        maxBlocks                      = maxBlocksPerSmPartition * 2;
                    }

                    // Try again if partitioned global caching is not enabled, or if
                    // the CTA cannot fit on the SM with caching on. In the latter
                    // case, the device will automatically turn off caching, except
                    // if the device forces it. The user can also override this
                    // assumption with PARTITIONED_GC_ON_STRICT to calculate
                    // occupancy and launch configuration.
                    //
                    {
                        bool gcOff = (gcConfig == cudaOccPartitionedGCConfig.Off);
                        bool zeroOccupancy = (maxBlocks == 0);
                        bool cachingForced = (gcConfig == cudaOccPartitionedGCConfig.OnStrict ||
                                             cudaOccPartitionedGCForced(properties));

                        if (gcOff || (zeroOccupancy && (!cachingForced))) {
                            gcConfig = cudaOccPartitionedGCConfig.Off;
                            numWarpsPerSM = numWarpsPerSubPartition * numSubPartitions;
                            maxBlocks     = numWarpsPerSM / warpsAllocatedPerCTA;
                        }
                    }
                }
                else {
                    maxBlocks = int.MaxValue;
                }
            }

            result.AllocatedRegistersPerBlock = regsAllocatedPerCTA;

            return maxBlocks;
        }
Пример #10
0
        /// <summary>
        /// 
        /// </summary>
        /// <param name="minGridSize"></param>
        /// <param name="blockSize"></param>
        /// <param name="properties"></param>
        /// <param name="attributes"></param>
        /// <param name="state"></param>
        /// <param name="blockSizeToDynamicSMemSize"></param>
        /// <param name="dynamicSMemSize"></param>
        public static void cudaOccMaxPotentialOccupancyBlockSize(
			ref int                         minGridSize,
			ref int                         blockSize,
			cudaOccDeviceProp     properties,
			cudaOccFuncAttributes attributes,
			cudaOccDeviceState    state,
			del_blockSizeToDynamicSMemSize blockSizeToDynamicSMemSize,
			SizeT                       dynamicSMemSize)
        {
            cudaOccResult result = new cudaOccResult();

            // Limits
            int occupancyLimit;
            int granularity;
            int blockSizeLimit;

            // Recorded maximum
            int maxBlockSize = 0;
            int numBlocks    = 0;
            int maxOccupancy = 0;

            // Temporary
            int blockSizeToTryAligned;
            int blockSizeToTry;
            int blockSizeLimitAligned;
            int occupancyInBlocks;
            int occupancyInThreads;

            ///////////////////////////
            // Check user input
            ///////////////////////////

            //if (!minGridSize || !blockSize || !properties || !attributes || !state) {
            //	return CUDA_OCC_ERROR_INVALID_INPUT;
            //}

            cudaOccInputCheck(properties, attributes, state);

            /////////////////////////////////////////////////////////////////////////////////
            // Try each block size, and pick the block size with maximum occupancy
            /////////////////////////////////////////////////////////////////////////////////

            occupancyLimit = properties.maxThreadsPerMultiProcessor;
            granularity    = properties.warpSize;

            blockSizeLimit        = __occMin(properties.maxThreadsPerBlock, attributes.maxThreadsPerBlock);
            blockSizeLimitAligned = __occRoundUp(blockSizeLimit, granularity);

            for (blockSizeToTryAligned = blockSizeLimitAligned; blockSizeToTryAligned > 0; blockSizeToTryAligned -= granularity) {
                blockSizeToTry = __occMin(blockSizeLimit, blockSizeToTryAligned);

                // Ignore dynamicSMemSize if the user provides a mapping
                //
                if (blockSizeToDynamicSMemSize != null) {
                    dynamicSMemSize = blockSizeToDynamicSMemSize(blockSizeToTry);
                }

                cudaOccMaxActiveBlocksPerMultiprocessor(
                    result,
                    properties,
                    attributes,
                    state,
                    blockSizeToTry,
                    dynamicSMemSize);

                //if (status != CUDA_OCC_SUCCESS) {
                //	return status;
                //}

                occupancyInBlocks = result.ActiveBlocksPerMultiProcessor;
                occupancyInThreads = blockSizeToTry * occupancyInBlocks;

                if (occupancyInThreads > maxOccupancy) {
                    maxBlockSize = blockSizeToTry;
                    numBlocks    = occupancyInBlocks;
                    maxOccupancy = occupancyInThreads;
                }

                // Early out if we have reached the maximum
                //
                if (occupancyLimit == maxOccupancy) {
                    break;
                }
            }

            ///////////////////////////
            // Return best available
            ///////////////////////////

            // Suggested min grid size to achieve a full machine launch
            //
            minGridSize = numBlocks * properties.numSms;
            blockSize = maxBlockSize;
        }
Пример #11
0
        ///////////////////////////////////
        //      API Implementations      //
        ///////////////////////////////////
        /// <summary>
        /// Determine the maximum number of CTAs that can be run simultaneously per SM.<para/>
        /// This is equivalent to the calculation done in the CUDA Occupancy Calculator
        /// spreadsheet
        /// </summary>
        /// <param name="result"></param>
        /// <param name="properties"></param>
        /// <param name="attributes"></param>
        /// <param name="state"></param>
        /// <param name="blockSize"></param>
        /// <param name="dynamicSmemSize"></param>
        /// <returns></returns>
        public static void cudaOccMaxActiveBlocksPerMultiprocessor(
			cudaOccResult               result,
			cudaOccDeviceProp     properties,
			cudaOccFuncAttributes attributes,
			cudaOccDeviceState    state,
			int                   blockSize,
			SizeT                 dynamicSmemSize)
        {
            int          ctaLimitWarps   = 0;
            int          ctaLimitBlocks  = 0;
            int          ctaLimitSMem    = 0;
            int          ctaLimitRegs    = 0;
            int          ctaLimit        = 0;
            cudaOccLimitingFactors limitingFactors = 0;

            cudaOccPartitionedGCConfig gcConfig = cudaOccPartitionedGCConfig.Off;

            //if (!result || !properties || !attributes || !state || blockSize <= 0) {
            //	return CUDA_OCC_ERROR_INVALID_INPUT;
            //}

            ///////////////////////////
            // Check user input
            ///////////////////////////

            cudaOccInputCheck(properties, attributes, state);

            ///////////////////////////
            // Initialization
            ///////////////////////////

            gcConfig = cudaOccPartitionedGCExpected(properties, attributes);

            ///////////////////////////
            // Compute occupancy
            ///////////////////////////

            // Limits due to registers/SM
            // Also compute if partitioned global caching has to be turned off
            //
            ctaLimitRegs = cudaOccMaxBlocksPerSMRegsLimit(ref gcConfig, result, properties, attributes, blockSize);

            // Limits due to warps/SM
            //
            ctaLimitWarps = cudaOccMaxBlocksPerSMWarpsLimit(gcConfig, properties, attributes, blockSize);

            // Limits due to blocks/SM
            //
            ctaLimitBlocks = cudaOccMaxBlocksPerMultiprocessor(properties);

            // Limits due to shared memory/SM
            //
            ctaLimitSMem = cudaOccMaxBlocksPerSMSmemLimit(result, properties, attributes, state, blockSize, dynamicSmemSize);

            ///////////////////////////
            // Overall occupancy
            ///////////////////////////

            // Overall limit is min() of limits due to above reasons
            //
            ctaLimit = __occMin(ctaLimitRegs, __occMin(ctaLimitSMem, __occMin(ctaLimitWarps, ctaLimitBlocks)));

            // Fill in the return values
            //
            // Determine occupancy limiting factors
            //
            if (ctaLimit == ctaLimitWarps) {
                limitingFactors |= cudaOccLimitingFactors.Warps;
            }
            if (ctaLimit == ctaLimitRegs) {
                limitingFactors |= cudaOccLimitingFactors.Registers;
            }
            if (ctaLimit == ctaLimitSMem) {
                limitingFactors |= cudaOccLimitingFactors.SharedMemory;
            }
            if (ctaLimit == ctaLimitBlocks) {
                limitingFactors |= cudaOccLimitingFactors.Blocks;
            }
            result.LimitingFactors = limitingFactors;

            result.BlockLimitRegs      = ctaLimitRegs;
            result.BlockLimitSharedMem = ctaLimitSMem;
            result.BlockLimitWarps     = ctaLimitWarps;
            result.BlockLimitBlocks    = ctaLimitBlocks;
            result.partitionedGCConfig = gcConfig;

            // Final occupancy
            result.ActiveBlocksPerMultiProcessor = ctaLimit;
        }