Exemplo n.º 1
0
        // Shared memory limit
        //
        private static int cudaOccMaxBlocksPerSMSmemLimit(
            cudaOccResult result,
            cudaOccDeviceProp properties,
            cudaOccFuncAttributes attributes,
            cudaOccDeviceState state,
            int blockSize,
            SizeT dynamicSmemSize)
        {
            int   allocationGranularity;
            SizeT userSmemPreference;
            SizeT totalSmemUsagePerCTA;
            SizeT smemAllocatedPerCTA;
            SizeT sharedMemPerMultiprocessor;
            int   maxBlocks;

            allocationGranularity = cudaOccSMemAllocationGranularity(properties);


            // Obtain the user preferred shared memory size. This setting is ignored if
            // user requests more shared memory than preferred.
            //
            userSmemPreference = cudaOccSMemPerMultiprocessor(properties, state.cacheConfig);

            totalSmemUsagePerCTA = attributes.sharedSizeBytes + dynamicSmemSize;
            smemAllocatedPerCTA  = __occRoundUp((int)totalSmemUsagePerCTA, (int)allocationGranularity);

            if (smemAllocatedPerCTA > properties.sharedMemPerBlock)
            {
                maxBlocks = 0;
            }
            else
            {
                // User requested shared memory limit is used as long as it is greater
                // than the total shared memory used per CTA, i.e. as long as at least
                // one CTA can be launched. Otherwise, the maximum shared memory limit
                // is used instead.
                //
                if (userSmemPreference >= smemAllocatedPerCTA)
                {
                    sharedMemPerMultiprocessor = userSmemPreference;
                }
                else
                {
                    sharedMemPerMultiprocessor = properties.sharedMemPerMultiprocessor;
                }

                if (smemAllocatedPerCTA > 0)
                {
                    maxBlocks = (int)(sharedMemPerMultiprocessor / smemAllocatedPerCTA);
                }
                else
                {
                    maxBlocks = int.MaxValue;
                }
            }

            result.AllocatedSharedMemPerBlock = smemAllocatedPerCTA;

            return(maxBlocks);
        }
Exemplo n.º 2
0
        /// <summary>
        /// Determine the potential block size that allows maximum number of CTAs that can run on multiprocessor simultaneously
        /// </summary>
        /// <param name="properties"></param>
        /// <param name="attributes"></param>
        /// <param name="state"></param>
        /// <param name="blockSizeToSMem">
        /// A function to convert from block size to dynamic shared memory size.<para/>
        /// e.g.:
        /// If no dynamic shared memory is used: x => 0<para/>
        /// If 4 bytes shared memory per thread is used: x = 4 * x</param>
        /// <returns>maxBlockSize</returns>
        public static int cudaOccMaxPotentialOccupancyBlockSize(
            cudaOccDeviceProp properties,
            cudaOccFuncAttributes attributes,
            cudaOccDeviceState state,
            del_blockSizeToDynamicSMemSize blockSizeToSMem)
        {
            int maxOccupancy     = properties.maxThreadsPerMultiProcessor;
            int largestBlockSize = min_(properties.maxThreadsPerBlock, attributes.maxThreadsPerBlock);
            int granularity      = properties.warpSize;
            int maxBlockSize     = 0;
            int blockSize        = 0;
            int highestOccupancy = 0;

            for (blockSize = largestBlockSize; blockSize > 0; blockSize -= granularity)
            {
                cudaOccResult res       = cudaOccMaxActiveBlocksPerMultiprocessor(properties, attributes, blockSize, blockSizeToSMem(blockSize), state);
                int           occupancy = res.ActiveBlocksPerMultiProcessor;
                occupancy = blockSize * occupancy;

                if (occupancy > highestOccupancy)
                {
                    maxBlockSize     = blockSize;
                    highestOccupancy = occupancy;
                }

                // can not get higher occupancy
                if (highestOccupancy == maxOccupancy)
                {
                    break;
                }
            }

            return(maxBlockSize);
        }
Exemplo n.º 3
0
        private static cudaOccError cudaOccDeviceStateCheck(cudaOccDeviceState state)
        {
            // Placeholder
            //

            return(cudaOccError.None);
        }
Exemplo n.º 4
0
        private static void cudaOccInputCheck(
            cudaOccDeviceProp properties,
            cudaOccFuncAttributes attributes,
            cudaOccDeviceState state)
        {
            cudaOccError status = cudaOccError.None;

            status = cudaOccDevicePropCheck(properties);
            if (status != cudaOccError.None)
            {
                throw new CudaOccupancyException(status);
            }

            status = cudaOccFuncAttributesCheck(attributes);
            if (status != cudaOccError.None)
            {
                throw new CudaOccupancyException(status);
            }

            status = cudaOccDeviceStateCheck(state);
            if (status != cudaOccError.None)
            {
                throw new CudaOccupancyException(status);
            }
        }
Exemplo n.º 5
0
        ///////////////////////////////////////////////
        //    Occupancy calculation Functions        //
        ///////////////////////////////////////////////

        /// <summary>
        /// Determine the maximum number of CTAs that can be run simultaneously per SM.<para/>
        /// This is equivalent to the calculation done in the CUDA Occupancy Calculator
        /// spreadsheet
        /// </summary>
        /// <param name="properties"></param>
        /// <param name="kernel"></param>
        /// <param name="state"></param>
        /// <returns></returns>
        public static cudaOccResult cudaOccMaxActiveBlocksPerMultiprocessor(
            CudaDeviceProperties properties,
            CudaKernel kernel,
            cudaOccDeviceState state)
        {
            cudaOccDeviceProp     props      = new cudaOccDeviceProp(properties);
            cudaOccFuncAttributes attributes = new cudaOccFuncAttributes(kernel);

            return(cudaOccMaxActiveBlocksPerMultiprocessor(props, attributes, (int)kernel.BlockDimensions.x * (int)kernel.BlockDimensions.y * (int)kernel.BlockDimensions.z, kernel.DynamicSharedMemory, state));
        }
Exemplo n.º 6
0
 /// <summary>
 ///
 /// </summary>
 /// <param name="minGridSize"></param>
 /// <param name="blockSize"></param>
 /// <param name="properties"></param>
 /// <param name="attributes"></param>
 /// <param name="state"></param>
 /// <param name="dynamicSMemSize"></param>
 public static void cudaOccMaxPotentialOccupancyBlockSize(
     ref int minGridSize,
     ref int blockSize,
     cudaOccDeviceProp properties,
     cudaOccFuncAttributes attributes,
     cudaOccDeviceState state,
     SizeT dynamicSMemSize)
 {
     cudaOccMaxPotentialOccupancyBlockSize(ref minGridSize, ref blockSize, properties, attributes, state, null, dynamicSMemSize);
 }
Exemplo n.º 7
0
        /// <summary>
        /// Determine the potential block size that allows maximum number of CTAs that can run on multiprocessor simultaneously
        /// </summary>
        /// <param name="properties"></param>
        /// <param name="kernel"></param>
        /// <param name="state"></param>
        /// <param name="blockSizeToSMem">
        /// A function to convert from block size to dynamic shared memory size.<para/>
        /// e.g.:
        /// If no dynamic shared memory is used: x => 0<para/>
        /// If 4 bytes shared memory per thread is used: x = 4 * x</param>
        /// <returns>maxBlockSize</returns>
        public static int cudaOccMaxPotentialOccupancyBlockSize(
            CudaDeviceProperties properties,
            CudaKernel kernel,
            cudaOccDeviceState state,
            del_blockSizeToDynamicSMemSize blockSizeToSMem)
        {
            cudaOccDeviceProp     props      = new cudaOccDeviceProp(properties);
            cudaOccFuncAttributes attributes = new cudaOccFuncAttributes(kernel);

            return(cudaOccMaxPotentialOccupancyBlockSize(props, attributes, state, blockSizeToSMem));
        }
Exemplo n.º 8
0
        /// <summary>
        /// Determine the maximum number of CTAs that can be run simultaneously per SM.<para/>
        /// This is equivalent to the calculation done in the CUDA Occupancy Calculator
        /// spreadsheet
        /// </summary>
        /// <param name="properties"></param>
        /// <param name="attributes"></param>
        /// <param name="blockSize"></param>
        /// <param name="dynamic_smem_bytes"></param>
        /// <param name="state"></param>
        /// <returns></returns>
        public static cudaOccResult cudaOccMaxActiveBlocksPerMultiprocessor(
            cudaOccDeviceProp properties,
            cudaOccFuncAttributes attributes,
            int blockSize,
            SizeT dynamic_smem_bytes,
            cudaOccDeviceState state)
        {
            int regAllocationUnit = 0, warpAllocationMultiple = 0, maxBlocksPerSM = 0;
            int ctaLimitWarps = 0, ctaLimitBlocks = 0, smemPerCTA = 0, smemBytes = 0, smemAllocationUnit = 0;
            int cacheConfigSMem = 0, sharedMemPerMultiprocessor = 0, ctaLimitRegs = 0, regsPerCTA = 0;
            int regsPerWarp = 0, numSides = 0, numRegsPerSide = 0, ctaLimit = 0;
            int maxWarpsPerSm = 0, warpsPerCTA = 0, ctaLimitSMem = 0;
            cudaOccLimitingFactors limitingFactors = 0;
            cudaOccResult          result          = new cudaOccResult();

            if (properties == null || attributes == null || blockSize <= 0)
            {
                throw new CudaOccupancyException(cudaOccError.ErrorInvalidInput);
            }

            //////////////////////////////////////////
            // Limits due to warps/SM or blocks/SM
            //////////////////////////////////////////
            CudaOccupancyException.CheckZero(properties.warpSize);
            maxWarpsPerSm          = properties.maxThreadsPerMultiProcessor / properties.warpSize;
            warpAllocationMultiple = cudaOccWarpAllocationMultiple(properties);

            CudaOccupancyException.CheckZero(warpAllocationMultiple);
            warpsPerCTA = round_i(divide_ri(blockSize, properties.warpSize), warpAllocationMultiple);

            maxBlocksPerSM = cudaOccMaxBlocksPerMultiprocessor(properties);

            // Calc limits
            CudaOccupancyException.CheckZero(warpsPerCTA);
            ctaLimitWarps  = (blockSize <= properties.maxThreadsPerBlock) ? maxWarpsPerSm / warpsPerCTA : 0;
            ctaLimitBlocks = maxBlocksPerSM;

            //////////////////////////////////////////
            // Limits due to shared memory/SM
            //////////////////////////////////////////
            smemAllocationUnit = cudaOccSMemAllocationUnit(properties);
            smemBytes          = (int)(attributes.sharedSizeBytes + dynamic_smem_bytes);
            CudaOccupancyException.CheckZero(smemAllocationUnit);
            smemPerCTA = round_i(smemBytes, smemAllocationUnit);

            // Calc limit
            cacheConfigSMem = cudaOccSMemPerMultiprocessor(properties, state.cacheConfig);

            // sharedMemoryPerMultiprocessor is by default limit set in hardware but user requested shared memory
            // limit is used instead if it is greater than total shared memory used by function .
            sharedMemPerMultiprocessor = (cacheConfigSMem >= smemPerCTA)
                                ? cacheConfigSMem
                                : (int)properties.sharedMemPerMultiprocessor;
            // Limit on blocks launched should be calculated with shared memory per SM but total shared memory
            // used by function should be limited by shared memory per block
            ctaLimitSMem = 0;
            if (properties.sharedMemPerBlock >= (SizeT)smemPerCTA)
            {
                ctaLimitSMem = smemPerCTA > 0 ? sharedMemPerMultiprocessor / smemPerCTA : maxBlocksPerSM;
            }

            //////////////////////////////////////////
            // Limits due to registers/SM
            //////////////////////////////////////////
            regAllocationUnit = cudaOccRegAllocationUnit(properties, attributes.numRegs);
            CudaOccupancyException.CheckZero(regAllocationUnit);

            // Calc limit
            ctaLimitRegs = 0;
            if (properties.major <= 1)
            {
                // GPUs of compute capability 1.x allocate registers to CTAs
                // Number of regs per block is regs per thread times number of warps times warp size, rounded up to allocation unit
                regsPerCTA   = round_i(attributes.numRegs * properties.warpSize * warpsPerCTA, regAllocationUnit);
                ctaLimitRegs = regsPerCTA > 0 ? properties.regsPerMultiprocessor / regsPerCTA : maxBlocksPerSM;
            }
            else
            {
                // GPUs of compute capability 2.x and higher allocate registers to warps
                // Number of regs per warp is regs per thread times number of warps times warp size, rounded up to allocation unit
                regsPerWarp = round_i(attributes.numRegs * properties.warpSize, regAllocationUnit);
                regsPerCTA  = regsPerWarp * warpsPerCTA;
                if (properties.regsPerBlock >= regsPerCTA)
                {
                    numSides = cudaOccSidesPerMultiprocessor(properties);
                    CudaOccupancyException.CheckZero(numSides);
                    numRegsPerSide = properties.regsPerMultiprocessor / numSides;
                    ctaLimitRegs   = regsPerWarp > 0 ? ((numRegsPerSide / regsPerWarp) * numSides) / warpsPerCTA : maxBlocksPerSM;
                }
            }

            //////////////////////////////////////////
            // Overall limit is min() of limits due to above reasons
            //////////////////////////////////////////
            ctaLimit = min_(ctaLimitRegs, min_(ctaLimitSMem, min_(ctaLimitWarps, ctaLimitBlocks)));
            // Determine occupancy limiting factors


            result.ActiveBlocksPerMultiProcessor = ctaLimit;

            if (ctaLimit == ctaLimitWarps)
            {
                limitingFactors |= cudaOccLimitingFactors.Warps;
            }
            if (ctaLimit == ctaLimitRegs && regsPerCTA > 0)
            {
                limitingFactors |= cudaOccLimitingFactors.Registers;
            }
            if (ctaLimit == ctaLimitSMem && smemPerCTA > 0)
            {
                limitingFactors |= cudaOccLimitingFactors.SharedMemory;
            }
            if (ctaLimit == ctaLimitBlocks)
            {
                limitingFactors |= cudaOccLimitingFactors.Blocks;
            }
            result.LimitingFactors = limitingFactors;

            result.BlockLimitRegs      = ctaLimitRegs;
            result.BlockLimitSharedMem = ctaLimitSMem;
            result.BlockLimitWarps     = ctaLimitWarps;
            result.BlockLimitBlocks    = ctaLimitBlocks;

            result.BllocatedRegistersPerBlock = regsPerCTA;
            result.AllocatedSharedMemPerBlock = smemPerCTA;

            result.ActiveWarpsPerMultiProcessor   = ctaLimit * ((int)Math.Ceiling(blockSize / (double)properties.warpSize));
            result.ActiceThreadsPerMultiProcessor = result.ActiveWarpsPerMultiProcessor * properties.warpSize;
            result.OccupancyOfEachMultiProcessor  = (int)Math.Round(result.ActiveWarpsPerMultiProcessor / (double)maxWarpsPerSm * 100);
            return(result);
        }
Exemplo n.º 9
0
		/// <summary>
		/// Determine the potential block size that allows maximum number of CTAs that can run on multiprocessor simultaneously 
		/// </summary>
		/// <param name="properties"></param>
		/// <param name="attributes"></param>
		/// <param name="state"></param>
		/// <param name="blockSizeToSMem">
		/// A function to convert from block size to dynamic shared memory size.<para/>
		/// e.g.:
		/// If no dynamic shared memory is used: x => 0<para/>
		/// If 4 bytes shared memory per thread is used: x = 4 * x</param>
		/// <returns>maxBlockSize</returns>
		public static int cudaOccMaxPotentialOccupancyBlockSize(
		    cudaOccDeviceProp properties,
		    cudaOccFuncAttributes attributes,
		    cudaOccDeviceState state,
		    del_blockSizeToDynamicSMemSize blockSizeToSMem)
		{
		    int maxOccupancy       = properties.maxThreadsPerMultiProcessor;
		    int largestBlockSize   = min_(properties.maxThreadsPerBlock, attributes.maxThreadsPerBlock);
		    int granularity        = properties.warpSize;
		    int maxBlockSize  = 0;
		    int blockSize     = 0;
		    int highestOccupancy   = 0;

		    for(blockSize = largestBlockSize; blockSize > 0; blockSize -= granularity)
		    {
				cudaOccResult res = cudaOccMaxActiveBlocksPerMultiprocessor(properties, attributes, blockSize, blockSizeToSMem(blockSize), state);
				int occupancy = res.ActiveBlocksPerMultiProcessor;
		        occupancy = blockSize*occupancy;

		        if(occupancy > highestOccupancy)
		        {
		            maxBlockSize = blockSize;
		            highestOccupancy = occupancy;
		        }

		        // can not get higher occupancy
		        if(highestOccupancy == maxOccupancy)
		            break;
		    }

		    return maxBlockSize;
		}
Exemplo n.º 10
0
		/// <summary>
		/// Determine the potential block size that allows maximum number of CTAs that can run on multiprocessor simultaneously 
		/// </summary>
		/// <param name="properties"></param>
		/// <param name="kernel"></param>
		/// <param name="state"></param>
		/// <param name="blockSizeToSMem">
		/// A function to convert from block size to dynamic shared memory size.<para/>
		/// e.g.:
		/// If no dynamic shared memory is used: x => 0<para/>
		/// If 4 bytes shared memory per thread is used: x = 4 * x</param>
		/// <returns>maxBlockSize</returns>
		public static int cudaOccMaxPotentialOccupancyBlockSize(
			CudaDeviceProperties properties,
			CudaKernel kernel,
			cudaOccDeviceState state,
			del_blockSizeToDynamicSMemSize blockSizeToSMem)
		{
			cudaOccDeviceProp props = new cudaOccDeviceProp(properties);
			cudaOccFuncAttributes attributes = new cudaOccFuncAttributes(kernel);
			return cudaOccMaxPotentialOccupancyBlockSize(props, attributes, state, blockSizeToSMem);
		}
Exemplo n.º 11
0
		/// <summary>
		/// Determine the maximum number of CTAs that can be run simultaneously per SM.<para/>
		/// This is equivalent to the calculation done in the CUDA Occupancy Calculator
		/// spreadsheet
		/// </summary>
		/// <param name="properties"></param>
		/// <param name="attributes"></param>
		/// <param name="blockSize"></param>
		/// <param name="dynamic_smem_bytes"></param>
		/// <param name="state"></param>
		/// <returns></returns>
		public static cudaOccResult cudaOccMaxActiveBlocksPerMultiprocessor(
			cudaOccDeviceProp properties,
			cudaOccFuncAttributes attributes,
			int blockSize,
			SizeT dynamic_smem_bytes,
			cudaOccDeviceState state)
		{
			int regAllocationUnit = 0, warpAllocationMultiple = 0, maxBlocksPerSM=0;
			int ctaLimitWarps = 0, ctaLimitBlocks = 0, smemPerCTA = 0, smemBytes = 0, smemAllocationUnit = 0;
			int cacheConfigSMem = 0, sharedMemPerMultiprocessor = 0, ctaLimitRegs = 0, regsPerCTA=0;
			int regsPerWarp = 0, numSides = 0, numRegsPerSide = 0, ctaLimit=0;
			int maxWarpsPerSm = 0, warpsPerCTA = 0, ctaLimitSMem=0;
			cudaOccLimitingFactors limitingFactors = 0;
			cudaOccResult result = new cudaOccResult();

			if(properties == null || attributes == null || blockSize <= 0)
			{
				throw new CudaOccupancyException(cudaOccError.ErrorInvalidInput);
			}

			//////////////////////////////////////////
			// Limits due to warps/SM or blocks/SM
			//////////////////////////////////////////
			CudaOccupancyException.CheckZero(properties.warpSize);
			maxWarpsPerSm   = properties.maxThreadsPerMultiProcessor / properties.warpSize;
			warpAllocationMultiple = cudaOccWarpAllocationMultiple(properties);

			CudaOccupancyException.CheckZero(warpAllocationMultiple);
			warpsPerCTA = round_i(divide_ri(blockSize, properties.warpSize), warpAllocationMultiple);

			maxBlocksPerSM  = cudaOccMaxBlocksPerMultiprocessor(properties);

			// Calc limits
			CudaOccupancyException.CheckZero(warpsPerCTA);
			ctaLimitWarps  = (blockSize <= properties.maxThreadsPerBlock) ? maxWarpsPerSm / warpsPerCTA : 0;
			ctaLimitBlocks = maxBlocksPerSM;

			//////////////////////////////////////////
			// Limits due to shared memory/SM
			//////////////////////////////////////////
			smemAllocationUnit     = cudaOccSMemAllocationUnit(properties);
			smemBytes  = (int)(attributes.sharedSizeBytes + dynamic_smem_bytes);
			CudaOccupancyException.CheckZero(smemAllocationUnit);
			smemPerCTA = round_i(smemBytes, smemAllocationUnit);

			// Calc limit
			cacheConfigSMem = cudaOccSMemPerMultiprocessor(properties,state.cacheConfig);

			// sharedMemoryPerMultiprocessor is by default limit set in hardware but user requested shared memory
			// limit is used instead if it is greater than total shared memory used by function .
			sharedMemPerMultiprocessor = (cacheConfigSMem >= smemPerCTA)
				? cacheConfigSMem
				: (int)properties.sharedMemPerMultiprocessor;
			// Limit on blocks launched should be calculated with shared memory per SM but total shared memory
			// used by function should be limited by shared memory per block
			ctaLimitSMem = 0;
			if(properties.sharedMemPerBlock >= (SizeT)smemPerCTA)
			{
				ctaLimitSMem = smemPerCTA > 0 ? sharedMemPerMultiprocessor / smemPerCTA : maxBlocksPerSM;
			}

			//////////////////////////////////////////
			// Limits due to registers/SM
			//////////////////////////////////////////
			regAllocationUnit      = cudaOccRegAllocationUnit(properties, attributes.numRegs);
			CudaOccupancyException.CheckZero(regAllocationUnit);

			// Calc limit
			ctaLimitRegs = 0;
			if(properties.major <= 1)
			{
				// GPUs of compute capability 1.x allocate registers to CTAs
				// Number of regs per block is regs per thread times number of warps times warp size, rounded up to allocation unit
				regsPerCTA = round_i(attributes.numRegs * properties.warpSize * warpsPerCTA, regAllocationUnit);
				ctaLimitRegs = regsPerCTA > 0 ? properties.regsPerMultiprocessor / regsPerCTA : maxBlocksPerSM;
			}
			else
			{
				// GPUs of compute capability 2.x and higher allocate registers to warps
				// Number of regs per warp is regs per thread times number of warps times warp size, rounded up to allocation unit
				regsPerWarp = round_i(attributes.numRegs * properties.warpSize, regAllocationUnit);
				regsPerCTA = regsPerWarp * warpsPerCTA;
				if(properties.regsPerBlock >= regsPerCTA)
				{
					numSides = cudaOccSidesPerMultiprocessor(properties);
					CudaOccupancyException.CheckZero(numSides);
					numRegsPerSide = properties.regsPerMultiprocessor / numSides;
					ctaLimitRegs = regsPerWarp > 0 ? ((numRegsPerSide / regsPerWarp) * numSides) / warpsPerCTA : maxBlocksPerSM;
				}
			}

			//////////////////////////////////////////
			// Overall limit is min() of limits due to above reasons
			//////////////////////////////////////////
			ctaLimit = min_(ctaLimitRegs, min_(ctaLimitSMem, min_(ctaLimitWarps, ctaLimitBlocks)));
			// Determine occupancy limiting factors
			
			
			result.ActiveBlocksPerMultiProcessor = ctaLimit;

			if(ctaLimit==ctaLimitWarps)
			{
				limitingFactors |= cudaOccLimitingFactors.Warps;
			}
			if(ctaLimit==ctaLimitRegs && regsPerCTA > 0)
			{
				limitingFactors |= cudaOccLimitingFactors.Registers;
			}
			if(ctaLimit==ctaLimitSMem && smemPerCTA > 0)
			{
				limitingFactors |= cudaOccLimitingFactors.SharedMemory;
			}
			if(ctaLimit==ctaLimitBlocks)
			{
				limitingFactors |= cudaOccLimitingFactors.Blocks;
			}
			result.LimitingFactors = limitingFactors;

			result.BlockLimitRegs = ctaLimitRegs;
			result.BlockLimitSharedMem = ctaLimitSMem;
			result.BlockLimitWarps = ctaLimitWarps;
			result.BlockLimitBlocks = ctaLimitBlocks;

			result.BllocatedRegistersPerBlock = regsPerCTA;
			result.AllocatedSharedMemPerBlock = smemPerCTA;

			result.ActiveWarpsPerMultiProcessor = ctaLimit * ((int)Math.Ceiling(blockSize / (double)properties.warpSize));
			result.ActiceThreadsPerMultiProcessor = result.ActiveWarpsPerMultiProcessor * properties.warpSize;
			result.OccupancyOfEachMultiProcessor = (int)Math.Round(result.ActiveWarpsPerMultiProcessor / (double)maxWarpsPerSm * 100);
			return result;
		}
Exemplo n.º 12
0
		///////////////////////////////////////////////
		//    Occupancy calculation Functions        //
		///////////////////////////////////////////////

		/// <summary>
		/// Determine the maximum number of CTAs that can be run simultaneously per SM.<para/>
		/// This is equivalent to the calculation done in the CUDA Occupancy Calculator
		/// spreadsheet
		/// </summary>
		/// <param name="properties"></param>
		/// <param name="kernel"></param>
		/// <param name="state"></param>
		/// <returns></returns>
		public static cudaOccResult cudaOccMaxActiveBlocksPerMultiprocessor(
			CudaDeviceProperties properties,
			CudaKernel kernel,
			cudaOccDeviceState state)
		{
			cudaOccDeviceProp props = new cudaOccDeviceProp(properties);
			cudaOccFuncAttributes attributes = new cudaOccFuncAttributes(kernel);

			return cudaOccMaxActiveBlocksPerMultiprocessor(props, attributes, (int)kernel.BlockDimensions.x * (int)kernel.BlockDimensions.y * (int)kernel.BlockDimensions.z, kernel.DynamicSharedMemory, state);			
		}
Exemplo n.º 13
0
        /// <summary>
        ///
        /// </summary>
        /// <param name="minGridSize"></param>
        /// <param name="blockSize"></param>
        /// <param name="properties"></param>
        /// <param name="attributes"></param>
        /// <param name="state"></param>
        /// <param name="blockSizeToDynamicSMemSize"></param>
        /// <param name="dynamicSMemSize"></param>
        public static void cudaOccMaxPotentialOccupancyBlockSize(
            ref int minGridSize,
            ref int blockSize,
            cudaOccDeviceProp properties,
            cudaOccFuncAttributes attributes,
            cudaOccDeviceState state,
            del_blockSizeToDynamicSMemSize blockSizeToDynamicSMemSize,
            SizeT dynamicSMemSize)
        {
            cudaOccResult result = new cudaOccResult();

            // Limits
            int occupancyLimit;
            int granularity;
            int blockSizeLimit;

            // Recorded maximum
            int maxBlockSize = 0;
            int numBlocks    = 0;
            int maxOccupancy = 0;

            // Temporary
            int blockSizeToTryAligned;
            int blockSizeToTry;
            int blockSizeLimitAligned;
            int occupancyInBlocks;
            int occupancyInThreads;

            ///////////////////////////
            // Check user input
            ///////////////////////////

            //if (!minGridSize || !blockSize || !properties || !attributes || !state) {
            //	return CUDA_OCC_ERROR_INVALID_INPUT;
            //}

            cudaOccInputCheck(properties, attributes, state);

            /////////////////////////////////////////////////////////////////////////////////
            // Try each block size, and pick the block size with maximum occupancy
            /////////////////////////////////////////////////////////////////////////////////

            occupancyLimit = properties.maxThreadsPerMultiProcessor;
            granularity    = properties.warpSize;

            blockSizeLimit        = __occMin(properties.maxThreadsPerBlock, attributes.maxThreadsPerBlock);
            blockSizeLimitAligned = __occRoundUp(blockSizeLimit, granularity);

            for (blockSizeToTryAligned = blockSizeLimitAligned; blockSizeToTryAligned > 0; blockSizeToTryAligned -= granularity)
            {
                blockSizeToTry = __occMin(blockSizeLimit, blockSizeToTryAligned);

                // Ignore dynamicSMemSize if the user provides a mapping
                //
                if (blockSizeToDynamicSMemSize != null)
                {
                    dynamicSMemSize = blockSizeToDynamicSMemSize(blockSizeToTry);
                }

                cudaOccMaxActiveBlocksPerMultiprocessor(
                    result,
                    properties,
                    attributes,
                    state,
                    blockSizeToTry,
                    dynamicSMemSize);

                //if (status != CUDA_OCC_SUCCESS) {
                //	return status;
                //}

                occupancyInBlocks  = result.ActiveBlocksPerMultiProcessor;
                occupancyInThreads = blockSizeToTry * occupancyInBlocks;

                if (occupancyInThreads > maxOccupancy)
                {
                    maxBlockSize = blockSizeToTry;
                    numBlocks    = occupancyInBlocks;
                    maxOccupancy = occupancyInThreads;
                }

                // Early out if we have reached the maximum
                //
                if (occupancyLimit == maxOccupancy)
                {
                    break;
                }
            }

            ///////////////////////////
            // Return best available
            ///////////////////////////

            // Suggested min grid size to achieve a full machine launch
            //
            minGridSize = numBlocks * properties.numSms;
            blockSize   = maxBlockSize;
        }
Exemplo n.º 14
0
        /// <summary>
        /// 
        /// </summary>
        /// <param name="minGridSize"></param>
        /// <param name="blockSize"></param>
        /// <param name="properties"></param>
        /// <param name="attributes"></param>
        /// <param name="state"></param>
        /// <param name="blockSizeToDynamicSMemSize"></param>
        /// <param name="dynamicSMemSize"></param>
        public static void cudaOccMaxPotentialOccupancyBlockSize(
			ref int                         minGridSize,
			ref int                         blockSize,
			cudaOccDeviceProp     properties,
			cudaOccFuncAttributes attributes,
			cudaOccDeviceState    state,
			del_blockSizeToDynamicSMemSize blockSizeToDynamicSMemSize,
			SizeT                       dynamicSMemSize)
        {
            cudaOccResult result = new cudaOccResult();

            // Limits
            int occupancyLimit;
            int granularity;
            int blockSizeLimit;

            // Recorded maximum
            int maxBlockSize = 0;
            int numBlocks    = 0;
            int maxOccupancy = 0;

            // Temporary
            int blockSizeToTryAligned;
            int blockSizeToTry;
            int blockSizeLimitAligned;
            int occupancyInBlocks;
            int occupancyInThreads;

            ///////////////////////////
            // Check user input
            ///////////////////////////

            //if (!minGridSize || !blockSize || !properties || !attributes || !state) {
            //	return CUDA_OCC_ERROR_INVALID_INPUT;
            //}

            cudaOccInputCheck(properties, attributes, state);

            /////////////////////////////////////////////////////////////////////////////////
            // Try each block size, and pick the block size with maximum occupancy
            /////////////////////////////////////////////////////////////////////////////////

            occupancyLimit = properties.maxThreadsPerMultiProcessor;
            granularity    = properties.warpSize;

            blockSizeLimit        = __occMin(properties.maxThreadsPerBlock, attributes.maxThreadsPerBlock);
            blockSizeLimitAligned = __occRoundUp(blockSizeLimit, granularity);

            for (blockSizeToTryAligned = blockSizeLimitAligned; blockSizeToTryAligned > 0; blockSizeToTryAligned -= granularity) {
                blockSizeToTry = __occMin(blockSizeLimit, blockSizeToTryAligned);

                // Ignore dynamicSMemSize if the user provides a mapping
                //
                if (blockSizeToDynamicSMemSize != null) {
                    dynamicSMemSize = blockSizeToDynamicSMemSize(blockSizeToTry);
                }

                cudaOccMaxActiveBlocksPerMultiprocessor(
                    result,
                    properties,
                    attributes,
                    state,
                    blockSizeToTry,
                    dynamicSMemSize);

                //if (status != CUDA_OCC_SUCCESS) {
                //	return status;
                //}

                occupancyInBlocks = result.ActiveBlocksPerMultiProcessor;
                occupancyInThreads = blockSizeToTry * occupancyInBlocks;

                if (occupancyInThreads > maxOccupancy) {
                    maxBlockSize = blockSizeToTry;
                    numBlocks    = occupancyInBlocks;
                    maxOccupancy = occupancyInThreads;
                }

                // Early out if we have reached the maximum
                //
                if (occupancyLimit == maxOccupancy) {
                    break;
                }
            }

            ///////////////////////////
            // Return best available
            ///////////////////////////

            // Suggested min grid size to achieve a full machine launch
            //
            minGridSize = numBlocks * properties.numSms;
            blockSize = maxBlockSize;
        }
Exemplo n.º 15
0
        // Shared memory limit
        //
        private static int cudaOccMaxBlocksPerSMSmemLimit(
			cudaOccResult result,
			cudaOccDeviceProp     properties,
			cudaOccFuncAttributes attributes,
			cudaOccDeviceState    state,
			int                   blockSize,
			SizeT                 dynamicSmemSize)
        {
            int allocationGranularity;
            SizeT userSmemPreference;
            SizeT totalSmemUsagePerCTA;
            SizeT smemAllocatedPerCTA;
            SizeT sharedMemPerMultiprocessor;
            int maxBlocks;

            allocationGranularity = cudaOccSMemAllocationGranularity(properties);

            // Obtain the user preferred shared memory size. This setting is ignored if
            // user requests more shared memory than preferred.
            //
            userSmemPreference = cudaOccSMemPerMultiprocessor(properties, state.cacheConfig);

            totalSmemUsagePerCTA = attributes.sharedSizeBytes + dynamicSmemSize;
            smemAllocatedPerCTA = __occRoundUp((int)totalSmemUsagePerCTA, (int)allocationGranularity);

            if (smemAllocatedPerCTA > properties.sharedMemPerBlock) {
                maxBlocks = 0;
            }
            else {
                // User requested shared memory limit is used as long as it is greater
                // than the total shared memory used per CTA, i.e. as long as at least
                // one CTA can be launched. Otherwise, the maximum shared memory limit
                // is used instead.
                //
                if (userSmemPreference >= smemAllocatedPerCTA) {
                    sharedMemPerMultiprocessor = userSmemPreference;
                }
                else{
                    sharedMemPerMultiprocessor = properties.sharedMemPerMultiprocessor;
                }

                if (smemAllocatedPerCTA > 0) {
                    maxBlocks = (int)(sharedMemPerMultiprocessor / smemAllocatedPerCTA);
                }
                else {
                    maxBlocks = int.MaxValue;
                }
            }

            result.AllocatedSharedMemPerBlock = smemAllocatedPerCTA;

            return maxBlocks;
        }
Exemplo n.º 16
0
        private static void cudaOccInputCheck(
			cudaOccDeviceProp     properties,
			cudaOccFuncAttributes attributes,
			cudaOccDeviceState    state)
        {
            cudaOccError status = cudaOccError.None;

            status = cudaOccDevicePropCheck(properties);
            if (status != cudaOccError.None)
            {
                throw new CudaOccupancyException(status);
            }

            status = cudaOccFuncAttributesCheck(attributes);
            if (status != cudaOccError.None)
            {
                throw new CudaOccupancyException(status);
            }

            status = cudaOccDeviceStateCheck(state);
            if (status != cudaOccError.None)
            {
                throw new CudaOccupancyException(status);
            }
        }
Exemplo n.º 17
0
        private static cudaOccError cudaOccDeviceStateCheck(cudaOccDeviceState state)
        {
            // Placeholder
            //

            return cudaOccError.None;
        }
Exemplo n.º 18
0
        /// <summary>
        /// 
        /// </summary>
        /// <param name="minGridSize"></param>
        /// <param name="blockSize"></param>
        /// <param name="properties"></param>
        /// <param name="attributes"></param>
        /// <param name="state"></param>
        /// <param name="dynamicSMemSize"></param>
        public static void cudaOccMaxPotentialOccupancyBlockSize(
			ref int minGridSize,
			ref int blockSize,
			cudaOccDeviceProp properties,
			cudaOccFuncAttributes attributes,
			cudaOccDeviceState state,
			SizeT dynamicSMemSize)
        {
            cudaOccMaxPotentialOccupancyBlockSize(ref minGridSize, ref blockSize, properties, attributes, state, null, dynamicSMemSize);
        }
Exemplo n.º 19
0
        ///////////////////////////////////
        //      API Implementations      //
        ///////////////////////////////////


        /// <summary>
        /// Determine the maximum number of CTAs that can be run simultaneously per SM.<para/>
        /// This is equivalent to the calculation done in the CUDA Occupancy Calculator
        /// spreadsheet
        /// </summary>
        /// <param name="result"></param>
        /// <param name="properties"></param>
        /// <param name="attributes"></param>
        /// <param name="state"></param>
        /// <param name="blockSize"></param>
        /// <param name="dynamicSmemSize"></param>
        /// <returns></returns>
        public static void cudaOccMaxActiveBlocksPerMultiprocessor(
            cudaOccResult result,
            cudaOccDeviceProp properties,
            cudaOccFuncAttributes attributes,
            cudaOccDeviceState state,
            int blockSize,
            SizeT dynamicSmemSize)
        {
            int ctaLimitWarps  = 0;
            int ctaLimitBlocks = 0;
            int ctaLimitSMem   = 0;
            int ctaLimitRegs   = 0;
            int ctaLimit       = 0;
            cudaOccLimitingFactors limitingFactors = 0;

            cudaOccPartitionedGCConfig gcConfig = cudaOccPartitionedGCConfig.Off;

            //if (!result || !properties || !attributes || !state || blockSize <= 0) {
            //	return CUDA_OCC_ERROR_INVALID_INPUT;
            //}

            ///////////////////////////
            // Check user input
            ///////////////////////////

            cudaOccInputCheck(properties, attributes, state);

            ///////////////////////////
            // Initialization
            ///////////////////////////

            gcConfig = cudaOccPartitionedGCExpected(properties, attributes);

            ///////////////////////////
            // Compute occupancy
            ///////////////////////////

            // Limits due to registers/SM
            // Also compute if partitioned global caching has to be turned off
            //
            ctaLimitRegs = cudaOccMaxBlocksPerSMRegsLimit(ref gcConfig, result, properties, attributes, blockSize);


            // Limits due to warps/SM
            //
            ctaLimitWarps = cudaOccMaxBlocksPerSMWarpsLimit(gcConfig, properties, attributes, blockSize);


            // Limits due to blocks/SM
            //
            ctaLimitBlocks = cudaOccMaxBlocksPerMultiprocessor(properties);

            // Limits due to shared memory/SM
            //
            ctaLimitSMem = cudaOccMaxBlocksPerSMSmemLimit(result, properties, attributes, state, blockSize, dynamicSmemSize);


            ///////////////////////////
            // Overall occupancy
            ///////////////////////////

            // Overall limit is min() of limits due to above reasons
            //
            ctaLimit = __occMin(ctaLimitRegs, __occMin(ctaLimitSMem, __occMin(ctaLimitWarps, ctaLimitBlocks)));

            // Fill in the return values
            //
            // Determine occupancy limiting factors
            //
            if (ctaLimit == ctaLimitWarps)
            {
                limitingFactors |= cudaOccLimitingFactors.Warps;
            }
            if (ctaLimit == ctaLimitRegs)
            {
                limitingFactors |= cudaOccLimitingFactors.Registers;
            }
            if (ctaLimit == ctaLimitSMem)
            {
                limitingFactors |= cudaOccLimitingFactors.SharedMemory;
            }
            if (ctaLimit == ctaLimitBlocks)
            {
                limitingFactors |= cudaOccLimitingFactors.Blocks;
            }
            result.LimitingFactors = limitingFactors;

            result.BlockLimitRegs      = ctaLimitRegs;
            result.BlockLimitSharedMem = ctaLimitSMem;
            result.BlockLimitWarps     = ctaLimitWarps;
            result.BlockLimitBlocks    = ctaLimitBlocks;
            result.partitionedGCConfig = gcConfig;

            // Final occupancy
            result.ActiveBlocksPerMultiProcessor = ctaLimit;
        }
Exemplo n.º 20
0
        ///////////////////////////////////
        //      API Implementations      //
        ///////////////////////////////////
        /// <summary>
        /// Determine the maximum number of CTAs that can be run simultaneously per SM.<para/>
        /// This is equivalent to the calculation done in the CUDA Occupancy Calculator
        /// spreadsheet
        /// </summary>
        /// <param name="result"></param>
        /// <param name="properties"></param>
        /// <param name="attributes"></param>
        /// <param name="state"></param>
        /// <param name="blockSize"></param>
        /// <param name="dynamicSmemSize"></param>
        /// <returns></returns>
        public static void cudaOccMaxActiveBlocksPerMultiprocessor(
			cudaOccResult               result,
			cudaOccDeviceProp     properties,
			cudaOccFuncAttributes attributes,
			cudaOccDeviceState    state,
			int                   blockSize,
			SizeT                 dynamicSmemSize)
        {
            int          ctaLimitWarps   = 0;
            int          ctaLimitBlocks  = 0;
            int          ctaLimitSMem    = 0;
            int          ctaLimitRegs    = 0;
            int          ctaLimit        = 0;
            cudaOccLimitingFactors limitingFactors = 0;

            cudaOccPartitionedGCConfig gcConfig = cudaOccPartitionedGCConfig.Off;

            //if (!result || !properties || !attributes || !state || blockSize <= 0) {
            //	return CUDA_OCC_ERROR_INVALID_INPUT;
            //}

            ///////////////////////////
            // Check user input
            ///////////////////////////

            cudaOccInputCheck(properties, attributes, state);

            ///////////////////////////
            // Initialization
            ///////////////////////////

            gcConfig = cudaOccPartitionedGCExpected(properties, attributes);

            ///////////////////////////
            // Compute occupancy
            ///////////////////////////

            // Limits due to registers/SM
            // Also compute if partitioned global caching has to be turned off
            //
            ctaLimitRegs = cudaOccMaxBlocksPerSMRegsLimit(ref gcConfig, result, properties, attributes, blockSize);

            // Limits due to warps/SM
            //
            ctaLimitWarps = cudaOccMaxBlocksPerSMWarpsLimit(gcConfig, properties, attributes, blockSize);

            // Limits due to blocks/SM
            //
            ctaLimitBlocks = cudaOccMaxBlocksPerMultiprocessor(properties);

            // Limits due to shared memory/SM
            //
            ctaLimitSMem = cudaOccMaxBlocksPerSMSmemLimit(result, properties, attributes, state, blockSize, dynamicSmemSize);

            ///////////////////////////
            // Overall occupancy
            ///////////////////////////

            // Overall limit is min() of limits due to above reasons
            //
            ctaLimit = __occMin(ctaLimitRegs, __occMin(ctaLimitSMem, __occMin(ctaLimitWarps, ctaLimitBlocks)));

            // Fill in the return values
            //
            // Determine occupancy limiting factors
            //
            if (ctaLimit == ctaLimitWarps) {
                limitingFactors |= cudaOccLimitingFactors.Warps;
            }
            if (ctaLimit == ctaLimitRegs) {
                limitingFactors |= cudaOccLimitingFactors.Registers;
            }
            if (ctaLimit == ctaLimitSMem) {
                limitingFactors |= cudaOccLimitingFactors.SharedMemory;
            }
            if (ctaLimit == ctaLimitBlocks) {
                limitingFactors |= cudaOccLimitingFactors.Blocks;
            }
            result.LimitingFactors = limitingFactors;

            result.BlockLimitRegs      = ctaLimitRegs;
            result.BlockLimitSharedMem = ctaLimitSMem;
            result.BlockLimitWarps     = ctaLimitWarps;
            result.BlockLimitBlocks    = ctaLimitBlocks;
            result.partitionedGCConfig = gcConfig;

            // Final occupancy
            result.ActiveBlocksPerMultiProcessor = ctaLimit;
        }