示例#1
0
        /// <summary>
        /// Determine the potential block size that allows maximum number of CTAs that can run on multiprocessor simultaneously
        /// </summary>
        /// <param name="properties"></param>
        /// <param name="attributes"></param>
        /// <param name="state"></param>
        /// <param name="blockSizeToSMem">
        /// A function to convert from block size to dynamic shared memory size.<para/>
        /// e.g.:
        /// If no dynamic shared memory is used: x => 0<para/>
        /// If 4 bytes shared memory per thread is used: x = 4 * x</param>
        /// <returns>maxBlockSize</returns>
        public static int cudaOccMaxPotentialOccupancyBlockSize(
            cudaOccDeviceProp properties,
            cudaOccFuncAttributes attributes,
            cudaOccDeviceState state,
            del_blockSizeToDynamicSMemSize blockSizeToSMem)
        {
            int maxOccupancy     = properties.maxThreadsPerMultiProcessor;
            int largestBlockSize = min_(properties.maxThreadsPerBlock, attributes.maxThreadsPerBlock);
            int granularity      = properties.warpSize;
            int maxBlockSize     = 0;
            int blockSize        = 0;
            int highestOccupancy = 0;

            for (blockSize = largestBlockSize; blockSize > 0; blockSize -= granularity)
            {
                cudaOccResult res       = cudaOccMaxActiveBlocksPerMultiprocessor(properties, attributes, blockSize, blockSizeToSMem(blockSize), state);
                int           occupancy = res.ActiveBlocksPerMultiProcessor;
                occupancy = blockSize * occupancy;

                if (occupancy > highestOccupancy)
                {
                    maxBlockSize     = blockSize;
                    highestOccupancy = occupancy;
                }

                // can not get higher occupancy
                if (highestOccupancy == maxOccupancy)
                {
                    break;
                }
            }

            return(maxBlockSize);
        }
示例#2
0
        /// <summary>
        /// Determine the potential block size that allows maximum number of CTAs that can run on multiprocessor simultaneously
        /// </summary>
        /// <param name="properties"></param>
        /// <param name="kernel"></param>
        /// <param name="state"></param>
        /// <param name="blockSizeToSMem">
        /// A function to convert from block size to dynamic shared memory size.<para/>
        /// e.g.:
        /// If no dynamic shared memory is used: x => 0<para/>
        /// If 4 bytes shared memory per thread is used: x = 4 * x</param>
        /// <returns>maxBlockSize</returns>
        public static int cudaOccMaxPotentialOccupancyBlockSize(
            CudaDeviceProperties properties,
            CudaKernel kernel,
            cudaOccDeviceState state,
            del_blockSizeToDynamicSMemSize blockSizeToSMem)
        {
            cudaOccDeviceProp     props      = new cudaOccDeviceProp(properties);
            cudaOccFuncAttributes attributes = new cudaOccFuncAttributes(kernel);

            return(cudaOccMaxPotentialOccupancyBlockSize(props, attributes, state, blockSizeToSMem));
        }
示例#3
0
		/// <summary>
		/// Determine the potential block size that allows maximum number of CTAs that can run on multiprocessor simultaneously 
		/// </summary>
		/// <param name="properties"></param>
		/// <param name="attributes"></param>
		/// <param name="state"></param>
		/// <param name="blockSizeToSMem">
		/// A function to convert from block size to dynamic shared memory size.<para/>
		/// e.g.:
		/// If no dynamic shared memory is used: x => 0<para/>
		/// If 4 bytes shared memory per thread is used: x = 4 * x</param>
		/// <returns>maxBlockSize</returns>
		public static int cudaOccMaxPotentialOccupancyBlockSize(
		    cudaOccDeviceProp properties,
		    cudaOccFuncAttributes attributes,
		    cudaOccDeviceState state,
		    del_blockSizeToDynamicSMemSize blockSizeToSMem)
		{
		    int maxOccupancy       = properties.maxThreadsPerMultiProcessor;
		    int largestBlockSize   = min_(properties.maxThreadsPerBlock, attributes.maxThreadsPerBlock);
		    int granularity        = properties.warpSize;
		    int maxBlockSize  = 0;
		    int blockSize     = 0;
		    int highestOccupancy   = 0;

		    for(blockSize = largestBlockSize; blockSize > 0; blockSize -= granularity)
		    {
				cudaOccResult res = cudaOccMaxActiveBlocksPerMultiprocessor(properties, attributes, blockSize, blockSizeToSMem(blockSize), state);
				int occupancy = res.ActiveBlocksPerMultiProcessor;
		        occupancy = blockSize*occupancy;

		        if(occupancy > highestOccupancy)
		        {
		            maxBlockSize = blockSize;
		            highestOccupancy = occupancy;
		        }

		        // can not get higher occupancy
		        if(highestOccupancy == maxOccupancy)
		            break;
		    }

		    return maxBlockSize;
		}
示例#4
0
		/// <summary>
		/// Determine the potential block size that allows maximum number of CTAs that can run on multiprocessor simultaneously 
		/// </summary>
		/// <param name="properties"></param>
		/// <param name="kernel"></param>
		/// <param name="state"></param>
		/// <param name="blockSizeToSMem">
		/// A function to convert from block size to dynamic shared memory size.<para/>
		/// e.g.:
		/// If no dynamic shared memory is used: x => 0<para/>
		/// If 4 bytes shared memory per thread is used: x = 4 * x</param>
		/// <returns>maxBlockSize</returns>
		public static int cudaOccMaxPotentialOccupancyBlockSize(
			CudaDeviceProperties properties,
			CudaKernel kernel,
			cudaOccDeviceState state,
			del_blockSizeToDynamicSMemSize blockSizeToSMem)
		{
			cudaOccDeviceProp props = new cudaOccDeviceProp(properties);
			cudaOccFuncAttributes attributes = new cudaOccFuncAttributes(kernel);
			return cudaOccMaxPotentialOccupancyBlockSize(props, attributes, state, blockSizeToSMem);
		}
示例#5
0
        /// <summary>
        ///
        /// </summary>
        /// <param name="minGridSize"></param>
        /// <param name="blockSize"></param>
        /// <param name="properties"></param>
        /// <param name="attributes"></param>
        /// <param name="state"></param>
        /// <param name="blockSizeToDynamicSMemSize"></param>
        /// <param name="dynamicSMemSize"></param>
        public static void cudaOccMaxPotentialOccupancyBlockSize(
            ref int minGridSize,
            ref int blockSize,
            cudaOccDeviceProp properties,
            cudaOccFuncAttributes attributes,
            cudaOccDeviceState state,
            del_blockSizeToDynamicSMemSize blockSizeToDynamicSMemSize,
            SizeT dynamicSMemSize)
        {
            cudaOccResult result = new cudaOccResult();

            // Limits
            int occupancyLimit;
            int granularity;
            int blockSizeLimit;

            // Recorded maximum
            int maxBlockSize = 0;
            int numBlocks    = 0;
            int maxOccupancy = 0;

            // Temporary
            int blockSizeToTryAligned;
            int blockSizeToTry;
            int blockSizeLimitAligned;
            int occupancyInBlocks;
            int occupancyInThreads;

            ///////////////////////////
            // Check user input
            ///////////////////////////

            //if (!minGridSize || !blockSize || !properties || !attributes || !state) {
            //	return CUDA_OCC_ERROR_INVALID_INPUT;
            //}

            cudaOccInputCheck(properties, attributes, state);

            /////////////////////////////////////////////////////////////////////////////////
            // Try each block size, and pick the block size with maximum occupancy
            /////////////////////////////////////////////////////////////////////////////////

            occupancyLimit = properties.maxThreadsPerMultiProcessor;
            granularity    = properties.warpSize;

            blockSizeLimit        = __occMin(properties.maxThreadsPerBlock, attributes.maxThreadsPerBlock);
            blockSizeLimitAligned = __occRoundUp(blockSizeLimit, granularity);

            for (blockSizeToTryAligned = blockSizeLimitAligned; blockSizeToTryAligned > 0; blockSizeToTryAligned -= granularity)
            {
                blockSizeToTry = __occMin(blockSizeLimit, blockSizeToTryAligned);

                // Ignore dynamicSMemSize if the user provides a mapping
                //
                if (blockSizeToDynamicSMemSize != null)
                {
                    dynamicSMemSize = blockSizeToDynamicSMemSize(blockSizeToTry);
                }

                cudaOccMaxActiveBlocksPerMultiprocessor(
                    result,
                    properties,
                    attributes,
                    state,
                    blockSizeToTry,
                    dynamicSMemSize);

                //if (status != CUDA_OCC_SUCCESS) {
                //	return status;
                //}

                occupancyInBlocks  = result.ActiveBlocksPerMultiProcessor;
                occupancyInThreads = blockSizeToTry * occupancyInBlocks;

                if (occupancyInThreads > maxOccupancy)
                {
                    maxBlockSize = blockSizeToTry;
                    numBlocks    = occupancyInBlocks;
                    maxOccupancy = occupancyInThreads;
                }

                // Early out if we have reached the maximum
                //
                if (occupancyLimit == maxOccupancy)
                {
                    break;
                }
            }

            ///////////////////////////
            // Return best available
            ///////////////////////////

            // Suggested min grid size to achieve a full machine launch
            //
            minGridSize = numBlocks * properties.numSms;
            blockSize   = maxBlockSize;
        }
示例#6
0
        /// <summary>
        /// 
        /// </summary>
        /// <param name="minGridSize"></param>
        /// <param name="blockSize"></param>
        /// <param name="properties"></param>
        /// <param name="attributes"></param>
        /// <param name="state"></param>
        /// <param name="blockSizeToDynamicSMemSize"></param>
        /// <param name="dynamicSMemSize"></param>
        public static void cudaOccMaxPotentialOccupancyBlockSize(
			ref int                         minGridSize,
			ref int                         blockSize,
			cudaOccDeviceProp     properties,
			cudaOccFuncAttributes attributes,
			cudaOccDeviceState    state,
			del_blockSizeToDynamicSMemSize blockSizeToDynamicSMemSize,
			SizeT                       dynamicSMemSize)
        {
            cudaOccResult result = new cudaOccResult();

            // Limits
            int occupancyLimit;
            int granularity;
            int blockSizeLimit;

            // Recorded maximum
            int maxBlockSize = 0;
            int numBlocks    = 0;
            int maxOccupancy = 0;

            // Temporary
            int blockSizeToTryAligned;
            int blockSizeToTry;
            int blockSizeLimitAligned;
            int occupancyInBlocks;
            int occupancyInThreads;

            ///////////////////////////
            // Check user input
            ///////////////////////////

            //if (!minGridSize || !blockSize || !properties || !attributes || !state) {
            //	return CUDA_OCC_ERROR_INVALID_INPUT;
            //}

            cudaOccInputCheck(properties, attributes, state);

            /////////////////////////////////////////////////////////////////////////////////
            // Try each block size, and pick the block size with maximum occupancy
            /////////////////////////////////////////////////////////////////////////////////

            occupancyLimit = properties.maxThreadsPerMultiProcessor;
            granularity    = properties.warpSize;

            blockSizeLimit        = __occMin(properties.maxThreadsPerBlock, attributes.maxThreadsPerBlock);
            blockSizeLimitAligned = __occRoundUp(blockSizeLimit, granularity);

            for (blockSizeToTryAligned = blockSizeLimitAligned; blockSizeToTryAligned > 0; blockSizeToTryAligned -= granularity) {
                blockSizeToTry = __occMin(blockSizeLimit, blockSizeToTryAligned);

                // Ignore dynamicSMemSize if the user provides a mapping
                //
                if (blockSizeToDynamicSMemSize != null) {
                    dynamicSMemSize = blockSizeToDynamicSMemSize(blockSizeToTry);
                }

                cudaOccMaxActiveBlocksPerMultiprocessor(
                    result,
                    properties,
                    attributes,
                    state,
                    blockSizeToTry,
                    dynamicSMemSize);

                //if (status != CUDA_OCC_SUCCESS) {
                //	return status;
                //}

                occupancyInBlocks = result.ActiveBlocksPerMultiProcessor;
                occupancyInThreads = blockSizeToTry * occupancyInBlocks;

                if (occupancyInThreads > maxOccupancy) {
                    maxBlockSize = blockSizeToTry;
                    numBlocks    = occupancyInBlocks;
                    maxOccupancy = occupancyInThreads;
                }

                // Early out if we have reached the maximum
                //
                if (occupancyLimit == maxOccupancy) {
                    break;
                }
            }

            ///////////////////////////
            // Return best available
            ///////////////////////////

            // Suggested min grid size to achieve a full machine launch
            //
            minGridSize = numBlocks * properties.numSms;
            blockSize = maxBlockSize;
        }