/// <summary> /// Determine the potential block size that allows maximum number of CTAs that can run on multiprocessor simultaneously /// </summary> /// <param name="properties"></param> /// <param name="attributes"></param> /// <param name="state"></param> /// <param name="blockSizeToSMem"> /// A function to convert from block size to dynamic shared memory size.<para/> /// e.g.: /// If no dynamic shared memory is used: x => 0<para/> /// If 4 bytes shared memory per thread is used: x = 4 * x</param> /// <returns>maxBlockSize</returns> public static int cudaOccMaxPotentialOccupancyBlockSize( cudaOccDeviceProp properties, cudaOccFuncAttributes attributes, cudaOccDeviceState state, del_blockSizeToDynamicSMemSize blockSizeToSMem) { int maxOccupancy = properties.maxThreadsPerMultiProcessor; int largestBlockSize = min_(properties.maxThreadsPerBlock, attributes.maxThreadsPerBlock); int granularity = properties.warpSize; int maxBlockSize = 0; int blockSize = 0; int highestOccupancy = 0; for (blockSize = largestBlockSize; blockSize > 0; blockSize -= granularity) { cudaOccResult res = cudaOccMaxActiveBlocksPerMultiprocessor(properties, attributes, blockSize, blockSizeToSMem(blockSize), state); int occupancy = res.ActiveBlocksPerMultiProcessor; occupancy = blockSize * occupancy; if (occupancy > highestOccupancy) { maxBlockSize = blockSize; highestOccupancy = occupancy; } // can not get higher occupancy if (highestOccupancy == maxOccupancy) { break; } } return(maxBlockSize); }
/// <summary> /// Determine the potential block size that allows maximum number of CTAs that can run on multiprocessor simultaneously /// </summary> /// <param name="properties"></param> /// <param name="kernel"></param> /// <param name="state"></param> /// <param name="blockSizeToSMem"> /// A function to convert from block size to dynamic shared memory size.<para/> /// e.g.: /// If no dynamic shared memory is used: x => 0<para/> /// If 4 bytes shared memory per thread is used: x = 4 * x</param> /// <returns>maxBlockSize</returns> public static int cudaOccMaxPotentialOccupancyBlockSize( CudaDeviceProperties properties, CudaKernel kernel, cudaOccDeviceState state, del_blockSizeToDynamicSMemSize blockSizeToSMem) { cudaOccDeviceProp props = new cudaOccDeviceProp(properties); cudaOccFuncAttributes attributes = new cudaOccFuncAttributes(kernel); return(cudaOccMaxPotentialOccupancyBlockSize(props, attributes, state, blockSizeToSMem)); }
/// <summary> /// Determine the potential block size that allows maximum number of CTAs that can run on multiprocessor simultaneously /// </summary> /// <param name="properties"></param> /// <param name="attributes"></param> /// <param name="state"></param> /// <param name="blockSizeToSMem"> /// A function to convert from block size to dynamic shared memory size.<para/> /// e.g.: /// If no dynamic shared memory is used: x => 0<para/> /// If 4 bytes shared memory per thread is used: x = 4 * x</param> /// <returns>maxBlockSize</returns> public static int cudaOccMaxPotentialOccupancyBlockSize( cudaOccDeviceProp properties, cudaOccFuncAttributes attributes, cudaOccDeviceState state, del_blockSizeToDynamicSMemSize blockSizeToSMem) { int maxOccupancy = properties.maxThreadsPerMultiProcessor; int largestBlockSize = min_(properties.maxThreadsPerBlock, attributes.maxThreadsPerBlock); int granularity = properties.warpSize; int maxBlockSize = 0; int blockSize = 0; int highestOccupancy = 0; for(blockSize = largestBlockSize; blockSize > 0; blockSize -= granularity) { cudaOccResult res = cudaOccMaxActiveBlocksPerMultiprocessor(properties, attributes, blockSize, blockSizeToSMem(blockSize), state); int occupancy = res.ActiveBlocksPerMultiProcessor; occupancy = blockSize*occupancy; if(occupancy > highestOccupancy) { maxBlockSize = blockSize; highestOccupancy = occupancy; } // can not get higher occupancy if(highestOccupancy == maxOccupancy) break; } return maxBlockSize; }
/// <summary> /// Determine the potential block size that allows maximum number of CTAs that can run on multiprocessor simultaneously /// </summary> /// <param name="properties"></param> /// <param name="kernel"></param> /// <param name="state"></param> /// <param name="blockSizeToSMem"> /// A function to convert from block size to dynamic shared memory size.<para/> /// e.g.: /// If no dynamic shared memory is used: x => 0<para/> /// If 4 bytes shared memory per thread is used: x = 4 * x</param> /// <returns>maxBlockSize</returns> public static int cudaOccMaxPotentialOccupancyBlockSize( CudaDeviceProperties properties, CudaKernel kernel, cudaOccDeviceState state, del_blockSizeToDynamicSMemSize blockSizeToSMem) { cudaOccDeviceProp props = new cudaOccDeviceProp(properties); cudaOccFuncAttributes attributes = new cudaOccFuncAttributes(kernel); return cudaOccMaxPotentialOccupancyBlockSize(props, attributes, state, blockSizeToSMem); }
/// <summary> /// /// </summary> /// <param name="minGridSize"></param> /// <param name="blockSize"></param> /// <param name="properties"></param> /// <param name="attributes"></param> /// <param name="state"></param> /// <param name="blockSizeToDynamicSMemSize"></param> /// <param name="dynamicSMemSize"></param> public static void cudaOccMaxPotentialOccupancyBlockSize( ref int minGridSize, ref int blockSize, cudaOccDeviceProp properties, cudaOccFuncAttributes attributes, cudaOccDeviceState state, del_blockSizeToDynamicSMemSize blockSizeToDynamicSMemSize, SizeT dynamicSMemSize) { cudaOccResult result = new cudaOccResult(); // Limits int occupancyLimit; int granularity; int blockSizeLimit; // Recorded maximum int maxBlockSize = 0; int numBlocks = 0; int maxOccupancy = 0; // Temporary int blockSizeToTryAligned; int blockSizeToTry; int blockSizeLimitAligned; int occupancyInBlocks; int occupancyInThreads; /////////////////////////// // Check user input /////////////////////////// //if (!minGridSize || !blockSize || !properties || !attributes || !state) { // return CUDA_OCC_ERROR_INVALID_INPUT; //} cudaOccInputCheck(properties, attributes, state); ///////////////////////////////////////////////////////////////////////////////// // Try each block size, and pick the block size with maximum occupancy ///////////////////////////////////////////////////////////////////////////////// occupancyLimit = properties.maxThreadsPerMultiProcessor; granularity = properties.warpSize; blockSizeLimit = __occMin(properties.maxThreadsPerBlock, attributes.maxThreadsPerBlock); blockSizeLimitAligned = __occRoundUp(blockSizeLimit, granularity); for (blockSizeToTryAligned = blockSizeLimitAligned; blockSizeToTryAligned > 0; blockSizeToTryAligned -= granularity) { blockSizeToTry = __occMin(blockSizeLimit, blockSizeToTryAligned); // Ignore dynamicSMemSize if the user provides a mapping // if (blockSizeToDynamicSMemSize != null) { dynamicSMemSize = blockSizeToDynamicSMemSize(blockSizeToTry); } cudaOccMaxActiveBlocksPerMultiprocessor( result, properties, attributes, state, blockSizeToTry, dynamicSMemSize); //if (status != CUDA_OCC_SUCCESS) { // return status; //} occupancyInBlocks = result.ActiveBlocksPerMultiProcessor; occupancyInThreads = blockSizeToTry * occupancyInBlocks; if (occupancyInThreads > maxOccupancy) { maxBlockSize = blockSizeToTry; numBlocks = occupancyInBlocks; maxOccupancy = occupancyInThreads; } // Early out if we have reached the maximum // if (occupancyLimit == maxOccupancy) { break; } } /////////////////////////// // Return best available /////////////////////////// // Suggested min grid size to achieve a full machine launch // minGridSize = numBlocks * properties.numSms; blockSize = maxBlockSize; }