//////////////////////////////////////////////////////////////////////////////// // Potential occupancy calculator // // The potential occupancy is calculated according to the kernel and // execution configuration the user desires. Occupancy is defined in // terms of active blocks per multiprocessor, and the user can convert // it to other metrics. // // This wrapper routine computes the occupancy of kernel, and reports // it in terms of active warps / maximum warps per SM. //////////////////////////////////////////////////////////////////////////////// static double reportPotentialOccupancy(int blockSize, SizeT dynamicSMem) { int device; int numBlocks; int activeWarps; int maxWarps; double occupancy; CudaOccupancy.cudaOccDeviceProp prop = new CudaOccupancy.cudaOccDeviceProp(0); CudaOccupancy.cudaOccResult result = new CudaOccupancy.cudaOccResult(); CudaOccupancy.cudaOccFuncAttributes attributes = new CudaOccupancy.cudaOccFuncAttributes(kernel); CudaOccupancy.cudaOccDeviceState state = new CudaOccupancy.cudaOccDeviceState(); state.cacheConfig = CudaOccupancy.cudaOccCacheConfig.PreferNone; CudaOccupancy.cudaOccMaxActiveBlocksPerMultiprocessor(result, prop, attributes, state, blockSize, dynamicSMem); numBlocks = result.ActiveBlocksPerMultiProcessor; activeWarps = numBlocks * blockSize / prop.warpSize; maxWarps = prop.maxThreadsPerMultiProcessor / prop.warpSize; occupancy = (double)activeWarps / maxWarps; return(occupancy); }
//////////////////////////////////////////////////////////////////////////////// // Occupancy-based launch configurator // // The launch configurator, cudaOccupancyMaxPotentialBlockSize and // cudaOccupancyMaxPotentialBlockSizeVariableSMem, suggests a block // size that achieves the best theoretical occupancy. It also returns // the minimum number of blocks needed to achieve the occupancy on the // whole device. // // This launch configurator is purely occupancy-based. It doesn't // translate directly to performance, but the suggestion should // nevertheless be a good starting point for further optimizations. // // This function configures the launch based on the "automatic" // argument, records the runtime, and reports occupancy and runtime. //////////////////////////////////////////////////////////////////////////////// static int launchConfig(CudaDeviceVariable <int> array, int arrayCount, bool automatic) { int blockSize = 0; int minGridSize = 0; int gridSize; SizeT dynamicSMemUsage = 0; float elapsedTime; double potentialOccupancy; CudaOccupancy.cudaOccDeviceState state = new CudaOccupancy.cudaOccDeviceState(); state.cacheConfig = CudaOccupancy.cudaOccCacheConfig.PreferNone; if (automatic) { CudaOccupancy.cudaOccMaxPotentialOccupancyBlockSize(ref minGridSize, ref blockSize, new CudaOccupancy.cudaOccDeviceProp(0), new CudaOccupancy.cudaOccFuncAttributes(kernel), state, dynamicSMemUsage); Console.WriteLine("Suggested block size: {0}", blockSize); Console.WriteLine("Minimum grid size for maximum occupancy: {0}", minGridSize); } else { // This block size is too small. Given limited number of // active blocks per multiprocessor, the number of active // threads will be limited, and thus unable to achieve maximum // occupancy. // blockSize = manualBlockSize; } // Round up // gridSize = (arrayCount + blockSize - 1) / blockSize; // Launch and profile // kernel.GridDimensions = gridSize; kernel.BlockDimensions = blockSize; elapsedTime = kernel.Run(array.DevicePointer, arrayCount); // Calculate occupancy // potentialOccupancy = reportPotentialOccupancy(blockSize, dynamicSMemUsage); Console.WriteLine("Potential occupancy: {0}%", potentialOccupancy * 100); // Report elapsed time // Console.WriteLine("Elapsed time: {0}ms", elapsedTime * 100); return(0); }