/////////////////////////////////////////////// // Occupancy calculation Functions // /////////////////////////////////////////////// /// <summary> /// Determine the maximum number of CTAs that can be run simultaneously per SM.<para/> /// This is equivalent to the calculation done in the CUDA Occupancy Calculator /// spreadsheet /// </summary> /// <param name="properties"></param> /// <param name="kernel"></param> /// <param name="state"></param> /// <returns></returns> public static cudaOccResult cudaOccMaxActiveBlocksPerMultiprocessor( CudaDeviceProperties properties, CudaKernel kernel, cudaOccDeviceState state) { cudaOccDeviceProp props = new cudaOccDeviceProp(properties); cudaOccFuncAttributes attributes = new cudaOccFuncAttributes(kernel); return(cudaOccMaxActiveBlocksPerMultiprocessor(props, attributes, (int)kernel.BlockDimensions.x * (int)kernel.BlockDimensions.y * (int)kernel.BlockDimensions.z, kernel.DynamicSharedMemory, state)); }
/// <summary> /// Determine the potential block size that allows maximum number of CTAs that can run on multiprocessor simultaneously /// </summary> /// <param name="properties"></param> /// <param name="kernel"></param> /// <param name="state"></param> /// <param name="blockSizeToSMem"> /// A function to convert from block size to dynamic shared memory size.<para/> /// e.g.: /// If no dynamic shared memory is used: x => 0<para/> /// If 4 bytes shared memory per thread is used: x = 4 * x</param> /// <returns>maxBlockSize</returns> public static int cudaOccMaxPotentialOccupancyBlockSize( CudaDeviceProperties properties, CudaKernel kernel, cudaOccDeviceState state, del_blockSizeToDynamicSMemSize blockSizeToSMem) { cudaOccDeviceProp props = new cudaOccDeviceProp(properties); cudaOccFuncAttributes attributes = new cudaOccFuncAttributes(kernel); return(cudaOccMaxPotentialOccupancyBlockSize(props, attributes, state, blockSizeToSMem)); }
/// <summary/> public cudaOccDeviceProp(CudaDeviceProperties props) { major = props.ComputeCapabilityMajor; minor = props.ComputeCapabilityMinor; maxThreadsPerBlock = props.MaxThreadsPerBlock; maxThreadsPerMultiProcessor = props.MaxThreadsPerMultiProcessor; regsPerBlock = props.RegistersPerBlock; regsPerMultiprocessor = props.MaxRegistersPerMultiprocessor; warpSize = props.WarpSize; sharedMemPerBlock = props.SharedMemoryPerBlock; sharedMemPerMultiprocessor = props.MaxSharedMemoryPerMultiprocessor; }
/// <summary> /// Determine the potential block size that allows maximum number of CTAs that can run on multiprocessor simultaneously /// </summary> /// <param name="properties"></param> /// <param name="kernel"></param> /// <param name="state"></param> /// <param name="blockSizeToSMem"> /// A function to convert from block size to dynamic shared memory size.<para/> /// e.g.: /// If no dynamic shared memory is used: x => 0<para/> /// If 4 bytes shared memory per thread is used: x = 4 * x</param> /// <returns>maxBlockSize</returns> public static int cudaOccMaxPotentialOccupancyBlockSize( CudaDeviceProperties properties, CudaKernel kernel, cudaOccDeviceState state, del_blockSizeToDynamicSMemSize blockSizeToSMem) { cudaOccDeviceProp props = new cudaOccDeviceProp(properties); cudaOccFuncAttributes attributes = new cudaOccFuncAttributes(kernel); return cudaOccMaxPotentialOccupancyBlockSize(props, attributes, state, blockSizeToSMem); }
/////////////////////////////////////////////// // Occupancy calculation Functions // /////////////////////////////////////////////// /// <summary> /// Determine the maximum number of CTAs that can be run simultaneously per SM.<para/> /// This is equivalent to the calculation done in the CUDA Occupancy Calculator /// spreadsheet /// </summary> /// <param name="properties"></param> /// <param name="kernel"></param> /// <param name="state"></param> /// <returns></returns> public static cudaOccResult cudaOccMaxActiveBlocksPerMultiprocessor( CudaDeviceProperties properties, CudaKernel kernel, cudaOccDeviceState state) { cudaOccDeviceProp props = new cudaOccDeviceProp(properties); cudaOccFuncAttributes attributes = new cudaOccFuncAttributes(kernel); return cudaOccMaxActiveBlocksPerMultiprocessor(props, attributes, (int)kernel.BlockDimensions.x * (int)kernel.BlockDimensions.y * (int)kernel.BlockDimensions.z, kernel.DynamicSharedMemory, state); }