// Shared memory limit // private static int cudaOccMaxBlocksPerSMSmemLimit( cudaOccResult result, cudaOccDeviceProp properties, cudaOccFuncAttributes attributes, cudaOccDeviceState state, int blockSize, SizeT dynamicSmemSize) { int allocationGranularity; SizeT userSmemPreference; SizeT totalSmemUsagePerCTA; SizeT smemAllocatedPerCTA; SizeT sharedMemPerMultiprocessor; int maxBlocks; allocationGranularity = cudaOccSMemAllocationGranularity(properties); // Obtain the user preferred shared memory size. This setting is ignored if // user requests more shared memory than preferred. // userSmemPreference = cudaOccSMemPerMultiprocessor(properties, state.cacheConfig); totalSmemUsagePerCTA = attributes.sharedSizeBytes + dynamicSmemSize; smemAllocatedPerCTA = __occRoundUp((int)totalSmemUsagePerCTA, (int)allocationGranularity); if (smemAllocatedPerCTA > properties.sharedMemPerBlock) { maxBlocks = 0; } else { // User requested shared memory limit is used as long as it is greater // than the total shared memory used per CTA, i.e. as long as at least // one CTA can be launched. Otherwise, the maximum shared memory limit // is used instead. // if (userSmemPreference >= smemAllocatedPerCTA) { sharedMemPerMultiprocessor = userSmemPreference; } else { sharedMemPerMultiprocessor = properties.sharedMemPerMultiprocessor; } if (smemAllocatedPerCTA > 0) { maxBlocks = (int)(sharedMemPerMultiprocessor / smemAllocatedPerCTA); } else { maxBlocks = int.MaxValue; } } result.AllocatedSharedMemPerBlock = smemAllocatedPerCTA; return(maxBlocks); }
/// <summary> /// Determine the potential block size that allows maximum number of CTAs that can run on multiprocessor simultaneously /// </summary> /// <param name="properties"></param> /// <param name="attributes"></param> /// <param name="state"></param> /// <param name="blockSizeToSMem"> /// A function to convert from block size to dynamic shared memory size.<para/> /// e.g.: /// If no dynamic shared memory is used: x => 0<para/> /// If 4 bytes shared memory per thread is used: x = 4 * x</param> /// <returns>maxBlockSize</returns> public static int cudaOccMaxPotentialOccupancyBlockSize( cudaOccDeviceProp properties, cudaOccFuncAttributes attributes, cudaOccDeviceState state, del_blockSizeToDynamicSMemSize blockSizeToSMem) { int maxOccupancy = properties.maxThreadsPerMultiProcessor; int largestBlockSize = min_(properties.maxThreadsPerBlock, attributes.maxThreadsPerBlock); int granularity = properties.warpSize; int maxBlockSize = 0; int blockSize = 0; int highestOccupancy = 0; for (blockSize = largestBlockSize; blockSize > 0; blockSize -= granularity) { cudaOccResult res = cudaOccMaxActiveBlocksPerMultiprocessor(properties, attributes, blockSize, blockSizeToSMem(blockSize), state); int occupancy = res.ActiveBlocksPerMultiProcessor; occupancy = blockSize * occupancy; if (occupancy > highestOccupancy) { maxBlockSize = blockSize; highestOccupancy = occupancy; } // can not get higher occupancy if (highestOccupancy == maxOccupancy) { break; } } return(maxBlockSize); }
private static cudaOccError cudaOccDeviceStateCheck(cudaOccDeviceState state) { // Placeholder // return(cudaOccError.None); }
private static void cudaOccInputCheck( cudaOccDeviceProp properties, cudaOccFuncAttributes attributes, cudaOccDeviceState state) { cudaOccError status = cudaOccError.None; status = cudaOccDevicePropCheck(properties); if (status != cudaOccError.None) { throw new CudaOccupancyException(status); } status = cudaOccFuncAttributesCheck(attributes); if (status != cudaOccError.None) { throw new CudaOccupancyException(status); } status = cudaOccDeviceStateCheck(state); if (status != cudaOccError.None) { throw new CudaOccupancyException(status); } }
/////////////////////////////////////////////// // Occupancy calculation Functions // /////////////////////////////////////////////// /// <summary> /// Determine the maximum number of CTAs that can be run simultaneously per SM.<para/> /// This is equivalent to the calculation done in the CUDA Occupancy Calculator /// spreadsheet /// </summary> /// <param name="properties"></param> /// <param name="kernel"></param> /// <param name="state"></param> /// <returns></returns> public static cudaOccResult cudaOccMaxActiveBlocksPerMultiprocessor( CudaDeviceProperties properties, CudaKernel kernel, cudaOccDeviceState state) { cudaOccDeviceProp props = new cudaOccDeviceProp(properties); cudaOccFuncAttributes attributes = new cudaOccFuncAttributes(kernel); return(cudaOccMaxActiveBlocksPerMultiprocessor(props, attributes, (int)kernel.BlockDimensions.x * (int)kernel.BlockDimensions.y * (int)kernel.BlockDimensions.z, kernel.DynamicSharedMemory, state)); }
/// <summary> /// /// </summary> /// <param name="minGridSize"></param> /// <param name="blockSize"></param> /// <param name="properties"></param> /// <param name="attributes"></param> /// <param name="state"></param> /// <param name="dynamicSMemSize"></param> public static void cudaOccMaxPotentialOccupancyBlockSize( ref int minGridSize, ref int blockSize, cudaOccDeviceProp properties, cudaOccFuncAttributes attributes, cudaOccDeviceState state, SizeT dynamicSMemSize) { cudaOccMaxPotentialOccupancyBlockSize(ref minGridSize, ref blockSize, properties, attributes, state, null, dynamicSMemSize); }
/// <summary> /// Determine the potential block size that allows maximum number of CTAs that can run on multiprocessor simultaneously /// </summary> /// <param name="properties"></param> /// <param name="kernel"></param> /// <param name="state"></param> /// <param name="blockSizeToSMem"> /// A function to convert from block size to dynamic shared memory size.<para/> /// e.g.: /// If no dynamic shared memory is used: x => 0<para/> /// If 4 bytes shared memory per thread is used: x = 4 * x</param> /// <returns>maxBlockSize</returns> public static int cudaOccMaxPotentialOccupancyBlockSize( CudaDeviceProperties properties, CudaKernel kernel, cudaOccDeviceState state, del_blockSizeToDynamicSMemSize blockSizeToSMem) { cudaOccDeviceProp props = new cudaOccDeviceProp(properties); cudaOccFuncAttributes attributes = new cudaOccFuncAttributes(kernel); return(cudaOccMaxPotentialOccupancyBlockSize(props, attributes, state, blockSizeToSMem)); }
/// <summary> /// Determine the maximum number of CTAs that can be run simultaneously per SM.<para/> /// This is equivalent to the calculation done in the CUDA Occupancy Calculator /// spreadsheet /// </summary> /// <param name="properties"></param> /// <param name="attributes"></param> /// <param name="blockSize"></param> /// <param name="dynamic_smem_bytes"></param> /// <param name="state"></param> /// <returns></returns> public static cudaOccResult cudaOccMaxActiveBlocksPerMultiprocessor( cudaOccDeviceProp properties, cudaOccFuncAttributes attributes, int blockSize, SizeT dynamic_smem_bytes, cudaOccDeviceState state) { int regAllocationUnit = 0, warpAllocationMultiple = 0, maxBlocksPerSM = 0; int ctaLimitWarps = 0, ctaLimitBlocks = 0, smemPerCTA = 0, smemBytes = 0, smemAllocationUnit = 0; int cacheConfigSMem = 0, sharedMemPerMultiprocessor = 0, ctaLimitRegs = 0, regsPerCTA = 0; int regsPerWarp = 0, numSides = 0, numRegsPerSide = 0, ctaLimit = 0; int maxWarpsPerSm = 0, warpsPerCTA = 0, ctaLimitSMem = 0; cudaOccLimitingFactors limitingFactors = 0; cudaOccResult result = new cudaOccResult(); if (properties == null || attributes == null || blockSize <= 0) { throw new CudaOccupancyException(cudaOccError.ErrorInvalidInput); } ////////////////////////////////////////// // Limits due to warps/SM or blocks/SM ////////////////////////////////////////// CudaOccupancyException.CheckZero(properties.warpSize); maxWarpsPerSm = properties.maxThreadsPerMultiProcessor / properties.warpSize; warpAllocationMultiple = cudaOccWarpAllocationMultiple(properties); CudaOccupancyException.CheckZero(warpAllocationMultiple); warpsPerCTA = round_i(divide_ri(blockSize, properties.warpSize), warpAllocationMultiple); maxBlocksPerSM = cudaOccMaxBlocksPerMultiprocessor(properties); // Calc limits CudaOccupancyException.CheckZero(warpsPerCTA); ctaLimitWarps = (blockSize <= properties.maxThreadsPerBlock) ? maxWarpsPerSm / warpsPerCTA : 0; ctaLimitBlocks = maxBlocksPerSM; ////////////////////////////////////////// // Limits due to shared memory/SM ////////////////////////////////////////// smemAllocationUnit = cudaOccSMemAllocationUnit(properties); smemBytes = (int)(attributes.sharedSizeBytes + dynamic_smem_bytes); CudaOccupancyException.CheckZero(smemAllocationUnit); smemPerCTA = round_i(smemBytes, smemAllocationUnit); // Calc limit cacheConfigSMem = cudaOccSMemPerMultiprocessor(properties, state.cacheConfig); // sharedMemoryPerMultiprocessor is by default limit set in hardware but user requested shared memory // limit is used instead if it is greater than total shared memory used by function . sharedMemPerMultiprocessor = (cacheConfigSMem >= smemPerCTA) ? cacheConfigSMem : (int)properties.sharedMemPerMultiprocessor; // Limit on blocks launched should be calculated with shared memory per SM but total shared memory // used by function should be limited by shared memory per block ctaLimitSMem = 0; if (properties.sharedMemPerBlock >= (SizeT)smemPerCTA) { ctaLimitSMem = smemPerCTA > 0 ? sharedMemPerMultiprocessor / smemPerCTA : maxBlocksPerSM; } ////////////////////////////////////////// // Limits due to registers/SM ////////////////////////////////////////// regAllocationUnit = cudaOccRegAllocationUnit(properties, attributes.numRegs); CudaOccupancyException.CheckZero(regAllocationUnit); // Calc limit ctaLimitRegs = 0; if (properties.major <= 1) { // GPUs of compute capability 1.x allocate registers to CTAs // Number of regs per block is regs per thread times number of warps times warp size, rounded up to allocation unit regsPerCTA = round_i(attributes.numRegs * properties.warpSize * warpsPerCTA, regAllocationUnit); ctaLimitRegs = regsPerCTA > 0 ? properties.regsPerMultiprocessor / regsPerCTA : maxBlocksPerSM; } else { // GPUs of compute capability 2.x and higher allocate registers to warps // Number of regs per warp is regs per thread times number of warps times warp size, rounded up to allocation unit regsPerWarp = round_i(attributes.numRegs * properties.warpSize, regAllocationUnit); regsPerCTA = regsPerWarp * warpsPerCTA; if (properties.regsPerBlock >= regsPerCTA) { numSides = cudaOccSidesPerMultiprocessor(properties); CudaOccupancyException.CheckZero(numSides); numRegsPerSide = properties.regsPerMultiprocessor / numSides; ctaLimitRegs = regsPerWarp > 0 ? ((numRegsPerSide / regsPerWarp) * numSides) / warpsPerCTA : maxBlocksPerSM; } } ////////////////////////////////////////// // Overall limit is min() of limits due to above reasons ////////////////////////////////////////// ctaLimit = min_(ctaLimitRegs, min_(ctaLimitSMem, min_(ctaLimitWarps, ctaLimitBlocks))); // Determine occupancy limiting factors result.ActiveBlocksPerMultiProcessor = ctaLimit; if (ctaLimit == ctaLimitWarps) { limitingFactors |= cudaOccLimitingFactors.Warps; } if (ctaLimit == ctaLimitRegs && regsPerCTA > 0) { limitingFactors |= cudaOccLimitingFactors.Registers; } if (ctaLimit == ctaLimitSMem && smemPerCTA > 0) { limitingFactors |= cudaOccLimitingFactors.SharedMemory; } if (ctaLimit == ctaLimitBlocks) { limitingFactors |= cudaOccLimitingFactors.Blocks; } result.LimitingFactors = limitingFactors; result.BlockLimitRegs = ctaLimitRegs; result.BlockLimitSharedMem = ctaLimitSMem; result.BlockLimitWarps = ctaLimitWarps; result.BlockLimitBlocks = ctaLimitBlocks; result.BllocatedRegistersPerBlock = regsPerCTA; result.AllocatedSharedMemPerBlock = smemPerCTA; result.ActiveWarpsPerMultiProcessor = ctaLimit * ((int)Math.Ceiling(blockSize / (double)properties.warpSize)); result.ActiceThreadsPerMultiProcessor = result.ActiveWarpsPerMultiProcessor * properties.warpSize; result.OccupancyOfEachMultiProcessor = (int)Math.Round(result.ActiveWarpsPerMultiProcessor / (double)maxWarpsPerSm * 100); return(result); }
/// <summary> /// Determine the potential block size that allows maximum number of CTAs that can run on multiprocessor simultaneously /// </summary> /// <param name="properties"></param> /// <param name="attributes"></param> /// <param name="state"></param> /// <param name="blockSizeToSMem"> /// A function to convert from block size to dynamic shared memory size.<para/> /// e.g.: /// If no dynamic shared memory is used: x => 0<para/> /// If 4 bytes shared memory per thread is used: x = 4 * x</param> /// <returns>maxBlockSize</returns> public static int cudaOccMaxPotentialOccupancyBlockSize( cudaOccDeviceProp properties, cudaOccFuncAttributes attributes, cudaOccDeviceState state, del_blockSizeToDynamicSMemSize blockSizeToSMem) { int maxOccupancy = properties.maxThreadsPerMultiProcessor; int largestBlockSize = min_(properties.maxThreadsPerBlock, attributes.maxThreadsPerBlock); int granularity = properties.warpSize; int maxBlockSize = 0; int blockSize = 0; int highestOccupancy = 0; for(blockSize = largestBlockSize; blockSize > 0; blockSize -= granularity) { cudaOccResult res = cudaOccMaxActiveBlocksPerMultiprocessor(properties, attributes, blockSize, blockSizeToSMem(blockSize), state); int occupancy = res.ActiveBlocksPerMultiProcessor; occupancy = blockSize*occupancy; if(occupancy > highestOccupancy) { maxBlockSize = blockSize; highestOccupancy = occupancy; } // can not get higher occupancy if(highestOccupancy == maxOccupancy) break; } return maxBlockSize; }
/// <summary> /// Determine the potential block size that allows maximum number of CTAs that can run on multiprocessor simultaneously /// </summary> /// <param name="properties"></param> /// <param name="kernel"></param> /// <param name="state"></param> /// <param name="blockSizeToSMem"> /// A function to convert from block size to dynamic shared memory size.<para/> /// e.g.: /// If no dynamic shared memory is used: x => 0<para/> /// If 4 bytes shared memory per thread is used: x = 4 * x</param> /// <returns>maxBlockSize</returns> public static int cudaOccMaxPotentialOccupancyBlockSize( CudaDeviceProperties properties, CudaKernel kernel, cudaOccDeviceState state, del_blockSizeToDynamicSMemSize blockSizeToSMem) { cudaOccDeviceProp props = new cudaOccDeviceProp(properties); cudaOccFuncAttributes attributes = new cudaOccFuncAttributes(kernel); return cudaOccMaxPotentialOccupancyBlockSize(props, attributes, state, blockSizeToSMem); }
/// <summary> /// Determine the maximum number of CTAs that can be run simultaneously per SM.<para/> /// This is equivalent to the calculation done in the CUDA Occupancy Calculator /// spreadsheet /// </summary> /// <param name="properties"></param> /// <param name="attributes"></param> /// <param name="blockSize"></param> /// <param name="dynamic_smem_bytes"></param> /// <param name="state"></param> /// <returns></returns> public static cudaOccResult cudaOccMaxActiveBlocksPerMultiprocessor( cudaOccDeviceProp properties, cudaOccFuncAttributes attributes, int blockSize, SizeT dynamic_smem_bytes, cudaOccDeviceState state) { int regAllocationUnit = 0, warpAllocationMultiple = 0, maxBlocksPerSM=0; int ctaLimitWarps = 0, ctaLimitBlocks = 0, smemPerCTA = 0, smemBytes = 0, smemAllocationUnit = 0; int cacheConfigSMem = 0, sharedMemPerMultiprocessor = 0, ctaLimitRegs = 0, regsPerCTA=0; int regsPerWarp = 0, numSides = 0, numRegsPerSide = 0, ctaLimit=0; int maxWarpsPerSm = 0, warpsPerCTA = 0, ctaLimitSMem=0; cudaOccLimitingFactors limitingFactors = 0; cudaOccResult result = new cudaOccResult(); if(properties == null || attributes == null || blockSize <= 0) { throw new CudaOccupancyException(cudaOccError.ErrorInvalidInput); } ////////////////////////////////////////// // Limits due to warps/SM or blocks/SM ////////////////////////////////////////// CudaOccupancyException.CheckZero(properties.warpSize); maxWarpsPerSm = properties.maxThreadsPerMultiProcessor / properties.warpSize; warpAllocationMultiple = cudaOccWarpAllocationMultiple(properties); CudaOccupancyException.CheckZero(warpAllocationMultiple); warpsPerCTA = round_i(divide_ri(blockSize, properties.warpSize), warpAllocationMultiple); maxBlocksPerSM = cudaOccMaxBlocksPerMultiprocessor(properties); // Calc limits CudaOccupancyException.CheckZero(warpsPerCTA); ctaLimitWarps = (blockSize <= properties.maxThreadsPerBlock) ? maxWarpsPerSm / warpsPerCTA : 0; ctaLimitBlocks = maxBlocksPerSM; ////////////////////////////////////////// // Limits due to shared memory/SM ////////////////////////////////////////// smemAllocationUnit = cudaOccSMemAllocationUnit(properties); smemBytes = (int)(attributes.sharedSizeBytes + dynamic_smem_bytes); CudaOccupancyException.CheckZero(smemAllocationUnit); smemPerCTA = round_i(smemBytes, smemAllocationUnit); // Calc limit cacheConfigSMem = cudaOccSMemPerMultiprocessor(properties,state.cacheConfig); // sharedMemoryPerMultiprocessor is by default limit set in hardware but user requested shared memory // limit is used instead if it is greater than total shared memory used by function . sharedMemPerMultiprocessor = (cacheConfigSMem >= smemPerCTA) ? cacheConfigSMem : (int)properties.sharedMemPerMultiprocessor; // Limit on blocks launched should be calculated with shared memory per SM but total shared memory // used by function should be limited by shared memory per block ctaLimitSMem = 0; if(properties.sharedMemPerBlock >= (SizeT)smemPerCTA) { ctaLimitSMem = smemPerCTA > 0 ? sharedMemPerMultiprocessor / smemPerCTA : maxBlocksPerSM; } ////////////////////////////////////////// // Limits due to registers/SM ////////////////////////////////////////// regAllocationUnit = cudaOccRegAllocationUnit(properties, attributes.numRegs); CudaOccupancyException.CheckZero(regAllocationUnit); // Calc limit ctaLimitRegs = 0; if(properties.major <= 1) { // GPUs of compute capability 1.x allocate registers to CTAs // Number of regs per block is regs per thread times number of warps times warp size, rounded up to allocation unit regsPerCTA = round_i(attributes.numRegs * properties.warpSize * warpsPerCTA, regAllocationUnit); ctaLimitRegs = regsPerCTA > 0 ? properties.regsPerMultiprocessor / regsPerCTA : maxBlocksPerSM; } else { // GPUs of compute capability 2.x and higher allocate registers to warps // Number of regs per warp is regs per thread times number of warps times warp size, rounded up to allocation unit regsPerWarp = round_i(attributes.numRegs * properties.warpSize, regAllocationUnit); regsPerCTA = regsPerWarp * warpsPerCTA; if(properties.regsPerBlock >= regsPerCTA) { numSides = cudaOccSidesPerMultiprocessor(properties); CudaOccupancyException.CheckZero(numSides); numRegsPerSide = properties.regsPerMultiprocessor / numSides; ctaLimitRegs = regsPerWarp > 0 ? ((numRegsPerSide / regsPerWarp) * numSides) / warpsPerCTA : maxBlocksPerSM; } } ////////////////////////////////////////// // Overall limit is min() of limits due to above reasons ////////////////////////////////////////// ctaLimit = min_(ctaLimitRegs, min_(ctaLimitSMem, min_(ctaLimitWarps, ctaLimitBlocks))); // Determine occupancy limiting factors result.ActiveBlocksPerMultiProcessor = ctaLimit; if(ctaLimit==ctaLimitWarps) { limitingFactors |= cudaOccLimitingFactors.Warps; } if(ctaLimit==ctaLimitRegs && regsPerCTA > 0) { limitingFactors |= cudaOccLimitingFactors.Registers; } if(ctaLimit==ctaLimitSMem && smemPerCTA > 0) { limitingFactors |= cudaOccLimitingFactors.SharedMemory; } if(ctaLimit==ctaLimitBlocks) { limitingFactors |= cudaOccLimitingFactors.Blocks; } result.LimitingFactors = limitingFactors; result.BlockLimitRegs = ctaLimitRegs; result.BlockLimitSharedMem = ctaLimitSMem; result.BlockLimitWarps = ctaLimitWarps; result.BlockLimitBlocks = ctaLimitBlocks; result.BllocatedRegistersPerBlock = regsPerCTA; result.AllocatedSharedMemPerBlock = smemPerCTA; result.ActiveWarpsPerMultiProcessor = ctaLimit * ((int)Math.Ceiling(blockSize / (double)properties.warpSize)); result.ActiceThreadsPerMultiProcessor = result.ActiveWarpsPerMultiProcessor * properties.warpSize; result.OccupancyOfEachMultiProcessor = (int)Math.Round(result.ActiveWarpsPerMultiProcessor / (double)maxWarpsPerSm * 100); return result; }
/////////////////////////////////////////////// // Occupancy calculation Functions // /////////////////////////////////////////////// /// <summary> /// Determine the maximum number of CTAs that can be run simultaneously per SM.<para/> /// This is equivalent to the calculation done in the CUDA Occupancy Calculator /// spreadsheet /// </summary> /// <param name="properties"></param> /// <param name="kernel"></param> /// <param name="state"></param> /// <returns></returns> public static cudaOccResult cudaOccMaxActiveBlocksPerMultiprocessor( CudaDeviceProperties properties, CudaKernel kernel, cudaOccDeviceState state) { cudaOccDeviceProp props = new cudaOccDeviceProp(properties); cudaOccFuncAttributes attributes = new cudaOccFuncAttributes(kernel); return cudaOccMaxActiveBlocksPerMultiprocessor(props, attributes, (int)kernel.BlockDimensions.x * (int)kernel.BlockDimensions.y * (int)kernel.BlockDimensions.z, kernel.DynamicSharedMemory, state); }
/// <summary> /// /// </summary> /// <param name="minGridSize"></param> /// <param name="blockSize"></param> /// <param name="properties"></param> /// <param name="attributes"></param> /// <param name="state"></param> /// <param name="blockSizeToDynamicSMemSize"></param> /// <param name="dynamicSMemSize"></param> public static void cudaOccMaxPotentialOccupancyBlockSize( ref int minGridSize, ref int blockSize, cudaOccDeviceProp properties, cudaOccFuncAttributes attributes, cudaOccDeviceState state, del_blockSizeToDynamicSMemSize blockSizeToDynamicSMemSize, SizeT dynamicSMemSize) { cudaOccResult result = new cudaOccResult(); // Limits int occupancyLimit; int granularity; int blockSizeLimit; // Recorded maximum int maxBlockSize = 0; int numBlocks = 0; int maxOccupancy = 0; // Temporary int blockSizeToTryAligned; int blockSizeToTry; int blockSizeLimitAligned; int occupancyInBlocks; int occupancyInThreads; /////////////////////////// // Check user input /////////////////////////// //if (!minGridSize || !blockSize || !properties || !attributes || !state) { // return CUDA_OCC_ERROR_INVALID_INPUT; //} cudaOccInputCheck(properties, attributes, state); ///////////////////////////////////////////////////////////////////////////////// // Try each block size, and pick the block size with maximum occupancy ///////////////////////////////////////////////////////////////////////////////// occupancyLimit = properties.maxThreadsPerMultiProcessor; granularity = properties.warpSize; blockSizeLimit = __occMin(properties.maxThreadsPerBlock, attributes.maxThreadsPerBlock); blockSizeLimitAligned = __occRoundUp(blockSizeLimit, granularity); for (blockSizeToTryAligned = blockSizeLimitAligned; blockSizeToTryAligned > 0; blockSizeToTryAligned -= granularity) { blockSizeToTry = __occMin(blockSizeLimit, blockSizeToTryAligned); // Ignore dynamicSMemSize if the user provides a mapping // if (blockSizeToDynamicSMemSize != null) { dynamicSMemSize = blockSizeToDynamicSMemSize(blockSizeToTry); } cudaOccMaxActiveBlocksPerMultiprocessor( result, properties, attributes, state, blockSizeToTry, dynamicSMemSize); //if (status != CUDA_OCC_SUCCESS) { // return status; //} occupancyInBlocks = result.ActiveBlocksPerMultiProcessor; occupancyInThreads = blockSizeToTry * occupancyInBlocks; if (occupancyInThreads > maxOccupancy) { maxBlockSize = blockSizeToTry; numBlocks = occupancyInBlocks; maxOccupancy = occupancyInThreads; } // Early out if we have reached the maximum // if (occupancyLimit == maxOccupancy) { break; } } /////////////////////////// // Return best available /////////////////////////// // Suggested min grid size to achieve a full machine launch // minGridSize = numBlocks * properties.numSms; blockSize = maxBlockSize; }
// Shared memory limit // private static int cudaOccMaxBlocksPerSMSmemLimit( cudaOccResult result, cudaOccDeviceProp properties, cudaOccFuncAttributes attributes, cudaOccDeviceState state, int blockSize, SizeT dynamicSmemSize) { int allocationGranularity; SizeT userSmemPreference; SizeT totalSmemUsagePerCTA; SizeT smemAllocatedPerCTA; SizeT sharedMemPerMultiprocessor; int maxBlocks; allocationGranularity = cudaOccSMemAllocationGranularity(properties); // Obtain the user preferred shared memory size. This setting is ignored if // user requests more shared memory than preferred. // userSmemPreference = cudaOccSMemPerMultiprocessor(properties, state.cacheConfig); totalSmemUsagePerCTA = attributes.sharedSizeBytes + dynamicSmemSize; smemAllocatedPerCTA = __occRoundUp((int)totalSmemUsagePerCTA, (int)allocationGranularity); if (smemAllocatedPerCTA > properties.sharedMemPerBlock) { maxBlocks = 0; } else { // User requested shared memory limit is used as long as it is greater // than the total shared memory used per CTA, i.e. as long as at least // one CTA can be launched. Otherwise, the maximum shared memory limit // is used instead. // if (userSmemPreference >= smemAllocatedPerCTA) { sharedMemPerMultiprocessor = userSmemPreference; } else{ sharedMemPerMultiprocessor = properties.sharedMemPerMultiprocessor; } if (smemAllocatedPerCTA > 0) { maxBlocks = (int)(sharedMemPerMultiprocessor / smemAllocatedPerCTA); } else { maxBlocks = int.MaxValue; } } result.AllocatedSharedMemPerBlock = smemAllocatedPerCTA; return maxBlocks; }
private static cudaOccError cudaOccDeviceStateCheck(cudaOccDeviceState state) { // Placeholder // return cudaOccError.None; }
/////////////////////////////////// // API Implementations // /////////////////////////////////// /// <summary> /// Determine the maximum number of CTAs that can be run simultaneously per SM.<para/> /// This is equivalent to the calculation done in the CUDA Occupancy Calculator /// spreadsheet /// </summary> /// <param name="result"></param> /// <param name="properties"></param> /// <param name="attributes"></param> /// <param name="state"></param> /// <param name="blockSize"></param> /// <param name="dynamicSmemSize"></param> /// <returns></returns> public static void cudaOccMaxActiveBlocksPerMultiprocessor( cudaOccResult result, cudaOccDeviceProp properties, cudaOccFuncAttributes attributes, cudaOccDeviceState state, int blockSize, SizeT dynamicSmemSize) { int ctaLimitWarps = 0; int ctaLimitBlocks = 0; int ctaLimitSMem = 0; int ctaLimitRegs = 0; int ctaLimit = 0; cudaOccLimitingFactors limitingFactors = 0; cudaOccPartitionedGCConfig gcConfig = cudaOccPartitionedGCConfig.Off; //if (!result || !properties || !attributes || !state || blockSize <= 0) { // return CUDA_OCC_ERROR_INVALID_INPUT; //} /////////////////////////// // Check user input /////////////////////////// cudaOccInputCheck(properties, attributes, state); /////////////////////////// // Initialization /////////////////////////// gcConfig = cudaOccPartitionedGCExpected(properties, attributes); /////////////////////////// // Compute occupancy /////////////////////////// // Limits due to registers/SM // Also compute if partitioned global caching has to be turned off // ctaLimitRegs = cudaOccMaxBlocksPerSMRegsLimit(ref gcConfig, result, properties, attributes, blockSize); // Limits due to warps/SM // ctaLimitWarps = cudaOccMaxBlocksPerSMWarpsLimit(gcConfig, properties, attributes, blockSize); // Limits due to blocks/SM // ctaLimitBlocks = cudaOccMaxBlocksPerMultiprocessor(properties); // Limits due to shared memory/SM // ctaLimitSMem = cudaOccMaxBlocksPerSMSmemLimit(result, properties, attributes, state, blockSize, dynamicSmemSize); /////////////////////////// // Overall occupancy /////////////////////////// // Overall limit is min() of limits due to above reasons // ctaLimit = __occMin(ctaLimitRegs, __occMin(ctaLimitSMem, __occMin(ctaLimitWarps, ctaLimitBlocks))); // Fill in the return values // // Determine occupancy limiting factors // if (ctaLimit == ctaLimitWarps) { limitingFactors |= cudaOccLimitingFactors.Warps; } if (ctaLimit == ctaLimitRegs) { limitingFactors |= cudaOccLimitingFactors.Registers; } if (ctaLimit == ctaLimitSMem) { limitingFactors |= cudaOccLimitingFactors.SharedMemory; } if (ctaLimit == ctaLimitBlocks) { limitingFactors |= cudaOccLimitingFactors.Blocks; } result.LimitingFactors = limitingFactors; result.BlockLimitRegs = ctaLimitRegs; result.BlockLimitSharedMem = ctaLimitSMem; result.BlockLimitWarps = ctaLimitWarps; result.BlockLimitBlocks = ctaLimitBlocks; result.partitionedGCConfig = gcConfig; // Final occupancy result.ActiveBlocksPerMultiProcessor = ctaLimit; }