/// <summary> /// Determine the maximum number of CTAs that can be run simultaneously per SM.<para/> /// This is equivalent to the calculation done in the CUDA Occupancy Calculator /// spreadsheet /// </summary> /// <param name="properties"></param> /// <param name="attributes"></param> /// <param name="blockSize"></param> /// <param name="dynamic_smem_bytes"></param> /// <param name="state"></param> /// <returns></returns> public static cudaOccResult cudaOccMaxActiveBlocksPerMultiprocessor( cudaOccDeviceProp properties, cudaOccFuncAttributes attributes, int blockSize, SizeT dynamic_smem_bytes, cudaOccDeviceState state) { int regAllocationUnit = 0, warpAllocationMultiple = 0, maxBlocksPerSM = 0; int ctaLimitWarps = 0, ctaLimitBlocks = 0, smemPerCTA = 0, smemBytes = 0, smemAllocationUnit = 0; int cacheConfigSMem = 0, sharedMemPerMultiprocessor = 0, ctaLimitRegs = 0, regsPerCTA = 0; int regsPerWarp = 0, numSides = 0, numRegsPerSide = 0, ctaLimit = 0; int maxWarpsPerSm = 0, warpsPerCTA = 0, ctaLimitSMem = 0; cudaOccLimitingFactors limitingFactors = 0; cudaOccResult result = new cudaOccResult(); if (properties == null || attributes == null || blockSize <= 0) { throw new CudaOccupancyException(cudaOccError.ErrorInvalidInput); } ////////////////////////////////////////// // Limits due to warps/SM or blocks/SM ////////////////////////////////////////// CudaOccupancyException.CheckZero(properties.warpSize); maxWarpsPerSm = properties.maxThreadsPerMultiProcessor / properties.warpSize; warpAllocationMultiple = cudaOccWarpAllocationMultiple(properties); CudaOccupancyException.CheckZero(warpAllocationMultiple); warpsPerCTA = round_i(divide_ri(blockSize, properties.warpSize), warpAllocationMultiple); maxBlocksPerSM = cudaOccMaxBlocksPerMultiprocessor(properties); // Calc limits CudaOccupancyException.CheckZero(warpsPerCTA); ctaLimitWarps = (blockSize <= properties.maxThreadsPerBlock) ? maxWarpsPerSm / warpsPerCTA : 0; ctaLimitBlocks = maxBlocksPerSM; ////////////////////////////////////////// // Limits due to shared memory/SM ////////////////////////////////////////// smemAllocationUnit = cudaOccSMemAllocationUnit(properties); smemBytes = (int)(attributes.sharedSizeBytes + dynamic_smem_bytes); CudaOccupancyException.CheckZero(smemAllocationUnit); smemPerCTA = round_i(smemBytes, smemAllocationUnit); // Calc limit cacheConfigSMem = cudaOccSMemPerMultiprocessor(properties, state.cacheConfig); // sharedMemoryPerMultiprocessor is by default limit set in hardware but user requested shared memory // limit is used instead if it is greater than total shared memory used by function . sharedMemPerMultiprocessor = (cacheConfigSMem >= smemPerCTA) ? cacheConfigSMem : (int)properties.sharedMemPerMultiprocessor; // Limit on blocks launched should be calculated with shared memory per SM but total shared memory // used by function should be limited by shared memory per block ctaLimitSMem = 0; if (properties.sharedMemPerBlock >= (SizeT)smemPerCTA) { ctaLimitSMem = smemPerCTA > 0 ? sharedMemPerMultiprocessor / smemPerCTA : maxBlocksPerSM; } ////////////////////////////////////////// // Limits due to registers/SM ////////////////////////////////////////// regAllocationUnit = cudaOccRegAllocationUnit(properties, attributes.numRegs); CudaOccupancyException.CheckZero(regAllocationUnit); // Calc limit ctaLimitRegs = 0; if (properties.major <= 1) { // GPUs of compute capability 1.x allocate registers to CTAs // Number of regs per block is regs per thread times number of warps times warp size, rounded up to allocation unit regsPerCTA = round_i(attributes.numRegs * properties.warpSize * warpsPerCTA, regAllocationUnit); ctaLimitRegs = regsPerCTA > 0 ? properties.regsPerMultiprocessor / regsPerCTA : maxBlocksPerSM; } else { // GPUs of compute capability 2.x and higher allocate registers to warps // Number of regs per warp is regs per thread times number of warps times warp size, rounded up to allocation unit regsPerWarp = round_i(attributes.numRegs * properties.warpSize, regAllocationUnit); regsPerCTA = regsPerWarp * warpsPerCTA; if (properties.regsPerBlock >= regsPerCTA) { numSides = cudaOccSidesPerMultiprocessor(properties); CudaOccupancyException.CheckZero(numSides); numRegsPerSide = properties.regsPerMultiprocessor / numSides; ctaLimitRegs = regsPerWarp > 0 ? ((numRegsPerSide / regsPerWarp) * numSides) / warpsPerCTA : maxBlocksPerSM; } } ////////////////////////////////////////// // Overall limit is min() of limits due to above reasons ////////////////////////////////////////// ctaLimit = min_(ctaLimitRegs, min_(ctaLimitSMem, min_(ctaLimitWarps, ctaLimitBlocks))); // Determine occupancy limiting factors result.ActiveBlocksPerMultiProcessor = ctaLimit; if (ctaLimit == ctaLimitWarps) { limitingFactors |= cudaOccLimitingFactors.Warps; } if (ctaLimit == ctaLimitRegs && regsPerCTA > 0) { limitingFactors |= cudaOccLimitingFactors.Registers; } if (ctaLimit == ctaLimitSMem && smemPerCTA > 0) { limitingFactors |= cudaOccLimitingFactors.SharedMemory; } if (ctaLimit == ctaLimitBlocks) { limitingFactors |= cudaOccLimitingFactors.Blocks; } result.LimitingFactors = limitingFactors; result.BlockLimitRegs = ctaLimitRegs; result.BlockLimitSharedMem = ctaLimitSMem; result.BlockLimitWarps = ctaLimitWarps; result.BlockLimitBlocks = ctaLimitBlocks; result.BllocatedRegistersPerBlock = regsPerCTA; result.AllocatedSharedMemPerBlock = smemPerCTA; result.ActiveWarpsPerMultiProcessor = ctaLimit * ((int)Math.Ceiling(blockSize / (double)properties.warpSize)); result.ActiceThreadsPerMultiProcessor = result.ActiveWarpsPerMultiProcessor * properties.warpSize; result.OccupancyOfEachMultiProcessor = (int)Math.Round(result.ActiveWarpsPerMultiProcessor / (double)maxWarpsPerSm * 100); return(result); }
/////////////////////////////////// // API Implementations // /////////////////////////////////// /// <summary> /// Determine the maximum number of CTAs that can be run simultaneously per SM.<para/> /// This is equivalent to the calculation done in the CUDA Occupancy Calculator /// spreadsheet /// </summary> /// <param name="result"></param> /// <param name="properties"></param> /// <param name="attributes"></param> /// <param name="state"></param> /// <param name="blockSize"></param> /// <param name="dynamicSmemSize"></param> /// <returns></returns> public static void cudaOccMaxActiveBlocksPerMultiprocessor( cudaOccResult result, cudaOccDeviceProp properties, cudaOccFuncAttributes attributes, cudaOccDeviceState state, int blockSize, SizeT dynamicSmemSize) { int ctaLimitWarps = 0; int ctaLimitBlocks = 0; int ctaLimitSMem = 0; int ctaLimitRegs = 0; int ctaLimit = 0; cudaOccLimitingFactors limitingFactors = 0; cudaOccPartitionedGCConfig gcConfig = cudaOccPartitionedGCConfig.Off; //if (!result || !properties || !attributes || !state || blockSize <= 0) { // return CUDA_OCC_ERROR_INVALID_INPUT; //} /////////////////////////// // Check user input /////////////////////////// cudaOccInputCheck(properties, attributes, state); /////////////////////////// // Initialization /////////////////////////// gcConfig = cudaOccPartitionedGCExpected(properties, attributes); /////////////////////////// // Compute occupancy /////////////////////////// // Limits due to registers/SM // Also compute if partitioned global caching has to be turned off // ctaLimitRegs = cudaOccMaxBlocksPerSMRegsLimit(ref gcConfig, result, properties, attributes, blockSize); // Limits due to warps/SM // ctaLimitWarps = cudaOccMaxBlocksPerSMWarpsLimit(gcConfig, properties, attributes, blockSize); // Limits due to blocks/SM // ctaLimitBlocks = cudaOccMaxBlocksPerMultiprocessor(properties); // Limits due to shared memory/SM // ctaLimitSMem = cudaOccMaxBlocksPerSMSmemLimit(result, properties, attributes, state, blockSize, dynamicSmemSize); /////////////////////////// // Overall occupancy /////////////////////////// // Overall limit is min() of limits due to above reasons // ctaLimit = __occMin(ctaLimitRegs, __occMin(ctaLimitSMem, __occMin(ctaLimitWarps, ctaLimitBlocks))); // Fill in the return values // // Determine occupancy limiting factors // if (ctaLimit == ctaLimitWarps) { limitingFactors |= cudaOccLimitingFactors.Warps; } if (ctaLimit == ctaLimitRegs) { limitingFactors |= cudaOccLimitingFactors.Registers; } if (ctaLimit == ctaLimitSMem) { limitingFactors |= cudaOccLimitingFactors.SharedMemory; } if (ctaLimit == ctaLimitBlocks) { limitingFactors |= cudaOccLimitingFactors.Blocks; } result.LimitingFactors = limitingFactors; result.BlockLimitRegs = ctaLimitRegs; result.BlockLimitSharedMem = ctaLimitSMem; result.BlockLimitWarps = ctaLimitWarps; result.BlockLimitBlocks = ctaLimitBlocks; result.partitionedGCConfig = gcConfig; // Final occupancy result.ActiveBlocksPerMultiProcessor = ctaLimit; }