// Shared memory limit // private static int cudaOccMaxBlocksPerSMSmemLimit( cudaOccResult result, cudaOccDeviceProp properties, cudaOccFuncAttributes attributes, cudaOccDeviceState state, int blockSize, SizeT dynamicSmemSize) { int allocationGranularity; SizeT userSmemPreference; SizeT totalSmemUsagePerCTA; SizeT smemAllocatedPerCTA; SizeT sharedMemPerMultiprocessor; int maxBlocks; allocationGranularity = cudaOccSMemAllocationGranularity(properties); // Obtain the user preferred shared memory size. This setting is ignored if // user requests more shared memory than preferred. // userSmemPreference = cudaOccSMemPerMultiprocessor(properties, state.cacheConfig); totalSmemUsagePerCTA = attributes.sharedSizeBytes + dynamicSmemSize; smemAllocatedPerCTA = __occRoundUp((int)totalSmemUsagePerCTA, (int)allocationGranularity); if (smemAllocatedPerCTA > properties.sharedMemPerBlock) { maxBlocks = 0; } else { // User requested shared memory limit is used as long as it is greater // than the total shared memory used per CTA, i.e. as long as at least // one CTA can be launched. Otherwise, the maximum shared memory limit // is used instead. // if (userSmemPreference >= smemAllocatedPerCTA) { sharedMemPerMultiprocessor = userSmemPreference; } else { sharedMemPerMultiprocessor = properties.sharedMemPerMultiprocessor; } if (smemAllocatedPerCTA > 0) { maxBlocks = (int)(sharedMemPerMultiprocessor / smemAllocatedPerCTA); } else { maxBlocks = int.MaxValue; } } result.AllocatedSharedMemPerBlock = smemAllocatedPerCTA; return(maxBlocks); }
/// <summary> /// Determine the potential block size that allows maximum number of CTAs that can run on multiprocessor simultaneously /// </summary> /// <param name="properties"></param> /// <param name="attributes"></param> /// <param name="state"></param> /// <param name="blockSizeToSMem"> /// A function to convert from block size to dynamic shared memory size.<para/> /// e.g.: /// If no dynamic shared memory is used: x => 0<para/> /// If 4 bytes shared memory per thread is used: x = 4 * x</param> /// <returns>maxBlockSize</returns> public static int cudaOccMaxPotentialOccupancyBlockSize( cudaOccDeviceProp properties, cudaOccFuncAttributes attributes, cudaOccDeviceState state, del_blockSizeToDynamicSMemSize blockSizeToSMem) { int maxOccupancy = properties.maxThreadsPerMultiProcessor; int largestBlockSize = min_(properties.maxThreadsPerBlock, attributes.maxThreadsPerBlock); int granularity = properties.warpSize; int maxBlockSize = 0; int blockSize = 0; int highestOccupancy = 0; for (blockSize = largestBlockSize; blockSize > 0; blockSize -= granularity) { cudaOccResult res = cudaOccMaxActiveBlocksPerMultiprocessor(properties, attributes, blockSize, blockSizeToSMem(blockSize), state); int occupancy = res.ActiveBlocksPerMultiProcessor; occupancy = blockSize * occupancy; if (occupancy > highestOccupancy) { maxBlockSize = blockSize; highestOccupancy = occupancy; } // can not get higher occupancy if (highestOccupancy == maxOccupancy) { break; } } return(maxBlockSize); }
/// <summary> /// Determine the maximum number of CTAs that can be run simultaneously per SM.<para/> /// This is equivalent to the calculation done in the CUDA Occupancy Calculator /// spreadsheet /// </summary> /// <param name="properties"></param> /// <param name="attributes"></param> /// <param name="blockSize"></param> /// <param name="dynamic_smem_bytes"></param> /// <param name="state"></param> /// <returns></returns> public static cudaOccResult cudaOccMaxActiveBlocksPerMultiprocessor( cudaOccDeviceProp properties, cudaOccFuncAttributes attributes, int blockSize, SizeT dynamic_smem_bytes, cudaOccDeviceState state) { int regAllocationUnit = 0, warpAllocationMultiple = 0, maxBlocksPerSM = 0; int ctaLimitWarps = 0, ctaLimitBlocks = 0, smemPerCTA = 0, smemBytes = 0, smemAllocationUnit = 0; int cacheConfigSMem = 0, sharedMemPerMultiprocessor = 0, ctaLimitRegs = 0, regsPerCTA = 0; int regsPerWarp = 0, numSides = 0, numRegsPerSide = 0, ctaLimit = 0; int maxWarpsPerSm = 0, warpsPerCTA = 0, ctaLimitSMem = 0; cudaOccLimitingFactors limitingFactors = 0; cudaOccResult result = new cudaOccResult(); if (properties == null || attributes == null || blockSize <= 0) { throw new CudaOccupancyException(cudaOccError.ErrorInvalidInput); } ////////////////////////////////////////// // Limits due to warps/SM or blocks/SM ////////////////////////////////////////// CudaOccupancyException.CheckZero(properties.warpSize); maxWarpsPerSm = properties.maxThreadsPerMultiProcessor / properties.warpSize; warpAllocationMultiple = cudaOccWarpAllocationMultiple(properties); CudaOccupancyException.CheckZero(warpAllocationMultiple); warpsPerCTA = round_i(divide_ri(blockSize, properties.warpSize), warpAllocationMultiple); maxBlocksPerSM = cudaOccMaxBlocksPerMultiprocessor(properties); // Calc limits CudaOccupancyException.CheckZero(warpsPerCTA); ctaLimitWarps = (blockSize <= properties.maxThreadsPerBlock) ? maxWarpsPerSm / warpsPerCTA : 0; ctaLimitBlocks = maxBlocksPerSM; ////////////////////////////////////////// // Limits due to shared memory/SM ////////////////////////////////////////// smemAllocationUnit = cudaOccSMemAllocationUnit(properties); smemBytes = (int)(attributes.sharedSizeBytes + dynamic_smem_bytes); CudaOccupancyException.CheckZero(smemAllocationUnit); smemPerCTA = round_i(smemBytes, smemAllocationUnit); // Calc limit cacheConfigSMem = cudaOccSMemPerMultiprocessor(properties, state.cacheConfig); // sharedMemoryPerMultiprocessor is by default limit set in hardware but user requested shared memory // limit is used instead if it is greater than total shared memory used by function . sharedMemPerMultiprocessor = (cacheConfigSMem >= smemPerCTA) ? cacheConfigSMem : (int)properties.sharedMemPerMultiprocessor; // Limit on blocks launched should be calculated with shared memory per SM but total shared memory // used by function should be limited by shared memory per block ctaLimitSMem = 0; if (properties.sharedMemPerBlock >= (SizeT)smemPerCTA) { ctaLimitSMem = smemPerCTA > 0 ? sharedMemPerMultiprocessor / smemPerCTA : maxBlocksPerSM; } ////////////////////////////////////////// // Limits due to registers/SM ////////////////////////////////////////// regAllocationUnit = cudaOccRegAllocationUnit(properties, attributes.numRegs); CudaOccupancyException.CheckZero(regAllocationUnit); // Calc limit ctaLimitRegs = 0; if (properties.major <= 1) { // GPUs of compute capability 1.x allocate registers to CTAs // Number of regs per block is regs per thread times number of warps times warp size, rounded up to allocation unit regsPerCTA = round_i(attributes.numRegs * properties.warpSize * warpsPerCTA, regAllocationUnit); ctaLimitRegs = regsPerCTA > 0 ? properties.regsPerMultiprocessor / regsPerCTA : maxBlocksPerSM; } else { // GPUs of compute capability 2.x and higher allocate registers to warps // Number of regs per warp is regs per thread times number of warps times warp size, rounded up to allocation unit regsPerWarp = round_i(attributes.numRegs * properties.warpSize, regAllocationUnit); regsPerCTA = regsPerWarp * warpsPerCTA; if (properties.regsPerBlock >= regsPerCTA) { numSides = cudaOccSidesPerMultiprocessor(properties); CudaOccupancyException.CheckZero(numSides); numRegsPerSide = properties.regsPerMultiprocessor / numSides; ctaLimitRegs = regsPerWarp > 0 ? ((numRegsPerSide / regsPerWarp) * numSides) / warpsPerCTA : maxBlocksPerSM; } } ////////////////////////////////////////// // Overall limit is min() of limits due to above reasons ////////////////////////////////////////// ctaLimit = min_(ctaLimitRegs, min_(ctaLimitSMem, min_(ctaLimitWarps, ctaLimitBlocks))); // Determine occupancy limiting factors result.ActiveBlocksPerMultiProcessor = ctaLimit; if (ctaLimit == ctaLimitWarps) { limitingFactors |= cudaOccLimitingFactors.Warps; } if (ctaLimit == ctaLimitRegs && regsPerCTA > 0) { limitingFactors |= cudaOccLimitingFactors.Registers; } if (ctaLimit == ctaLimitSMem && smemPerCTA > 0) { limitingFactors |= cudaOccLimitingFactors.SharedMemory; } if (ctaLimit == ctaLimitBlocks) { limitingFactors |= cudaOccLimitingFactors.Blocks; } result.LimitingFactors = limitingFactors; result.BlockLimitRegs = ctaLimitRegs; result.BlockLimitSharedMem = ctaLimitSMem; result.BlockLimitWarps = ctaLimitWarps; result.BlockLimitBlocks = ctaLimitBlocks; result.BllocatedRegistersPerBlock = regsPerCTA; result.AllocatedSharedMemPerBlock = smemPerCTA; result.ActiveWarpsPerMultiProcessor = ctaLimit * ((int)Math.Ceiling(blockSize / (double)properties.warpSize)); result.ActiceThreadsPerMultiProcessor = result.ActiveWarpsPerMultiProcessor * properties.warpSize; result.OccupancyOfEachMultiProcessor = (int)Math.Round(result.ActiveWarpsPerMultiProcessor / (double)maxWarpsPerSm * 100); return(result); }
/// <summary> /// Determine the maximum number of CTAs that can be run simultaneously per SM.<para/> /// This is equivalent to the calculation done in the CUDA Occupancy Calculator /// spreadsheet /// </summary> /// <param name="properties"></param> /// <param name="attributes"></param> /// <param name="blockSize"></param> /// <param name="dynamic_smem_bytes"></param> /// <param name="state"></param> /// <returns></returns> public static cudaOccResult cudaOccMaxActiveBlocksPerMultiprocessor( cudaOccDeviceProp properties, cudaOccFuncAttributes attributes, int blockSize, SizeT dynamic_smem_bytes, cudaOccDeviceState state) { int regAllocationUnit = 0, warpAllocationMultiple = 0, maxBlocksPerSM=0; int ctaLimitWarps = 0, ctaLimitBlocks = 0, smemPerCTA = 0, smemBytes = 0, smemAllocationUnit = 0; int cacheConfigSMem = 0, sharedMemPerMultiprocessor = 0, ctaLimitRegs = 0, regsPerCTA=0; int regsPerWarp = 0, numSides = 0, numRegsPerSide = 0, ctaLimit=0; int maxWarpsPerSm = 0, warpsPerCTA = 0, ctaLimitSMem=0; cudaOccLimitingFactors limitingFactors = 0; cudaOccResult result = new cudaOccResult(); if(properties == null || attributes == null || blockSize <= 0) { throw new CudaOccupancyException(cudaOccError.ErrorInvalidInput); } ////////////////////////////////////////// // Limits due to warps/SM or blocks/SM ////////////////////////////////////////// CudaOccupancyException.CheckZero(properties.warpSize); maxWarpsPerSm = properties.maxThreadsPerMultiProcessor / properties.warpSize; warpAllocationMultiple = cudaOccWarpAllocationMultiple(properties); CudaOccupancyException.CheckZero(warpAllocationMultiple); warpsPerCTA = round_i(divide_ri(blockSize, properties.warpSize), warpAllocationMultiple); maxBlocksPerSM = cudaOccMaxBlocksPerMultiprocessor(properties); // Calc limits CudaOccupancyException.CheckZero(warpsPerCTA); ctaLimitWarps = (blockSize <= properties.maxThreadsPerBlock) ? maxWarpsPerSm / warpsPerCTA : 0; ctaLimitBlocks = maxBlocksPerSM; ////////////////////////////////////////// // Limits due to shared memory/SM ////////////////////////////////////////// smemAllocationUnit = cudaOccSMemAllocationUnit(properties); smemBytes = (int)(attributes.sharedSizeBytes + dynamic_smem_bytes); CudaOccupancyException.CheckZero(smemAllocationUnit); smemPerCTA = round_i(smemBytes, smemAllocationUnit); // Calc limit cacheConfigSMem = cudaOccSMemPerMultiprocessor(properties,state.cacheConfig); // sharedMemoryPerMultiprocessor is by default limit set in hardware but user requested shared memory // limit is used instead if it is greater than total shared memory used by function . sharedMemPerMultiprocessor = (cacheConfigSMem >= smemPerCTA) ? cacheConfigSMem : (int)properties.sharedMemPerMultiprocessor; // Limit on blocks launched should be calculated with shared memory per SM but total shared memory // used by function should be limited by shared memory per block ctaLimitSMem = 0; if(properties.sharedMemPerBlock >= (SizeT)smemPerCTA) { ctaLimitSMem = smemPerCTA > 0 ? sharedMemPerMultiprocessor / smemPerCTA : maxBlocksPerSM; } ////////////////////////////////////////// // Limits due to registers/SM ////////////////////////////////////////// regAllocationUnit = cudaOccRegAllocationUnit(properties, attributes.numRegs); CudaOccupancyException.CheckZero(regAllocationUnit); // Calc limit ctaLimitRegs = 0; if(properties.major <= 1) { // GPUs of compute capability 1.x allocate registers to CTAs // Number of regs per block is regs per thread times number of warps times warp size, rounded up to allocation unit regsPerCTA = round_i(attributes.numRegs * properties.warpSize * warpsPerCTA, regAllocationUnit); ctaLimitRegs = regsPerCTA > 0 ? properties.regsPerMultiprocessor / regsPerCTA : maxBlocksPerSM; } else { // GPUs of compute capability 2.x and higher allocate registers to warps // Number of regs per warp is regs per thread times number of warps times warp size, rounded up to allocation unit regsPerWarp = round_i(attributes.numRegs * properties.warpSize, regAllocationUnit); regsPerCTA = regsPerWarp * warpsPerCTA; if(properties.regsPerBlock >= regsPerCTA) { numSides = cudaOccSidesPerMultiprocessor(properties); CudaOccupancyException.CheckZero(numSides); numRegsPerSide = properties.regsPerMultiprocessor / numSides; ctaLimitRegs = regsPerWarp > 0 ? ((numRegsPerSide / regsPerWarp) * numSides) / warpsPerCTA : maxBlocksPerSM; } } ////////////////////////////////////////// // Overall limit is min() of limits due to above reasons ////////////////////////////////////////// ctaLimit = min_(ctaLimitRegs, min_(ctaLimitSMem, min_(ctaLimitWarps, ctaLimitBlocks))); // Determine occupancy limiting factors result.ActiveBlocksPerMultiProcessor = ctaLimit; if(ctaLimit==ctaLimitWarps) { limitingFactors |= cudaOccLimitingFactors.Warps; } if(ctaLimit==ctaLimitRegs && regsPerCTA > 0) { limitingFactors |= cudaOccLimitingFactors.Registers; } if(ctaLimit==ctaLimitSMem && smemPerCTA > 0) { limitingFactors |= cudaOccLimitingFactors.SharedMemory; } if(ctaLimit==ctaLimitBlocks) { limitingFactors |= cudaOccLimitingFactors.Blocks; } result.LimitingFactors = limitingFactors; result.BlockLimitRegs = ctaLimitRegs; result.BlockLimitSharedMem = ctaLimitSMem; result.BlockLimitWarps = ctaLimitWarps; result.BlockLimitBlocks = ctaLimitBlocks; result.BllocatedRegistersPerBlock = regsPerCTA; result.AllocatedSharedMemPerBlock = smemPerCTA; result.ActiveWarpsPerMultiProcessor = ctaLimit * ((int)Math.Ceiling(blockSize / (double)properties.warpSize)); result.ActiceThreadsPerMultiProcessor = result.ActiveWarpsPerMultiProcessor * properties.warpSize; result.OccupancyOfEachMultiProcessor = (int)Math.Round(result.ActiveWarpsPerMultiProcessor / (double)maxWarpsPerSm * 100); return result; }
/// <summary> /// /// </summary> /// <param name="minGridSize"></param> /// <param name="blockSize"></param> /// <param name="properties"></param> /// <param name="attributes"></param> /// <param name="state"></param> /// <param name="blockSizeToDynamicSMemSize"></param> /// <param name="dynamicSMemSize"></param> public static void cudaOccMaxPotentialOccupancyBlockSize( ref int minGridSize, ref int blockSize, cudaOccDeviceProp properties, cudaOccFuncAttributes attributes, cudaOccDeviceState state, del_blockSizeToDynamicSMemSize blockSizeToDynamicSMemSize, SizeT dynamicSMemSize) { cudaOccResult result = new cudaOccResult(); // Limits int occupancyLimit; int granularity; int blockSizeLimit; // Recorded maximum int maxBlockSize = 0; int numBlocks = 0; int maxOccupancy = 0; // Temporary int blockSizeToTryAligned; int blockSizeToTry; int blockSizeLimitAligned; int occupancyInBlocks; int occupancyInThreads; /////////////////////////// // Check user input /////////////////////////// //if (!minGridSize || !blockSize || !properties || !attributes || !state) { // return CUDA_OCC_ERROR_INVALID_INPUT; //} cudaOccInputCheck(properties, attributes, state); ///////////////////////////////////////////////////////////////////////////////// // Try each block size, and pick the block size with maximum occupancy ///////////////////////////////////////////////////////////////////////////////// occupancyLimit = properties.maxThreadsPerMultiProcessor; granularity = properties.warpSize; blockSizeLimit = __occMin(properties.maxThreadsPerBlock, attributes.maxThreadsPerBlock); blockSizeLimitAligned = __occRoundUp(blockSizeLimit, granularity); for (blockSizeToTryAligned = blockSizeLimitAligned; blockSizeToTryAligned > 0; blockSizeToTryAligned -= granularity) { blockSizeToTry = __occMin(blockSizeLimit, blockSizeToTryAligned); // Ignore dynamicSMemSize if the user provides a mapping // if (blockSizeToDynamicSMemSize != null) { dynamicSMemSize = blockSizeToDynamicSMemSize(blockSizeToTry); } cudaOccMaxActiveBlocksPerMultiprocessor( result, properties, attributes, state, blockSizeToTry, dynamicSMemSize); //if (status != CUDA_OCC_SUCCESS) { // return status; //} occupancyInBlocks = result.ActiveBlocksPerMultiProcessor; occupancyInThreads = blockSizeToTry * occupancyInBlocks; if (occupancyInThreads > maxOccupancy) { maxBlockSize = blockSizeToTry; numBlocks = occupancyInBlocks; maxOccupancy = occupancyInThreads; } // Early out if we have reached the maximum // if (occupancyLimit == maxOccupancy) { break; } } /////////////////////////// // Return best available /////////////////////////// // Suggested min grid size to achieve a full machine launch // minGridSize = numBlocks * properties.numSms; blockSize = maxBlockSize; }
/////////////////////////////////// // API Implementations // /////////////////////////////////// /// <summary> /// Determine the maximum number of CTAs that can be run simultaneously per SM.<para/> /// This is equivalent to the calculation done in the CUDA Occupancy Calculator /// spreadsheet /// </summary> /// <param name="result"></param> /// <param name="properties"></param> /// <param name="attributes"></param> /// <param name="state"></param> /// <param name="blockSize"></param> /// <param name="dynamicSmemSize"></param> /// <returns></returns> public static void cudaOccMaxActiveBlocksPerMultiprocessor( cudaOccResult result, cudaOccDeviceProp properties, cudaOccFuncAttributes attributes, cudaOccDeviceState state, int blockSize, SizeT dynamicSmemSize) { int ctaLimitWarps = 0; int ctaLimitBlocks = 0; int ctaLimitSMem = 0; int ctaLimitRegs = 0; int ctaLimit = 0; cudaOccLimitingFactors limitingFactors = 0; cudaOccPartitionedGCConfig gcConfig = cudaOccPartitionedGCConfig.Off; //if (!result || !properties || !attributes || !state || blockSize <= 0) { // return CUDA_OCC_ERROR_INVALID_INPUT; //} /////////////////////////// // Check user input /////////////////////////// cudaOccInputCheck(properties, attributes, state); /////////////////////////// // Initialization /////////////////////////// gcConfig = cudaOccPartitionedGCExpected(properties, attributes); /////////////////////////// // Compute occupancy /////////////////////////// // Limits due to registers/SM // Also compute if partitioned global caching has to be turned off // ctaLimitRegs = cudaOccMaxBlocksPerSMRegsLimit(ref gcConfig, result, properties, attributes, blockSize); // Limits due to warps/SM // ctaLimitWarps = cudaOccMaxBlocksPerSMWarpsLimit(gcConfig, properties, attributes, blockSize); // Limits due to blocks/SM // ctaLimitBlocks = cudaOccMaxBlocksPerMultiprocessor(properties); // Limits due to shared memory/SM // ctaLimitSMem = cudaOccMaxBlocksPerSMSmemLimit(result, properties, attributes, state, blockSize, dynamicSmemSize); /////////////////////////// // Overall occupancy /////////////////////////// // Overall limit is min() of limits due to above reasons // ctaLimit = __occMin(ctaLimitRegs, __occMin(ctaLimitSMem, __occMin(ctaLimitWarps, ctaLimitBlocks))); // Fill in the return values // // Determine occupancy limiting factors // if (ctaLimit == ctaLimitWarps) { limitingFactors |= cudaOccLimitingFactors.Warps; } if (ctaLimit == ctaLimitRegs) { limitingFactors |= cudaOccLimitingFactors.Registers; } if (ctaLimit == ctaLimitSMem) { limitingFactors |= cudaOccLimitingFactors.SharedMemory; } if (ctaLimit == ctaLimitBlocks) { limitingFactors |= cudaOccLimitingFactors.Blocks; } result.LimitingFactors = limitingFactors; result.BlockLimitRegs = ctaLimitRegs; result.BlockLimitSharedMem = ctaLimitSMem; result.BlockLimitWarps = ctaLimitWarps; result.BlockLimitBlocks = ctaLimitBlocks; result.partitionedGCConfig = gcConfig; // Final occupancy result.ActiveBlocksPerMultiProcessor = ctaLimit; }
private static int cudaOccMaxBlocksPerSMRegsLimit( ref cudaOccPartitionedGCConfig gcConfig, cudaOccResult result, cudaOccDeviceProp properties, cudaOccFuncAttributes attributes, int blockSize) { int allocationGranularity; int warpsAllocatedPerCTA; int regsAllocatedPerCTA; int regsAssumedPerCTA; int regsPerWarp; int regsAllocatedPerWarp; int numSubPartitions; int numRegsPerSubPartition; int numWarpsPerSubPartition; int numWarpsPerSM; int maxBlocks; allocationGranularity = cudaOccRegAllocationGranularity( properties, attributes.numRegs); // Fermi requires special handling of certain register usage numSubPartitions = cudaOccSubPartitionsPerMultiprocessor(properties); warpsAllocatedPerCTA = __occDivideRoundUp(blockSize, properties.warpSize); // GPUs of compute capability 2.x and higher allocate registers to warps // // Number of regs per warp is regs per thread x warp size, rounded up to // register allocation granularity // regsPerWarp = attributes.numRegs * properties.warpSize; regsAllocatedPerWarp = __occRoundUp(regsPerWarp, allocationGranularity); regsAllocatedPerCTA = regsAllocatedPerWarp * warpsAllocatedPerCTA; // Hardware verifies if a launch fits the per-CTA register limit. For // historical reasons, the verification logic assumes register // allocations are made to all partitions simultaneously. Therefore, to // simulate the hardware check, the warp allocation needs to be rounded // up to the number of partitions. // regsAssumedPerCTA = regsAllocatedPerWarp * __occRoundUp(warpsAllocatedPerCTA, numSubPartitions); if (properties.regsPerBlock < regsAssumedPerCTA || // Hardware check properties.regsPerBlock < regsAllocatedPerCTA) // Software check { maxBlocks = 0; } else { if (regsAllocatedPerWarp > 0) { // Registers are allocated in each sub-partition. The max number // of warps that can fit on an SM is equal to the max number of // warps per sub-partition x number of sub-partitions. // numRegsPerSubPartition = properties.regsPerMultiprocessor / numSubPartitions; numWarpsPerSubPartition = numRegsPerSubPartition / regsAllocatedPerWarp; maxBlocks = 0; if (gcConfig != cudaOccPartitionedGCConfig.Off) { int numSubPartitionsPerSmPartition; int numWarpsPerSmPartition; int maxBlocksPerSmPartition; // If partitioned global caching is on, then a CTA can only // use a half SM, and thus a half of the registers available // per SM // numSubPartitionsPerSmPartition = numSubPartitions / 2; numWarpsPerSmPartition = numWarpsPerSubPartition * numSubPartitionsPerSmPartition; maxBlocksPerSmPartition = numWarpsPerSmPartition / warpsAllocatedPerCTA; maxBlocks = maxBlocksPerSmPartition * 2; } // Try again if partitioned global caching is not enabled, or if // the CTA cannot fit on the SM with caching on. In the latter // case, the device will automatically turn off caching, except // if the device forces it. The user can also override this // assumption with PARTITIONED_GC_ON_STRICT to calculate // occupancy and launch configuration. // { bool gcOff = (gcConfig == cudaOccPartitionedGCConfig.Off); bool zeroOccupancy = (maxBlocks == 0); bool cachingForced = (gcConfig == cudaOccPartitionedGCConfig.OnStrict || cudaOccPartitionedGCForced(properties)); if (gcOff || (zeroOccupancy && (!cachingForced))) { gcConfig = cudaOccPartitionedGCConfig.Off; numWarpsPerSM = numWarpsPerSubPartition * numSubPartitions; maxBlocks = numWarpsPerSM / warpsAllocatedPerCTA; } } } else { maxBlocks = int.MaxValue; } } result.AllocatedRegistersPerBlock = regsAllocatedPerCTA; return(maxBlocks); }
// Shared memory limit // private static int cudaOccMaxBlocksPerSMSmemLimit( cudaOccResult result, cudaOccDeviceProp properties, cudaOccFuncAttributes attributes, cudaOccDeviceState state, int blockSize, SizeT dynamicSmemSize) { int allocationGranularity; SizeT userSmemPreference; SizeT totalSmemUsagePerCTA; SizeT smemAllocatedPerCTA; SizeT sharedMemPerMultiprocessor; int maxBlocks; allocationGranularity = cudaOccSMemAllocationGranularity(properties); // Obtain the user preferred shared memory size. This setting is ignored if // user requests more shared memory than preferred. // userSmemPreference = cudaOccSMemPerMultiprocessor(properties, state.cacheConfig); totalSmemUsagePerCTA = attributes.sharedSizeBytes + dynamicSmemSize; smemAllocatedPerCTA = __occRoundUp((int)totalSmemUsagePerCTA, (int)allocationGranularity); if (smemAllocatedPerCTA > properties.sharedMemPerBlock) { maxBlocks = 0; } else { // User requested shared memory limit is used as long as it is greater // than the total shared memory used per CTA, i.e. as long as at least // one CTA can be launched. Otherwise, the maximum shared memory limit // is used instead. // if (userSmemPreference >= smemAllocatedPerCTA) { sharedMemPerMultiprocessor = userSmemPreference; } else{ sharedMemPerMultiprocessor = properties.sharedMemPerMultiprocessor; } if (smemAllocatedPerCTA > 0) { maxBlocks = (int)(sharedMemPerMultiprocessor / smemAllocatedPerCTA); } else { maxBlocks = int.MaxValue; } } result.AllocatedSharedMemPerBlock = smemAllocatedPerCTA; return maxBlocks; }
private static int cudaOccMaxBlocksPerSMRegsLimit( ref cudaOccPartitionedGCConfig gcConfig, cudaOccResult result, cudaOccDeviceProp properties, cudaOccFuncAttributes attributes, int blockSize) { int allocationGranularity; int warpsAllocatedPerCTA; int regsAllocatedPerCTA; int regsAssumedPerCTA; int regsPerWarp; int regsAllocatedPerWarp; int numSubPartitions; int numRegsPerSubPartition; int numWarpsPerSubPartition; int numWarpsPerSM; int maxBlocks; allocationGranularity = cudaOccRegAllocationGranularity( properties, attributes.numRegs); // Fermi requires special handling of certain register usage numSubPartitions = cudaOccSubPartitionsPerMultiprocessor(properties); warpsAllocatedPerCTA = __occDivideRoundUp(blockSize, properties.warpSize); // GPUs of compute capability 2.x and higher allocate registers to warps // // Number of regs per warp is regs per thread x warp size, rounded up to // register allocation granularity // regsPerWarp = attributes.numRegs * properties.warpSize; regsAllocatedPerWarp = __occRoundUp(regsPerWarp, allocationGranularity); regsAllocatedPerCTA = regsAllocatedPerWarp * warpsAllocatedPerCTA; // Hardware verifies if a launch fits the per-CTA register limit. For // historical reasons, the verification logic assumes register // allocations are made to all partitions simultaneously. Therefore, to // simulate the hardware check, the warp allocation needs to be rounded // up to the number of partitions. // regsAssumedPerCTA = regsAllocatedPerWarp * __occRoundUp(warpsAllocatedPerCTA, numSubPartitions); if (properties.regsPerBlock < regsAssumedPerCTA || // Hardware check properties.regsPerBlock < regsAllocatedPerCTA) { // Software check maxBlocks = 0; } else { if (regsAllocatedPerWarp > 0) { // Registers are allocated in each sub-partition. The max number // of warps that can fit on an SM is equal to the max number of // warps per sub-partition x number of sub-partitions. // numRegsPerSubPartition = properties.regsPerMultiprocessor / numSubPartitions; numWarpsPerSubPartition = numRegsPerSubPartition / regsAllocatedPerWarp; maxBlocks = 0; if (gcConfig != cudaOccPartitionedGCConfig.Off) { int numSubPartitionsPerSmPartition; int numWarpsPerSmPartition; int maxBlocksPerSmPartition; // If partitioned global caching is on, then a CTA can only // use a half SM, and thus a half of the registers available // per SM // numSubPartitionsPerSmPartition = numSubPartitions / 2; numWarpsPerSmPartition = numWarpsPerSubPartition * numSubPartitionsPerSmPartition; maxBlocksPerSmPartition = numWarpsPerSmPartition / warpsAllocatedPerCTA; maxBlocks = maxBlocksPerSmPartition * 2; } // Try again if partitioned global caching is not enabled, or if // the CTA cannot fit on the SM with caching on. In the latter // case, the device will automatically turn off caching, except // if the device forces it. The user can also override this // assumption with PARTITIONED_GC_ON_STRICT to calculate // occupancy and launch configuration. // { bool gcOff = (gcConfig == cudaOccPartitionedGCConfig.Off); bool zeroOccupancy = (maxBlocks == 0); bool cachingForced = (gcConfig == cudaOccPartitionedGCConfig.OnStrict || cudaOccPartitionedGCForced(properties)); if (gcOff || (zeroOccupancy && (!cachingForced))) { gcConfig = cudaOccPartitionedGCConfig.Off; numWarpsPerSM = numWarpsPerSubPartition * numSubPartitions; maxBlocks = numWarpsPerSM / warpsAllocatedPerCTA; } } } else { maxBlocks = int.MaxValue; } } result.AllocatedRegistersPerBlock = regsAllocatedPerCTA; return maxBlocks; }