/// <summary> /// cudaOccFuncAttributes /// </summary> /// <param name="aMaxThreadsPerBlock"></param> /// <param name="aNumRegs"></param> /// <param name="aSharedSizeBytes">Only the static part shared memory (without dynamic allocations)</param> /// <param name="partitionedGC"></param> public cudaOccFuncAttributes(int aMaxThreadsPerBlock, int aNumRegs, SizeT aSharedSizeBytes, cudaOccPartitionedGCConfig partitionedGC) { maxThreadsPerBlock = aMaxThreadsPerBlock; numRegs = aNumRegs; sharedSizeBytes = aSharedSizeBytes; partitionedGCConfig = partitionedGC; }
// Warp limit // private static int cudaOccMaxBlocksPerSMWarpsLimit( cudaOccPartitionedGCConfig gcConfig, cudaOccDeviceProp properties, cudaOccFuncAttributes attributes, int blockSize) { int limit; int maxWarpsPerSm; int warpsAllocatedPerCTA; int maxBlocks; if (blockSize > properties.maxThreadsPerBlock) { maxBlocks = 0; } else { maxWarpsPerSm = properties.maxThreadsPerMultiProcessor / properties.warpSize; warpsAllocatedPerCTA = __occDivideRoundUp(blockSize, properties.warpSize); maxBlocks = 0; if (gcConfig != cudaOccPartitionedGCConfig.Off) { int maxBlocksPerSmPartition; int maxWarpsPerSmPartition; // If partitioned global caching is on, then a CTA can only use a SM // partition (a half SM), and thus a half of the warp slots // available per SM // maxWarpsPerSmPartition = maxWarpsPerSm / 2; maxBlocksPerSmPartition = maxWarpsPerSmPartition / warpsAllocatedPerCTA; maxBlocks = maxBlocksPerSmPartition * 2; } // On hardware that supports partitioned global caching, each half SM is // guaranteed to support at least 32 warps (maximum number of warps of a // CTA), so caching will not cause 0 occupancy due to insufficient warp // allocation slots. // else { maxBlocks = maxWarpsPerSm / warpsAllocatedPerCTA; } } limit = maxBlocks; return(limit); }
/////////////////////////////////// // API Implementations // /////////////////////////////////// /// <summary> /// Determine the maximum number of CTAs that can be run simultaneously per SM.<para/> /// This is equivalent to the calculation done in the CUDA Occupancy Calculator /// spreadsheet /// </summary> /// <param name="result"></param> /// <param name="properties"></param> /// <param name="attributes"></param> /// <param name="state"></param> /// <param name="blockSize"></param> /// <param name="dynamicSmemSize"></param> /// <returns></returns> public static void cudaOccMaxActiveBlocksPerMultiprocessor( cudaOccResult result, cudaOccDeviceProp properties, cudaOccFuncAttributes attributes, cudaOccDeviceState state, int blockSize, SizeT dynamicSmemSize) { int ctaLimitWarps = 0; int ctaLimitBlocks = 0; int ctaLimitSMem = 0; int ctaLimitRegs = 0; int ctaLimit = 0; cudaOccLimitingFactors limitingFactors = 0; cudaOccPartitionedGCConfig gcConfig = cudaOccPartitionedGCConfig.Off; //if (!result || !properties || !attributes || !state || blockSize <= 0) { // return CUDA_OCC_ERROR_INVALID_INPUT; //} /////////////////////////// // Check user input /////////////////////////// cudaOccInputCheck(properties, attributes, state); /////////////////////////// // Initialization /////////////////////////// gcConfig = cudaOccPartitionedGCExpected(properties, attributes); /////////////////////////// // Compute occupancy /////////////////////////// // Limits due to registers/SM // Also compute if partitioned global caching has to be turned off // ctaLimitRegs = cudaOccMaxBlocksPerSMRegsLimit(ref gcConfig, result, properties, attributes, blockSize); // Limits due to warps/SM // ctaLimitWarps = cudaOccMaxBlocksPerSMWarpsLimit(gcConfig, properties, attributes, blockSize); // Limits due to blocks/SM // ctaLimitBlocks = cudaOccMaxBlocksPerMultiprocessor(properties); // Limits due to shared memory/SM // ctaLimitSMem = cudaOccMaxBlocksPerSMSmemLimit(result, properties, attributes, state, blockSize, dynamicSmemSize); /////////////////////////// // Overall occupancy /////////////////////////// // Overall limit is min() of limits due to above reasons // ctaLimit = __occMin(ctaLimitRegs, __occMin(ctaLimitSMem, __occMin(ctaLimitWarps, ctaLimitBlocks))); // Fill in the return values // // Determine occupancy limiting factors // if (ctaLimit == ctaLimitWarps) { limitingFactors |= cudaOccLimitingFactors.Warps; } if (ctaLimit == ctaLimitRegs) { limitingFactors |= cudaOccLimitingFactors.Registers; } if (ctaLimit == ctaLimitSMem) { limitingFactors |= cudaOccLimitingFactors.SharedMemory; } if (ctaLimit == ctaLimitBlocks) { limitingFactors |= cudaOccLimitingFactors.Blocks; } result.LimitingFactors = limitingFactors; result.BlockLimitRegs = ctaLimitRegs; result.BlockLimitSharedMem = ctaLimitSMem; result.BlockLimitWarps = ctaLimitWarps; result.BlockLimitBlocks = ctaLimitBlocks; result.partitionedGCConfig = gcConfig; // Final occupancy result.ActiveBlocksPerMultiProcessor = ctaLimit; }
private static int cudaOccMaxBlocksPerSMRegsLimit( ref cudaOccPartitionedGCConfig gcConfig, cudaOccResult result, cudaOccDeviceProp properties, cudaOccFuncAttributes attributes, int blockSize) { int allocationGranularity; int warpsAllocatedPerCTA; int regsAllocatedPerCTA; int regsAssumedPerCTA; int regsPerWarp; int regsAllocatedPerWarp; int numSubPartitions; int numRegsPerSubPartition; int numWarpsPerSubPartition; int numWarpsPerSM; int maxBlocks; allocationGranularity = cudaOccRegAllocationGranularity( properties, attributes.numRegs); // Fermi requires special handling of certain register usage numSubPartitions = cudaOccSubPartitionsPerMultiprocessor(properties); warpsAllocatedPerCTA = __occDivideRoundUp(blockSize, properties.warpSize); // GPUs of compute capability 2.x and higher allocate registers to warps // // Number of regs per warp is regs per thread x warp size, rounded up to // register allocation granularity // regsPerWarp = attributes.numRegs * properties.warpSize; regsAllocatedPerWarp = __occRoundUp(regsPerWarp, allocationGranularity); regsAllocatedPerCTA = regsAllocatedPerWarp * warpsAllocatedPerCTA; // Hardware verifies if a launch fits the per-CTA register limit. For // historical reasons, the verification logic assumes register // allocations are made to all partitions simultaneously. Therefore, to // simulate the hardware check, the warp allocation needs to be rounded // up to the number of partitions. // regsAssumedPerCTA = regsAllocatedPerWarp * __occRoundUp(warpsAllocatedPerCTA, numSubPartitions); if (properties.regsPerBlock < regsAssumedPerCTA || // Hardware check properties.regsPerBlock < regsAllocatedPerCTA) // Software check { maxBlocks = 0; } else { if (regsAllocatedPerWarp > 0) { // Registers are allocated in each sub-partition. The max number // of warps that can fit on an SM is equal to the max number of // warps per sub-partition x number of sub-partitions. // numRegsPerSubPartition = properties.regsPerMultiprocessor / numSubPartitions; numWarpsPerSubPartition = numRegsPerSubPartition / regsAllocatedPerWarp; maxBlocks = 0; if (gcConfig != cudaOccPartitionedGCConfig.Off) { int numSubPartitionsPerSmPartition; int numWarpsPerSmPartition; int maxBlocksPerSmPartition; // If partitioned global caching is on, then a CTA can only // use a half SM, and thus a half of the registers available // per SM // numSubPartitionsPerSmPartition = numSubPartitions / 2; numWarpsPerSmPartition = numWarpsPerSubPartition * numSubPartitionsPerSmPartition; maxBlocksPerSmPartition = numWarpsPerSmPartition / warpsAllocatedPerCTA; maxBlocks = maxBlocksPerSmPartition * 2; } // Try again if partitioned global caching is not enabled, or if // the CTA cannot fit on the SM with caching on. In the latter // case, the device will automatically turn off caching, except // if the device forces it. The user can also override this // assumption with PARTITIONED_GC_ON_STRICT to calculate // occupancy and launch configuration. // { bool gcOff = (gcConfig == cudaOccPartitionedGCConfig.Off); bool zeroOccupancy = (maxBlocks == 0); bool cachingForced = (gcConfig == cudaOccPartitionedGCConfig.OnStrict || cudaOccPartitionedGCForced(properties)); if (gcOff || (zeroOccupancy && (!cachingForced))) { gcConfig = cudaOccPartitionedGCConfig.Off; numWarpsPerSM = numWarpsPerSubPartition * numSubPartitions; maxBlocks = numWarpsPerSM / warpsAllocatedPerCTA; } } } else { maxBlocks = int.MaxValue; } } result.AllocatedRegistersPerBlock = regsAllocatedPerCTA; return(maxBlocks); }
// Warp limit // private static int cudaOccMaxBlocksPerSMWarpsLimit( cudaOccPartitionedGCConfig gcConfig, cudaOccDeviceProp properties, cudaOccFuncAttributes attributes, int blockSize) { int limit; int maxWarpsPerSm; int warpsAllocatedPerCTA; int maxBlocks; if (blockSize > properties.maxThreadsPerBlock) { maxBlocks = 0; } else { maxWarpsPerSm = properties.maxThreadsPerMultiProcessor / properties.warpSize; warpsAllocatedPerCTA = __occDivideRoundUp(blockSize, properties.warpSize); maxBlocks = 0; if (gcConfig != cudaOccPartitionedGCConfig.Off) { int maxBlocksPerSmPartition; int maxWarpsPerSmPartition; // If partitioned global caching is on, then a CTA can only use a SM // partition (a half SM), and thus a half of the warp slots // available per SM // maxWarpsPerSmPartition = maxWarpsPerSm / 2; maxBlocksPerSmPartition = maxWarpsPerSmPartition / warpsAllocatedPerCTA; maxBlocks = maxBlocksPerSmPartition * 2; } // On hardware that supports partitioned global caching, each half SM is // guaranteed to support at least 32 warps (maximum number of warps of a // CTA), so caching will not cause 0 occupancy due to insufficient warp // allocation slots. // else { maxBlocks = maxWarpsPerSm / warpsAllocatedPerCTA; } } limit = maxBlocks; return limit; }
private static int cudaOccMaxBlocksPerSMRegsLimit( ref cudaOccPartitionedGCConfig gcConfig, cudaOccResult result, cudaOccDeviceProp properties, cudaOccFuncAttributes attributes, int blockSize) { int allocationGranularity; int warpsAllocatedPerCTA; int regsAllocatedPerCTA; int regsAssumedPerCTA; int regsPerWarp; int regsAllocatedPerWarp; int numSubPartitions; int numRegsPerSubPartition; int numWarpsPerSubPartition; int numWarpsPerSM; int maxBlocks; allocationGranularity = cudaOccRegAllocationGranularity( properties, attributes.numRegs); // Fermi requires special handling of certain register usage numSubPartitions = cudaOccSubPartitionsPerMultiprocessor(properties); warpsAllocatedPerCTA = __occDivideRoundUp(blockSize, properties.warpSize); // GPUs of compute capability 2.x and higher allocate registers to warps // // Number of regs per warp is regs per thread x warp size, rounded up to // register allocation granularity // regsPerWarp = attributes.numRegs * properties.warpSize; regsAllocatedPerWarp = __occRoundUp(regsPerWarp, allocationGranularity); regsAllocatedPerCTA = regsAllocatedPerWarp * warpsAllocatedPerCTA; // Hardware verifies if a launch fits the per-CTA register limit. For // historical reasons, the verification logic assumes register // allocations are made to all partitions simultaneously. Therefore, to // simulate the hardware check, the warp allocation needs to be rounded // up to the number of partitions. // regsAssumedPerCTA = regsAllocatedPerWarp * __occRoundUp(warpsAllocatedPerCTA, numSubPartitions); if (properties.regsPerBlock < regsAssumedPerCTA || // Hardware check properties.regsPerBlock < regsAllocatedPerCTA) { // Software check maxBlocks = 0; } else { if (regsAllocatedPerWarp > 0) { // Registers are allocated in each sub-partition. The max number // of warps that can fit on an SM is equal to the max number of // warps per sub-partition x number of sub-partitions. // numRegsPerSubPartition = properties.regsPerMultiprocessor / numSubPartitions; numWarpsPerSubPartition = numRegsPerSubPartition / regsAllocatedPerWarp; maxBlocks = 0; if (gcConfig != cudaOccPartitionedGCConfig.Off) { int numSubPartitionsPerSmPartition; int numWarpsPerSmPartition; int maxBlocksPerSmPartition; // If partitioned global caching is on, then a CTA can only // use a half SM, and thus a half of the registers available // per SM // numSubPartitionsPerSmPartition = numSubPartitions / 2; numWarpsPerSmPartition = numWarpsPerSubPartition * numSubPartitionsPerSmPartition; maxBlocksPerSmPartition = numWarpsPerSmPartition / warpsAllocatedPerCTA; maxBlocks = maxBlocksPerSmPartition * 2; } // Try again if partitioned global caching is not enabled, or if // the CTA cannot fit on the SM with caching on. In the latter // case, the device will automatically turn off caching, except // if the device forces it. The user can also override this // assumption with PARTITIONED_GC_ON_STRICT to calculate // occupancy and launch configuration. // { bool gcOff = (gcConfig == cudaOccPartitionedGCConfig.Off); bool zeroOccupancy = (maxBlocks == 0); bool cachingForced = (gcConfig == cudaOccPartitionedGCConfig.OnStrict || cudaOccPartitionedGCForced(properties)); if (gcOff || (zeroOccupancy && (!cachingForced))) { gcConfig = cudaOccPartitionedGCConfig.Off; numWarpsPerSM = numWarpsPerSubPartition * numSubPartitions; maxBlocks = numWarpsPerSM / warpsAllocatedPerCTA; } } } else { maxBlocks = int.MaxValue; } } result.AllocatedRegistersPerBlock = regsAllocatedPerCTA; return maxBlocks; }