コード例 #1
0
 /// <summary>
 /// cudaOccFuncAttributes
 /// </summary>
 /// <param name="aMaxThreadsPerBlock"></param>
 /// <param name="aNumRegs"></param>
 /// <param name="aSharedSizeBytes">Only the static part shared memory (without dynamic allocations)</param>
 /// <param name="partitionedGC"></param>
 public cudaOccFuncAttributes(int aMaxThreadsPerBlock, int aNumRegs, SizeT aSharedSizeBytes, cudaOccPartitionedGCConfig partitionedGC)
 {
     maxThreadsPerBlock  = aMaxThreadsPerBlock;
     numRegs             = aNumRegs;
     sharedSizeBytes     = aSharedSizeBytes;
     partitionedGCConfig = partitionedGC;
 }
コード例 #2
0
        // Warp limit
        //
        private static int cudaOccMaxBlocksPerSMWarpsLimit(
            cudaOccPartitionedGCConfig gcConfig,
            cudaOccDeviceProp properties,
            cudaOccFuncAttributes attributes,
            int blockSize)
        {
            int limit;
            int maxWarpsPerSm;
            int warpsAllocatedPerCTA;
            int maxBlocks;

            if (blockSize > properties.maxThreadsPerBlock)
            {
                maxBlocks = 0;
            }
            else
            {
                maxWarpsPerSm        = properties.maxThreadsPerMultiProcessor / properties.warpSize;
                warpsAllocatedPerCTA = __occDivideRoundUp(blockSize, properties.warpSize);
                maxBlocks            = 0;

                if (gcConfig != cudaOccPartitionedGCConfig.Off)
                {
                    int maxBlocksPerSmPartition;
                    int maxWarpsPerSmPartition;

                    // If partitioned global caching is on, then a CTA can only use a SM
                    // partition (a half SM), and thus a half of the warp slots
                    // available per SM
                    //
                    maxWarpsPerSmPartition  = maxWarpsPerSm / 2;
                    maxBlocksPerSmPartition = maxWarpsPerSmPartition / warpsAllocatedPerCTA;
                    maxBlocks = maxBlocksPerSmPartition * 2;
                }
                // On hardware that supports partitioned global caching, each half SM is
                // guaranteed to support at least 32 warps (maximum number of warps of a
                // CTA), so caching will not cause 0 occupancy due to insufficient warp
                // allocation slots.
                //
                else
                {
                    maxBlocks = maxWarpsPerSm / warpsAllocatedPerCTA;
                }
            }

            limit = maxBlocks;

            return(limit);
        }
コード例 #3
0
        ///////////////////////////////////
        //      API Implementations      //
        ///////////////////////////////////


        /// <summary>
        /// Determine the maximum number of CTAs that can be run simultaneously per SM.<para/>
        /// This is equivalent to the calculation done in the CUDA Occupancy Calculator
        /// spreadsheet
        /// </summary>
        /// <param name="result"></param>
        /// <param name="properties"></param>
        /// <param name="attributes"></param>
        /// <param name="state"></param>
        /// <param name="blockSize"></param>
        /// <param name="dynamicSmemSize"></param>
        /// <returns></returns>
        public static void cudaOccMaxActiveBlocksPerMultiprocessor(
            cudaOccResult result,
            cudaOccDeviceProp properties,
            cudaOccFuncAttributes attributes,
            cudaOccDeviceState state,
            int blockSize,
            SizeT dynamicSmemSize)
        {
            int ctaLimitWarps  = 0;
            int ctaLimitBlocks = 0;
            int ctaLimitSMem   = 0;
            int ctaLimitRegs   = 0;
            int ctaLimit       = 0;
            cudaOccLimitingFactors limitingFactors = 0;

            cudaOccPartitionedGCConfig gcConfig = cudaOccPartitionedGCConfig.Off;

            //if (!result || !properties || !attributes || !state || blockSize <= 0) {
            //	return CUDA_OCC_ERROR_INVALID_INPUT;
            //}

            ///////////////////////////
            // Check user input
            ///////////////////////////

            cudaOccInputCheck(properties, attributes, state);

            ///////////////////////////
            // Initialization
            ///////////////////////////

            gcConfig = cudaOccPartitionedGCExpected(properties, attributes);

            ///////////////////////////
            // Compute occupancy
            ///////////////////////////

            // Limits due to registers/SM
            // Also compute if partitioned global caching has to be turned off
            //
            ctaLimitRegs = cudaOccMaxBlocksPerSMRegsLimit(ref gcConfig, result, properties, attributes, blockSize);


            // Limits due to warps/SM
            //
            ctaLimitWarps = cudaOccMaxBlocksPerSMWarpsLimit(gcConfig, properties, attributes, blockSize);


            // Limits due to blocks/SM
            //
            ctaLimitBlocks = cudaOccMaxBlocksPerMultiprocessor(properties);

            // Limits due to shared memory/SM
            //
            ctaLimitSMem = cudaOccMaxBlocksPerSMSmemLimit(result, properties, attributes, state, blockSize, dynamicSmemSize);


            ///////////////////////////
            // Overall occupancy
            ///////////////////////////

            // Overall limit is min() of limits due to above reasons
            //
            ctaLimit = __occMin(ctaLimitRegs, __occMin(ctaLimitSMem, __occMin(ctaLimitWarps, ctaLimitBlocks)));

            // Fill in the return values
            //
            // Determine occupancy limiting factors
            //
            if (ctaLimit == ctaLimitWarps)
            {
                limitingFactors |= cudaOccLimitingFactors.Warps;
            }
            if (ctaLimit == ctaLimitRegs)
            {
                limitingFactors |= cudaOccLimitingFactors.Registers;
            }
            if (ctaLimit == ctaLimitSMem)
            {
                limitingFactors |= cudaOccLimitingFactors.SharedMemory;
            }
            if (ctaLimit == ctaLimitBlocks)
            {
                limitingFactors |= cudaOccLimitingFactors.Blocks;
            }
            result.LimitingFactors = limitingFactors;

            result.BlockLimitRegs      = ctaLimitRegs;
            result.BlockLimitSharedMem = ctaLimitSMem;
            result.BlockLimitWarps     = ctaLimitWarps;
            result.BlockLimitBlocks    = ctaLimitBlocks;
            result.partitionedGCConfig = gcConfig;

            // Final occupancy
            result.ActiveBlocksPerMultiProcessor = ctaLimit;
        }
コード例 #4
0
        private static int cudaOccMaxBlocksPerSMRegsLimit(
            ref cudaOccPartitionedGCConfig gcConfig,
            cudaOccResult result,
            cudaOccDeviceProp properties,
            cudaOccFuncAttributes attributes,
            int blockSize)
        {
            int allocationGranularity;
            int warpsAllocatedPerCTA;
            int regsAllocatedPerCTA;
            int regsAssumedPerCTA;
            int regsPerWarp;
            int regsAllocatedPerWarp;
            int numSubPartitions;
            int numRegsPerSubPartition;
            int numWarpsPerSubPartition;
            int numWarpsPerSM;
            int maxBlocks;

            allocationGranularity = cudaOccRegAllocationGranularity(
                properties,
                attributes.numRegs);                   // Fermi requires special handling of certain register usage

            numSubPartitions = cudaOccSubPartitionsPerMultiprocessor(properties);

            warpsAllocatedPerCTA = __occDivideRoundUp(blockSize, properties.warpSize);

            // GPUs of compute capability 2.x and higher allocate registers to warps
            //
            // Number of regs per warp is regs per thread x warp size, rounded up to
            // register allocation granularity
            //
            regsPerWarp          = attributes.numRegs * properties.warpSize;
            regsAllocatedPerWarp = __occRoundUp(regsPerWarp, allocationGranularity);
            regsAllocatedPerCTA  = regsAllocatedPerWarp * warpsAllocatedPerCTA;

            // Hardware verifies if a launch fits the per-CTA register limit. For
            // historical reasons, the verification logic assumes register
            // allocations are made to all partitions simultaneously. Therefore, to
            // simulate the hardware check, the warp allocation needs to be rounded
            // up to the number of partitions.
            //
            regsAssumedPerCTA = regsAllocatedPerWarp * __occRoundUp(warpsAllocatedPerCTA, numSubPartitions);

            if (properties.regsPerBlock < regsAssumedPerCTA ||               // Hardware check
                properties.regsPerBlock < regsAllocatedPerCTA)               // Software check
            {
                maxBlocks = 0;
            }
            else
            {
                if (regsAllocatedPerWarp > 0)
                {
                    // Registers are allocated in each sub-partition. The max number
                    // of warps that can fit on an SM is equal to the max number of
                    // warps per sub-partition x number of sub-partitions.
                    //
                    numRegsPerSubPartition  = properties.regsPerMultiprocessor / numSubPartitions;
                    numWarpsPerSubPartition = numRegsPerSubPartition / regsAllocatedPerWarp;

                    maxBlocks = 0;

                    if (gcConfig != cudaOccPartitionedGCConfig.Off)
                    {
                        int numSubPartitionsPerSmPartition;
                        int numWarpsPerSmPartition;
                        int maxBlocksPerSmPartition;

                        // If partitioned global caching is on, then a CTA can only
                        // use a half SM, and thus a half of the registers available
                        // per SM
                        //
                        numSubPartitionsPerSmPartition = numSubPartitions / 2;
                        numWarpsPerSmPartition         = numWarpsPerSubPartition * numSubPartitionsPerSmPartition;
                        maxBlocksPerSmPartition        = numWarpsPerSmPartition / warpsAllocatedPerCTA;
                        maxBlocks = maxBlocksPerSmPartition * 2;
                    }

                    // Try again if partitioned global caching is not enabled, or if
                    // the CTA cannot fit on the SM with caching on. In the latter
                    // case, the device will automatically turn off caching, except
                    // if the device forces it. The user can also override this
                    // assumption with PARTITIONED_GC_ON_STRICT to calculate
                    // occupancy and launch configuration.
                    //
                    {
                        bool gcOff         = (gcConfig == cudaOccPartitionedGCConfig.Off);
                        bool zeroOccupancy = (maxBlocks == 0);
                        bool cachingForced = (gcConfig == cudaOccPartitionedGCConfig.OnStrict ||
                                              cudaOccPartitionedGCForced(properties));

                        if (gcOff || (zeroOccupancy && (!cachingForced)))
                        {
                            gcConfig      = cudaOccPartitionedGCConfig.Off;
                            numWarpsPerSM = numWarpsPerSubPartition * numSubPartitions;
                            maxBlocks     = numWarpsPerSM / warpsAllocatedPerCTA;
                        }
                    }
                }
                else
                {
                    maxBlocks = int.MaxValue;
                }
            }


            result.AllocatedRegistersPerBlock = regsAllocatedPerCTA;

            return(maxBlocks);
        }
コード例 #5
0
ファイル: CudaOccupancy.cs プロジェクト: kunzmi/managedCuda
        // Warp limit
        //
        private static int cudaOccMaxBlocksPerSMWarpsLimit(
			cudaOccPartitionedGCConfig   gcConfig,
			cudaOccDeviceProp     properties,
			cudaOccFuncAttributes attributes,
			int                   blockSize)
        {
            int limit;
            int maxWarpsPerSm;
            int warpsAllocatedPerCTA;
            int maxBlocks;

            if (blockSize > properties.maxThreadsPerBlock) {
                maxBlocks = 0;
            }
            else {
                maxWarpsPerSm = properties.maxThreadsPerMultiProcessor / properties.warpSize;
                warpsAllocatedPerCTA = __occDivideRoundUp(blockSize, properties.warpSize);
                maxBlocks = 0;

                if (gcConfig != cudaOccPartitionedGCConfig.Off) {
                    int maxBlocksPerSmPartition;
                    int maxWarpsPerSmPartition;

                    // If partitioned global caching is on, then a CTA can only use a SM
                    // partition (a half SM), and thus a half of the warp slots
                    // available per SM
                    //
                    maxWarpsPerSmPartition  = maxWarpsPerSm / 2;
                    maxBlocksPerSmPartition = maxWarpsPerSmPartition / warpsAllocatedPerCTA;
                    maxBlocks               = maxBlocksPerSmPartition * 2;
                }
                // On hardware that supports partitioned global caching, each half SM is
                // guaranteed to support at least 32 warps (maximum number of warps of a
                // CTA), so caching will not cause 0 occupancy due to insufficient warp
                // allocation slots.
                //
                else {
                    maxBlocks = maxWarpsPerSm / warpsAllocatedPerCTA;
                }
            }

            limit = maxBlocks;

            return limit;
        }
コード例 #6
0
ファイル: CudaOccupancy.cs プロジェクト: kunzmi/managedCuda
        private static int cudaOccMaxBlocksPerSMRegsLimit(
			ref cudaOccPartitionedGCConfig  gcConfig,
			cudaOccResult         result,
			cudaOccDeviceProp     properties,
			cudaOccFuncAttributes attributes,
			int                   blockSize)
        {
            int allocationGranularity;
            int warpsAllocatedPerCTA;
            int regsAllocatedPerCTA;
            int regsAssumedPerCTA;
            int regsPerWarp;
            int regsAllocatedPerWarp;
            int numSubPartitions;
            int numRegsPerSubPartition;
            int numWarpsPerSubPartition;
            int numWarpsPerSM;
            int maxBlocks;

            allocationGranularity = cudaOccRegAllocationGranularity(
                properties,
                attributes.numRegs);   // Fermi requires special handling of certain register usage

            numSubPartitions = cudaOccSubPartitionsPerMultiprocessor(properties);

            warpsAllocatedPerCTA = __occDivideRoundUp(blockSize, properties.warpSize);

            // GPUs of compute capability 2.x and higher allocate registers to warps
            //
            // Number of regs per warp is regs per thread x warp size, rounded up to
            // register allocation granularity
            //
            regsPerWarp          = attributes.numRegs * properties.warpSize;
            regsAllocatedPerWarp = __occRoundUp(regsPerWarp, allocationGranularity);
            regsAllocatedPerCTA  = regsAllocatedPerWarp * warpsAllocatedPerCTA;

            // Hardware verifies if a launch fits the per-CTA register limit. For
            // historical reasons, the verification logic assumes register
            // allocations are made to all partitions simultaneously. Therefore, to
            // simulate the hardware check, the warp allocation needs to be rounded
            // up to the number of partitions.
            //
            regsAssumedPerCTA = regsAllocatedPerWarp * __occRoundUp(warpsAllocatedPerCTA, numSubPartitions);

            if (properties.regsPerBlock < regsAssumedPerCTA ||   // Hardware check
                properties.regsPerBlock < regsAllocatedPerCTA) { // Software check
                maxBlocks = 0;
            }
            else {
                if (regsAllocatedPerWarp > 0) {
                    // Registers are allocated in each sub-partition. The max number
                    // of warps that can fit on an SM is equal to the max number of
                    // warps per sub-partition x number of sub-partitions.
                    //
                    numRegsPerSubPartition  = properties.regsPerMultiprocessor / numSubPartitions;
                    numWarpsPerSubPartition = numRegsPerSubPartition / regsAllocatedPerWarp;

                    maxBlocks = 0;

                    if (gcConfig != cudaOccPartitionedGCConfig.Off) {
                        int numSubPartitionsPerSmPartition;
                        int numWarpsPerSmPartition;
                        int maxBlocksPerSmPartition;

                        // If partitioned global caching is on, then a CTA can only
                        // use a half SM, and thus a half of the registers available
                        // per SM
                        //
                        numSubPartitionsPerSmPartition = numSubPartitions / 2;
                        numWarpsPerSmPartition         = numWarpsPerSubPartition * numSubPartitionsPerSmPartition;
                        maxBlocksPerSmPartition        = numWarpsPerSmPartition / warpsAllocatedPerCTA;
                        maxBlocks                      = maxBlocksPerSmPartition * 2;
                    }

                    // Try again if partitioned global caching is not enabled, or if
                    // the CTA cannot fit on the SM with caching on. In the latter
                    // case, the device will automatically turn off caching, except
                    // if the device forces it. The user can also override this
                    // assumption with PARTITIONED_GC_ON_STRICT to calculate
                    // occupancy and launch configuration.
                    //
                    {
                        bool gcOff = (gcConfig == cudaOccPartitionedGCConfig.Off);
                        bool zeroOccupancy = (maxBlocks == 0);
                        bool cachingForced = (gcConfig == cudaOccPartitionedGCConfig.OnStrict ||
                                             cudaOccPartitionedGCForced(properties));

                        if (gcOff || (zeroOccupancy && (!cachingForced))) {
                            gcConfig = cudaOccPartitionedGCConfig.Off;
                            numWarpsPerSM = numWarpsPerSubPartition * numSubPartitions;
                            maxBlocks     = numWarpsPerSM / warpsAllocatedPerCTA;
                        }
                    }
                }
                else {
                    maxBlocks = int.MaxValue;
                }
            }

            result.AllocatedRegistersPerBlock = regsAllocatedPerCTA;

            return maxBlocks;
        }
コード例 #7
0
ファイル: CudaOccupancy.cs プロジェクト: kunzmi/managedCuda
 /// <summary>
 /// cudaOccFuncAttributes
 /// </summary>
 /// <param name="aMaxThreadsPerBlock"></param>
 /// <param name="aNumRegs"></param>
 /// <param name="aSharedSizeBytes">Only the static part shared memory (without dynamic allocations)</param>
 /// <param name="partitionedGC"></param>
 public cudaOccFuncAttributes(int aMaxThreadsPerBlock, int aNumRegs, SizeT aSharedSizeBytes, cudaOccPartitionedGCConfig partitionedGC)
 {
     maxThreadsPerBlock = aMaxThreadsPerBlock;
     numRegs = aNumRegs;
     sharedSizeBytes = aSharedSizeBytes;
     partitionedGCConfig = partitionedGC;
 }