Пример #1
0
        private static void cudaOccInputCheck(
            cudaOccDeviceProp properties,
            cudaOccFuncAttributes attributes,
            cudaOccDeviceState state)
        {
            cudaOccError status = cudaOccError.None;

            status = cudaOccDevicePropCheck(properties);
            if (status != cudaOccError.None)
            {
                throw new CudaOccupancyException(status);
            }

            status = cudaOccFuncAttributesCheck(attributes);
            if (status != cudaOccError.None)
            {
                throw new CudaOccupancyException(status);
            }

            status = cudaOccDeviceStateCheck(state);
            if (status != cudaOccError.None)
            {
                throw new CudaOccupancyException(status);
            }
        }
Пример #2
0
        /*!
         * Granularity of register allocation
         */
        private static int cudaOccRegAllocationUnit(cudaOccDeviceProp properties, int regsPerThread)
        {
            switch (properties.major)
            {
            case 1:  return((properties.minor <= 1) ? 256 : 512);

            case 2:  switch (regsPerThread)
                {
                case 21:
                case 22:
                case 29:
                case 30:
                case 37:
                case 38:
                case 45:
                case 46:
                    return(128);

                default:
                    return(64);
                }

            case 3:
            case 5:  return(256);

            default: throw new CudaOccupancyException(cudaOccError.ErrorUnknownDevice);
            }
        }
Пример #3
0
        // Shared memory limit
        //
        private static int cudaOccMaxBlocksPerSMSmemLimit(
            cudaOccResult result,
            cudaOccDeviceProp properties,
            cudaOccFuncAttributes attributes,
            cudaOccDeviceState state,
            int blockSize,
            SizeT dynamicSmemSize)
        {
            int   allocationGranularity;
            SizeT userSmemPreference;
            SizeT totalSmemUsagePerCTA;
            SizeT smemAllocatedPerCTA;
            SizeT sharedMemPerMultiprocessor;
            int   maxBlocks;

            allocationGranularity = cudaOccSMemAllocationGranularity(properties);


            // Obtain the user preferred shared memory size. This setting is ignored if
            // user requests more shared memory than preferred.
            //
            userSmemPreference = cudaOccSMemPerMultiprocessor(properties, state.cacheConfig);

            totalSmemUsagePerCTA = attributes.sharedSizeBytes + dynamicSmemSize;
            smemAllocatedPerCTA  = __occRoundUp((int)totalSmemUsagePerCTA, (int)allocationGranularity);

            if (smemAllocatedPerCTA > properties.sharedMemPerBlock)
            {
                maxBlocks = 0;
            }
            else
            {
                // User requested shared memory limit is used as long as it is greater
                // than the total shared memory used per CTA, i.e. as long as at least
                // one CTA can be launched. Otherwise, the maximum shared memory limit
                // is used instead.
                //
                if (userSmemPreference >= smemAllocatedPerCTA)
                {
                    sharedMemPerMultiprocessor = userSmemPreference;
                }
                else
                {
                    sharedMemPerMultiprocessor = properties.sharedMemPerMultiprocessor;
                }

                if (smemAllocatedPerCTA > 0)
                {
                    maxBlocks = (int)(sharedMemPerMultiprocessor / smemAllocatedPerCTA);
                }
                else
                {
                    maxBlocks = int.MaxValue;
                }
            }

            result.AllocatedSharedMemPerBlock = smemAllocatedPerCTA;

            return(maxBlocks);
        }
Пример #4
0
        /// <summary>
        /// Determine the potential block size that allows maximum number of CTAs that can run on multiprocessor simultaneously
        /// </summary>
        /// <param name="properties"></param>
        /// <param name="attributes"></param>
        /// <param name="state"></param>
        /// <param name="blockSizeToSMem">
        /// A function to convert from block size to dynamic shared memory size.<para/>
        /// e.g.:
        /// If no dynamic shared memory is used: x => 0<para/>
        /// If 4 bytes shared memory per thread is used: x = 4 * x</param>
        /// <returns>maxBlockSize</returns>
        public static int cudaOccMaxPotentialOccupancyBlockSize(
            cudaOccDeviceProp properties,
            cudaOccFuncAttributes attributes,
            cudaOccDeviceState state,
            del_blockSizeToDynamicSMemSize blockSizeToSMem)
        {
            int maxOccupancy     = properties.maxThreadsPerMultiProcessor;
            int largestBlockSize = min_(properties.maxThreadsPerBlock, attributes.maxThreadsPerBlock);
            int granularity      = properties.warpSize;
            int maxBlockSize     = 0;
            int blockSize        = 0;
            int highestOccupancy = 0;

            for (blockSize = largestBlockSize; blockSize > 0; blockSize -= granularity)
            {
                cudaOccResult res       = cudaOccMaxActiveBlocksPerMultiprocessor(properties, attributes, blockSize, blockSizeToSMem(blockSize), state);
                int           occupancy = res.ActiveBlocksPerMultiProcessor;
                occupancy = blockSize * occupancy;

                if (occupancy > highestOccupancy)
                {
                    maxBlockSize     = blockSize;
                    highestOccupancy = occupancy;
                }

                // can not get higher occupancy
                if (highestOccupancy == maxOccupancy)
                {
                    break;
                }
            }

            return(maxBlockSize);
        }
Пример #5
0
        ///////////////////////////////////////////////
        //    Occupancy calculation Functions        //
        ///////////////////////////////////////////////

        /// <summary>
        /// Determine the maximum number of CTAs that can be run simultaneously per SM.<para/>
        /// This is equivalent to the calculation done in the CUDA Occupancy Calculator
        /// spreadsheet
        /// </summary>
        /// <param name="properties"></param>
        /// <param name="kernel"></param>
        /// <param name="state"></param>
        /// <returns></returns>
        public static cudaOccResult cudaOccMaxActiveBlocksPerMultiprocessor(
            CudaDeviceProperties properties,
            CudaKernel kernel,
            cudaOccDeviceState state)
        {
            cudaOccDeviceProp     props      = new cudaOccDeviceProp(properties);
            cudaOccFuncAttributes attributes = new cudaOccFuncAttributes(kernel);

            return(cudaOccMaxActiveBlocksPerMultiprocessor(props, attributes, (int)kernel.BlockDimensions.x * (int)kernel.BlockDimensions.y * (int)kernel.BlockDimensions.z, kernel.DynamicSharedMemory, state));
        }
Пример #6
0
 /// <summary>
 ///
 /// </summary>
 /// <param name="minGridSize"></param>
 /// <param name="blockSize"></param>
 /// <param name="properties"></param>
 /// <param name="attributes"></param>
 /// <param name="state"></param>
 /// <param name="dynamicSMemSize"></param>
 public static void cudaOccMaxPotentialOccupancyBlockSize(
     ref int minGridSize,
     ref int blockSize,
     cudaOccDeviceProp properties,
     cudaOccFuncAttributes attributes,
     cudaOccDeviceState state,
     SizeT dynamicSMemSize)
 {
     cudaOccMaxPotentialOccupancyBlockSize(ref minGridSize, ref blockSize, properties, attributes, state, null, dynamicSMemSize);
 }
Пример #7
0
        /// <summary>
        /// Determine the potential block size that allows maximum number of CTAs that can run on multiprocessor simultaneously
        /// </summary>
        /// <param name="properties"></param>
        /// <param name="kernel"></param>
        /// <param name="state"></param>
        /// <param name="blockSizeToSMem">
        /// A function to convert from block size to dynamic shared memory size.<para/>
        /// e.g.:
        /// If no dynamic shared memory is used: x => 0<para/>
        /// If 4 bytes shared memory per thread is used: x = 4 * x</param>
        /// <returns>maxBlockSize</returns>
        public static int cudaOccMaxPotentialOccupancyBlockSize(
            CudaDeviceProperties properties,
            CudaKernel kernel,
            cudaOccDeviceState state,
            del_blockSizeToDynamicSMemSize blockSizeToSMem)
        {
            cudaOccDeviceProp     props      = new cudaOccDeviceProp(properties);
            cudaOccFuncAttributes attributes = new cudaOccFuncAttributes(kernel);

            return(cudaOccMaxPotentialOccupancyBlockSize(props, attributes, state, blockSizeToSMem));
        }
Пример #8
0
        //////////////////////////////////////////
        //      Architectural Properties        //
        //////////////////////////////////////////

        /*!
         * Granularity of shared memory allocation
         */
        private static int cudaOccSMemAllocationGranularity(cudaOccDeviceProp properties)
        {
            switch (properties.computeMajor)
            {
            //case 1:  return 512;
            case 2:  return(128);

            case 3:
            case 5:
            case 6: return(256);

            default: throw new CudaOccupancyException(cudaOccError.ErrorUnknownDevice);
            }
        }
Пример #9
0
        //////////////////////////////////////////
        //    Occupancy Helper Functions        //
        //////////////////////////////////////////

        /*!
         * Granularity of shared memory allocation
         */
        private static int cudaOccSMemAllocationUnit(cudaOccDeviceProp properties)
        {
            switch (properties.major)
            {
            case 1:  return(512);

            case 2:  return(128);

            case 3:
            case 5:  return(256);

            default: throw new CudaOccupancyException(cudaOccError.ErrorUnknownDevice);
            }
        }
Пример #10
0
        // Warp limit
        //
        private static int cudaOccMaxBlocksPerSMWarpsLimit(
            cudaOccPartitionedGCConfig gcConfig,
            cudaOccDeviceProp properties,
            cudaOccFuncAttributes attributes,
            int blockSize)
        {
            int limit;
            int maxWarpsPerSm;
            int warpsAllocatedPerCTA;
            int maxBlocks;

            if (blockSize > properties.maxThreadsPerBlock)
            {
                maxBlocks = 0;
            }
            else
            {
                maxWarpsPerSm        = properties.maxThreadsPerMultiProcessor / properties.warpSize;
                warpsAllocatedPerCTA = __occDivideRoundUp(blockSize, properties.warpSize);
                maxBlocks            = 0;

                if (gcConfig != cudaOccPartitionedGCConfig.Off)
                {
                    int maxBlocksPerSmPartition;
                    int maxWarpsPerSmPartition;

                    // If partitioned global caching is on, then a CTA can only use a SM
                    // partition (a half SM), and thus a half of the warp slots
                    // available per SM
                    //
                    maxWarpsPerSmPartition  = maxWarpsPerSm / 2;
                    maxBlocksPerSmPartition = maxWarpsPerSmPartition / warpsAllocatedPerCTA;
                    maxBlocks = maxBlocksPerSmPartition * 2;
                }
                // On hardware that supports partitioned global caching, each half SM is
                // guaranteed to support at least 32 warps (maximum number of warps of a
                // CTA), so caching will not cause 0 occupancy due to insufficient warp
                // allocation slots.
                //
                else
                {
                    maxBlocks = maxWarpsPerSm / warpsAllocatedPerCTA;
                }
            }

            limit = maxBlocks;

            return(limit);
        }
Пример #11
0
        /*!
         * Maximum blocks that can run simultaneously on a multiprocessor
         */
        private static int cudaOccMaxBlocksPerMultiprocessor(cudaOccDeviceProp properties)
        {
            switch (properties.major)
            {
            case 1:  return(8);

            case 2:  return(8);

            case 3:  return(16);

            case 5:  return(32);

            default: throw new CudaOccupancyException(cudaOccError.ErrorUnknownDevice);
            }
        }
Пример #12
0
        ///*!
        // * Granularity of warp allocation
        // */
        //private static int cudaOccWarpAllocationMultiple(cudaOccDeviceProp properties)
        //{
        //	return (properties.major <= 1) ? 2 : 1;
        //}

        /*!
         * Number of "sides" into which the multiprocessor is partitioned
         */
        private static int cudaOccSubPartitionsPerMultiprocessor(cudaOccDeviceProp properties)
        {
            switch (properties.computeMajor)
            {
            //case 1:  return 1;
            case 2: return(2);

            case 3: return(4);

            case 5: return(4);

            case 6: return(4);

            default: throw new CudaOccupancyException(cudaOccError.ErrorUnknownDevice);
            }
        }
Пример #13
0
        ///////////////////////////////////////////////
        //            User Input Sanity              //
        ///////////////////////////////////////////////



        private static cudaOccError cudaOccDevicePropCheck(cudaOccDeviceProp properties)
        {
            // Verify device properties
            //
            // Each of these limits must be a positive number.
            //
            // Compute capacity is checked during the occupancy calculation
            //
            if (properties.maxThreadsPerBlock <= 0 ||
                properties.maxThreadsPerMultiProcessor <= 0 ||
                properties.regsPerBlock <= 0 ||
                properties.regsPerMultiprocessor <= 0 ||
                properties.warpSize <= 0 ||
                properties.sharedMemPerBlock <= 0 ||
                properties.sharedMemPerMultiprocessor <= 0 ||
                properties.numSms <= 0)
            {
                return(cudaOccError.ErrorInvalidInput);
            }

            return(cudaOccError.None);
        }
Пример #14
0
        ///*!
        // * Map int to cudaOccCacheConfig
        // */
        //private static cudaOccCacheConfig cudaOccGetCacheConfig(cudaOccDeviceState state)
        //{
        //    switch(state.cacheConfig)
        //    {
        //        case 0:  return cudaOccCacheConfig.PreferNone;
        //        case 1:  return cudaOccCacheConfig.PreferShared;
        //        case 2:  return cudaOccCacheConfig.PreferL1;
        //        case 3:  return cudaOccCacheConfig.PreferEqual;
        //        default: return cudaOccCacheConfig.PreferNone;
        //    }
        //}

        /*!
         * Shared memory based on config requested by User
         */
        private static int cudaOccSMemPerMultiprocessor(cudaOccDeviceProp properties, cudaOccCacheConfig cacheConfig)
        {
            int bytes = 0;
            int sharedMemPerMultiprocessorHigh = (int)properties.sharedMemPerMultiprocessor;
            int sharedMemPerMultiprocessorLow  = (properties.major == 3 && properties.minor == 7)
                                ? MIN_SHARED_MEM_PER_SM_GK210
                                : MIN_SHARED_MEM_PER_SM;

            switch (properties.major)
            {
            case 1:
            case 2: bytes = (cacheConfig == cudaOccCacheConfig.PreferL1)? sharedMemPerMultiprocessorLow : sharedMemPerMultiprocessorHigh;
                break;

            case 3: switch (cacheConfig)
                {
                default:
                case cudaOccCacheConfig.PreferNone:
                case cudaOccCacheConfig.PreferShared:
                    bytes = sharedMemPerMultiprocessorHigh;
                    break;

                case cudaOccCacheConfig.PreferL1:
                    bytes = sharedMemPerMultiprocessorLow;
                    break;

                case cudaOccCacheConfig.PreferEqual:
                    bytes = (sharedMemPerMultiprocessorHigh + sharedMemPerMultiprocessorLow) / 2;
                    break;
                }
                break;

            case 5:
            default: bytes = sharedMemPerMultiprocessorHigh;
                break;
            }

            return(bytes);
        }
Пример #15
0
        private static cudaOccPartitionedGCConfig cudaOccPartitionedGCExpected(
            cudaOccDeviceProp properties,
            cudaOccFuncAttributes attributes)
        {
            cudaOccPartitionedGCSupport gcSupport;
            cudaOccPartitionedGCConfig  gcConfig;

            gcSupport = cudaOccPartitionedGlobalCachingModeSupport(properties);

            gcConfig = attributes.partitionedGCConfig;

            if (gcSupport == cudaOccPartitionedGCSupport.NotSupported)
            {
                gcConfig = cudaOccPartitionedGCConfig.Off;
            }

            if (cudaOccPartitionedGCForced(properties))
            {
                gcConfig = cudaOccPartitionedGCConfig.On;
            }

            return(gcConfig);
        }
Пример #16
0
        /**
         * Partitioned global caching mode support
         */
        private static cudaOccPartitionedGCSupport cudaOccPartitionedGlobalCachingModeSupport(cudaOccDeviceProp properties)
        {
            cudaOccPartitionedGCSupport limit = cudaOccPartitionedGCSupport.NotSupported;

            if ((properties.computeMajor == 5 && (properties.computeMinor == 2 || properties.computeMinor == 3)) ||
                properties.computeMajor == 6)
            {
                limit = cudaOccPartitionedGCSupport.Supported;
            }

            if (properties.computeMajor == 6 && properties.computeMinor == 0) {
                limit = cudaOccPartitionedGCSupport.NotSupported;
            }

            return limit;
        }
Пример #17
0
		/*!
		 * Granularity of register allocation
		 */
		private static int cudaOccRegAllocationUnit(cudaOccDeviceProp properties, int regsPerThread)
		{
			switch(properties.major)
			{
				case 1:  return (properties.minor <= 1) ? 256 : 512;
				case 2:  switch(regsPerThread)
						 {
							case 21:
							case 22:
							case 29:
							case 30:
							case 37:
							case 38:
							case 45:
							case 46:
								return 128;
							default:
								return 64;
						 }
				case 3:
				case 5:  return 256;
				default: throw new CudaOccupancyException(cudaOccError.ErrorUnknownDevice);
			}
		}
Пример #18
0
        /// <summary>
        /// 
        /// </summary>
        /// <param name="minGridSize"></param>
        /// <param name="blockSize"></param>
        /// <param name="properties"></param>
        /// <param name="attributes"></param>
        /// <param name="state"></param>
        /// <param name="dynamicSMemSize"></param>
        public static void cudaOccMaxPotentialOccupancyBlockSize(
			ref int minGridSize,
			ref int blockSize,
			cudaOccDeviceProp properties,
			cudaOccFuncAttributes attributes,
			cudaOccDeviceState state,
			SizeT dynamicSMemSize)
        {
            cudaOccMaxPotentialOccupancyBlockSize(ref minGridSize, ref blockSize, properties, attributes, state, null, dynamicSMemSize);
        }
Пример #19
0
        /// <summary>
        /// 
        /// </summary>
        /// <param name="minGridSize"></param>
        /// <param name="blockSize"></param>
        /// <param name="properties"></param>
        /// <param name="attributes"></param>
        /// <param name="state"></param>
        /// <param name="blockSizeToDynamicSMemSize"></param>
        /// <param name="dynamicSMemSize"></param>
        public static void cudaOccMaxPotentialOccupancyBlockSize(
			ref int                         minGridSize,
			ref int                         blockSize,
			cudaOccDeviceProp     properties,
			cudaOccFuncAttributes attributes,
			cudaOccDeviceState    state,
			del_blockSizeToDynamicSMemSize blockSizeToDynamicSMemSize,
			SizeT                       dynamicSMemSize)
        {
            cudaOccResult result = new cudaOccResult();

            // Limits
            int occupancyLimit;
            int granularity;
            int blockSizeLimit;

            // Recorded maximum
            int maxBlockSize = 0;
            int numBlocks    = 0;
            int maxOccupancy = 0;

            // Temporary
            int blockSizeToTryAligned;
            int blockSizeToTry;
            int blockSizeLimitAligned;
            int occupancyInBlocks;
            int occupancyInThreads;

            ///////////////////////////
            // Check user input
            ///////////////////////////

            //if (!minGridSize || !blockSize || !properties || !attributes || !state) {
            //	return CUDA_OCC_ERROR_INVALID_INPUT;
            //}

            cudaOccInputCheck(properties, attributes, state);

            /////////////////////////////////////////////////////////////////////////////////
            // Try each block size, and pick the block size with maximum occupancy
            /////////////////////////////////////////////////////////////////////////////////

            occupancyLimit = properties.maxThreadsPerMultiProcessor;
            granularity    = properties.warpSize;

            blockSizeLimit        = __occMin(properties.maxThreadsPerBlock, attributes.maxThreadsPerBlock);
            blockSizeLimitAligned = __occRoundUp(blockSizeLimit, granularity);

            for (blockSizeToTryAligned = blockSizeLimitAligned; blockSizeToTryAligned > 0; blockSizeToTryAligned -= granularity) {
                blockSizeToTry = __occMin(blockSizeLimit, blockSizeToTryAligned);

                // Ignore dynamicSMemSize if the user provides a mapping
                //
                if (blockSizeToDynamicSMemSize != null) {
                    dynamicSMemSize = blockSizeToDynamicSMemSize(blockSizeToTry);
                }

                cudaOccMaxActiveBlocksPerMultiprocessor(
                    result,
                    properties,
                    attributes,
                    state,
                    blockSizeToTry,
                    dynamicSMemSize);

                //if (status != CUDA_OCC_SUCCESS) {
                //	return status;
                //}

                occupancyInBlocks = result.ActiveBlocksPerMultiProcessor;
                occupancyInThreads = blockSizeToTry * occupancyInBlocks;

                if (occupancyInThreads > maxOccupancy) {
                    maxBlockSize = blockSizeToTry;
                    numBlocks    = occupancyInBlocks;
                    maxOccupancy = occupancyInThreads;
                }

                // Early out if we have reached the maximum
                //
                if (occupancyLimit == maxOccupancy) {
                    break;
                }
            }

            ///////////////////////////
            // Return best available
            ///////////////////////////

            // Suggested min grid size to achieve a full machine launch
            //
            minGridSize = numBlocks * properties.numSms;
            blockSize = maxBlockSize;
        }
Пример #20
0
        private static cudaOccPartitionedGCConfig cudaOccPartitionedGCExpected(
			cudaOccDeviceProp     properties,
			cudaOccFuncAttributes attributes)
        {
            cudaOccPartitionedGCSupport gcSupport;
            cudaOccPartitionedGCConfig gcConfig;

            gcSupport = cudaOccPartitionedGlobalCachingModeSupport(properties);

            gcConfig = attributes.partitionedGCConfig;

            if (gcSupport == cudaOccPartitionedGCSupport.NotSupported) {
                gcConfig = cudaOccPartitionedGCConfig.Off;
            }

            if (cudaOccPartitionedGCForced(properties)) {
                gcConfig = cudaOccPartitionedGCConfig.On;
            }

            return gcConfig;
        }
Пример #21
0
 //////////////////////////////////////////
 //      Architectural Properties        //
 //////////////////////////////////////////
 /*!
  * Granularity of shared memory allocation
  */
 private static int cudaOccSMemAllocationGranularity(cudaOccDeviceProp properties)
 {
     switch(properties.computeMajor)
     {
         //case 1:  return 512;
         case 2:  return 128;
         case 3:
         case 5:
         case 6: return 256;
         default: throw new CudaOccupancyException(cudaOccError.ErrorUnknownDevice);
     }
 }
Пример #22
0
		/// <summary>
		/// Determine the maximum number of CTAs that can be run simultaneously per SM.<para/>
		/// This is equivalent to the calculation done in the CUDA Occupancy Calculator
		/// spreadsheet
		/// </summary>
		/// <param name="properties"></param>
		/// <param name="attributes"></param>
		/// <param name="blockSize"></param>
		/// <param name="dynamic_smem_bytes"></param>
		/// <param name="state"></param>
		/// <returns></returns>
		public static cudaOccResult cudaOccMaxActiveBlocksPerMultiprocessor(
			cudaOccDeviceProp properties,
			cudaOccFuncAttributes attributes,
			int blockSize,
			SizeT dynamic_smem_bytes,
			cudaOccDeviceState state)
		{
			int regAllocationUnit = 0, warpAllocationMultiple = 0, maxBlocksPerSM=0;
			int ctaLimitWarps = 0, ctaLimitBlocks = 0, smemPerCTA = 0, smemBytes = 0, smemAllocationUnit = 0;
			int cacheConfigSMem = 0, sharedMemPerMultiprocessor = 0, ctaLimitRegs = 0, regsPerCTA=0;
			int regsPerWarp = 0, numSides = 0, numRegsPerSide = 0, ctaLimit=0;
			int maxWarpsPerSm = 0, warpsPerCTA = 0, ctaLimitSMem=0;
			cudaOccLimitingFactors limitingFactors = 0;
			cudaOccResult result = new cudaOccResult();

			if(properties == null || attributes == null || blockSize <= 0)
			{
				throw new CudaOccupancyException(cudaOccError.ErrorInvalidInput);
			}

			//////////////////////////////////////////
			// Limits due to warps/SM or blocks/SM
			//////////////////////////////////////////
			CudaOccupancyException.CheckZero(properties.warpSize);
			maxWarpsPerSm   = properties.maxThreadsPerMultiProcessor / properties.warpSize;
			warpAllocationMultiple = cudaOccWarpAllocationMultiple(properties);

			CudaOccupancyException.CheckZero(warpAllocationMultiple);
			warpsPerCTA = round_i(divide_ri(blockSize, properties.warpSize), warpAllocationMultiple);

			maxBlocksPerSM  = cudaOccMaxBlocksPerMultiprocessor(properties);

			// Calc limits
			CudaOccupancyException.CheckZero(warpsPerCTA);
			ctaLimitWarps  = (blockSize <= properties.maxThreadsPerBlock) ? maxWarpsPerSm / warpsPerCTA : 0;
			ctaLimitBlocks = maxBlocksPerSM;

			//////////////////////////////////////////
			// Limits due to shared memory/SM
			//////////////////////////////////////////
			smemAllocationUnit     = cudaOccSMemAllocationUnit(properties);
			smemBytes  = (int)(attributes.sharedSizeBytes + dynamic_smem_bytes);
			CudaOccupancyException.CheckZero(smemAllocationUnit);
			smemPerCTA = round_i(smemBytes, smemAllocationUnit);

			// Calc limit
			cacheConfigSMem = cudaOccSMemPerMultiprocessor(properties,state.cacheConfig);

			// sharedMemoryPerMultiprocessor is by default limit set in hardware but user requested shared memory
			// limit is used instead if it is greater than total shared memory used by function .
			sharedMemPerMultiprocessor = (cacheConfigSMem >= smemPerCTA)
				? cacheConfigSMem
				: (int)properties.sharedMemPerMultiprocessor;
			// Limit on blocks launched should be calculated with shared memory per SM but total shared memory
			// used by function should be limited by shared memory per block
			ctaLimitSMem = 0;
			if(properties.sharedMemPerBlock >= (SizeT)smemPerCTA)
			{
				ctaLimitSMem = smemPerCTA > 0 ? sharedMemPerMultiprocessor / smemPerCTA : maxBlocksPerSM;
			}

			//////////////////////////////////////////
			// Limits due to registers/SM
			//////////////////////////////////////////
			regAllocationUnit      = cudaOccRegAllocationUnit(properties, attributes.numRegs);
			CudaOccupancyException.CheckZero(regAllocationUnit);

			// Calc limit
			ctaLimitRegs = 0;
			if(properties.major <= 1)
			{
				// GPUs of compute capability 1.x allocate registers to CTAs
				// Number of regs per block is regs per thread times number of warps times warp size, rounded up to allocation unit
				regsPerCTA = round_i(attributes.numRegs * properties.warpSize * warpsPerCTA, regAllocationUnit);
				ctaLimitRegs = regsPerCTA > 0 ? properties.regsPerMultiprocessor / regsPerCTA : maxBlocksPerSM;
			}
			else
			{
				// GPUs of compute capability 2.x and higher allocate registers to warps
				// Number of regs per warp is regs per thread times number of warps times warp size, rounded up to allocation unit
				regsPerWarp = round_i(attributes.numRegs * properties.warpSize, regAllocationUnit);
				regsPerCTA = regsPerWarp * warpsPerCTA;
				if(properties.regsPerBlock >= regsPerCTA)
				{
					numSides = cudaOccSidesPerMultiprocessor(properties);
					CudaOccupancyException.CheckZero(numSides);
					numRegsPerSide = properties.regsPerMultiprocessor / numSides;
					ctaLimitRegs = regsPerWarp > 0 ? ((numRegsPerSide / regsPerWarp) * numSides) / warpsPerCTA : maxBlocksPerSM;
				}
			}

			//////////////////////////////////////////
			// Overall limit is min() of limits due to above reasons
			//////////////////////////////////////////
			ctaLimit = min_(ctaLimitRegs, min_(ctaLimitSMem, min_(ctaLimitWarps, ctaLimitBlocks)));
			// Determine occupancy limiting factors
			
			
			result.ActiveBlocksPerMultiProcessor = ctaLimit;

			if(ctaLimit==ctaLimitWarps)
			{
				limitingFactors |= cudaOccLimitingFactors.Warps;
			}
			if(ctaLimit==ctaLimitRegs && regsPerCTA > 0)
			{
				limitingFactors |= cudaOccLimitingFactors.Registers;
			}
			if(ctaLimit==ctaLimitSMem && smemPerCTA > 0)
			{
				limitingFactors |= cudaOccLimitingFactors.SharedMemory;
			}
			if(ctaLimit==ctaLimitBlocks)
			{
				limitingFactors |= cudaOccLimitingFactors.Blocks;
			}
			result.LimitingFactors = limitingFactors;

			result.BlockLimitRegs = ctaLimitRegs;
			result.BlockLimitSharedMem = ctaLimitSMem;
			result.BlockLimitWarps = ctaLimitWarps;
			result.BlockLimitBlocks = ctaLimitBlocks;

			result.BllocatedRegistersPerBlock = regsPerCTA;
			result.AllocatedSharedMemPerBlock = smemPerCTA;

			result.ActiveWarpsPerMultiProcessor = ctaLimit * ((int)Math.Ceiling(blockSize / (double)properties.warpSize));
			result.ActiceThreadsPerMultiProcessor = result.ActiveWarpsPerMultiProcessor * properties.warpSize;
			result.OccupancyOfEachMultiProcessor = (int)Math.Round(result.ActiveWarpsPerMultiProcessor / (double)maxWarpsPerSm * 100);
			return result;
		}
Пример #23
0
		///*!
		// * Map int to cudaOccCacheConfig
		// */
		//private static cudaOccCacheConfig cudaOccGetCacheConfig(cudaOccDeviceState state)
		//{
		//    switch(state.cacheConfig)
		//    {
		//        case 0:  return cudaOccCacheConfig.PreferNone;
		//        case 1:  return cudaOccCacheConfig.PreferShared;
		//        case 2:  return cudaOccCacheConfig.PreferL1;
		//        case 3:  return cudaOccCacheConfig.PreferEqual;
		//        default: return cudaOccCacheConfig.PreferNone;
		//    }
		//}

		/*!
		 * Shared memory based on config requested by User
		 */
		private static int cudaOccSMemPerMultiprocessor(cudaOccDeviceProp properties, cudaOccCacheConfig cacheConfig)
		{
			int bytes = 0;
			int sharedMemPerMultiprocessorHigh = (int) properties.sharedMemPerMultiprocessor;
			int sharedMemPerMultiprocessorLow  = (properties.major==3 && properties.minor==7)
				? MIN_SHARED_MEM_PER_SM_GK210
				: MIN_SHARED_MEM_PER_SM ;

			switch(properties.major)
			{
				case 1:
				case 2: bytes = (cacheConfig == cudaOccCacheConfig.PreferL1)? sharedMemPerMultiprocessorLow : sharedMemPerMultiprocessorHigh;
						break;
				case 3: switch (cacheConfig)
						{
							default :
							case cudaOccCacheConfig.PreferNone:
							case cudaOccCacheConfig.PreferShared:
									bytes = sharedMemPerMultiprocessorHigh;
									break;
							case cudaOccCacheConfig.PreferL1:
									bytes = sharedMemPerMultiprocessorLow;
									break;
							case cudaOccCacheConfig.PreferEqual:
									bytes = (sharedMemPerMultiprocessorHigh + sharedMemPerMultiprocessorLow) / 2;
									break;
						}
						break;
				case 5:
				default: bytes = sharedMemPerMultiprocessorHigh;
						 break;
			}

			return bytes;
		}
Пример #24
0
        // Shared memory limit
        //
        private static int cudaOccMaxBlocksPerSMSmemLimit(
			cudaOccResult result,
			cudaOccDeviceProp     properties,
			cudaOccFuncAttributes attributes,
			cudaOccDeviceState    state,
			int                   blockSize,
			SizeT                 dynamicSmemSize)
        {
            int allocationGranularity;
            SizeT userSmemPreference;
            SizeT totalSmemUsagePerCTA;
            SizeT smemAllocatedPerCTA;
            SizeT sharedMemPerMultiprocessor;
            int maxBlocks;

            allocationGranularity = cudaOccSMemAllocationGranularity(properties);

            // Obtain the user preferred shared memory size. This setting is ignored if
            // user requests more shared memory than preferred.
            //
            userSmemPreference = cudaOccSMemPerMultiprocessor(properties, state.cacheConfig);

            totalSmemUsagePerCTA = attributes.sharedSizeBytes + dynamicSmemSize;
            smemAllocatedPerCTA = __occRoundUp((int)totalSmemUsagePerCTA, (int)allocationGranularity);

            if (smemAllocatedPerCTA > properties.sharedMemPerBlock) {
                maxBlocks = 0;
            }
            else {
                // User requested shared memory limit is used as long as it is greater
                // than the total shared memory used per CTA, i.e. as long as at least
                // one CTA can be launched. Otherwise, the maximum shared memory limit
                // is used instead.
                //
                if (userSmemPreference >= smemAllocatedPerCTA) {
                    sharedMemPerMultiprocessor = userSmemPreference;
                }
                else{
                    sharedMemPerMultiprocessor = properties.sharedMemPerMultiprocessor;
                }

                if (smemAllocatedPerCTA > 0) {
                    maxBlocks = (int)(sharedMemPerMultiprocessor / smemAllocatedPerCTA);
                }
                else {
                    maxBlocks = int.MaxValue;
                }
            }

            result.AllocatedSharedMemPerBlock = smemAllocatedPerCTA;

            return maxBlocks;
        }
Пример #25
0
        ///////////////////////////////////////////////
        //    Occupancy calculation Functions        //
        ///////////////////////////////////////////////


        private static bool cudaOccPartitionedGCForced(cudaOccDeviceProp properties)
        {
            cudaOccPartitionedGCSupport gcSupport = cudaOccPartitionedGlobalCachingModeSupport(properties);

            return(gcSupport == cudaOccPartitionedGCSupport.AlwaysOn);
        }
Пример #26
0
        ///*!
        // * Map int to cudaOccCacheConfig
        // */
        //private static cudaOccCacheConfig cudaOccGetCacheConfig(cudaOccDeviceState state)
        //{
        //    switch(state.cacheConfig)
        //    {
        //        case 0:  return cudaOccCacheConfig.PreferNone;
        //        case 1:  return cudaOccCacheConfig.PreferShared;
        //        case 2:  return cudaOccCacheConfig.PreferL1;
        //        case 3:  return cudaOccCacheConfig.PreferEqual;
        //        default: return cudaOccCacheConfig.PreferNone;
        //    }
        //}
        /*!
         * Shared memory based on config requested by User
         */
        private static SizeT cudaOccSMemPerMultiprocessor(cudaOccDeviceProp properties, cudaOccCacheConfig cacheConfig)
        {
            SizeT bytes = 0;
            SizeT sharedMemPerMultiprocessorHigh = (int)properties.sharedMemPerMultiprocessor;
            // Fermi and Kepler has shared L1 cache / shared memory, and support cache
            // configuration to trade one for the other. These values are needed to
            // calculate the correct shared memory size for user requested cache
            // configuration.
            //
            SizeT minCacheSize = 16384;
            SizeT maxCacheSize = 49152;
            SizeT cacheAndSharedTotal = sharedMemPerMultiprocessorHigh + minCacheSize;
            SizeT sharedMemPerMultiprocessorLow = cacheAndSharedTotal - maxCacheSize;

            switch (properties.computeMajor)
            {
                case 2:
                    // Fermi supports 48KB / 16KB or 16KB / 48KB partitions for shared /
                    // L1.
                    //
                    switch (cacheConfig)
                    {
                        default:
                        case cudaOccCacheConfig.PreferNone:
                        case cudaOccCacheConfig.PreferShared:
                        case cudaOccCacheConfig.PreferEqual:
                            bytes = sharedMemPerMultiprocessorHigh;
                            break;
                        case cudaOccCacheConfig.PreferL1:
                            bytes = sharedMemPerMultiprocessorLow;
                            break;
                    }
                    break;
                case 3:
                    // Kepler supports 16KB, 32KB, or 48KB partitions for L1. The rest
                    // is shared memory.
                    //
                    switch (cacheConfig)
                    {
                        default:
                        case cudaOccCacheConfig.PreferNone:
                        case cudaOccCacheConfig.PreferShared:
                            bytes = sharedMemPerMultiprocessorHigh;
                            break;
                        case cudaOccCacheConfig.PreferL1:
                            bytes = sharedMemPerMultiprocessorLow;
                            break;
                        case cudaOccCacheConfig.PreferEqual:
                            // Equal is the mid-point between high and low. It should be
                            // equivalent to low + 16KB.
                            //
                            bytes = (sharedMemPerMultiprocessorHigh + sharedMemPerMultiprocessorLow) / 2;
                            break;
                    }
                    break;
                case 5:
                case 6:
                    // Maxwell and Pascal have dedicated shared memory.
                    //
                    bytes = sharedMemPerMultiprocessorHigh;
                    break;
                default: throw new CudaOccupancyException(cudaOccError.ErrorUnknownDevice);
            }

            return bytes;
        }
Пример #27
0
        /**
         * Partitioned global caching mode support
         */
        private static cudaOccPartitionedGCSupport cudaOccPartitionedGlobalCachingModeSupport(cudaOccDeviceProp properties)
        {
            cudaOccPartitionedGCSupport limit = cudaOccPartitionedGCSupport.NotSupported;

            if ((properties.computeMajor == 5 && (properties.computeMinor == 2 || properties.computeMinor == 3)) ||
                properties.computeMajor == 6)
            {
                limit = cudaOccPartitionedGCSupport.Supported;
            }

            if (properties.computeMajor == 6 && properties.computeMinor == 0)
            {
                limit = cudaOccPartitionedGCSupport.NotSupported;
            }

            return(limit);
        }
Пример #28
0
        ///*!
        // * Map int to cudaOccCacheConfig
        // */
        //private static cudaOccCacheConfig cudaOccGetCacheConfig(cudaOccDeviceState state)
        //{
        //    switch(state.cacheConfig)
        //    {
        //        case 0:  return cudaOccCacheConfig.PreferNone;
        //        case 1:  return cudaOccCacheConfig.PreferShared;
        //        case 2:  return cudaOccCacheConfig.PreferL1;
        //        case 3:  return cudaOccCacheConfig.PreferEqual;
        //        default: return cudaOccCacheConfig.PreferNone;
        //    }
        //}

        /*!
         * Shared memory based on config requested by User
         */
        private static SizeT cudaOccSMemPerMultiprocessor(cudaOccDeviceProp properties, cudaOccCacheConfig cacheConfig)
        {
            SizeT bytes = 0;
            SizeT sharedMemPerMultiprocessorHigh = (int)properties.sharedMemPerMultiprocessor;
            // Fermi and Kepler has shared L1 cache / shared memory, and support cache
            // configuration to trade one for the other. These values are needed to
            // calculate the correct shared memory size for user requested cache
            // configuration.
            //
            SizeT minCacheSize                  = 16384;
            SizeT maxCacheSize                  = 49152;
            SizeT cacheAndSharedTotal           = sharedMemPerMultiprocessorHigh + minCacheSize;
            SizeT sharedMemPerMultiprocessorLow = cacheAndSharedTotal - maxCacheSize;


            switch (properties.computeMajor)
            {
            case 2:
                // Fermi supports 48KB / 16KB or 16KB / 48KB partitions for shared /
                // L1.
                //
                switch (cacheConfig)
                {
                default:
                case cudaOccCacheConfig.PreferNone:
                case cudaOccCacheConfig.PreferShared:
                case cudaOccCacheConfig.PreferEqual:
                    bytes = sharedMemPerMultiprocessorHigh;
                    break;

                case cudaOccCacheConfig.PreferL1:
                    bytes = sharedMemPerMultiprocessorLow;
                    break;
                }
                break;

            case 3:
                // Kepler supports 16KB, 32KB, or 48KB partitions for L1. The rest
                // is shared memory.
                //
                switch (cacheConfig)
                {
                default:
                case cudaOccCacheConfig.PreferNone:
                case cudaOccCacheConfig.PreferShared:
                    bytes = sharedMemPerMultiprocessorHigh;
                    break;

                case cudaOccCacheConfig.PreferL1:
                    bytes = sharedMemPerMultiprocessorLow;
                    break;

                case cudaOccCacheConfig.PreferEqual:
                    // Equal is the mid-point between high and low. It should be
                    // equivalent to low + 16KB.
                    //
                    bytes = (sharedMemPerMultiprocessorHigh + sharedMemPerMultiprocessorLow) / 2;
                    break;
                }
                break;

            case 5:
            case 6:
                // Maxwell and Pascal have dedicated shared memory.
                //
                bytes = sharedMemPerMultiprocessorHigh;
                break;

            default: throw new CudaOccupancyException(cudaOccError.ErrorUnknownDevice);
            }

            return(bytes);
        }
Пример #29
0
		/*!
		 * Granularity of warp allocation
		 */
		private static int cudaOccWarpAllocationMultiple(cudaOccDeviceProp properties)
		{
			return (properties.major <= 1) ? 2 : 1;
		}
Пример #30
0
		/// <summary>
		/// Determine the potential block size that allows maximum number of CTAs that can run on multiprocessor simultaneously 
		/// </summary>
		/// <param name="properties"></param>
		/// <param name="attributes"></param>
		/// <param name="state"></param>
		/// <param name="blockSizeToSMem">
		/// A function to convert from block size to dynamic shared memory size.<para/>
		/// e.g.:
		/// If no dynamic shared memory is used: x => 0<para/>
		/// If 4 bytes shared memory per thread is used: x = 4 * x</param>
		/// <returns>maxBlockSize</returns>
		public static int cudaOccMaxPotentialOccupancyBlockSize(
		    cudaOccDeviceProp properties,
		    cudaOccFuncAttributes attributes,
		    cudaOccDeviceState state,
		    del_blockSizeToDynamicSMemSize blockSizeToSMem)
		{
		    int maxOccupancy       = properties.maxThreadsPerMultiProcessor;
		    int largestBlockSize   = min_(properties.maxThreadsPerBlock, attributes.maxThreadsPerBlock);
		    int granularity        = properties.warpSize;
		    int maxBlockSize  = 0;
		    int blockSize     = 0;
		    int highestOccupancy   = 0;

		    for(blockSize = largestBlockSize; blockSize > 0; blockSize -= granularity)
		    {
				cudaOccResult res = cudaOccMaxActiveBlocksPerMultiprocessor(properties, attributes, blockSize, blockSizeToSMem(blockSize), state);
				int occupancy = res.ActiveBlocksPerMultiProcessor;
		        occupancy = blockSize*occupancy;

		        if(occupancy > highestOccupancy)
		        {
		            maxBlockSize = blockSize;
		            highestOccupancy = occupancy;
		        }

		        // can not get higher occupancy
		        if(highestOccupancy == maxOccupancy)
		            break;
		    }

		    return maxBlockSize;
		}
Пример #31
0
		/*!
		 * Maximum blocks that can run simultaneously on a multiprocessor
		 */
		private static int cudaOccMaxBlocksPerMultiprocessor(cudaOccDeviceProp properties)
		{
			switch(properties.major)
			{
				case 1:  return 8;
				case 2:  return 8;
				case 3:  return 16;
				case 5:  return 32;
				default: throw new CudaOccupancyException(cudaOccError.ErrorUnknownDevice);
			}
		}
Пример #32
0
        ///////////////////////////////////////////////
        //            User Input Sanity              //
        ///////////////////////////////////////////////
        private static cudaOccError cudaOccDevicePropCheck(cudaOccDeviceProp properties)
        {
            // Verify device properties
            //
            // Each of these limits must be a positive number.
            //
            // Compute capacity is checked during the occupancy calculation
            //
            if (properties.maxThreadsPerBlock          <= 0 ||
                properties.maxThreadsPerMultiProcessor <= 0 ||
                properties.regsPerBlock                <= 0 ||
                properties.regsPerMultiprocessor       <= 0 ||
                properties.warpSize                    <= 0 ||
                properties.sharedMemPerBlock           <= 0 ||
                properties.sharedMemPerMultiprocessor  <= 0 ||
                properties.numSms                      <= 0)
            {
                return cudaOccError.ErrorInvalidInput;
            }

            return cudaOccError.None;
        }
Пример #33
0
		///////////////////////////////////////////////
		//    Occupancy calculation Functions        //
		///////////////////////////////////////////////

		/// <summary>
		/// Determine the maximum number of CTAs that can be run simultaneously per SM.<para/>
		/// This is equivalent to the calculation done in the CUDA Occupancy Calculator
		/// spreadsheet
		/// </summary>
		/// <param name="properties"></param>
		/// <param name="kernel"></param>
		/// <param name="state"></param>
		/// <returns></returns>
		public static cudaOccResult cudaOccMaxActiveBlocksPerMultiprocessor(
			CudaDeviceProperties properties,
			CudaKernel kernel,
			cudaOccDeviceState state)
		{
			cudaOccDeviceProp props = new cudaOccDeviceProp(properties);
			cudaOccFuncAttributes attributes = new cudaOccFuncAttributes(kernel);

			return cudaOccMaxActiveBlocksPerMultiprocessor(props, attributes, (int)kernel.BlockDimensions.x * (int)kernel.BlockDimensions.y * (int)kernel.BlockDimensions.z, kernel.DynamicSharedMemory, state);			
		}
Пример #34
0
        // Warp limit
        //
        private static int cudaOccMaxBlocksPerSMWarpsLimit(
			cudaOccPartitionedGCConfig   gcConfig,
			cudaOccDeviceProp     properties,
			cudaOccFuncAttributes attributes,
			int                   blockSize)
        {
            int limit;
            int maxWarpsPerSm;
            int warpsAllocatedPerCTA;
            int maxBlocks;

            if (blockSize > properties.maxThreadsPerBlock) {
                maxBlocks = 0;
            }
            else {
                maxWarpsPerSm = properties.maxThreadsPerMultiProcessor / properties.warpSize;
                warpsAllocatedPerCTA = __occDivideRoundUp(blockSize, properties.warpSize);
                maxBlocks = 0;

                if (gcConfig != cudaOccPartitionedGCConfig.Off) {
                    int maxBlocksPerSmPartition;
                    int maxWarpsPerSmPartition;

                    // If partitioned global caching is on, then a CTA can only use a SM
                    // partition (a half SM), and thus a half of the warp slots
                    // available per SM
                    //
                    maxWarpsPerSmPartition  = maxWarpsPerSm / 2;
                    maxBlocksPerSmPartition = maxWarpsPerSmPartition / warpsAllocatedPerCTA;
                    maxBlocks               = maxBlocksPerSmPartition * 2;
                }
                // On hardware that supports partitioned global caching, each half SM is
                // guaranteed to support at least 32 warps (maximum number of warps of a
                // CTA), so caching will not cause 0 occupancy due to insufficient warp
                // allocation slots.
                //
                else {
                    maxBlocks = maxWarpsPerSm / warpsAllocatedPerCTA;
                }
            }

            limit = maxBlocks;

            return limit;
        }
Пример #35
0
		/// <summary>
		/// Determine the potential block size that allows maximum number of CTAs that can run on multiprocessor simultaneously 
		/// </summary>
		/// <param name="properties"></param>
		/// <param name="kernel"></param>
		/// <param name="state"></param>
		/// <param name="blockSizeToSMem">
		/// A function to convert from block size to dynamic shared memory size.<para/>
		/// e.g.:
		/// If no dynamic shared memory is used: x => 0<para/>
		/// If 4 bytes shared memory per thread is used: x = 4 * x</param>
		/// <returns>maxBlockSize</returns>
		public static int cudaOccMaxPotentialOccupancyBlockSize(
			CudaDeviceProperties properties,
			CudaKernel kernel,
			cudaOccDeviceState state,
			del_blockSizeToDynamicSMemSize blockSizeToSMem)
		{
			cudaOccDeviceProp props = new cudaOccDeviceProp(properties);
			cudaOccFuncAttributes attributes = new cudaOccFuncAttributes(kernel);
			return cudaOccMaxPotentialOccupancyBlockSize(props, attributes, state, blockSizeToSMem);
		}
Пример #36
0
        ///////////////////////////////////
        //      API Implementations      //
        ///////////////////////////////////
        /// <summary>
        /// Determine the maximum number of CTAs that can be run simultaneously per SM.<para/>
        /// This is equivalent to the calculation done in the CUDA Occupancy Calculator
        /// spreadsheet
        /// </summary>
        /// <param name="result"></param>
        /// <param name="properties"></param>
        /// <param name="attributes"></param>
        /// <param name="state"></param>
        /// <param name="blockSize"></param>
        /// <param name="dynamicSmemSize"></param>
        /// <returns></returns>
        public static void cudaOccMaxActiveBlocksPerMultiprocessor(
			cudaOccResult               result,
			cudaOccDeviceProp     properties,
			cudaOccFuncAttributes attributes,
			cudaOccDeviceState    state,
			int                   blockSize,
			SizeT                 dynamicSmemSize)
        {
            int          ctaLimitWarps   = 0;
            int          ctaLimitBlocks  = 0;
            int          ctaLimitSMem    = 0;
            int          ctaLimitRegs    = 0;
            int          ctaLimit        = 0;
            cudaOccLimitingFactors limitingFactors = 0;

            cudaOccPartitionedGCConfig gcConfig = cudaOccPartitionedGCConfig.Off;

            //if (!result || !properties || !attributes || !state || blockSize <= 0) {
            //	return CUDA_OCC_ERROR_INVALID_INPUT;
            //}

            ///////////////////////////
            // Check user input
            ///////////////////////////

            cudaOccInputCheck(properties, attributes, state);

            ///////////////////////////
            // Initialization
            ///////////////////////////

            gcConfig = cudaOccPartitionedGCExpected(properties, attributes);

            ///////////////////////////
            // Compute occupancy
            ///////////////////////////

            // Limits due to registers/SM
            // Also compute if partitioned global caching has to be turned off
            //
            ctaLimitRegs = cudaOccMaxBlocksPerSMRegsLimit(ref gcConfig, result, properties, attributes, blockSize);

            // Limits due to warps/SM
            //
            ctaLimitWarps = cudaOccMaxBlocksPerSMWarpsLimit(gcConfig, properties, attributes, blockSize);

            // Limits due to blocks/SM
            //
            ctaLimitBlocks = cudaOccMaxBlocksPerMultiprocessor(properties);

            // Limits due to shared memory/SM
            //
            ctaLimitSMem = cudaOccMaxBlocksPerSMSmemLimit(result, properties, attributes, state, blockSize, dynamicSmemSize);

            ///////////////////////////
            // Overall occupancy
            ///////////////////////////

            // Overall limit is min() of limits due to above reasons
            //
            ctaLimit = __occMin(ctaLimitRegs, __occMin(ctaLimitSMem, __occMin(ctaLimitWarps, ctaLimitBlocks)));

            // Fill in the return values
            //
            // Determine occupancy limiting factors
            //
            if (ctaLimit == ctaLimitWarps) {
                limitingFactors |= cudaOccLimitingFactors.Warps;
            }
            if (ctaLimit == ctaLimitRegs) {
                limitingFactors |= cudaOccLimitingFactors.Registers;
            }
            if (ctaLimit == ctaLimitSMem) {
                limitingFactors |= cudaOccLimitingFactors.SharedMemory;
            }
            if (ctaLimit == ctaLimitBlocks) {
                limitingFactors |= cudaOccLimitingFactors.Blocks;
            }
            result.LimitingFactors = limitingFactors;

            result.BlockLimitRegs      = ctaLimitRegs;
            result.BlockLimitSharedMem = ctaLimitSMem;
            result.BlockLimitWarps     = ctaLimitWarps;
            result.BlockLimitBlocks    = ctaLimitBlocks;
            result.partitionedGCConfig = gcConfig;

            // Final occupancy
            result.ActiveBlocksPerMultiProcessor = ctaLimit;
        }
Пример #37
0
        private static void cudaOccInputCheck(
			cudaOccDeviceProp     properties,
			cudaOccFuncAttributes attributes,
			cudaOccDeviceState    state)
        {
            cudaOccError status = cudaOccError.None;

            status = cudaOccDevicePropCheck(properties);
            if (status != cudaOccError.None)
            {
                throw new CudaOccupancyException(status);
            }

            status = cudaOccFuncAttributesCheck(attributes);
            if (status != cudaOccError.None)
            {
                throw new CudaOccupancyException(status);
            }

            status = cudaOccDeviceStateCheck(state);
            if (status != cudaOccError.None)
            {
                throw new CudaOccupancyException(status);
            }
        }
Пример #38
0
        private static int cudaOccMaxBlocksPerSMRegsLimit(
			ref cudaOccPartitionedGCConfig  gcConfig,
			cudaOccResult         result,
			cudaOccDeviceProp     properties,
			cudaOccFuncAttributes attributes,
			int                   blockSize)
        {
            int allocationGranularity;
            int warpsAllocatedPerCTA;
            int regsAllocatedPerCTA;
            int regsAssumedPerCTA;
            int regsPerWarp;
            int regsAllocatedPerWarp;
            int numSubPartitions;
            int numRegsPerSubPartition;
            int numWarpsPerSubPartition;
            int numWarpsPerSM;
            int maxBlocks;

            allocationGranularity = cudaOccRegAllocationGranularity(
                properties,
                attributes.numRegs);   // Fermi requires special handling of certain register usage

            numSubPartitions = cudaOccSubPartitionsPerMultiprocessor(properties);

            warpsAllocatedPerCTA = __occDivideRoundUp(blockSize, properties.warpSize);

            // GPUs of compute capability 2.x and higher allocate registers to warps
            //
            // Number of regs per warp is regs per thread x warp size, rounded up to
            // register allocation granularity
            //
            regsPerWarp          = attributes.numRegs * properties.warpSize;
            regsAllocatedPerWarp = __occRoundUp(regsPerWarp, allocationGranularity);
            regsAllocatedPerCTA  = regsAllocatedPerWarp * warpsAllocatedPerCTA;

            // Hardware verifies if a launch fits the per-CTA register limit. For
            // historical reasons, the verification logic assumes register
            // allocations are made to all partitions simultaneously. Therefore, to
            // simulate the hardware check, the warp allocation needs to be rounded
            // up to the number of partitions.
            //
            regsAssumedPerCTA = regsAllocatedPerWarp * __occRoundUp(warpsAllocatedPerCTA, numSubPartitions);

            if (properties.regsPerBlock < regsAssumedPerCTA ||   // Hardware check
                properties.regsPerBlock < regsAllocatedPerCTA) { // Software check
                maxBlocks = 0;
            }
            else {
                if (regsAllocatedPerWarp > 0) {
                    // Registers are allocated in each sub-partition. The max number
                    // of warps that can fit on an SM is equal to the max number of
                    // warps per sub-partition x number of sub-partitions.
                    //
                    numRegsPerSubPartition  = properties.regsPerMultiprocessor / numSubPartitions;
                    numWarpsPerSubPartition = numRegsPerSubPartition / regsAllocatedPerWarp;

                    maxBlocks = 0;

                    if (gcConfig != cudaOccPartitionedGCConfig.Off) {
                        int numSubPartitionsPerSmPartition;
                        int numWarpsPerSmPartition;
                        int maxBlocksPerSmPartition;

                        // If partitioned global caching is on, then a CTA can only
                        // use a half SM, and thus a half of the registers available
                        // per SM
                        //
                        numSubPartitionsPerSmPartition = numSubPartitions / 2;
                        numWarpsPerSmPartition         = numWarpsPerSubPartition * numSubPartitionsPerSmPartition;
                        maxBlocksPerSmPartition        = numWarpsPerSmPartition / warpsAllocatedPerCTA;
                        maxBlocks                      = maxBlocksPerSmPartition * 2;
                    }

                    // Try again if partitioned global caching is not enabled, or if
                    // the CTA cannot fit on the SM with caching on. In the latter
                    // case, the device will automatically turn off caching, except
                    // if the device forces it. The user can also override this
                    // assumption with PARTITIONED_GC_ON_STRICT to calculate
                    // occupancy and launch configuration.
                    //
                    {
                        bool gcOff = (gcConfig == cudaOccPartitionedGCConfig.Off);
                        bool zeroOccupancy = (maxBlocks == 0);
                        bool cachingForced = (gcConfig == cudaOccPartitionedGCConfig.OnStrict ||
                                             cudaOccPartitionedGCForced(properties));

                        if (gcOff || (zeroOccupancy && (!cachingForced))) {
                            gcConfig = cudaOccPartitionedGCConfig.Off;
                            numWarpsPerSM = numWarpsPerSubPartition * numSubPartitions;
                            maxBlocks     = numWarpsPerSM / warpsAllocatedPerCTA;
                        }
                    }
                }
                else {
                    maxBlocks = int.MaxValue;
                }
            }

            result.AllocatedRegistersPerBlock = regsAllocatedPerCTA;

            return maxBlocks;
        }
Пример #39
0
 /*!
  * Granularity of warp allocation
  */
 private static int cudaOccWarpAllocationMultiple(cudaOccDeviceProp properties)
 {
     return((properties.major <= 1) ? 2 : 1);
 }
Пример #40
0
        private static int cudaOccMaxBlocksPerSMRegsLimit(
            ref cudaOccPartitionedGCConfig gcConfig,
            cudaOccResult result,
            cudaOccDeviceProp properties,
            cudaOccFuncAttributes attributes,
            int blockSize)
        {
            int allocationGranularity;
            int warpsAllocatedPerCTA;
            int regsAllocatedPerCTA;
            int regsAssumedPerCTA;
            int regsPerWarp;
            int regsAllocatedPerWarp;
            int numSubPartitions;
            int numRegsPerSubPartition;
            int numWarpsPerSubPartition;
            int numWarpsPerSM;
            int maxBlocks;

            allocationGranularity = cudaOccRegAllocationGranularity(
                properties,
                attributes.numRegs);                   // Fermi requires special handling of certain register usage

            numSubPartitions = cudaOccSubPartitionsPerMultiprocessor(properties);

            warpsAllocatedPerCTA = __occDivideRoundUp(blockSize, properties.warpSize);

            // GPUs of compute capability 2.x and higher allocate registers to warps
            //
            // Number of regs per warp is regs per thread x warp size, rounded up to
            // register allocation granularity
            //
            regsPerWarp          = attributes.numRegs * properties.warpSize;
            regsAllocatedPerWarp = __occRoundUp(regsPerWarp, allocationGranularity);
            regsAllocatedPerCTA  = regsAllocatedPerWarp * warpsAllocatedPerCTA;

            // Hardware verifies if a launch fits the per-CTA register limit. For
            // historical reasons, the verification logic assumes register
            // allocations are made to all partitions simultaneously. Therefore, to
            // simulate the hardware check, the warp allocation needs to be rounded
            // up to the number of partitions.
            //
            regsAssumedPerCTA = regsAllocatedPerWarp * __occRoundUp(warpsAllocatedPerCTA, numSubPartitions);

            if (properties.regsPerBlock < regsAssumedPerCTA ||               // Hardware check
                properties.regsPerBlock < regsAllocatedPerCTA)               // Software check
            {
                maxBlocks = 0;
            }
            else
            {
                if (regsAllocatedPerWarp > 0)
                {
                    // Registers are allocated in each sub-partition. The max number
                    // of warps that can fit on an SM is equal to the max number of
                    // warps per sub-partition x number of sub-partitions.
                    //
                    numRegsPerSubPartition  = properties.regsPerMultiprocessor / numSubPartitions;
                    numWarpsPerSubPartition = numRegsPerSubPartition / regsAllocatedPerWarp;

                    maxBlocks = 0;

                    if (gcConfig != cudaOccPartitionedGCConfig.Off)
                    {
                        int numSubPartitionsPerSmPartition;
                        int numWarpsPerSmPartition;
                        int maxBlocksPerSmPartition;

                        // If partitioned global caching is on, then a CTA can only
                        // use a half SM, and thus a half of the registers available
                        // per SM
                        //
                        numSubPartitionsPerSmPartition = numSubPartitions / 2;
                        numWarpsPerSmPartition         = numWarpsPerSubPartition * numSubPartitionsPerSmPartition;
                        maxBlocksPerSmPartition        = numWarpsPerSmPartition / warpsAllocatedPerCTA;
                        maxBlocks = maxBlocksPerSmPartition * 2;
                    }

                    // Try again if partitioned global caching is not enabled, or if
                    // the CTA cannot fit on the SM with caching on. In the latter
                    // case, the device will automatically turn off caching, except
                    // if the device forces it. The user can also override this
                    // assumption with PARTITIONED_GC_ON_STRICT to calculate
                    // occupancy and launch configuration.
                    //
                    {
                        bool gcOff         = (gcConfig == cudaOccPartitionedGCConfig.Off);
                        bool zeroOccupancy = (maxBlocks == 0);
                        bool cachingForced = (gcConfig == cudaOccPartitionedGCConfig.OnStrict ||
                                              cudaOccPartitionedGCForced(properties));

                        if (gcOff || (zeroOccupancy && (!cachingForced)))
                        {
                            gcConfig      = cudaOccPartitionedGCConfig.Off;
                            numWarpsPerSM = numWarpsPerSubPartition * numSubPartitions;
                            maxBlocks     = numWarpsPerSM / warpsAllocatedPerCTA;
                        }
                    }
                }
                else
                {
                    maxBlocks = int.MaxValue;
                }
            }


            result.AllocatedRegistersPerBlock = regsAllocatedPerCTA;

            return(maxBlocks);
        }
Пример #41
0
        ///////////////////////////////////
        //      API Implementations      //
        ///////////////////////////////////


        /// <summary>
        /// Determine the maximum number of CTAs that can be run simultaneously per SM.<para/>
        /// This is equivalent to the calculation done in the CUDA Occupancy Calculator
        /// spreadsheet
        /// </summary>
        /// <param name="result"></param>
        /// <param name="properties"></param>
        /// <param name="attributes"></param>
        /// <param name="state"></param>
        /// <param name="blockSize"></param>
        /// <param name="dynamicSmemSize"></param>
        /// <returns></returns>
        public static void cudaOccMaxActiveBlocksPerMultiprocessor(
            cudaOccResult result,
            cudaOccDeviceProp properties,
            cudaOccFuncAttributes attributes,
            cudaOccDeviceState state,
            int blockSize,
            SizeT dynamicSmemSize)
        {
            int ctaLimitWarps  = 0;
            int ctaLimitBlocks = 0;
            int ctaLimitSMem   = 0;
            int ctaLimitRegs   = 0;
            int ctaLimit       = 0;
            cudaOccLimitingFactors limitingFactors = 0;

            cudaOccPartitionedGCConfig gcConfig = cudaOccPartitionedGCConfig.Off;

            //if (!result || !properties || !attributes || !state || blockSize <= 0) {
            //	return CUDA_OCC_ERROR_INVALID_INPUT;
            //}

            ///////////////////////////
            // Check user input
            ///////////////////////////

            cudaOccInputCheck(properties, attributes, state);

            ///////////////////////////
            // Initialization
            ///////////////////////////

            gcConfig = cudaOccPartitionedGCExpected(properties, attributes);

            ///////////////////////////
            // Compute occupancy
            ///////////////////////////

            // Limits due to registers/SM
            // Also compute if partitioned global caching has to be turned off
            //
            ctaLimitRegs = cudaOccMaxBlocksPerSMRegsLimit(ref gcConfig, result, properties, attributes, blockSize);


            // Limits due to warps/SM
            //
            ctaLimitWarps = cudaOccMaxBlocksPerSMWarpsLimit(gcConfig, properties, attributes, blockSize);


            // Limits due to blocks/SM
            //
            ctaLimitBlocks = cudaOccMaxBlocksPerMultiprocessor(properties);

            // Limits due to shared memory/SM
            //
            ctaLimitSMem = cudaOccMaxBlocksPerSMSmemLimit(result, properties, attributes, state, blockSize, dynamicSmemSize);


            ///////////////////////////
            // Overall occupancy
            ///////////////////////////

            // Overall limit is min() of limits due to above reasons
            //
            ctaLimit = __occMin(ctaLimitRegs, __occMin(ctaLimitSMem, __occMin(ctaLimitWarps, ctaLimitBlocks)));

            // Fill in the return values
            //
            // Determine occupancy limiting factors
            //
            if (ctaLimit == ctaLimitWarps)
            {
                limitingFactors |= cudaOccLimitingFactors.Warps;
            }
            if (ctaLimit == ctaLimitRegs)
            {
                limitingFactors |= cudaOccLimitingFactors.Registers;
            }
            if (ctaLimit == ctaLimitSMem)
            {
                limitingFactors |= cudaOccLimitingFactors.SharedMemory;
            }
            if (ctaLimit == ctaLimitBlocks)
            {
                limitingFactors |= cudaOccLimitingFactors.Blocks;
            }
            result.LimitingFactors = limitingFactors;

            result.BlockLimitRegs      = ctaLimitRegs;
            result.BlockLimitSharedMem = ctaLimitSMem;
            result.BlockLimitWarps     = ctaLimitWarps;
            result.BlockLimitBlocks    = ctaLimitBlocks;
            result.partitionedGCConfig = gcConfig;

            // Final occupancy
            result.ActiveBlocksPerMultiProcessor = ctaLimit;
        }
Пример #42
0
        /// <summary>
        ///
        /// </summary>
        /// <param name="minGridSize"></param>
        /// <param name="blockSize"></param>
        /// <param name="properties"></param>
        /// <param name="attributes"></param>
        /// <param name="state"></param>
        /// <param name="blockSizeToDynamicSMemSize"></param>
        /// <param name="dynamicSMemSize"></param>
        public static void cudaOccMaxPotentialOccupancyBlockSize(
            ref int minGridSize,
            ref int blockSize,
            cudaOccDeviceProp properties,
            cudaOccFuncAttributes attributes,
            cudaOccDeviceState state,
            del_blockSizeToDynamicSMemSize blockSizeToDynamicSMemSize,
            SizeT dynamicSMemSize)
        {
            cudaOccResult result = new cudaOccResult();

            // Limits
            int occupancyLimit;
            int granularity;
            int blockSizeLimit;

            // Recorded maximum
            int maxBlockSize = 0;
            int numBlocks    = 0;
            int maxOccupancy = 0;

            // Temporary
            int blockSizeToTryAligned;
            int blockSizeToTry;
            int blockSizeLimitAligned;
            int occupancyInBlocks;
            int occupancyInThreads;

            ///////////////////////////
            // Check user input
            ///////////////////////////

            //if (!minGridSize || !blockSize || !properties || !attributes || !state) {
            //	return CUDA_OCC_ERROR_INVALID_INPUT;
            //}

            cudaOccInputCheck(properties, attributes, state);

            /////////////////////////////////////////////////////////////////////////////////
            // Try each block size, and pick the block size with maximum occupancy
            /////////////////////////////////////////////////////////////////////////////////

            occupancyLimit = properties.maxThreadsPerMultiProcessor;
            granularity    = properties.warpSize;

            blockSizeLimit        = __occMin(properties.maxThreadsPerBlock, attributes.maxThreadsPerBlock);
            blockSizeLimitAligned = __occRoundUp(blockSizeLimit, granularity);

            for (blockSizeToTryAligned = blockSizeLimitAligned; blockSizeToTryAligned > 0; blockSizeToTryAligned -= granularity)
            {
                blockSizeToTry = __occMin(blockSizeLimit, blockSizeToTryAligned);

                // Ignore dynamicSMemSize if the user provides a mapping
                //
                if (blockSizeToDynamicSMemSize != null)
                {
                    dynamicSMemSize = blockSizeToDynamicSMemSize(blockSizeToTry);
                }

                cudaOccMaxActiveBlocksPerMultiprocessor(
                    result,
                    properties,
                    attributes,
                    state,
                    blockSizeToTry,
                    dynamicSMemSize);

                //if (status != CUDA_OCC_SUCCESS) {
                //	return status;
                //}

                occupancyInBlocks  = result.ActiveBlocksPerMultiProcessor;
                occupancyInThreads = blockSizeToTry * occupancyInBlocks;

                if (occupancyInThreads > maxOccupancy)
                {
                    maxBlockSize = blockSizeToTry;
                    numBlocks    = occupancyInBlocks;
                    maxOccupancy = occupancyInThreads;
                }

                // Early out if we have reached the maximum
                //
                if (occupancyLimit == maxOccupancy)
                {
                    break;
                }
            }

            ///////////////////////////
            // Return best available
            ///////////////////////////

            // Suggested min grid size to achieve a full machine launch
            //
            minGridSize = numBlocks * properties.numSms;
            blockSize   = maxBlockSize;
        }
Пример #43
0
        /// <summary>
        /// Determine the maximum number of CTAs that can be run simultaneously per SM.<para/>
        /// This is equivalent to the calculation done in the CUDA Occupancy Calculator
        /// spreadsheet
        /// </summary>
        /// <param name="properties"></param>
        /// <param name="attributes"></param>
        /// <param name="blockSize"></param>
        /// <param name="dynamic_smem_bytes"></param>
        /// <param name="state"></param>
        /// <returns></returns>
        public static cudaOccResult cudaOccMaxActiveBlocksPerMultiprocessor(
            cudaOccDeviceProp properties,
            cudaOccFuncAttributes attributes,
            int blockSize,
            SizeT dynamic_smem_bytes,
            cudaOccDeviceState state)
        {
            int regAllocationUnit = 0, warpAllocationMultiple = 0, maxBlocksPerSM = 0;
            int ctaLimitWarps = 0, ctaLimitBlocks = 0, smemPerCTA = 0, smemBytes = 0, smemAllocationUnit = 0;
            int cacheConfigSMem = 0, sharedMemPerMultiprocessor = 0, ctaLimitRegs = 0, regsPerCTA = 0;
            int regsPerWarp = 0, numSides = 0, numRegsPerSide = 0, ctaLimit = 0;
            int maxWarpsPerSm = 0, warpsPerCTA = 0, ctaLimitSMem = 0;
            cudaOccLimitingFactors limitingFactors = 0;
            cudaOccResult          result          = new cudaOccResult();

            if (properties == null || attributes == null || blockSize <= 0)
            {
                throw new CudaOccupancyException(cudaOccError.ErrorInvalidInput);
            }

            //////////////////////////////////////////
            // Limits due to warps/SM or blocks/SM
            //////////////////////////////////////////
            CudaOccupancyException.CheckZero(properties.warpSize);
            maxWarpsPerSm          = properties.maxThreadsPerMultiProcessor / properties.warpSize;
            warpAllocationMultiple = cudaOccWarpAllocationMultiple(properties);

            CudaOccupancyException.CheckZero(warpAllocationMultiple);
            warpsPerCTA = round_i(divide_ri(blockSize, properties.warpSize), warpAllocationMultiple);

            maxBlocksPerSM = cudaOccMaxBlocksPerMultiprocessor(properties);

            // Calc limits
            CudaOccupancyException.CheckZero(warpsPerCTA);
            ctaLimitWarps  = (blockSize <= properties.maxThreadsPerBlock) ? maxWarpsPerSm / warpsPerCTA : 0;
            ctaLimitBlocks = maxBlocksPerSM;

            //////////////////////////////////////////
            // Limits due to shared memory/SM
            //////////////////////////////////////////
            smemAllocationUnit = cudaOccSMemAllocationUnit(properties);
            smemBytes          = (int)(attributes.sharedSizeBytes + dynamic_smem_bytes);
            CudaOccupancyException.CheckZero(smemAllocationUnit);
            smemPerCTA = round_i(smemBytes, smemAllocationUnit);

            // Calc limit
            cacheConfigSMem = cudaOccSMemPerMultiprocessor(properties, state.cacheConfig);

            // sharedMemoryPerMultiprocessor is by default limit set in hardware but user requested shared memory
            // limit is used instead if it is greater than total shared memory used by function .
            sharedMemPerMultiprocessor = (cacheConfigSMem >= smemPerCTA)
                                ? cacheConfigSMem
                                : (int)properties.sharedMemPerMultiprocessor;
            // Limit on blocks launched should be calculated with shared memory per SM but total shared memory
            // used by function should be limited by shared memory per block
            ctaLimitSMem = 0;
            if (properties.sharedMemPerBlock >= (SizeT)smemPerCTA)
            {
                ctaLimitSMem = smemPerCTA > 0 ? sharedMemPerMultiprocessor / smemPerCTA : maxBlocksPerSM;
            }

            //////////////////////////////////////////
            // Limits due to registers/SM
            //////////////////////////////////////////
            regAllocationUnit = cudaOccRegAllocationUnit(properties, attributes.numRegs);
            CudaOccupancyException.CheckZero(regAllocationUnit);

            // Calc limit
            ctaLimitRegs = 0;
            if (properties.major <= 1)
            {
                // GPUs of compute capability 1.x allocate registers to CTAs
                // Number of regs per block is regs per thread times number of warps times warp size, rounded up to allocation unit
                regsPerCTA   = round_i(attributes.numRegs * properties.warpSize * warpsPerCTA, regAllocationUnit);
                ctaLimitRegs = regsPerCTA > 0 ? properties.regsPerMultiprocessor / regsPerCTA : maxBlocksPerSM;
            }
            else
            {
                // GPUs of compute capability 2.x and higher allocate registers to warps
                // Number of regs per warp is regs per thread times number of warps times warp size, rounded up to allocation unit
                regsPerWarp = round_i(attributes.numRegs * properties.warpSize, regAllocationUnit);
                regsPerCTA  = regsPerWarp * warpsPerCTA;
                if (properties.regsPerBlock >= regsPerCTA)
                {
                    numSides = cudaOccSidesPerMultiprocessor(properties);
                    CudaOccupancyException.CheckZero(numSides);
                    numRegsPerSide = properties.regsPerMultiprocessor / numSides;
                    ctaLimitRegs   = regsPerWarp > 0 ? ((numRegsPerSide / regsPerWarp) * numSides) / warpsPerCTA : maxBlocksPerSM;
                }
            }

            //////////////////////////////////////////
            // Overall limit is min() of limits due to above reasons
            //////////////////////////////////////////
            ctaLimit = min_(ctaLimitRegs, min_(ctaLimitSMem, min_(ctaLimitWarps, ctaLimitBlocks)));
            // Determine occupancy limiting factors


            result.ActiveBlocksPerMultiProcessor = ctaLimit;

            if (ctaLimit == ctaLimitWarps)
            {
                limitingFactors |= cudaOccLimitingFactors.Warps;
            }
            if (ctaLimit == ctaLimitRegs && regsPerCTA > 0)
            {
                limitingFactors |= cudaOccLimitingFactors.Registers;
            }
            if (ctaLimit == ctaLimitSMem && smemPerCTA > 0)
            {
                limitingFactors |= cudaOccLimitingFactors.SharedMemory;
            }
            if (ctaLimit == ctaLimitBlocks)
            {
                limitingFactors |= cudaOccLimitingFactors.Blocks;
            }
            result.LimitingFactors = limitingFactors;

            result.BlockLimitRegs      = ctaLimitRegs;
            result.BlockLimitSharedMem = ctaLimitSMem;
            result.BlockLimitWarps     = ctaLimitWarps;
            result.BlockLimitBlocks    = ctaLimitBlocks;

            result.BllocatedRegistersPerBlock = regsPerCTA;
            result.AllocatedSharedMemPerBlock = smemPerCTA;

            result.ActiveWarpsPerMultiProcessor   = ctaLimit * ((int)Math.Ceiling(blockSize / (double)properties.warpSize));
            result.ActiceThreadsPerMultiProcessor = result.ActiveWarpsPerMultiProcessor * properties.warpSize;
            result.OccupancyOfEachMultiProcessor  = (int)Math.Round(result.ActiveWarpsPerMultiProcessor / (double)maxWarpsPerSm * 100);
            return(result);
        }
Пример #44
0
		//////////////////////////////////////////
		//    Occupancy Helper Functions        //
		//////////////////////////////////////////

		/*!
		 * Granularity of shared memory allocation
		 */
		private static int cudaOccSMemAllocationUnit(cudaOccDeviceProp properties)
		{
			switch(properties.major)
			{
				case 1:  return 512;
				case 2:  return 128;
				case 3:
				case 5:  return 256;
				default: throw new CudaOccupancyException(cudaOccError.ErrorUnknownDevice);
			}
		}
Пример #45
0
 ///*!
 // * Granularity of warp allocation
 // */
 //private static int cudaOccWarpAllocationMultiple(cudaOccDeviceProp properties)
 //{
 //    return (properties.major <= 1) ? 2 : 1;
 //}
 /*!
  * Number of "sides" into which the multiprocessor is partitioned
  */
 private static int cudaOccSubPartitionsPerMultiprocessor(cudaOccDeviceProp properties)
 {
     switch(properties.computeMajor)
     {
         //case 1:  return 1;
         case 2: return 2;
         case 3: return 4;
         case 5: return 4;
         case 6: return 4;
         default: throw new CudaOccupancyException(cudaOccError.ErrorUnknownDevice);
     }
 }
Пример #46
0
        ///////////////////////////////////////////////
        //    Occupancy calculation Functions        //
        ///////////////////////////////////////////////
        private static bool cudaOccPartitionedGCForced(cudaOccDeviceProp properties)
        {
            cudaOccPartitionedGCSupport gcSupport = cudaOccPartitionedGlobalCachingModeSupport(properties);

            return gcSupport == cudaOccPartitionedGCSupport.AlwaysOn;
        }