Example #1
0
        ////////////////////////////////////////////////////////////////////////////////
        // Potential occupancy calculator
        //
        // The potential occupancy is calculated according to the kernel and
        // execution configuration the user desires. Occupancy is defined in
        // terms of active blocks per multiprocessor, and the user can convert
        // it to other metrics.
        //
        // This wrapper routine computes the occupancy of kernel, and reports
        // it in terms of active warps / maximum warps per SM.
        ////////////////////////////////////////////////////////////////////////////////
        static double reportPotentialOccupancy(int blockSize, SizeT dynamicSMem)
        {
            int device;

            int numBlocks;
            int activeWarps;
            int maxWarps;

            double occupancy;

            CudaOccupancy.cudaOccDeviceProp prop = new CudaOccupancy.cudaOccDeviceProp(0);

            CudaOccupancy.cudaOccResult         result     = new CudaOccupancy.cudaOccResult();
            CudaOccupancy.cudaOccFuncAttributes attributes = new CudaOccupancy.cudaOccFuncAttributes(kernel);
            CudaOccupancy.cudaOccDeviceState    state      = new CudaOccupancy.cudaOccDeviceState();
            state.cacheConfig = CudaOccupancy.cudaOccCacheConfig.PreferNone;

            CudaOccupancy.cudaOccMaxActiveBlocksPerMultiprocessor(result, prop, attributes, state, blockSize, dynamicSMem);



            numBlocks = result.ActiveBlocksPerMultiProcessor;

            activeWarps = numBlocks * blockSize / prop.warpSize;
            maxWarps    = prop.maxThreadsPerMultiProcessor / prop.warpSize;

            occupancy = (double)activeWarps / maxWarps;

            return(occupancy);
        }
Example #2
0
        ////////////////////////////////////////////////////////////////////////////////
        // Occupancy-based launch configurator
        //
        // The launch configurator, cudaOccupancyMaxPotentialBlockSize and
        // cudaOccupancyMaxPotentialBlockSizeVariableSMem, suggests a block
        // size that achieves the best theoretical occupancy. It also returns
        // the minimum number of blocks needed to achieve the occupancy on the
        // whole device.
        //
        // This launch configurator is purely occupancy-based. It doesn't
        // translate directly to performance, but the suggestion should
        // nevertheless be a good starting point for further optimizations.
        //
        // This function configures the launch based on the "automatic"
        // argument, records the runtime, and reports occupancy and runtime.
        ////////////////////////////////////////////////////////////////////////////////
        static int launchConfig(CudaDeviceVariable <int> array, int arrayCount, bool automatic)
        {
            int   blockSize   = 0;
            int   minGridSize = 0;
            int   gridSize;
            SizeT dynamicSMemUsage = 0;


            float elapsedTime;

            double potentialOccupancy;

            CudaOccupancy.cudaOccDeviceState state = new CudaOccupancy.cudaOccDeviceState();
            state.cacheConfig = CudaOccupancy.cudaOccCacheConfig.PreferNone;

            if (automatic)
            {
                CudaOccupancy.cudaOccMaxPotentialOccupancyBlockSize(ref minGridSize, ref blockSize, new CudaOccupancy.cudaOccDeviceProp(0), new CudaOccupancy.cudaOccFuncAttributes(kernel), state, dynamicSMemUsage);

                Console.WriteLine("Suggested block size: {0}", blockSize);
                Console.WriteLine("Minimum grid size for maximum occupancy: {0}", minGridSize);
            }
            else
            {
                // This block size is too small. Given limited number of
                // active blocks per multiprocessor, the number of active
                // threads will be limited, and thus unable to achieve maximum
                // occupancy.
                //
                blockSize = manualBlockSize;
            }

            // Round up
            //
            gridSize = (arrayCount + blockSize - 1) / blockSize;

            // Launch and profile
            //
            kernel.GridDimensions  = gridSize;
            kernel.BlockDimensions = blockSize;
            elapsedTime            = kernel.Run(array.DevicePointer, arrayCount);

            // Calculate occupancy
            //
            potentialOccupancy = reportPotentialOccupancy(blockSize, dynamicSMemUsage);

            Console.WriteLine("Potential occupancy: {0}%", potentialOccupancy * 100);

            // Report elapsed time
            //
            Console.WriteLine("Elapsed time: {0}ms", elapsedTime * 100);

            return(0);
        }