Example #1
0
        /// <summary>
        /// Demonstrates a pre-defined warp-reduction functionality.
        /// </summary>
        /// <param name="index">The current thread index.</param>
        /// <param name="dataView">The view pointing to our memory buffer.</param>
        static void ReduceKernel(
            GroupedIndex index,               // The grouped thread index (1D in this case)
            ArrayView <int> dataView)         // A view to a chunk of memory (1D in this case)
        {
            // Compute the global 1D index for accessing the data view
            var globalIndex = index.ComputeGlobalIndex();

            // Use native warp-reduce functionality to reduce all given
            // values in the scope of a single warp. Note that only
            // the first lane of a warp will contain the reduced value.
            // If all lanes should receive the reduced value,
            // use the Warp.AllReduce<...> function.
            var value = Warp.Reduce(
                1,
                new ShuffleDownInt32(),
                new AddInt32());

            dataView[globalIndex] = value;
        }
        /// <summary>
        /// Implements a basic block-wide reduction algorithm.
        /// The algorithm is based on the one from https://devblogs.nvidia.com/parallelforall/faster-parallel-reductions-kepler/.
        /// </summary>
        /// <typeparam name="T">The element type.</typeparam>
        /// <typeparam name="TShuffleDown">The type of the shuffle logic.</typeparam>
        /// <typeparam name="TReduction">The type of the reduction logic.</typeparam>
        /// <param name="groupThreadIdx">The current group-thread index.</param>
        /// <param name="value">The current value.</param>
        /// <param name="shuffleDown">The shuffle logic.</param>
        /// <param name="reduction">The reduction logic.</param>
        /// <param name="sharedMemory">A view to a section of group-shared memory.</param>
        /// <returns>The reduced value.</returns>
        public static T Reduce <T, TShuffleDown, TReduction>(
            Index groupThreadIdx,
            T value,
            TShuffleDown shuffleDown,
            TReduction reduction,
            ArrayView <T> sharedMemory)
            where T : struct
            where TShuffleDown : IShuffleDown <T>
            where TReduction : IReduction <T>
        {
            Debug.Assert(Warp.WarpSize > 1, "This algorithm can only be used on architectures with a warp size > 1");

            var warpIdx = Warp.ComputeWarpIdx(groupThreadIdx);
            var laneIdx = Warp.LaneIdx;

            value = Warp.Reduce(value, shuffleDown, reduction);

            if (laneIdx == 0)
            {
                Debug.Assert(warpIdx < sharedMemory.Length, "Shared memory out of range");
                sharedMemory[warpIdx] = value;
            }

            Group.Barrier();

            if (groupThreadIdx < Group.Dimension.X / Warp.WarpSize)
            {
                value = sharedMemory[laneIdx];
            }
            else
            {
                value = reduction.NeutralElement;
            }

            if (warpIdx == 0)
            {
                value = Warp.Reduce(value, shuffleDown, reduction);
            }

            return(value);
        }