/// <summary> /// Demonstrates a pre-defined warp-reduction functionality. /// </summary> /// <param name="index">The current thread index.</param> /// <param name="dataView">The view pointing to our memory buffer.</param> static void ReduceKernel( GroupedIndex index, // The grouped thread index (1D in this case) ArrayView <int> dataView) // A view to a chunk of memory (1D in this case) { // Compute the global 1D index for accessing the data view var globalIndex = index.ComputeGlobalIndex(); // Use native warp-reduce functionality to reduce all given // values in the scope of a single warp. Note that only // the first lane of a warp will contain the reduced value. // If all lanes should receive the reduced value, // use the Warp.AllReduce<...> function. var value = Warp.Reduce( 1, new ShuffleDownInt32(), new AddInt32()); dataView[globalIndex] = value; }
/// <summary> /// Implements a basic block-wide reduction algorithm. /// The algorithm is based on the one from https://devblogs.nvidia.com/parallelforall/faster-parallel-reductions-kepler/. /// </summary> /// <typeparam name="T">The element type.</typeparam> /// <typeparam name="TShuffleDown">The type of the shuffle logic.</typeparam> /// <typeparam name="TReduction">The type of the reduction logic.</typeparam> /// <param name="groupThreadIdx">The current group-thread index.</param> /// <param name="value">The current value.</param> /// <param name="shuffleDown">The shuffle logic.</param> /// <param name="reduction">The reduction logic.</param> /// <param name="sharedMemory">A view to a section of group-shared memory.</param> /// <returns>The reduced value.</returns> public static T Reduce <T, TShuffleDown, TReduction>( Index groupThreadIdx, T value, TShuffleDown shuffleDown, TReduction reduction, ArrayView <T> sharedMemory) where T : struct where TShuffleDown : IShuffleDown <T> where TReduction : IReduction <T> { Debug.Assert(Warp.WarpSize > 1, "This algorithm can only be used on architectures with a warp size > 1"); var warpIdx = Warp.ComputeWarpIdx(groupThreadIdx); var laneIdx = Warp.LaneIdx; value = Warp.Reduce(value, shuffleDown, reduction); if (laneIdx == 0) { Debug.Assert(warpIdx < sharedMemory.Length, "Shared memory out of range"); sharedMemory[warpIdx] = value; } Group.Barrier(); if (groupThreadIdx < Group.Dimension.X / Warp.WarpSize) { value = sharedMemory[laneIdx]; } else { value = reduction.NeutralElement; } if (warpIdx == 0) { value = Warp.Reduce(value, shuffleDown, reduction); } return(value); }