private readonly ref TRandomProvider GetRandomProvider() { // Compute the global warp index int groupIndex = Group.LinearIndex; int warpIndex = Warp.ComputeWarpIdx(groupIndex); int groupStride = XMath.DivRoundUp(Group.Dimension.Size, Warp.WarpSize); int groupOffset = Grid.LinearIndex * groupStride; int providerIndex = groupOffset + warpIndex; // Access the underlying provider Trace.Assert( providerIndex < randomProviders.Length, "Current warp does not have a valid RNG provider"); return(ref randomProviders[providerIndex]); }
public static T AllReduce <T, TReduction>(T value) where T : unmanaged where TReduction : IScanReduceOperation <T> { // A fixed number of memory banks to distribute the workload // of the atomic operations in shared memory. const int NumMemoryBanks = 4; var sharedMemory = SharedMemory.Allocate <T>(NumMemoryBanks); var warpIdx = Warp.ComputeWarpIdx(Group.IdxX); var laneIdx = Warp.LaneIdx; TReduction reduction = default; if (warpIdx == 0) { for ( int bankIdx = laneIdx; bankIdx < NumMemoryBanks; bankIdx += Warp.WarpSize) { sharedMemory[bankIdx] = reduction.Identity; } } Group.Barrier(); value = PTXWarpExtensions.Reduce <T, TReduction>(value); if (laneIdx == 0) { reduction.AtomicApply(ref sharedMemory[warpIdx % NumMemoryBanks], value); } Group.Barrier(); // Note that this is explicitly unrolled (see NumMemoryBanks above) var result = sharedMemory[0]; result = reduction.Apply(result, sharedMemory[1]); result = reduction.Apply(result, sharedMemory[2]); result = reduction.Apply(result, sharedMemory[3]); Group.Barrier(); return(result); }
/// <summary> /// Implements a basic block-wide reduction algorithm. /// The algorithm is based on the one from https://devblogs.nvidia.com/parallelforall/faster-parallel-reductions-kepler/. /// </summary> /// <typeparam name="T">The element type.</typeparam> /// <typeparam name="TShuffleDown">The type of the shuffle logic.</typeparam> /// <typeparam name="TReduction">The type of the reduction logic.</typeparam> /// <param name="groupThreadIdx">The current group-thread index.</param> /// <param name="value">The current value.</param> /// <param name="shuffleDown">The shuffle logic.</param> /// <param name="reduction">The reduction logic.</param> /// <param name="sharedMemory">A view to a section of group-shared memory.</param> /// <returns>The reduced value.</returns> public static T Reduce <T, TShuffleDown, TReduction>( Index groupThreadIdx, T value, TShuffleDown shuffleDown, TReduction reduction, ArrayView <T> sharedMemory) where T : struct where TShuffleDown : IShuffleDown <T> where TReduction : IReduction <T> { Debug.Assert(Warp.WarpSize > 1, "This algorithm can only be used on architectures with a warp size > 1"); var warpIdx = Warp.ComputeWarpIdx(groupThreadIdx); var laneIdx = Warp.LaneIdx; value = Warp.Reduce(value, shuffleDown, reduction); if (laneIdx == 0) { Debug.Assert(warpIdx < sharedMemory.Length, "Shared memory out of range"); sharedMemory[warpIdx] = value; } Group.Barrier(); if (groupThreadIdx < Group.Dimension.X / Warp.WarpSize) { value = sharedMemory[laneIdx]; } else { value = reduction.NeutralElement; } if (warpIdx == 0) { value = Warp.Reduce(value, shuffleDown, reduction); } return(value); }