private static void Kernel( GroupedIndex index, ArrayView <T> input, ArrayView <T> output, TShuffleDown shuffleDown, TReduction reduction, [SharedMemory(32)] ArrayView <T> sharedMemory) { var stride = GridExtensions.GridStrideLoopStride; var reduced = reduction.NeutralElement; for (var idx = index.ComputeGlobalIndex(); idx < input.Length; idx += stride) { reduced = reduction.Reduce(reduced, input[idx]); } reduced = GroupExtensions.Reduce( index.GroupIdx, reduced, shuffleDown, reduction, sharedMemory); var finalizer = default(TReductionFinalizer); finalizer.Finalize(index, output, reduced, reduction); }
/// <summary> /// Demonstrates the use of shared-memory variable referencing multiple elements. /// </summary> /// <param name="index">The current thread index.</param> /// <param name="dataView">The view pointing to our memory buffer.</param> /// <param name="outputView">The view pointing to our memory buffer.</param> /// <param name="sharedArray">Implicit shared-memory parameter that is handled by the runtime.</param> static void SharedMemoryArrayKernel( GroupedIndex index, // The grouped thread index (1D in this case) ArrayView <int> dataView, // A view to a chunk of memory (1D in this case) ArrayView <int> outputView, // A view to a chunk of memory (1D in this case) [SharedMemory(128)] // Declares a shared-memory array with 128 elements of ArrayView <int> sharedArray) // type int = 4 * 128 = 512 bytes shared memory per group { // Compute the global 1D index for accessing the data view var globalIndex = index.ComputeGlobalIndex(); // Load the element into shared memory var value = globalIndex < dataView.Length ? dataView[globalIndex] : 0; sharedArray[index.GroupIdx] = value; // Wait for all threads to complete the loading process Group.Barrier(); // Compute the sum over all elements in the group int sum = 0; for (int i = 0, e = Group.Dimension.X; i < e; ++i) { sum += sharedArray[i]; } // Store the sum if (globalIndex < outputView.Length) { outputView[globalIndex] = sum; } }
/// <summary> /// Explicitly grouped kernels receive an index type (first parameter) of type: /// <see cref="GroupedIndex"/>, <see cref="GroupedIndex2"/> or <see cref="GroupedIndex3"/>. /// Shared memory is only supported in explicitly-grouped kernel contexts. /// Shared-memory parameters are automatically handled by the runtime and have to be /// annotated with the SharedMemoryAttribute. Note that currently, the only supported /// shared-memory parameters are VariableViews and ArrayViews. /// </summary> /// <param name="index">The current thread index.</param> /// <param name="dataView">The view pointing to our memory buffer.</param> /// <param name="sharedVariable">Implicit shared-memory parameter that is handled by the runtime.</param> static void SharedMemoryVariableKernel( GroupedIndex index, // The grouped thread index (1D in this case) ArrayView <int> dataView, // A view to a chunk of memory (1D in this case) ArrayView <int> outputView, // A view to a chunk of memory (1D in this case) [SharedMemory] // Declares a single variable of type int in VariableView <int> sharedVariable) // shared memory (= 4 bytes) { // Compute the global 1D index for accessing the data view var globalIndex = index.ComputeGlobalIndex(); // Initialize shared memory if (index.GroupIdx.IsFirst) { sharedVariable.Value = 0; } // Wait for the initialization to complete Group.Barrier(); if (globalIndex < dataView.Length) { Atomic.Max(sharedVariable, dataView[globalIndex]); } // Wait for all threads to complete the maximum computation process Group.Barrier(); // Write the maximum of all values into the data view if (globalIndex < outputView.Length) { outputView[globalIndex] = sharedVariable.Value; } }
/// <summary> /// Explicitly grouped kernels receive an index type (first parameter) of type: /// <see cref="GroupedIndex"/>, <see cref="GroupedIndex2"/> or <see cref="GroupedIndex3"/>. /// Shared memory is only supported in explicitly-grouped kernel contexts and can be accesses /// via the static <see cref="ILGPU.SharedMemory"/> class. /// </summary> /// <param name="index">The current thread index.</param> /// <param name="dataView">The view pointing to our memory buffer.</param> static void SharedMemoryVariableKernel( GroupedIndex index, // The grouped thread index (1D in this case) ArrayView <int> dataView, // A view to a chunk of memory (1D in this case) ArrayView <int> outputView) // A view to a chunk of memory (1D in this case) { // Compute the global 1D index for accessing the data view var globalIndex = index.ComputeGlobalIndex(); // 'Allocate' a single shared memory variable of type int (= 4 bytes) ref int sharedVariable = ref ILGPU.SharedMemory.Allocate <int>();
internal static void MemoryFenceGroupLevelKernel( GroupedIndex index, ArrayView <int> data) { var idx = index.ComputeGlobalIndex(); data[idx] = idx; MemoryFence.GroupLevel(); }
/// <summary> /// Performs a grid-stride loop. /// </summary> /// <typeparam name="TLoopBody">The type of the loop body.</typeparam> /// <param name="index">The global start index.</param> /// <param name="length">The global length.</param> /// <param name="loopBody">The loop body.</param> public static void GridStrideLoop <TLoopBody>( GroupedIndex index, Index length, ref TLoopBody loopBody) where TLoopBody : struct, IGridStrideLoopBody { GridStrideLoop( index.ComputeGlobalIndex(), length, ref loopBody); }
/// <summary> /// Performs a functional grid-stride loop. /// </summary> /// <typeparam name="T">The element type of the intermediate values.</typeparam> /// <typeparam name="TLoopBody">The type of the loop body.</typeparam> /// <param name="index">The global start index.</param> /// <param name="length">The global length.</param> /// <param name="input">The initial input value.</param> /// <param name="loopBody">The loop body.</param> /// <returns>The last intermediate value.</returns> public static T GridStrideLoop <T, TLoopBody>( GroupedIndex index, Index length, T input, TLoopBody loopBody) where T : struct where TLoopBody : struct, IGridStrideLoopBody <T> { return(GridStrideLoop( index.ComputeGlobalIndex(), length, input, loopBody)); }
/// <summary> /// Explicitly grouped kernels receive an index type (first parameter) of type: /// <see cref="GroupedIndex"/>, <see cref="GroupedIndex2"/> or <see cref="GroupedIndex3"/>. /// Note that you can use warp-shuffle functionality only within /// explicitly-grouped kernels. /// </summary> /// <param name="index">The current thread index.</param> /// <param name="dataView">The view pointing to our memory buffer.</param> static void ShuffleDownKernel( GroupedIndex index, // The grouped thread index (1D in this case) ArrayView <int> dataView) // A view to a chunk of memory (1D in this case) { // Compute the global 1D index for accessing the data view var globalIndex = index.ComputeGlobalIndex(); // Use native shuffle-down functionality to shuffle the // given value by a delta of 2 lanes int value = index.GroupIdx; value = Warp.ShuffleDown(value, 2); dataView[globalIndex] = value; }
/// <summary> /// Demonstrates the use of a group-wide barrier. /// </summary> static void GroupedKernelBarrier( GroupedIndex index, // The grouped thread index (1D in this case) ArrayView <int> dataView, // A view to a chunk of memory (1D in this case) ArrayView <int> outputView, // A view to a chunk of memory (1D in this case) int constant) // A sample uniform constant { var globalIndex = index.ComputeGlobalIndex(); // Wait until all threads in the group reach this point Group.Barrier(); if (globalIndex < dataView.Length) { outputView[globalIndex] = dataView[globalIndex] > constant ? 1 : 0; } }
/// <summary> /// Explicitly-grouped kernels receive an index type (first parameter) of type: /// <see cref="GroupedIndex"/>, <see cref="GroupedIndex2"/> or <see cref="GroupedIndex3"/>. /// These kernel types expose the underlying blocking/grouping semantics of a GPU /// and allow for highly efficient implementation of kernels for different GPUs. /// The semantics of theses kernels are equivalent to kernel implementations in CUDA. /// An explicitly-grouped kernel can be loaded with: /// - LoadImplicitlyGroupedKernel /// - LoadAutoGroupedKernel. /// /// Note that you must not use warp-shuffle functionality within implicitly grouped /// kernels since not all lanes of a warp are guaranteed to participate in the warp shuffle. /// </summary> /// <param name="index">The current thread index.</param> /// <param name="dataView">The view pointing to our memory buffer.</param> /// <param name="constant">A nice uniform constant.</param> static void GroupedKernel( GroupedIndex index, // The grouped thread index (1D in this case) ArrayView <int> dataView, // A view to a chunk of memory (1D in this case) int constant) // A sample uniform constant { // Compute the global 1D index for accessing the data view var globalIndex = index.ComputeGlobalIndex(); if (globalIndex < dataView.Length) { dataView[globalIndex] = globalIndex + constant; } // Note: this explicitly grouped kernel implements the same functionality // as MyKernel in the ImplicitlyGroupedKernels sample. }
/// <summary> /// Explicitly grouped kernels receive an index type (first parameter) of type: /// <see cref="GroupedIndex"/>, <see cref="GroupedIndex2"/> or <see cref="GroupedIndex3"/>. /// Note that you can use warp-shuffle functionality only within /// explicitly-grouped kernels. /// </summary> /// <typeparam name="TShuffleOperation">The type of the shuffle operation.</typeparam> /// <param name="index">The current thread index.</param> /// <param name="dataView">The view pointing to our memory buffer.</param> static void ShuffleDownKernel <TShuffleOperation>( GroupedIndex index, // The grouped thread index (1D in this case) ArrayView <long> dataView) // A view to a chunk of memory (1D in this case) where TShuffleOperation : struct, IShuffleDown <long> { // Compute the global 1D index for accessing the data view var globalIndex = index.ComputeGlobalIndex(); // Use custom shuffle-down functionality to shuffle the // given value by a delta of 2 lanes long value = index.GroupIdx; TShuffleOperation shuffleOperation = default; value = shuffleOperation.ShuffleDown(value, 2); dataView[globalIndex] = value; }
/// <summary> /// Demonstrates a pre-defined warp-reduction functionality. /// </summary> /// <param name="index">The current thread index.</param> /// <param name="dataView">The view pointing to our memory buffer.</param> static void ReduceKernel( GroupedIndex index, // The grouped thread index (1D in this case) ArrayView <int> dataView) // A view to a chunk of memory (1D in this case) { // Compute the global 1D index for accessing the data view var globalIndex = index.ComputeGlobalIndex(); // Use native warp-reduce functionality to reduce all given // values in the scope of a single warp. Note that only // the first lane of a warp will contain the reduced value. // If all lanes should receive the reduced value, // use the Warp.AllReduce<...> function. var value = Warp.Reduce( 1, new ShuffleDownInt32(), new AddInt32()); dataView[globalIndex] = value; }
/// <summary> /// Demonstrates the use of a group-wide and-barrier. /// </summary> static void GroupedKernelAndBarrier( GroupedIndex index, // The grouped thread index (1D in this case) ArrayView <int> dataView, // A view to a chunk of memory (1D in this case) ArrayView <int> outputView, // A view to a chunk of memory (1D in this case) int constant) // A sample uniform constant { // Compute the global 1D index for accessing the data view var globalIndex = index.ComputeGlobalIndex(); // Load value iff the index is in range var value = globalIndex < dataView.Length ? dataView[globalIndex] : constant + 1; // Wait until all threads in the group reach this point. Moreover, BarrierAnd // evaluates the given predicate and returns true iff the predicate evaluates // to true for all threads in the group. var found = Group.BarrierAnd(value > constant); if (globalIndex < outputView.Length) { outputView[globalIndex] = found ? 1 : 0; } }