static void FibonacciSharedKernel( GroupedIndex <Index> index, int x, ArrayView <Data> input, ArrayView <int> output, [SharedMemory(GroupSize)] ArrayView <int> tempData) { var gridIdx = index.GridIdx; var groupIdx = index.GroupIdx; tempData[groupIdx] = input[groupIdx].a16; index.Barrier(); int result = 0; for (int i = 0; i < x; ++i) { result += tempData[i]; } //index.Barrier(); output[gridIdx] = result; }
/// <summary> /// Demonstrates the use of shared-memory variable referencing multiple elements. /// </summary> /// <param name="index">The current thread index.</param> /// <param name="dataView">The view pointing to our memory buffer.</param> /// <param name="outputView">The view pointing to our memory buffer.</param> /// <param name="sharedArray">Implicit shared-memory parameter that is handled by the runtime.</param> static void SharedMemoryArrayKernel( GroupedIndex index, // The grouped thread index (1D in this case) ArrayView <int> dataView, // A view to a chunk of memory (1D in this case) ArrayView <int> outputView, // A view to a chunk of memory (1D in this case) [SharedMemory(128)] // Declares a shared-memory array with 128 elements of ArrayView <int> sharedArray) // type int = 4 * 128 = 512 bytes shared memory per group { // Compute the global 1D index for accessing the data view var globalIndex = index.ComputeGlobalIndex(); // Load the element into shared memory var value = globalIndex < dataView.Length ? dataView[globalIndex] : 0; sharedArray[index.GroupIdx] = value; // Wait for all threads to complete the loading process Group.Barrier(); // Compute the sum over all elements in the group int sum = 0; for (int i = 0, e = Group.Dimension.X; i < e; ++i) { sum += sharedArray[i]; } // Store the sum if (globalIndex < outputView.Length) { outputView[globalIndex] = sum; } }
private static void Kernel( GroupedIndex index, ArrayView <T> input, ArrayView <T> output, TShuffleDown shuffleDown, TReduction reduction, [SharedMemory(32)] ArrayView <T> sharedMemory) { var stride = GridExtensions.GridStrideLoopStride; var reduced = reduction.NeutralElement; for (var idx = index.ComputeGlobalIndex(); idx < input.Length; idx += stride) { reduced = reduction.Reduce(reduced, input[idx]); } reduced = GroupExtensions.Reduce( index.GroupIdx, reduced, shuffleDown, reduction, sharedMemory); var finalizer = default(TReductionFinalizer); finalizer.Finalize(index, output, reduced, reduction); }
internal static void GroupedIndex1EntryPointKernel( GroupedIndex index, ArrayView <int> output, int stride) { var idx = index.GridIdx.X * stride + index.GroupIdx.X; output[idx] = idx; }
/// <summary> /// Explicitly grouped kernels receive an index type (first parameter) of type: /// <see cref="GroupedIndex"/>, <see cref="GroupedIndex2"/> or <see cref="GroupedIndex3"/>. /// Shared memory is only supported in explicitly-grouped kernel contexts. /// Shared-memory parameters are automatically handled by the runtime and have to be /// annotated with the SharedMemoryAttribute. Note that currently, the only supported /// shared-memory parameters are VariableViews and ArrayViews. /// </summary> /// <param name="index">The current thread index.</param> /// <param name="dataView">The view pointing to our memory buffer.</param> /// <param name="sharedVariable">Implicit shared-memory parameter that is handled by the runtime.</param> static void SharedMemoryVariableKernel( GroupedIndex index, // The grouped thread index (1D in this case) ArrayView <int> dataView, // A view to a chunk of memory (1D in this case) ArrayView <int> outputView, // A view to a chunk of memory (1D in this case) [SharedMemory] // Declares a single variable of type int in VariableView <int> sharedVariable) // shared memory (= 4 bytes) { // Compute the global 1D index for accessing the data view var globalIndex = index.ComputeGlobalIndex(); // Initialize shared memory if (index.GroupIdx.IsFirst) { sharedVariable.Value = 0; } // Wait for the initialization to complete Group.Barrier(); if (globalIndex < dataView.Length) { Atomic.Max(sharedVariable, dataView[globalIndex]); } // Wait for all threads to complete the maximum computation process Group.Barrier(); // Write the maximum of all values into the data view if (globalIndex < outputView.Length) { outputView[globalIndex] = sharedVariable.Value; } }
internal static void WarpBroadcastKernel( GroupedIndex index, ArrayView <int> data) { var idx = index.GridIdx * Group.DimensionX + index.GroupIdx; data[idx] = Warp.Broadcast(index.GroupIdx.X, Warp.WarpSize - 1); }
private static void Kernel( GroupedIndex index, ArrayView <T> input, ArrayView <T> output, TShuffleDown shuffleDown, TScanOperation scanOperation) { // TODO: add final scan implementation }
internal static void GroupBarrierOrKernel( GroupedIndex index, ArrayView <int> data, Index bound) { var idx = index.GridIdx * Group.DimensionX + index.GroupIdx; data[idx] = Group.BarrierOr(index.GroupIdx < bound) ? 1 : 0; }
internal static void WarpBarrierKernel( GroupedIndex index, ArrayView <int> data) { var idx = index.GridIdx * Group.DimensionX + index.GroupIdx; Warp.Barrier(); data[idx] = idx; }
/// <summary> /// Explicitly grouped kernels receive an index type (first parameter) of type: /// <see cref="GroupedIndex"/>, <see cref="GroupedIndex2"/> or <see cref="GroupedIndex3"/>. /// Shared memory is only supported in explicitly-grouped kernel contexts and can be accesses /// via the static <see cref="ILGPU.SharedMemory"/> class. /// </summary> /// <param name="index">The current thread index.</param> /// <param name="dataView">The view pointing to our memory buffer.</param> static void SharedMemoryVariableKernel( GroupedIndex index, // The grouped thread index (1D in this case) ArrayView <int> dataView, // A view to a chunk of memory (1D in this case) ArrayView <int> outputView) // A view to a chunk of memory (1D in this case) { // Compute the global 1D index for accessing the data view var globalIndex = index.ComputeGlobalIndex(); // 'Allocate' a single shared memory variable of type int (= 4 bytes) ref int sharedVariable = ref ILGPU.SharedMemory.Allocate <int>();
internal static void MemoryFenceGroupLevelKernel( GroupedIndex index, ArrayView <int> data) { var idx = index.ComputeGlobalIndex(); data[idx] = idx; MemoryFence.GroupLevel(); }
public void ExceedGroupSize() { var groupSize = Accelerator.MaxNumThreadsPerGroup + 1; var extent = new GroupedIndex(2, groupSize); Action act = () => Execute(extent, 0); act.Should().Throw <Exception>() .Which.GetBaseException() .Should().Match(x => x is CudaException || x is NotSupportedException); }
/// <summary> /// Performs a grid-stride loop. /// </summary> /// <typeparam name="TLoopBody">The type of the loop body.</typeparam> /// <param name="index">The global start index.</param> /// <param name="length">The global length.</param> /// <param name="loopBody">The loop body.</param> public static void GridStrideLoop <TLoopBody>( GroupedIndex index, Index length, ref TLoopBody loopBody) where TLoopBody : struct, IGridStrideLoopBody { GridStrideLoop( index.ComputeGlobalIndex(), length, ref loopBody); }
/// <summary cref="IReductionFinalizer{T, TReduction}.Finalize(GroupedIndex, ArrayView{T}, T, TReduction)"/> public void Finalize( GroupedIndex index, ArrayView <T> output, T reducedValue, TReduction reduction) { if (index.GroupIdx.IsFirst) { reduction.AtomicReduce(output.GetVariableView(), reducedValue); } }
/// <summary cref="IReductionFinalizer{T, TReduction}.Finalize(GroupedIndex, ArrayView{T}, T, TReduction)"/> public void Finalize( GroupedIndex index, ArrayView <T> output, T reducedValue, TReduction reduction) { if (index.GroupIdx.IsFirst) { output[index.GridIdx] = reducedValue; } }
public void GroupBroadcast(int length) { for (int i = 2; i < Accelerator.MaxNumThreadsPerGroup; i <<= 1) { using var buffer = Accelerator.Allocate <int>(length * i); var extent = new GroupedIndex(length, i); Execute(extent, buffer.View); var expected = Enumerable.Repeat(i - 1, buffer.Length).ToArray(); Verify(buffer, expected); } }
public void MemoryFenceGroupLevel() { for (int i = 1; i < Accelerator.MaxNumThreadsPerGroup; i <<= 1) { var extent = new GroupedIndex(Length, i); using var buffer = Accelerator.Allocate <int>(extent.Size); Execute(extent, buffer.View); var expected = Enumerable.Range(0, extent.Size).ToArray(); Verify(buffer, expected); } }
/// <summary> /// Launches a simple 1D kernel using warp intrinsics. /// </summary> static void Main() { // Create main context using (var context = new Context()) { // For each available accelerator... foreach (var acceleratorId in Accelerator.Accelerators) { // Create default accelerator for the given accelerator id using (var accelerator = Accelerator.Create(context, acceleratorId)) { Console.WriteLine($"Performing operations on {accelerator}"); var dimension = new GroupedIndex(1, accelerator.WarpSize); using (var dataTarget = accelerator.Allocate <int>(accelerator.WarpSize)) { // Load the explicitly grouped kernel var shuffleDownKernel = accelerator.LoadStreamKernel <GroupedIndex, ArrayView <int> >(ShuffleDownKernel); dataTarget.MemSetToZero(); shuffleDownKernel(dimension, dataTarget.View); accelerator.Synchronize(); Console.WriteLine("Shuffle-down kernel"); var target = dataTarget.GetAsArray(); for (int i = 0, e = target.Length; i < e; ++i) { Console.WriteLine($"Data[{i}] = {target[i]}"); } } using (var dataTarget = accelerator.Allocate <long>(accelerator.WarpSize)) { // Load the explicitly grouped kernel var reduceKernel = accelerator.LoadStreamKernel <GroupedIndex, ArrayView <long> >( ShuffleDownKernel <ShuffleDownInt64>); dataTarget.MemSetToZero(); reduceKernel(dimension, dataTarget.View); accelerator.Synchronize(); Console.WriteLine("Generic shuffle-down kernel"); var target = dataTarget.GetAsArray(); for (int i = 0, e = target.Length; i < e; ++i) { Console.WriteLine($"Data[{i}] = {target[i]}"); } } } } } }
/// <summary> /// Performs a functional grid-stride loop. /// </summary> /// <typeparam name="T">The element type of the intermediate values.</typeparam> /// <typeparam name="TLoopBody">The type of the loop body.</typeparam> /// <param name="index">The global start index.</param> /// <param name="length">The global length.</param> /// <param name="input">The initial input value.</param> /// <param name="loopBody">The loop body.</param> /// <returns>The last intermediate value.</returns> public static T GridStrideLoop <T, TLoopBody>( GroupedIndex index, Index length, T input, TLoopBody loopBody) where T : struct where TLoopBody : struct, IGridStrideLoopBody <T> { return(GridStrideLoop( index.ComputeGlobalIndex(), length, input, loopBody)); }
/// <summary> /// Explicitly grouped kernels receive an index type (first parameter) of type: /// <see cref="GroupedIndex"/>, <see cref="GroupedIndex2"/> or <see cref="GroupedIndex3"/>. /// Note that you can use warp-shuffle functionality only within /// explicitly-grouped kernels. /// </summary> /// <param name="index">The current thread index.</param> /// <param name="dataView">The view pointing to our memory buffer.</param> static void ShuffleDownKernel( GroupedIndex index, // The grouped thread index (1D in this case) ArrayView <int> dataView) // A view to a chunk of memory (1D in this case) { // Compute the global 1D index for accessing the data view var globalIndex = index.ComputeGlobalIndex(); // Use native shuffle-down functionality to shuffle the // given value by a delta of 2 lanes int value = index.GroupIdx; value = Warp.ShuffleDown(value, 2); dataView[globalIndex] = value; }
public void WarpBroadcast(int length) { var warpSize = Accelerator.WarpSize; using (var buffer = Accelerator.Allocate <int>(length * warpSize)) { var extent = new GroupedIndex( length, warpSize); Execute(extent, buffer.View); var expected = Enumerable.Repeat(warpSize - 1, length * warpSize).ToArray(); Verify(buffer, expected); } }
public void WarpBarrier(int length) { var warpSize = Accelerator.WarpSize; using var buffer = Accelerator.Allocate <int>(length * warpSize); var extent = new GroupedIndex( length, warpSize); Execute(extent, buffer.View); var expected = Enumerable.Range(0, length * warpSize).ToArray(); Verify(buffer, expected); }
/// <summary> /// Demonstrates the use of a group-wide barrier. /// </summary> static void GroupedKernelBarrier( GroupedIndex index, // The grouped thread index (1D in this case) ArrayView <int> dataView, // A view to a chunk of memory (1D in this case) ArrayView <int> outputView, // A view to a chunk of memory (1D in this case) int constant) // A sample uniform constant { var globalIndex = index.ComputeGlobalIndex(); // Wait until all threads in the group reach this point Group.Barrier(); if (globalIndex < dataView.Length) { outputView[globalIndex] = dataView[globalIndex] > constant ? 1 : 0; } }
/// <summary> /// Explicitly-grouped kernels receive an index type (first parameter) of type: /// <see cref="GroupedIndex"/>, <see cref="GroupedIndex2"/> or <see cref="GroupedIndex3"/>. /// These kernel types expose the underlying blocking/grouping semantics of a GPU /// and allow for highly efficient implementation of kernels for different GPUs. /// The semantics of theses kernels are equivalent to kernel implementations in CUDA. /// An explicitly-grouped kernel can be loaded with: /// - LoadImplicitlyGroupedKernel /// - LoadAutoGroupedKernel. /// /// Note that you must not use warp-shuffle functionality within implicitly grouped /// kernels since not all lanes of a warp are guaranteed to participate in the warp shuffle. /// </summary> /// <param name="index">The current thread index.</param> /// <param name="dataView">The view pointing to our memory buffer.</param> /// <param name="constant">A nice uniform constant.</param> static void GroupedKernel( GroupedIndex index, // The grouped thread index (1D in this case) ArrayView <int> dataView, // A view to a chunk of memory (1D in this case) int constant) // A sample uniform constant { // Compute the global 1D index for accessing the data view var globalIndex = index.ComputeGlobalIndex(); if (globalIndex < dataView.Length) { dataView[globalIndex] = globalIndex + constant; } // Note: this explicitly grouped kernel implements the same functionality // as MyKernel in the ImplicitlyGroupedKernels sample. }
public void GroupBarrier(int length) { for (int i = 1; i < Accelerator.MaxNumThreadsPerGroup; i <<= 1) { using (var buffer = Accelerator.Allocate <int>(length * i)) { var extent = new GroupedIndex( length, i); Execute(extent, buffer.View); var expected = Enumerable.Range(0, length * i).ToArray(); Verify(buffer, expected); } } }
/// <summary> /// Explicitly grouped kernels receive an index type (first parameter) of type: /// <see cref="GroupedIndex"/>, <see cref="GroupedIndex2"/> or <see cref="GroupedIndex3"/>. /// Note that you can use warp-shuffle functionality only within /// explicitly-grouped kernels. /// </summary> /// <typeparam name="TShuffleOperation">The type of the shuffle operation.</typeparam> /// <param name="index">The current thread index.</param> /// <param name="dataView">The view pointing to our memory buffer.</param> static void ShuffleDownKernel <TShuffleOperation>( GroupedIndex index, // The grouped thread index (1D in this case) ArrayView <long> dataView) // A view to a chunk of memory (1D in this case) where TShuffleOperation : struct, IShuffleDown <long> { // Compute the global 1D index for accessing the data view var globalIndex = index.ComputeGlobalIndex(); // Use custom shuffle-down functionality to shuffle the // given value by a delta of 2 lanes long value = index.GroupIdx; TShuffleOperation shuffleOperation = default; value = shuffleOperation.ShuffleDown(value, 2); dataView[globalIndex] = value; }
/// <summary> /// Demonstrates a pre-defined warp-reduction functionality. /// </summary> /// <param name="index">The current thread index.</param> /// <param name="dataView">The view pointing to our memory buffer.</param> static void ReduceKernel( GroupedIndex index, // The grouped thread index (1D in this case) ArrayView <int> dataView) // A view to a chunk of memory (1D in this case) { // Compute the global 1D index for accessing the data view var globalIndex = index.ComputeGlobalIndex(); // Use native warp-reduce functionality to reduce all given // values in the scope of a single warp. Note that only // the first lane of a warp will contain the reduced value. // If all lanes should receive the reduced value, // use the Warp.AllReduce<...> function. var value = Warp.Reduce( 1, new ShuffleDownInt32(), new AddInt32()); dataView[globalIndex] = value; }
public void GroupedIndex1EntryPoint(int length) { for (int i = 1; i < Accelerator.MaxNumThreadsPerGroup; i <<= 1) { var extent = new GroupedIndex(length, i); using var buffer = Accelerator.Allocate <int>(extent.Size); Execute(extent, buffer.View, i); var expected = new int[extent.Size]; for (int j = 0; j < length; ++j) { for (int k = 0; k < i; ++k) { var idx = j * i + k; expected[idx] = idx; } } Verify(buffer, expected); } }
/// <summary> /// Demonstrates the use of a group-wide and-barrier. /// </summary> static void GroupedKernelAndBarrier( GroupedIndex index, // The grouped thread index (1D in this case) ArrayView <int> dataView, // A view to a chunk of memory (1D in this case) ArrayView <int> outputView, // A view to a chunk of memory (1D in this case) int constant) // A sample uniform constant { // Compute the global 1D index for accessing the data view var globalIndex = index.ComputeGlobalIndex(); // Load value iff the index is in range var value = globalIndex < dataView.Length ? dataView[globalIndex] : constant + 1; // Wait until all threads in the group reach this point. Moreover, BarrierAnd // evaluates the given predicate and returns true iff the predicate evaluates // to true for all threads in the group. var found = Group.BarrierAnd(value > constant); if (globalIndex < outputView.Length) { outputView[globalIndex] = found ? 1 : 0; } }
/// <summary> /// Launches a simple 1D kernel using the default explicit-grouping functionality. /// </summary> static void Main(string[] args) { // Create main context using (var context = new Context()) { // For each available accelerator... foreach (var acceleratorId in Accelerator.Accelerators) { // Create default accelerator for the given accelerator id using (var accelerator = Accelerator.Create(context, acceleratorId)) { Console.WriteLine($"Performing operations on {accelerator}"); var data = Enumerable.Range(1, 128).ToArray(); var groupSize = accelerator.MaxNumThreadsPerGroup; var launchDimension = new GroupedIndex( (data.Length + groupSize - 1) / groupSize, // Compute the number of groups (round up) groupSize); // Use the given group size using (var dataSource = accelerator.Allocate <int>(data.Length)) { // Initialize data source dataSource.CopyFrom(data, 0, 0, data.Length); using (var dataTarget = accelerator.Allocate <int>(data.Length)) { // Launch default grouped kernel { dataTarget.MemSetToZero(); var groupedKernel = accelerator.LoadStreamKernel <GroupedIndex, ArrayView <int>, int>(GroupedKernel); groupedKernel(launchDimension, dataTarget.View, 64); accelerator.Synchronize(); Console.WriteLine("Default grouped kernel"); var target = dataTarget.GetAsArray(); for (int i = 0, e = target.Length; i < e; ++i) { Console.WriteLine($"Data[{i}] = {target[i]}"); } } // Launch grouped kernel with barrier { dataTarget.MemSetToZero(); var groupedKernel = accelerator.LoadStreamKernel <GroupedIndex, ArrayView <int>, ArrayView <int>, int>(GroupedKernelBarrier); groupedKernel(launchDimension, dataSource, dataTarget.View, 64); accelerator.Synchronize(); Console.WriteLine("Grouped-barrier kernel"); var target = dataTarget.GetAsArray(); for (int i = 0, e = target.Length; i < e; ++i) { Console.WriteLine($"Data[{i}] = {target[i]}"); } } // Launch grouped kernel with and-barrier { dataTarget.MemSetToZero(); var groupedKernel = accelerator.LoadStreamKernel <GroupedIndex, ArrayView <int>, ArrayView <int>, int>(GroupedKernelAndBarrier); groupedKernel(launchDimension, dataSource, dataTarget.View, 0); accelerator.Synchronize(); Console.WriteLine("Grouped-and-barrier kernel"); var target = dataTarget.GetAsArray(); for (int i = 0, e = target.Length; i < e; ++i) { Console.WriteLine($"Data[{i}] = {target[i]}"); } } // Launch grouped kernel with or-barrier { dataTarget.MemSetToZero(); var groupedKernel = accelerator.LoadStreamKernel <GroupedIndex, ArrayView <int>, ArrayView <int>, int>(GroupedKernelOrBarrier); groupedKernel(launchDimension, dataSource, dataTarget.View, 64); accelerator.Synchronize(); Console.WriteLine("Grouped-or-barrier kernel"); var target = dataTarget.GetAsArray(); for (int i = 0, e = target.Length; i < e; ++i) { Console.WriteLine($"Data[{i}] = {target[i]}"); } } // Launch grouped kernel with popcount-barrier { dataTarget.MemSetToZero(); var groupedKernel = accelerator.LoadStreamKernel <GroupedIndex, ArrayView <int>, ArrayView <int>, int>(GroupedKernelPopCountBarrier); groupedKernel(launchDimension, dataSource, dataTarget.View, 0); accelerator.Synchronize(); Console.WriteLine("Grouped-popcount-barrier kernel"); var target = dataTarget.GetAsArray(); for (int i = 0, e = target.Length; i < e; ++i) { Console.WriteLine($"Data[{i}] = {target[i]}"); } } } } } } } }