Ejemplo n.º 1
0
        static void FibonacciSharedKernel(
            GroupedIndex <Index> index,
            int x,
            ArrayView <Data> input,
            ArrayView <int> output,
            [SharedMemory(GroupSize)]
            ArrayView <int> tempData)
        {
            var gridIdx  = index.GridIdx;
            var groupIdx = index.GroupIdx;

            tempData[groupIdx] = input[groupIdx].a16;
            index.Barrier();

            int result = 0;

            for (int i = 0; i < x; ++i)
            {
                result += tempData[i];
            }

            //index.Barrier();

            output[gridIdx] = result;
        }
Ejemplo n.º 2
0
        /// <summary>
        /// Demonstrates the use of shared-memory variable referencing multiple elements.
        /// </summary>
        /// <param name="index">The current thread index.</param>
        /// <param name="dataView">The view pointing to our memory buffer.</param>
        /// <param name="outputView">The view pointing to our memory buffer.</param>
        /// <param name="sharedArray">Implicit shared-memory parameter that is handled by the runtime.</param>
        static void SharedMemoryArrayKernel(
            GroupedIndex index,          // The grouped thread index (1D in this case)
            ArrayView <int> dataView,    // A view to a chunk of memory (1D in this case)
            ArrayView <int> outputView,  // A view to a chunk of memory (1D in this case)

            [SharedMemory(128)]          // Declares a shared-memory array with 128 elements of
            ArrayView <int> sharedArray) // type int = 4 * 128 = 512 bytes shared memory per group
        {
            // Compute the global 1D index for accessing the data view
            var globalIndex = index.ComputeGlobalIndex();

            // Load the element into shared memory
            var value = globalIndex < dataView.Length ?
                        dataView[globalIndex] :
                        0;

            sharedArray[index.GroupIdx] = value;

            // Wait for all threads to complete the loading process
            Group.Barrier();

            // Compute the sum over all elements in the group
            int sum = 0;

            for (int i = 0, e = Group.Dimension.X; i < e; ++i)
            {
                sum += sharedArray[i];
            }

            // Store the sum
            if (globalIndex < outputView.Length)
            {
                outputView[globalIndex] = sum;
            }
        }
        private static void Kernel(
            GroupedIndex index,
            ArrayView <T> input,
            ArrayView <T> output,
            TShuffleDown shuffleDown,
            TReduction reduction,

            [SharedMemory(32)]
            ArrayView <T> sharedMemory)
        {
            var stride  = GridExtensions.GridStrideLoopStride;
            var reduced = reduction.NeutralElement;

            for (var idx = index.ComputeGlobalIndex(); idx < input.Length; idx += stride)
            {
                reduced = reduction.Reduce(reduced, input[idx]);
            }

            reduced = GroupExtensions.Reduce(
                index.GroupIdx,
                reduced,
                shuffleDown,
                reduction,
                sharedMemory);

            var finalizer = default(TReductionFinalizer);

            finalizer.Finalize(index, output, reduced, reduction);
        }
Ejemplo n.º 4
0
        internal static void GroupedIndex1EntryPointKernel(
            GroupedIndex index, ArrayView <int> output, int stride)
        {
            var idx = index.GridIdx.X * stride + index.GroupIdx.X;

            output[idx] = idx;
        }
Ejemplo n.º 5
0
        /// <summary>
        /// Explicitly grouped kernels receive an index type (first parameter) of type:
        /// <see cref="GroupedIndex"/>, <see cref="GroupedIndex2"/> or <see cref="GroupedIndex3"/>.
        /// Shared memory is only supported in explicitly-grouped kernel contexts.
        /// Shared-memory parameters are automatically handled by the runtime and have to be
        /// annotated with the SharedMemoryAttribute. Note that currently, the only supported
        /// shared-memory parameters are VariableViews and ArrayViews.
        /// </summary>
        /// <param name="index">The current thread index.</param>
        /// <param name="dataView">The view pointing to our memory buffer.</param>
        /// <param name="sharedVariable">Implicit shared-memory parameter that is handled by the runtime.</param>
        static void SharedMemoryVariableKernel(
            GroupedIndex index,                // The grouped thread index (1D in this case)
            ArrayView <int> dataView,          // A view to a chunk of memory (1D in this case)
            ArrayView <int> outputView,        // A view to a chunk of memory (1D in this case)

            [SharedMemory]                     // Declares a single variable of type int in
            VariableView <int> sharedVariable) // shared memory (= 4 bytes)
        {
            // Compute the global 1D index for accessing the data view
            var globalIndex = index.ComputeGlobalIndex();

            // Initialize shared memory
            if (index.GroupIdx.IsFirst)
            {
                sharedVariable.Value = 0;
            }
            // Wait for the initialization to complete
            Group.Barrier();

            if (globalIndex < dataView.Length)
            {
                Atomic.Max(sharedVariable, dataView[globalIndex]);
            }

            // Wait for all threads to complete the maximum computation process
            Group.Barrier();

            // Write the maximum of all values into the data view
            if (globalIndex < outputView.Length)
            {
                outputView[globalIndex] = sharedVariable.Value;
            }
        }
Ejemplo n.º 6
0
        internal static void WarpBroadcastKernel(
            GroupedIndex index,
            ArrayView <int> data)
        {
            var idx = index.GridIdx * Group.DimensionX + index.GroupIdx;

            data[idx] = Warp.Broadcast(index.GroupIdx.X, Warp.WarpSize - 1);
        }
Ejemplo n.º 7
0
 private static void Kernel(
     GroupedIndex index,
     ArrayView <T> input,
     ArrayView <T> output,
     TShuffleDown shuffleDown,
     TScanOperation scanOperation)
 {
     // TODO: add final scan implementation
 }
Ejemplo n.º 8
0
        internal static void GroupBarrierOrKernel(
            GroupedIndex index,
            ArrayView <int> data,
            Index bound)
        {
            var idx = index.GridIdx * Group.DimensionX + index.GroupIdx;

            data[idx] = Group.BarrierOr(index.GroupIdx < bound) ? 1 : 0;
        }
Ejemplo n.º 9
0
        internal static void WarpBarrierKernel(
            GroupedIndex index,
            ArrayView <int> data)
        {
            var idx = index.GridIdx * Group.DimensionX + index.GroupIdx;

            Warp.Barrier();
            data[idx] = idx;
        }
Ejemplo n.º 10
0
        /// <summary>
        /// Explicitly grouped kernels receive an index type (first parameter) of type:
        /// <see cref="GroupedIndex"/>, <see cref="GroupedIndex2"/> or <see cref="GroupedIndex3"/>.
        /// Shared memory is only supported in explicitly-grouped kernel contexts and can be accesses
        /// via the static <see cref="ILGPU.SharedMemory"/> class.
        /// </summary>
        /// <param name="index">The current thread index.</param>
        /// <param name="dataView">The view pointing to our memory buffer.</param>
        static void SharedMemoryVariableKernel(
            GroupedIndex index,               // The grouped thread index (1D in this case)
            ArrayView <int> dataView,         // A view to a chunk of memory (1D in this case)
            ArrayView <int> outputView)       // A view to a chunk of memory (1D in this case)
        {
            // Compute the global 1D index for accessing the data view
            var globalIndex = index.ComputeGlobalIndex();

            // 'Allocate' a single shared memory variable of type int (= 4 bytes)
            ref int sharedVariable = ref ILGPU.SharedMemory.Allocate <int>();
Ejemplo n.º 11
0
        internal static void MemoryFenceGroupLevelKernel(
            GroupedIndex index,
            ArrayView <int> data)
        {
            var idx = index.ComputeGlobalIndex();

            data[idx] = idx;

            MemoryFence.GroupLevel();
        }
Ejemplo n.º 12
0
        public void ExceedGroupSize()
        {
            var groupSize = Accelerator.MaxNumThreadsPerGroup + 1;
            var extent    = new GroupedIndex(2, groupSize);

            Action act = () => Execute(extent, 0);

            act.Should().Throw <Exception>()
            .Which.GetBaseException()
            .Should().Match(x => x is CudaException || x is NotSupportedException);
        }
Ejemplo n.º 13
0
 /// <summary>
 /// Performs a grid-stride loop.
 /// </summary>
 /// <typeparam name="TLoopBody">The type of the loop body.</typeparam>
 /// <param name="index">The global start index.</param>
 /// <param name="length">The global length.</param>
 /// <param name="loopBody">The loop body.</param>
 public static void GridStrideLoop <TLoopBody>(
     GroupedIndex index,
     Index length,
     ref TLoopBody loopBody)
     where TLoopBody : struct, IGridStrideLoopBody
 {
     GridStrideLoop(
         index.ComputeGlobalIndex(),
         length,
         ref loopBody);
 }
Ejemplo n.º 14
0
 /// <summary cref="IReductionFinalizer{T, TReduction}.Finalize(GroupedIndex, ArrayView{T}, T, TReduction)"/>
 public void Finalize(
     GroupedIndex index,
     ArrayView <T> output,
     T reducedValue,
     TReduction reduction)
 {
     if (index.GroupIdx.IsFirst)
     {
         reduction.AtomicReduce(output.GetVariableView(), reducedValue);
     }
 }
Ejemplo n.º 15
0
 /// <summary cref="IReductionFinalizer{T, TReduction}.Finalize(GroupedIndex, ArrayView{T}, T, TReduction)"/>
 public void Finalize(
     GroupedIndex index,
     ArrayView <T> output,
     T reducedValue,
     TReduction reduction)
 {
     if (index.GroupIdx.IsFirst)
     {
         output[index.GridIdx] = reducedValue;
     }
 }
Ejemplo n.º 16
0
        public void GroupBroadcast(int length)
        {
            for (int i = 2; i < Accelerator.MaxNumThreadsPerGroup; i <<= 1)
            {
                using var buffer = Accelerator.Allocate <int>(length * i);
                var extent = new GroupedIndex(length, i);
                Execute(extent, buffer.View);

                var expected = Enumerable.Repeat(i - 1, buffer.Length).ToArray();
                Verify(buffer, expected);
            }
        }
Ejemplo n.º 17
0
        public void MemoryFenceGroupLevel()
        {
            for (int i = 1; i < Accelerator.MaxNumThreadsPerGroup; i <<= 1)
            {
                var extent = new GroupedIndex(Length, i);
                using var buffer = Accelerator.Allocate <int>(extent.Size);
                Execute(extent, buffer.View);

                var expected = Enumerable.Range(0, extent.Size).ToArray();
                Verify(buffer, expected);
            }
        }
Ejemplo n.º 18
0
        /// <summary>
        /// Launches a simple 1D kernel using warp intrinsics.
        /// </summary>
        static void Main()
        {
            // Create main context
            using (var context = new Context())
            {
                // For each available accelerator...
                foreach (var acceleratorId in Accelerator.Accelerators)
                {
                    // Create default accelerator for the given accelerator id
                    using (var accelerator = Accelerator.Create(context, acceleratorId))
                    {
                        Console.WriteLine($"Performing operations on {accelerator}");

                        var dimension = new GroupedIndex(1, accelerator.WarpSize);
                        using (var dataTarget = accelerator.Allocate <int>(accelerator.WarpSize))
                        {
                            // Load the explicitly grouped kernel
                            var shuffleDownKernel = accelerator.LoadStreamKernel <GroupedIndex, ArrayView <int> >(ShuffleDownKernel);
                            dataTarget.MemSetToZero();

                            shuffleDownKernel(dimension, dataTarget.View);
                            accelerator.Synchronize();

                            Console.WriteLine("Shuffle-down kernel");
                            var target = dataTarget.GetAsArray();
                            for (int i = 0, e = target.Length; i < e; ++i)
                            {
                                Console.WriteLine($"Data[{i}] = {target[i]}");
                            }
                        }

                        using (var dataTarget = accelerator.Allocate <long>(accelerator.WarpSize))
                        {
                            // Load the explicitly grouped kernel
                            var reduceKernel = accelerator.LoadStreamKernel <GroupedIndex, ArrayView <long> >(
                                ShuffleDownKernel <ShuffleDownInt64>);
                            dataTarget.MemSetToZero();

                            reduceKernel(dimension, dataTarget.View);
                            accelerator.Synchronize();

                            Console.WriteLine("Generic shuffle-down kernel");
                            var target = dataTarget.GetAsArray();
                            for (int i = 0, e = target.Length; i < e; ++i)
                            {
                                Console.WriteLine($"Data[{i}] = {target[i]}");
                            }
                        }
                    }
                }
            }
        }
Ejemplo n.º 19
0
 /// <summary>
 /// Performs a functional grid-stride loop.
 /// </summary>
 /// <typeparam name="T">The element type of the intermediate values.</typeparam>
 /// <typeparam name="TLoopBody">The type of the loop body.</typeparam>
 /// <param name="index">The global start index.</param>
 /// <param name="length">The global length.</param>
 /// <param name="input">The initial input value.</param>
 /// <param name="loopBody">The loop body.</param>
 /// <returns>The last intermediate value.</returns>
 public static T GridStrideLoop <T, TLoopBody>(
     GroupedIndex index,
     Index length,
     T input,
     TLoopBody loopBody)
     where T : struct
     where TLoopBody : struct, IGridStrideLoopBody <T>
 {
     return(GridStrideLoop(
                index.ComputeGlobalIndex(),
                length,
                input,
                loopBody));
 }
Ejemplo n.º 20
0
        /// <summary>
        /// Explicitly grouped kernels receive an index type (first parameter) of type:
        /// <see cref="GroupedIndex"/>, <see cref="GroupedIndex2"/> or <see cref="GroupedIndex3"/>.
        /// Note that you can use warp-shuffle functionality only within
        /// explicitly-grouped kernels.
        /// </summary>
        /// <param name="index">The current thread index.</param>
        /// <param name="dataView">The view pointing to our memory buffer.</param>
        static void ShuffleDownKernel(
            GroupedIndex index,               // The grouped thread index (1D in this case)
            ArrayView <int> dataView)         // A view to a chunk of memory (1D in this case)
        {
            // Compute the global 1D index for accessing the data view
            var globalIndex = index.ComputeGlobalIndex();

            // Use native shuffle-down functionality to shuffle the
            // given value by a delta of 2 lanes
            int value = index.GroupIdx;

            value = Warp.ShuffleDown(value, 2);

            dataView[globalIndex] = value;
        }
Ejemplo n.º 21
0
        public void WarpBroadcast(int length)
        {
            var warpSize = Accelerator.WarpSize;

            using (var buffer = Accelerator.Allocate <int>(length * warpSize))
            {
                var extent = new GroupedIndex(
                    length,
                    warpSize);
                Execute(extent, buffer.View);

                var expected = Enumerable.Repeat(warpSize - 1, length * warpSize).ToArray();
                Verify(buffer, expected);
            }
        }
Ejemplo n.º 22
0
        public void WarpBarrier(int length)
        {
            var warpSize = Accelerator.WarpSize;

            using var buffer = Accelerator.Allocate <int>(length * warpSize);
            var extent = new GroupedIndex(
                length,
                warpSize);

            Execute(extent, buffer.View);

            var expected = Enumerable.Range(0, length * warpSize).ToArray();

            Verify(buffer, expected);
        }
Ejemplo n.º 23
0
        /// <summary>
        /// Demonstrates the use of a group-wide barrier.
        /// </summary>
        static void GroupedKernelBarrier(
            GroupedIndex index,          // The grouped thread index (1D in this case)
            ArrayView <int> dataView,    // A view to a chunk of memory (1D in this case)
            ArrayView <int> outputView,  // A view to a chunk of memory (1D in this case)
            int constant)                // A sample uniform constant
        {
            var globalIndex = index.ComputeGlobalIndex();

            // Wait until all threads in the group reach this point
            Group.Barrier();

            if (globalIndex < dataView.Length)
            {
                outputView[globalIndex] = dataView[globalIndex] > constant ? 1 : 0;
            }
        }
Ejemplo n.º 24
0
        /// <summary>
        /// Explicitly-grouped kernels receive an index type (first parameter) of type:
        /// <see cref="GroupedIndex"/>, <see cref="GroupedIndex2"/> or <see cref="GroupedIndex3"/>.
        /// These kernel types expose the underlying blocking/grouping semantics of a GPU
        /// and allow for highly efficient implementation of kernels for different GPUs.
        /// The semantics of theses kernels are equivalent to kernel implementations in CUDA.
        /// An explicitly-grouped kernel can be loaded with:
        /// - LoadImplicitlyGroupedKernel
        /// - LoadAutoGroupedKernel.
        ///
        /// Note that you must not use warp-shuffle functionality within implicitly grouped
        /// kernels since not all lanes of a warp are guaranteed to participate in the warp shuffle.
        /// </summary>
        /// <param name="index">The current thread index.</param>
        /// <param name="dataView">The view pointing to our memory buffer.</param>
        /// <param name="constant">A nice uniform constant.</param>
        static void GroupedKernel(
            GroupedIndex index,          // The grouped thread index (1D in this case)
            ArrayView <int> dataView,    // A view to a chunk of memory (1D in this case)
            int constant)                // A sample uniform constant
        {
            // Compute the global 1D index for accessing the data view
            var globalIndex = index.ComputeGlobalIndex();

            if (globalIndex < dataView.Length)
            {
                dataView[globalIndex] = globalIndex + constant;
            }

            // Note: this explicitly grouped kernel implements the same functionality
            // as MyKernel in the ImplicitlyGroupedKernels sample.
        }
Ejemplo n.º 25
0
        public void GroupBarrier(int length)
        {
            for (int i = 1; i < Accelerator.MaxNumThreadsPerGroup; i <<= 1)
            {
                using (var buffer = Accelerator.Allocate <int>(length * i))
                {
                    var extent = new GroupedIndex(
                        length,
                        i);
                    Execute(extent, buffer.View);

                    var expected = Enumerable.Range(0, length * i).ToArray();
                    Verify(buffer, expected);
                }
            }
        }
Ejemplo n.º 26
0
        /// <summary>
        /// Explicitly grouped kernels receive an index type (first parameter) of type:
        /// <see cref="GroupedIndex"/>, <see cref="GroupedIndex2"/> or <see cref="GroupedIndex3"/>.
        /// Note that you can use warp-shuffle functionality only within
        /// explicitly-grouped kernels.
        /// </summary>
        /// <typeparam name="TShuffleOperation">The type of the shuffle operation.</typeparam>
        /// <param name="index">The current thread index.</param>
        /// <param name="dataView">The view pointing to our memory buffer.</param>
        static void ShuffleDownKernel <TShuffleOperation>(
            GroupedIndex index,               // The grouped thread index (1D in this case)
            ArrayView <long> dataView)        // A view to a chunk of memory (1D in this case)
            where TShuffleOperation : struct, IShuffleDown <long>
        {
            // Compute the global 1D index for accessing the data view
            var globalIndex = index.ComputeGlobalIndex();

            // Use custom shuffle-down functionality to shuffle the
            // given value by a delta of 2 lanes
            long value = index.GroupIdx;
            TShuffleOperation shuffleOperation = default;

            value = shuffleOperation.ShuffleDown(value, 2);

            dataView[globalIndex] = value;
        }
Ejemplo n.º 27
0
        /// <summary>
        /// Demonstrates a pre-defined warp-reduction functionality.
        /// </summary>
        /// <param name="index">The current thread index.</param>
        /// <param name="dataView">The view pointing to our memory buffer.</param>
        static void ReduceKernel(
            GroupedIndex index,               // The grouped thread index (1D in this case)
            ArrayView <int> dataView)         // A view to a chunk of memory (1D in this case)
        {
            // Compute the global 1D index for accessing the data view
            var globalIndex = index.ComputeGlobalIndex();

            // Use native warp-reduce functionality to reduce all given
            // values in the scope of a single warp. Note that only
            // the first lane of a warp will contain the reduced value.
            // If all lanes should receive the reduced value,
            // use the Warp.AllReduce<...> function.
            var value = Warp.Reduce(
                1,
                new ShuffleDownInt32(),
                new AddInt32());

            dataView[globalIndex] = value;
        }
Ejemplo n.º 28
0
        public void GroupedIndex1EntryPoint(int length)
        {
            for (int i = 1; i < Accelerator.MaxNumThreadsPerGroup; i <<= 1)
            {
                var extent = new GroupedIndex(length, i);
                using var buffer = Accelerator.Allocate <int>(extent.Size);
                Execute(extent, buffer.View, i);

                var expected = new int[extent.Size];
                for (int j = 0; j < length; ++j)
                {
                    for (int k = 0; k < i; ++k)
                    {
                        var idx = j * i + k;
                        expected[idx] = idx;
                    }
                }

                Verify(buffer, expected);
            }
        }
Ejemplo n.º 29
0
        /// <summary>
        /// Demonstrates the use of a group-wide and-barrier.
        /// </summary>
        static void GroupedKernelAndBarrier(
            GroupedIndex index,         // The grouped thread index (1D in this case)
            ArrayView <int> dataView,   // A view to a chunk of memory (1D in this case)
            ArrayView <int> outputView, // A view to a chunk of memory (1D in this case)
            int constant)               // A sample uniform constant
        {
            // Compute the global 1D index for accessing the data view
            var globalIndex = index.ComputeGlobalIndex();

            // Load value iff the index is in range
            var value = globalIndex < dataView.Length ?
                        dataView[globalIndex] :
                        constant + 1;

            // Wait until all threads in the group reach this point. Moreover, BarrierAnd
            // evaluates the given predicate and returns true iff the predicate evaluates
            // to true for all threads in the group.
            var found = Group.BarrierAnd(value > constant);

            if (globalIndex < outputView.Length)
            {
                outputView[globalIndex] = found ? 1 : 0;
            }
        }
Ejemplo n.º 30
0
        /// <summary>
        /// Launches a simple 1D kernel using the default explicit-grouping functionality.
        /// </summary>
        static void Main(string[] args)
        {
            // Create main context
            using (var context = new Context())
            {
                // For each available accelerator...
                foreach (var acceleratorId in Accelerator.Accelerators)
                {
                    // Create default accelerator for the given accelerator id
                    using (var accelerator = Accelerator.Create(context, acceleratorId))
                    {
                        Console.WriteLine($"Performing operations on {accelerator}");

                        var data = Enumerable.Range(1, 128).ToArray();

                        var groupSize       = accelerator.MaxNumThreadsPerGroup;
                        var launchDimension = new GroupedIndex(
                            (data.Length + groupSize - 1) / groupSize,  // Compute the number of groups (round up)
                            groupSize);                                 // Use the given group size

                        using (var dataSource = accelerator.Allocate <int>(data.Length))
                        {
                            // Initialize data source
                            dataSource.CopyFrom(data, 0, 0, data.Length);

                            using (var dataTarget = accelerator.Allocate <int>(data.Length))
                            {
                                // Launch default grouped kernel
                                {
                                    dataTarget.MemSetToZero();

                                    var groupedKernel = accelerator.LoadStreamKernel <GroupedIndex, ArrayView <int>, int>(GroupedKernel);
                                    groupedKernel(launchDimension, dataTarget.View, 64);

                                    accelerator.Synchronize();

                                    Console.WriteLine("Default grouped kernel");
                                    var target = dataTarget.GetAsArray();
                                    for (int i = 0, e = target.Length; i < e; ++i)
                                    {
                                        Console.WriteLine($"Data[{i}] = {target[i]}");
                                    }
                                }

                                // Launch grouped kernel with barrier
                                {
                                    dataTarget.MemSetToZero();

                                    var groupedKernel = accelerator.LoadStreamKernel <GroupedIndex, ArrayView <int>, ArrayView <int>, int>(GroupedKernelBarrier);
                                    groupedKernel(launchDimension, dataSource, dataTarget.View, 64);

                                    accelerator.Synchronize();

                                    Console.WriteLine("Grouped-barrier kernel");
                                    var target = dataTarget.GetAsArray();
                                    for (int i = 0, e = target.Length; i < e; ++i)
                                    {
                                        Console.WriteLine($"Data[{i}] = {target[i]}");
                                    }
                                }

                                // Launch grouped kernel with and-barrier
                                {
                                    dataTarget.MemSetToZero();

                                    var groupedKernel = accelerator.LoadStreamKernel <GroupedIndex, ArrayView <int>, ArrayView <int>, int>(GroupedKernelAndBarrier);
                                    groupedKernel(launchDimension, dataSource, dataTarget.View, 0);

                                    accelerator.Synchronize();

                                    Console.WriteLine("Grouped-and-barrier kernel");
                                    var target = dataTarget.GetAsArray();
                                    for (int i = 0, e = target.Length; i < e; ++i)
                                    {
                                        Console.WriteLine($"Data[{i}] = {target[i]}");
                                    }
                                }

                                // Launch grouped kernel with or-barrier
                                {
                                    dataTarget.MemSetToZero();

                                    var groupedKernel = accelerator.LoadStreamKernel <GroupedIndex, ArrayView <int>, ArrayView <int>, int>(GroupedKernelOrBarrier);
                                    groupedKernel(launchDimension, dataSource, dataTarget.View, 64);

                                    accelerator.Synchronize();

                                    Console.WriteLine("Grouped-or-barrier kernel");
                                    var target = dataTarget.GetAsArray();
                                    for (int i = 0, e = target.Length; i < e; ++i)
                                    {
                                        Console.WriteLine($"Data[{i}] = {target[i]}");
                                    }
                                }

                                // Launch grouped kernel with popcount-barrier
                                {
                                    dataTarget.MemSetToZero();

                                    var groupedKernel = accelerator.LoadStreamKernel <GroupedIndex, ArrayView <int>, ArrayView <int>, int>(GroupedKernelPopCountBarrier);
                                    groupedKernel(launchDimension, dataSource, dataTarget.View, 0);

                                    accelerator.Synchronize();

                                    Console.WriteLine("Grouped-popcount-barrier kernel");
                                    var target = dataTarget.GetAsArray();
                                    for (int i = 0, e = target.Length; i < e; ++i)
                                    {
                                        Console.WriteLine($"Data[{i}] = {target[i]}");
                                    }
                                }
                            }
                        }
                    }
                }
            }
        }