static void Main() { using (var context = new Context()) { // For each available accelerator... foreach (var acceleratorId in Accelerator.Accelerators) { // Create default accelerator for the given accelerator id using (var accelerator = Accelerator.Create(context, acceleratorId)) { Console.WriteLine($"Performing operations on {accelerator}"); int groupSize = accelerator.MaxNumThreadsPerGroup; // Scenario 1: simple version using (var buffer = accelerator.Allocate <int>(groupSize)) { var kernel = accelerator.LoadStreamKernel < ArrayView <int>, SpecializedValue <int> >(SpecializedKernel); kernel((1, groupSize), buffer.View, SpecializedValue.New(2)); kernel((1, groupSize), buffer.View, SpecializedValue.New(23)); kernel((1, groupSize), buffer.View, SpecializedValue.New(42)); } // Scenario 2: custom structure using (var buffer = accelerator.Allocate <int>(groupSize)) { var kernel = accelerator.LoadStreamKernel < ArrayView <int>, SpecializedValue <CustomStruct> >(SpecializedCustomStructKernel); kernel( (1, groupSize), buffer.View, SpecializedValue.New( new CustomStruct(1, 7))); kernel( (1, groupSize), buffer.View, SpecializedValue.New( new CustomStruct(23, 42))); } // Scenario 3: generic kernel using (var buffer = accelerator.Allocate <long>(groupSize)) { var kernel = accelerator.LoadStreamKernel < ArrayView <long>, SpecializedValue <long> >(SpecializedGenericKernel); kernel((1, groupSize), buffer.View, SpecializedValue.New(23L)); kernel((1, groupSize), buffer.View, SpecializedValue.New(42L)); } } } } }
/// <summary> /// Demonstrates using the mul.hi.u64 and mul.lo.u64 inline PTX instructions to /// multiply two UInt64 values to produce a UInt128 value. /// </summary> static void MultiplyUInt128(CudaAccelerator accelerator) { using var buffer = accelerator.Allocate1D <UInt128>(1024); var kernel = accelerator.LoadAutoGroupedStreamKernel <Index1D, ArrayView <UInt128>, SpecializedValue <ulong> >(MultiplyUInt128Kernel); kernel( (int)buffer.Length, buffer.View, SpecializedValue.New(ulong.MaxValue)); var results = buffer.GetAsArray1D(); for (var i = 0; i < results.Length; i++) { Console.WriteLine($"[{i}] = {results[i]}"); } }
private static void IlGpuOptimisedImpl( CudaAccelerator gpu, Real[] mSquaredDistances, Real[] mCoordinates, int c, int n, string name, Action <ArrayView2D <Real>, ArrayView <Real>, SpecializedValue <int>, SpecializedValue <int>, int> kernelFunc) { using var cudaSquaredDistance = gpu.Allocate <Real>(n, n); using var cudaCoordinates = gpu.Allocate(mCoordinates); var timer = Stopwatch.StartNew(); const int blockSize = 128; var gridSize = Util.DivUp(n, blockSize); var lp = ((gridSize, gridSize, 1), (blockSize, 1, 1)); gpu.Launch(kernelFunc, gpu.DefaultStream, lp, cudaSquaredDistance.View, cudaCoordinates.View, SpecializedValue.New(blockSize), SpecializedValue.New(c), n); gpu.Synchronize(); Util.PrintPerformance(timer, name, n, c, n); cudaSquaredDistance.CopyTo(mSquaredDistances, (0, 0), 0, (n, n)); }