public static void IlGpu(CudaAccelerator gpu, Real[] matrix, Real[] vector, int m, int n) { using (var cudaMatrix = gpu.Allocate(matrix)) using (var cudaVector = gpu.Allocate(vector)) { var timer = Stopwatch.StartNew(); var gridSizeX = Util.DivUp(n, 32); var gridSizeY = Util.DivUp(m, 8); var lp = ((gridSizeX, gridSizeY, 1), (32, 8)); gpu.Launch(IlGpuKernel, gpu.DefaultStream, lp, cudaMatrix.View, cudaVector.View, m, n); gpu.Synchronize(); Util.PrintPerformance(timer, "AddVector.IlGpu", 3, m, n); cudaMatrix.CopyTo(matrix, 0, 0, matrix.Length); } }
public static float[] RunMatrixMulShared(float[][] a, float[][] b, int N, ref Stopwatch sw) { //Create context and accelerator var gpu = new CudaAccelerator(new Context()); //Create typed launcher var matrixMulKernelShared = gpu.LoadStreamKernel < ArrayView <float>, ArrayView <float>, ArrayView <float>, int>(MatrixMulShared); //Allocate memory var buffSize = N * N; MemoryBuffer <float> d_a = gpu.Allocate <float>(buffSize); MemoryBuffer <float> d_b = gpu.Allocate <float>(buffSize); MemoryBuffer <float> d_c = gpu.Allocate <float>(buffSize); d_a.CopyFrom(FlatternArr(a), 0, Index1.Zero, buffSize); d_b.CopyFrom(FlatternArr(b), 0, Index1.Zero, buffSize); //Groups per grid dimension int GrPerDim = (int)Math.Ceiling((float)N / groupSize); KernelConfig dimension = ( new Index2(GrPerDim, GrPerDim), // Number of groups new Index2(groupSize, groupSize)); // Group size (thread count in group) sw.Restart(); matrixMulKernelShared(dimension, d_a.View, d_b.View, d_c.View, N); // Wait for the kernel to finish... gpu.Synchronize(); sw.Stop(); var c = d_c.GetAsArray(); return(c); }
public static void IlGpu( CudaAccelerator gpu, Real[] mSquaredDistances, Real[] mCoordinates, int c, int n) { using var cudaSquaredDistance = gpu.Allocate(mSquaredDistances); using var cudaCoordinates = gpu.Allocate(mCoordinates); var timer = Stopwatch.StartNew(); const int blockSize = 128; var gridSize = Util.DivUp(n * n, blockSize); var lp = (gridSize, blockSize); gpu.Launch(IlGpuKernel, gpu.DefaultStream, lp, cudaSquaredDistance.View, cudaCoordinates.View, c, n); gpu.Synchronize(); Util.PrintPerformance(timer, "SquaredDistance.IlGpu", n, c, n); cudaSquaredDistance.CopyTo(mSquaredDistances, 0, 0, mSquaredDistances.Length); }
private static void IlGpuOptimisedImpl( CudaAccelerator gpu, Real[] mSquaredDistances, Real[] mCoordinates, int c, int n, string name, Action <ArrayView2D <Real>, ArrayView <Real>, SpecializedValue <int>, SpecializedValue <int>, int> kernelFunc) { using var cudaSquaredDistance = gpu.Allocate <Real>(n, n); using var cudaCoordinates = gpu.Allocate(mCoordinates); var timer = Stopwatch.StartNew(); const int blockSize = 128; var gridSize = Util.DivUp(n, blockSize); var lp = ((gridSize, gridSize, 1), (blockSize, 1, 1)); gpu.Launch(kernelFunc, gpu.DefaultStream, lp, cudaSquaredDistance.View, cudaCoordinates.View, SpecializedValue.New(blockSize), SpecializedValue.New(c), n); gpu.Synchronize(); Util.PrintPerformance(timer, name, n, c, n); cudaSquaredDistance.CopyTo(mSquaredDistances, (0, 0), 0, (n, n)); }
private static void Performance() { using (var context = new Context()) { using (var accelerator = new CudaAccelerator(context)) { using (var b = accelerator.CreateBackend()) { using (var c = accelerator.Context.CreateCompileUnit(b)) { var method = typeof(Program).GetMethod("MathKernel", BindingFlags.Static | BindingFlags.Public); var compiled = b.Compile(c, method); var kernel = accelerator.LoadAutoGroupedStreamKernel<Index2, ArrayView2D<float>>(MathKernel); //var kernel = accelerator.LoadAutoGroupedKernel(compiled); int size = 100000; var W = new[] { 50 }; var H = new[] { 50 }; for (int n = 0; n < W.Length; n++) { for (int m = 0; m < H.Length; m++) { int x = W[n]; int y = H[m]; Console.WriteLine($"\n\nW {x}, H {y} \n\n"); //var watch = Stopwatch.StartNew(); //for (int k = 0; k < size; k++) //{ // var v = new float[x, y]; // for (int i = 0; i < x; i++) // { // for (int j = 0; j < y; j++) // { // v[i, j] = (float)Math.Sqrt(i * j); // } // } //} //watch.Stop(); //Console.WriteLine($"\n\nElapsed CPU Time Linear: {watch.ElapsedMilliseconds}ms\n"); //GC.Collect(); // //watch = Stopwatch.StartNew(); //Parallel.For(0, size, k => //{ // var v = new float[x, y]; // Parallel.For(0, x, i => // { // Parallel.For(0, y, j => // { // v[i, j] = (float)Math.Sqrt(i * j); // }); // }); //}); //watch.Stop(); //Console.WriteLine($"Elapsed CPU Time Parallel: {watch.ElapsedMilliseconds}ms\n\n"); //GC.Collect(); //var watch = Stopwatch.StartNew(); //for (int k = 0; k < size; k++) //{ // var idx = new Index2(x, y); // var buffer = accelerator.Allocate<float>(idx); // kernel(idx, buffer.View); // accelerator.Synchronize(); // buffer.Dispose(); //} //watch.Stop(); //Console.WriteLine($"\n\nElapsed GPU Time Linear: {watch.ElapsedMilliseconds}ms\n"); //GC.Collect(); var kn = Enumerable.Repeat(accelerator.LoadAutoGroupedStreamKernel<Index2, ArrayView2D<float>>(MathKernel), size).ToList(); var watch = Stopwatch.StartNew(); Parallel.For(0, size, k => { var idx = new Index2(x, y); var buffer = accelerator.Allocate<float>(idx); //kn[k](idx, buffer.View); //kernel.Launch(idx, buffer.View); kernel(idx, buffer.View); accelerator.Synchronize(); buffer.Dispose(); }); watch.Stop(); Console.WriteLine($"Elapsed GPU Time Parallel: {watch.ElapsedMilliseconds}ms\n\n"); GC.Collect(); } } } } } } }
public void ProccessOld() { // Create the required ILGPU context using (var context = new Context()) { /* * using (var accelerator = new CPUAccelerator(context)) * { * // accelerator.LoadAutoGroupedStreamKernel creates a typed launcher * // that implicitly uses the default accelerator stream. * // In order to create a launcher that receives a custom accelerator stream * // use: accelerator.LoadAutoGroupedKernel<Index, ArrayView<int> int>(...) * var myKernel = accelerator.LoadAutoGroupedStreamKernel<Index, ArrayView<int>, int>(MyKernel2); * * // Allocate some memory * using (var buffer = accelerator.Allocate<int>(1024)) * { * // Launch buffer.Length many threads and pass a view to buffer * myKernel(buffer.Length, buffer.View, 42); * * // Wait for the kernel to finish... * accelerator.Synchronize(); * * // Resolve data * var data = buffer.GetAsArray(); * // ... * } * }*/ using (var accelerator = new CudaAccelerator(context)) // test with CPUAccelerator { var methodInfo = typeof(Impl_ILGPU).GetMethod(nameof(MyKernel), BindingFlags.Public | BindingFlags.Static); var myKernel = accelerator.LoadAutoGroupedStreamKernel <Index, ArrayView <byte>, ArrayView <double>, ArrayView <int>, ArrayView <int>, ArrayView <double>, ArrayView <byte>, int >(MyKernel); /* * var myKernel = accelerator.LoadAutoGroupedStreamKernel<Action<Index, * ArrayView<byte>, * ArrayView<int>, * ArrayView<int>, * ArrayView<double>, * ArrayView2D<byte>>>(methodInfo);*/ // Allocate some memory var input1_dev = accelerator.Allocate <int>(DataGenerator.In1.Length); var input2_dev = accelerator.Allocate <int>(DataGenerator.In2.Length); var input3_dev = accelerator.Allocate <double>(DataGenerator.In3.Length); //var input4_dev = accelerator.Allocate<byte>(DataGenerator.In4_2.GetLength(0), DataGenerator.In4_2.GetLength(1)); var input4_dev = accelerator.Allocate <byte>(DataGenerator.In4_3_bytes.Length); // init output parameters var result_dev = accelerator.Allocate <byte>(resultsBytes.Length); var resultCalc_dev = accelerator.Allocate <double>(calculatables.Length); input1_dev.CopyFrom(DataGenerator.In1, 0, 0, DataGenerator.In1.Length); input2_dev.CopyFrom(DataGenerator.In2, 0, 0, DataGenerator.In2.Length); input3_dev.CopyFrom(DataGenerator.In3, 0, 0, DataGenerator.In3.Length); //input4_dev.CopyFrom(DataFeeder.In4_2_bytes, new Index2(), new Index2(DataFeeder.In4_2_bytes.GetLength(0), 0), new Index2(1, 2)); //input4_dev.CopyFrom(DataGenerator.In4_2_bytes, Index2.Zero, Index2.Zero, input4_dev.Extent); input4_dev.CopyFrom(DataGenerator.In4_3_bytes, 0, 0, DataGenerator.In4_3_bytes.Length); myKernel(input1_dev.Length, result_dev.View, resultCalc_dev.View, input1_dev.View, input2_dev.View, input3_dev.View, input4_dev.View, DataGenerator.Width); // Wait for the kernel to finish... accelerator.Synchronize(); // Resolve data resultsBytes = result_dev.GetAsArray(); calculatables = resultCalc_dev.GetAsArray(); //d_in1.Dispose(); //d_in1 = null; /* * var kernelWithDefaultStream = accelerator.LoadAutoGroupedStreamKernel< * Index, * ArrayView<bool>, * ArrayView<int>, * ArrayView<int>, * ArrayView<double>, * ArrayView2D<bool> * >(MyKernel); * * kernelWithDefaultStream(buffer.Extent, buffer.View, 1); */ // Launch buffer.Length many threads and pass a view to buffer //myKernel(d_in1.Length, d_in1.View, 42); // Wait for the kernel to finish... //accelerator.Synchronize(); // Resolve data //var data = buffer.GetAsArray(); // ... } } }