public static void IlGpu(CudaAccelerator gpu, Real[] result, Real[] left, Real[] right, int n) { using (var cudaResult = gpu.Allocate(result)) using (var cudaLeft = gpu.Allocate(left)) using (var cudaRight = gpu.Allocate(right)) { using var blas = new CuBlas(gpu, CuBlasAPIVersion.V11); var timer = Stopwatch.StartNew(); blas.Gemm( CuBlasOperation.NonTranspose, CuBlasOperation.NonTranspose, n, n, n, 1, cudaLeft.View, n, cudaRight.View, n, 0, cudaResult.View, n); gpu.Synchronize(); PrintPerformance(timer, "MatrixMultiplication.IlGpu.cuBLAS", n, n, n); cudaResult.CopyTo(result, 0, 0, result.Length); } }
private static void IlGpuOptimisedImpl <TInt>( CudaAccelerator gpu, Real[] mSquaredDistances, Real[] mCoordinates, int c, int n, string name, Action <ArrayView2D <Real>, ArrayView <Real>, TInt, int> kernelFunc, Func <int, TInt> numCoordGetter) where TInt : struct { using var cudaSquaredDistance = gpu.Allocate <Real>(n, n); using var cudaCoordinates = gpu.Allocate(mCoordinates); var timer = Stopwatch.StartNew(); const int blockSize = 128; var gridSize = Util.DivUp(n, blockSize); var lp = ((gridSize, gridSize, 1), (blockSize, 1, 1), SharedMemoryConfig.RequestDynamic <Real>(2 * c * blockSize)); gpu.Launch(kernelFunc, gpu.DefaultStream, lp, cudaSquaredDistance.View, cudaCoordinates.View, numCoordGetter(c), n); gpu.Synchronize(); Util.PrintPerformance(timer, name, n, c, n); cudaSquaredDistance.CopyTo(mSquaredDistances, (0, 0), 0, (n, n)); }
public static void IlGpu( CudaAccelerator gpu, Real[] mIntraReturn, Real[] vClose, Real[] vIsAlive, Real[] vIsValidDay, int m, int n) { using (var cudaIntraReturn = gpu.Allocate(mIntraReturn)) using (var cudaClose = gpu.Allocate(vClose)) using (var cudaIsAlive = gpu.Allocate(vIsAlive)) using (var cudaIsValidDay = gpu.Allocate(vIsValidDay)) { var timer = Stopwatch.StartNew(); var gridSizeX = Util.DivUp(n, 32); var gridSizeY = Util.DivUp(m, 8); var lp = ((gridSizeX, gridSizeY, 1), (32, 8)); gpu.Launch(IlGpuKernel, gpu.DefaultStream, lp, cudaIntraReturn.View, cudaClose.View, cudaIsAlive.View, cudaIsValidDay.View, m, n); gpu.Synchronize(); Util.PrintPerformance(timer, "IntraReturn.IlGpu", 5, m, n); cudaIntraReturn.CopyTo(mIntraReturn, 0, 0, mIntraReturn.Length); } }
public static float[] RunMatrixMul(float[][] a, float[][] b, int N) { //Create context and accelerator var gpu = new CudaAccelerator(new Context()); //Create typed launcher var matrixMulKernel = gpu.LoadAutoGroupedStreamKernel < Index2, ArrayView <float>, ArrayView <float>, ArrayView <float>, int>(MatrixMul); //Allocate memory int buffSize = N * N; MemoryBuffer <float> d_a = gpu.Allocate <float>(buffSize); MemoryBuffer <float> d_b = gpu.Allocate <float>(buffSize); MemoryBuffer <float> d_c = gpu.Allocate <float>(buffSize); d_a.CopyFrom(FlatternArr(a), 0, Index1.Zero, buffSize); d_b.CopyFrom(FlatternArr(b), 0, Index1.Zero, buffSize); matrixMulKernel(new Index2(N, N), d_a.View, d_b.View, d_c.View, N); // Wait for the kernel to finish... gpu.Synchronize(); var c = d_c.GetAsArray(); return(c); }
static void Main() { const int DataSize = 1024; const CuBlasAPIVersion CuBlasVersion = CuBlasAPIVersion.V10; using (var context = new Context()) { // Enable algorithms library context.EnableAlgorithms(); // Check for Cuda support foreach (var acceleratorId in CudaAccelerator.CudaAccelerators) { using (var accelerator = new CudaAccelerator(context, acceleratorId)) { Console.WriteLine($"Performing operations on {accelerator}"); var buf = accelerator.Allocate <float>(DataSize); var buf2 = accelerator.Allocate <float>(DataSize); accelerator.Initialize(accelerator.DefaultStream, buf, 1.0f); accelerator.Initialize(accelerator.DefaultStream, buf2.View, 1.0f); // Initialize the CuBlas library using manual pointer mode handling // (default behavior) using (var blas = new CuBlas(accelerator, CuBlasVersion)) { // Set pointer mode to Host to enable data transfer to CPU memory blas.PointerMode = CuBlasPointerMode.Host; float output = blas.Nrm2(buf); // Set pointer mode to Device to enable data transfer to GPU memory blas.PointerMode = CuBlasPointerMode.Device; blas.Nrm2(buf, buf2); // Use pointer mode scopes to recover the previous pointer mode using (var scope = blas.BeginPointerScope(CuBlasPointerMode.Host)) { float output2 = blas.Nrm2(buf); } } // Initialize the CuBlas<T> library using custom pointer mode handlers using (var blas = new CuBlas <CuBlasPointerModeHandlers.AutomaticMode>(accelerator, CuBlasVersion)) { // Automatic transfer to host float output = blas.Nrm2(buf); // Automatic transfer to device blas.Nrm2(buf, buf2); } } } } }
public static float[] RunOddEvenSort2(float[] a) { int N = a.Length; bool evenArr = (N % 2) == 0 ? true : false; //Create context and accelerator var gpu = new CudaAccelerator(new Context()); //Create typed launcher var oddEvenKernel = gpu.LoadAutoGroupedStreamKernel < Index1, ArrayView <float>, VariableView <byte>, int, bool>(OddEvenSort2); //Allocate memory MemoryBuffer <float> d_a = gpu.Allocate <float>(N); MemoryBuffer <byte> d_stopFlag = gpu.AllocateZero <byte>(1); d_a.CopyFrom(a, 0, Index1.Zero, N); //Run kernel oddEvenKernel(N / 2, d_a, d_stopFlag.View.GetVariableView(0), N, evenArr); gpu.Synchronize(); return(d_a.GetAsArray()); }
public static float[] RunFloydWarshall(float[][] a, int N, ref Stopwatch sw) { //Create context and accelerator var gpu = new CudaAccelerator(new Context()); //Create typed launcher var floydWarshallKernel = gpu.LoadAutoGroupedStreamKernel < Index1, int, ArrayView <float>, int>(FloydWarshall); //Allocate memory var bufSize = N * N; MemoryBuffer <float> d_graphMinDist = gpu.Allocate <float>(bufSize); d_graphMinDist.CopyFrom(FlatternArr(a), 0, Index1.Zero, bufSize); sw.Restart(); for (int k = 0; k < N; k++) { floydWarshallKernel(bufSize, k, d_graphMinDist, N); gpu.Synchronize(); } sw.Stop(); return(d_graphMinDist.GetAsArray()); }
public static void IlGpu(CudaAccelerator gpu, Real[] matrix, Real[] vector, int m, int n) { using (var cudaMatrix = gpu.Allocate(matrix)) using (var cudaVector = gpu.Allocate(vector)) { var timer = Stopwatch.StartNew(); var gridSizeX = Util.DivUp(n, 32); var gridSizeY = Util.DivUp(m, 8); var lp = ((gridSizeX, gridSizeY, 1), (32, 8)); gpu.Launch(IlGpuKernel, gpu.DefaultStream, lp, cudaMatrix.View, cudaVector.View, m, n); gpu.Synchronize(); Util.PrintPerformance(timer, "AddVector.IlGpu", 3, m, n); cudaMatrix.CopyTo(matrix, 0, 0, matrix.Length); } }
public static float[] RunMatrixMulShared(float[][] a, float[][] b, int N, ref Stopwatch sw) { //Create context and accelerator var gpu = new CudaAccelerator(new Context()); //Create typed launcher var matrixMulKernelShared = gpu.LoadStreamKernel < ArrayView <float>, ArrayView <float>, ArrayView <float>, int>(MatrixMulShared); //Allocate memory var buffSize = N * N; MemoryBuffer <float> d_a = gpu.Allocate <float>(buffSize); MemoryBuffer <float> d_b = gpu.Allocate <float>(buffSize); MemoryBuffer <float> d_c = gpu.Allocate <float>(buffSize); d_a.CopyFrom(FlatternArr(a), 0, Index1.Zero, buffSize); d_b.CopyFrom(FlatternArr(b), 0, Index1.Zero, buffSize); //Groups per grid dimension int GrPerDim = (int)Math.Ceiling((float)N / groupSize); KernelConfig dimension = ( new Index2(GrPerDim, GrPerDim), // Number of groups new Index2(groupSize, groupSize)); // Group size (thread count in group) sw.Restart(); matrixMulKernelShared(dimension, d_a.View, d_b.View, d_c.View, N); // Wait for the kernel to finish... gpu.Synchronize(); sw.Stop(); var c = d_c.GetAsArray(); return(c); }
public static float[] RunOddEvenSort(float[] a, ref Stopwatch sw) { int N = a.Length; bool evenArr = (N % 2) == 0 ? true : false; bool stopFlag = false; bool iterationEven = true; //Create context and accelerator var gpu = new CudaAccelerator(new Context()); //Create typed launcher var oddEvenKernel = gpu.LoadAutoGroupedStreamKernel < Index1, ArrayView <float>, VariableView <byte>, bool, int, bool>(OddEvenSort); //Allocate memory MemoryBuffer <float> d_a = gpu.Allocate <float>(N); MemoryBuffer <byte> d_stopFlag = gpu.AllocateZero <byte>(1); d_a.CopyFrom(a, 0, Index1.Zero, N); sw.Restart(); //Run kernel byte[] zero_val = new byte[1]; zero_val[0] = 0; while (true) { if (stopFlag) { break; } stopFlag = true; d_stopFlag.CopyFrom(zero_val, 0, 0, 1); oddEvenKernel(N / 2, d_a, d_stopFlag.View.GetVariableView(), iterationEven, N, evenArr); gpu.Synchronize(); if (d_stopFlag.GetAsArray()[0] > 0) { stopFlag = false; } iterationEven = !iterationEven; } sw.Stop(); return(d_a.GetAsArray()); }
public override void Init() { GeneratePTX(); // alternative approach through this? context = new Context(); accelerator = new CudaAccelerator(context); var methodInfo = typeof(Impl_ILGPU).GetMethod(nameof(MyKernel), BindingFlags.Public | BindingFlags.Static); myKernel = accelerator.LoadAutoGroupedStreamKernel <Index, ArrayView <byte>, ArrayView <double>, ArrayView <int>, ArrayView <int>, ArrayView <double>, ArrayView <byte>, int >(MyKernel); // Allocate some memory input1_dev = accelerator.Allocate <int>(DataGenerator.In1.Length); input2_dev = accelerator.Allocate <int>(DataGenerator.In2.Length); input3_dev = accelerator.Allocate <double>(DataGenerator.In3.Length); input4_dev = accelerator.Allocate <byte>(DataGenerator.In4_3_bytes.Length); // init output parameters result_dev = accelerator.Allocate <byte>(resultsBytes.Length); resultCalc_dev = accelerator.Allocate <double>(calculatables.Length); input1_dev.CopyFrom(DataGenerator.In1, 0, 0, DataGenerator.In1.Length); input2_dev.CopyFrom(DataGenerator.In2, 0, 0, DataGenerator.In2.Length); input3_dev.CopyFrom(DataGenerator.In3, 0, 0, DataGenerator.In3.Length); input4_dev.CopyFrom(DataGenerator.In4_3_bytes, 0, 0, DataGenerator.In4_3_bytes.Length); }
public static void IlGpu( CudaAccelerator gpu, Real[] mSquaredDistances, Real[] mCoordinates, int c, int n) { using var cudaSquaredDistance = gpu.Allocate(mSquaredDistances); using var cudaCoordinates = gpu.Allocate(mCoordinates); var timer = Stopwatch.StartNew(); const int blockSize = 128; var gridSize = Util.DivUp(n * n, blockSize); var lp = (gridSize, blockSize); gpu.Launch(IlGpuKernel, gpu.DefaultStream, lp, cudaSquaredDistance.View, cudaCoordinates.View, c, n); gpu.Synchronize(); Util.PrintPerformance(timer, "SquaredDistance.IlGpu", n, c, n); cudaSquaredDistance.CopyTo(mSquaredDistances, 0, 0, mSquaredDistances.Length); }
private static void IlGpuOptimisedImpl( CudaAccelerator gpu, Real[] mSquaredDistances, Real[] mCoordinates, int c, int n, string name, Action <ArrayView2D <Real>, ArrayView <Real>, SpecializedValue <int>, SpecializedValue <int>, int> kernelFunc) { using var cudaSquaredDistance = gpu.Allocate <Real>(n, n); using var cudaCoordinates = gpu.Allocate(mCoordinates); var timer = Stopwatch.StartNew(); const int blockSize = 128; var gridSize = Util.DivUp(n, blockSize); var lp = ((gridSize, gridSize, 1), (blockSize, 1, 1)); gpu.Launch(kernelFunc, gpu.DefaultStream, lp, cudaSquaredDistance.View, cudaCoordinates.View, SpecializedValue.New(blockSize), SpecializedValue.New(c), n); gpu.Synchronize(); Util.PrintPerformance(timer, name, n, c, n); cudaSquaredDistance.CopyTo(mSquaredDistances, (0, 0), 0, (n, n)); }
public static void Main() { using Context context = new Context(); context.EnableAlgorithms(); using Accelerator device = new CudaAccelerator(context); int width = 1920; int height = 1080; byte[] h_bitmapData = new byte[width * height * 3]; using MemoryBuffer2D <Vec3> canvasData = device.Allocate <Vec3>(width, height); using MemoryBuffer <byte> d_bitmapData = device.Allocate <byte>(width * height * 3); CanvasData c = new CanvasData(canvasData, d_bitmapData, width, height); // pos // look at // up Camera camera = new Camera(new Vec3(0, 50, -100), new Vec3(0, 0, 0), new Vec3(0, -1, 0), width, height, 40f); WorldData world = new WorldData(device); //world.loadMesh(new Vec3(10, 0, 0), "./Assets/defaultcube.obj"); world.loadMesh(new Vec3(0, 0, 0), "./Assets/cat.obj"); var frameBufferToBitmap = device.LoadAutoGroupedStreamKernel <Index2, CanvasData>(CanvasData.CanvasToBitmap); var RTMethod = device.LoadAutoGroupedStreamKernel <Index2, CanvasData, dWorldBuffer, Camera>(PerPixelRayIntersectionMethod); //do rasterization here Stopwatch timer = new Stopwatch(); timer.Start(); RTMethod(new Index2(width, height), c, world.getDeviceWorldBuffer(), camera); frameBufferToBitmap(canvasData.Extent, c); device.Synchronize(); d_bitmapData.CopyTo(h_bitmapData, 0, 0, d_bitmapData.Extent); timer.Stop(); Console.WriteLine("Rendered in: " + timer.Elapsed); //bitmap magic that ignores striding be careful with some using Bitmap b = new Bitmap(width, height, width * 3, PixelFormat.Format24bppRgb, Marshal.UnsafeAddrOfPinnedArrayElement(h_bitmapData, 0)); b.Save("out.bmp"); Process.Start("cmd.exe", "/c out.bmp"); }
private static void Performance() { using (var context = new Context()) { using (var accelerator = new CudaAccelerator(context)) { using (var b = accelerator.CreateBackend()) { using (var c = accelerator.Context.CreateCompileUnit(b)) { var method = typeof(Program).GetMethod("MathKernel", BindingFlags.Static | BindingFlags.Public); var compiled = b.Compile(c, method); var kernel = accelerator.LoadAutoGroupedStreamKernel<Index2, ArrayView2D<float>>(MathKernel); //var kernel = accelerator.LoadAutoGroupedKernel(compiled); int size = 100000; var W = new[] { 50 }; var H = new[] { 50 }; for (int n = 0; n < W.Length; n++) { for (int m = 0; m < H.Length; m++) { int x = W[n]; int y = H[m]; Console.WriteLine($"\n\nW {x}, H {y} \n\n"); //var watch = Stopwatch.StartNew(); //for (int k = 0; k < size; k++) //{ // var v = new float[x, y]; // for (int i = 0; i < x; i++) // { // for (int j = 0; j < y; j++) // { // v[i, j] = (float)Math.Sqrt(i * j); // } // } //} //watch.Stop(); //Console.WriteLine($"\n\nElapsed CPU Time Linear: {watch.ElapsedMilliseconds}ms\n"); //GC.Collect(); // //watch = Stopwatch.StartNew(); //Parallel.For(0, size, k => //{ // var v = new float[x, y]; // Parallel.For(0, x, i => // { // Parallel.For(0, y, j => // { // v[i, j] = (float)Math.Sqrt(i * j); // }); // }); //}); //watch.Stop(); //Console.WriteLine($"Elapsed CPU Time Parallel: {watch.ElapsedMilliseconds}ms\n\n"); //GC.Collect(); //var watch = Stopwatch.StartNew(); //for (int k = 0; k < size; k++) //{ // var idx = new Index2(x, y); // var buffer = accelerator.Allocate<float>(idx); // kernel(idx, buffer.View); // accelerator.Synchronize(); // buffer.Dispose(); //} //watch.Stop(); //Console.WriteLine($"\n\nElapsed GPU Time Linear: {watch.ElapsedMilliseconds}ms\n"); //GC.Collect(); var kn = Enumerable.Repeat(accelerator.LoadAutoGroupedStreamKernel<Index2, ArrayView2D<float>>(MathKernel), size).ToList(); var watch = Stopwatch.StartNew(); Parallel.For(0, size, k => { var idx = new Index2(x, y); var buffer = accelerator.Allocate<float>(idx); //kn[k](idx, buffer.View); //kernel.Launch(idx, buffer.View); kernel(idx, buffer.View); accelerator.Synchronize(); buffer.Dispose(); }); watch.Stop(); Console.WriteLine($"Elapsed GPU Time Parallel: {watch.ElapsedMilliseconds}ms\n\n"); GC.Collect(); } } } } } } }
public void ProccessOld() { // Create the required ILGPU context using (var context = new Context()) { /* * using (var accelerator = new CPUAccelerator(context)) * { * // accelerator.LoadAutoGroupedStreamKernel creates a typed launcher * // that implicitly uses the default accelerator stream. * // In order to create a launcher that receives a custom accelerator stream * // use: accelerator.LoadAutoGroupedKernel<Index, ArrayView<int> int>(...) * var myKernel = accelerator.LoadAutoGroupedStreamKernel<Index, ArrayView<int>, int>(MyKernel2); * * // Allocate some memory * using (var buffer = accelerator.Allocate<int>(1024)) * { * // Launch buffer.Length many threads and pass a view to buffer * myKernel(buffer.Length, buffer.View, 42); * * // Wait for the kernel to finish... * accelerator.Synchronize(); * * // Resolve data * var data = buffer.GetAsArray(); * // ... * } * }*/ using (var accelerator = new CudaAccelerator(context)) // test with CPUAccelerator { var methodInfo = typeof(Impl_ILGPU).GetMethod(nameof(MyKernel), BindingFlags.Public | BindingFlags.Static); var myKernel = accelerator.LoadAutoGroupedStreamKernel <Index, ArrayView <byte>, ArrayView <double>, ArrayView <int>, ArrayView <int>, ArrayView <double>, ArrayView <byte>, int >(MyKernel); /* * var myKernel = accelerator.LoadAutoGroupedStreamKernel<Action<Index, * ArrayView<byte>, * ArrayView<int>, * ArrayView<int>, * ArrayView<double>, * ArrayView2D<byte>>>(methodInfo);*/ // Allocate some memory var input1_dev = accelerator.Allocate <int>(DataGenerator.In1.Length); var input2_dev = accelerator.Allocate <int>(DataGenerator.In2.Length); var input3_dev = accelerator.Allocate <double>(DataGenerator.In3.Length); //var input4_dev = accelerator.Allocate<byte>(DataGenerator.In4_2.GetLength(0), DataGenerator.In4_2.GetLength(1)); var input4_dev = accelerator.Allocate <byte>(DataGenerator.In4_3_bytes.Length); // init output parameters var result_dev = accelerator.Allocate <byte>(resultsBytes.Length); var resultCalc_dev = accelerator.Allocate <double>(calculatables.Length); input1_dev.CopyFrom(DataGenerator.In1, 0, 0, DataGenerator.In1.Length); input2_dev.CopyFrom(DataGenerator.In2, 0, 0, DataGenerator.In2.Length); input3_dev.CopyFrom(DataGenerator.In3, 0, 0, DataGenerator.In3.Length); //input4_dev.CopyFrom(DataFeeder.In4_2_bytes, new Index2(), new Index2(DataFeeder.In4_2_bytes.GetLength(0), 0), new Index2(1, 2)); //input4_dev.CopyFrom(DataGenerator.In4_2_bytes, Index2.Zero, Index2.Zero, input4_dev.Extent); input4_dev.CopyFrom(DataGenerator.In4_3_bytes, 0, 0, DataGenerator.In4_3_bytes.Length); myKernel(input1_dev.Length, result_dev.View, resultCalc_dev.View, input1_dev.View, input2_dev.View, input3_dev.View, input4_dev.View, DataGenerator.Width); // Wait for the kernel to finish... accelerator.Synchronize(); // Resolve data resultsBytes = result_dev.GetAsArray(); calculatables = resultCalc_dev.GetAsArray(); //d_in1.Dispose(); //d_in1 = null; /* * var kernelWithDefaultStream = accelerator.LoadAutoGroupedStreamKernel< * Index, * ArrayView<bool>, * ArrayView<int>, * ArrayView<int>, * ArrayView<double>, * ArrayView2D<bool> * >(MyKernel); * * kernelWithDefaultStream(buffer.Extent, buffer.View, 1); */ // Launch buffer.Length many threads and pass a view to buffer //myKernel(d_in1.Length, d_in1.View, 42); // Wait for the kernel to finish... //accelerator.Synchronize(); // Resolve data //var data = buffer.GetAsArray(); // ... } } }