public static float[] RunMatrixMul(float[][] a, float[][] b, int N) { //Create context and accelerator var gpu = new CudaAccelerator(new Context()); //Create typed launcher var matrixMulKernel = gpu.LoadAutoGroupedStreamKernel < Index2, ArrayView <float>, ArrayView <float>, ArrayView <float>, int>(MatrixMul); //Allocate memory int buffSize = N * N; MemoryBuffer <float> d_a = gpu.Allocate <float>(buffSize); MemoryBuffer <float> d_b = gpu.Allocate <float>(buffSize); MemoryBuffer <float> d_c = gpu.Allocate <float>(buffSize); d_a.CopyFrom(FlatternArr(a), 0, Index1.Zero, buffSize); d_b.CopyFrom(FlatternArr(b), 0, Index1.Zero, buffSize); matrixMulKernel(new Index2(N, N), d_a.View, d_b.View, d_c.View, N); // Wait for the kernel to finish... gpu.Synchronize(); var c = d_c.GetAsArray(); return(c); }
public override void Init() { GeneratePTX(); // alternative approach through this? context = new Context(); accelerator = new CudaAccelerator(context); var methodInfo = typeof(Impl_ILGPU).GetMethod(nameof(MyKernel), BindingFlags.Public | BindingFlags.Static); myKernel = accelerator.LoadAutoGroupedStreamKernel <Index, ArrayView <byte>, ArrayView <double>, ArrayView <int>, ArrayView <int>, ArrayView <double>, ArrayView <byte>, int >(MyKernel); // Allocate some memory input1_dev = accelerator.Allocate <int>(DataGenerator.In1.Length); input2_dev = accelerator.Allocate <int>(DataGenerator.In2.Length); input3_dev = accelerator.Allocate <double>(DataGenerator.In3.Length); input4_dev = accelerator.Allocate <byte>(DataGenerator.In4_3_bytes.Length); // init output parameters result_dev = accelerator.Allocate <byte>(resultsBytes.Length); resultCalc_dev = accelerator.Allocate <double>(calculatables.Length); input1_dev.CopyFrom(DataGenerator.In1, 0, 0, DataGenerator.In1.Length); input2_dev.CopyFrom(DataGenerator.In2, 0, 0, DataGenerator.In2.Length); input3_dev.CopyFrom(DataGenerator.In3, 0, 0, DataGenerator.In3.Length); input4_dev.CopyFrom(DataGenerator.In4_3_bytes, 0, 0, DataGenerator.In4_3_bytes.Length); }
public static float[] RunOddEvenSort2(float[] a) { int N = a.Length; bool evenArr = (N % 2) == 0 ? true : false; //Create context and accelerator var gpu = new CudaAccelerator(new Context()); //Create typed launcher var oddEvenKernel = gpu.LoadAutoGroupedStreamKernel < Index1, ArrayView <float>, VariableView <byte>, int, bool>(OddEvenSort2); //Allocate memory MemoryBuffer <float> d_a = gpu.Allocate <float>(N); MemoryBuffer <byte> d_stopFlag = gpu.AllocateZero <byte>(1); d_a.CopyFrom(a, 0, Index1.Zero, N); //Run kernel oddEvenKernel(N / 2, d_a, d_stopFlag.View.GetVariableView(0), N, evenArr); gpu.Synchronize(); return(d_a.GetAsArray()); }
public static float[] RunFloydWarshall(float[][] a, int N, ref Stopwatch sw) { //Create context and accelerator var gpu = new CudaAccelerator(new Context()); //Create typed launcher var floydWarshallKernel = gpu.LoadAutoGroupedStreamKernel < Index1, int, ArrayView <float>, int>(FloydWarshall); //Allocate memory var bufSize = N * N; MemoryBuffer <float> d_graphMinDist = gpu.Allocate <float>(bufSize); d_graphMinDist.CopyFrom(FlatternArr(a), 0, Index1.Zero, bufSize); sw.Restart(); for (int k = 0; k < N; k++) { floydWarshallKernel(bufSize, k, d_graphMinDist, N); gpu.Synchronize(); } sw.Stop(); return(d_graphMinDist.GetAsArray()); }
public static void Main() { using Context context = new Context(); context.EnableAlgorithms(); using Accelerator device = new CudaAccelerator(context); int width = 1920; int height = 1080; byte[] h_bitmapData = new byte[width * height * 3]; using MemoryBuffer2D <Vec3> canvasData = device.Allocate <Vec3>(width, height); using MemoryBuffer <byte> d_bitmapData = device.Allocate <byte>(width * height * 3); CanvasData c = new CanvasData(canvasData, d_bitmapData, width, height); // pos // look at // up Camera camera = new Camera(new Vec3(0, 50, -100), new Vec3(0, 0, 0), new Vec3(0, -1, 0), width, height, 40f); WorldData world = new WorldData(device); //world.loadMesh(new Vec3(10, 0, 0), "./Assets/defaultcube.obj"); world.loadMesh(new Vec3(0, 0, 0), "./Assets/cat.obj"); var frameBufferToBitmap = device.LoadAutoGroupedStreamKernel <Index2, CanvasData>(CanvasData.CanvasToBitmap); var RTMethod = device.LoadAutoGroupedStreamKernel <Index2, CanvasData, dWorldBuffer, Camera>(PerPixelRayIntersectionMethod); //do rasterization here Stopwatch timer = new Stopwatch(); timer.Start(); RTMethod(new Index2(width, height), c, world.getDeviceWorldBuffer(), camera); frameBufferToBitmap(canvasData.Extent, c); device.Synchronize(); d_bitmapData.CopyTo(h_bitmapData, 0, 0, d_bitmapData.Extent); timer.Stop(); Console.WriteLine("Rendered in: " + timer.Elapsed); //bitmap magic that ignores striding be careful with some using Bitmap b = new Bitmap(width, height, width * 3, PixelFormat.Format24bppRgb, Marshal.UnsafeAddrOfPinnedArrayElement(h_bitmapData, 0)); b.Save("out.bmp"); Process.Start("cmd.exe", "/c out.bmp"); }
public static float[] RunOddEvenSort(float[] a, ref Stopwatch sw) { int N = a.Length; bool evenArr = (N % 2) == 0 ? true : false; bool stopFlag = false; bool iterationEven = true; //Create context and accelerator var gpu = new CudaAccelerator(new Context()); //Create typed launcher var oddEvenKernel = gpu.LoadAutoGroupedStreamKernel < Index1, ArrayView <float>, VariableView <byte>, bool, int, bool>(OddEvenSort); //Allocate memory MemoryBuffer <float> d_a = gpu.Allocate <float>(N); MemoryBuffer <byte> d_stopFlag = gpu.AllocateZero <byte>(1); d_a.CopyFrom(a, 0, Index1.Zero, N); sw.Restart(); //Run kernel byte[] zero_val = new byte[1]; zero_val[0] = 0; while (true) { if (stopFlag) { break; } stopFlag = true; d_stopFlag.CopyFrom(zero_val, 0, 0, 1); oddEvenKernel(N / 2, d_a, d_stopFlag.View.GetVariableView(), iterationEven, N, evenArr); gpu.Synchronize(); if (d_stopFlag.GetAsArray()[0] > 0) { stopFlag = false; } iterationEven = !iterationEven; } sw.Stop(); return(d_a.GetAsArray()); }
/// <summary> /// Demonstrates a block statement, with local register declaration. /// </summary> static void AddUsingTempRegister(CudaAccelerator accelerator) { using var buffer = accelerator.Allocate1D <double>(1024); var kernel = accelerator.LoadAutoGroupedStreamKernel <Index1D, ArrayView <double> >(MultipleInstructionKernel); kernel((int)buffer.Length, buffer.View); var results = buffer.GetAsArray1D(); for (var i = 0; i < results.Length; i++) { Console.WriteLine($"[{i}] = {results[i]}"); } }
/// <summary> /// Demonstrates using EmitRef. /// </summary> static void SubtractUsingEmitRef(CudaAccelerator accelerator) { using var buffer = accelerator.Allocate1D <long>(32); var kernel = accelerator.LoadAutoGroupedStreamKernel <Index1D, ArrayView <long> >(SubtractEmitRefKernel); kernel((int)buffer.Length, buffer.View); var results = buffer.GetAsArray1D(); for (var i = 0; i < results.Length; i++) { Console.WriteLine($"[{i}] = {results[i]}"); } }
/// <summary> /// Demonstrates using the mul.hi.u64 and mul.lo.u64 inline PTX instructions to /// multiply two UInt64 values to produce a UInt128 value. /// </summary> static void MultiplyUInt128(CudaAccelerator accelerator) { using var buffer = accelerator.Allocate1D <UInt128>(1024); var kernel = accelerator.LoadAutoGroupedStreamKernel <Index1D, ArrayView <UInt128>, SpecializedValue <ulong> >(MultiplyUInt128Kernel); kernel( (int)buffer.Length, buffer.View, SpecializedValue.New(ulong.MaxValue)); var results = buffer.GetAsArray1D(); for (var i = 0; i < results.Length; i++) { Console.WriteLine($"[{i}] = {results[i]}"); } }
private static void Performance() { using (var context = new Context()) { using (var accelerator = new CudaAccelerator(context)) { using (var b = accelerator.CreateBackend()) { using (var c = accelerator.Context.CreateCompileUnit(b)) { var method = typeof(Program).GetMethod("MathKernel", BindingFlags.Static | BindingFlags.Public); var compiled = b.Compile(c, method); var kernel = accelerator.LoadAutoGroupedStreamKernel<Index2, ArrayView2D<float>>(MathKernel); //var kernel = accelerator.LoadAutoGroupedKernel(compiled); int size = 100000; var W = new[] { 50 }; var H = new[] { 50 }; for (int n = 0; n < W.Length; n++) { for (int m = 0; m < H.Length; m++) { int x = W[n]; int y = H[m]; Console.WriteLine($"\n\nW {x}, H {y} \n\n"); //var watch = Stopwatch.StartNew(); //for (int k = 0; k < size; k++) //{ // var v = new float[x, y]; // for (int i = 0; i < x; i++) // { // for (int j = 0; j < y; j++) // { // v[i, j] = (float)Math.Sqrt(i * j); // } // } //} //watch.Stop(); //Console.WriteLine($"\n\nElapsed CPU Time Linear: {watch.ElapsedMilliseconds}ms\n"); //GC.Collect(); // //watch = Stopwatch.StartNew(); //Parallel.For(0, size, k => //{ // var v = new float[x, y]; // Parallel.For(0, x, i => // { // Parallel.For(0, y, j => // { // v[i, j] = (float)Math.Sqrt(i * j); // }); // }); //}); //watch.Stop(); //Console.WriteLine($"Elapsed CPU Time Parallel: {watch.ElapsedMilliseconds}ms\n\n"); //GC.Collect(); //var watch = Stopwatch.StartNew(); //for (int k = 0; k < size; k++) //{ // var idx = new Index2(x, y); // var buffer = accelerator.Allocate<float>(idx); // kernel(idx, buffer.View); // accelerator.Synchronize(); // buffer.Dispose(); //} //watch.Stop(); //Console.WriteLine($"\n\nElapsed GPU Time Linear: {watch.ElapsedMilliseconds}ms\n"); //GC.Collect(); var kn = Enumerable.Repeat(accelerator.LoadAutoGroupedStreamKernel<Index2, ArrayView2D<float>>(MathKernel), size).ToList(); var watch = Stopwatch.StartNew(); Parallel.For(0, size, k => { var idx = new Index2(x, y); var buffer = accelerator.Allocate<float>(idx); //kn[k](idx, buffer.View); //kernel.Launch(idx, buffer.View); kernel(idx, buffer.View); accelerator.Synchronize(); buffer.Dispose(); }); watch.Stop(); Console.WriteLine($"Elapsed GPU Time Parallel: {watch.ElapsedMilliseconds}ms\n\n"); GC.Collect(); } } } } } } }
public void ProccessOld() { // Create the required ILGPU context using (var context = new Context()) { /* * using (var accelerator = new CPUAccelerator(context)) * { * // accelerator.LoadAutoGroupedStreamKernel creates a typed launcher * // that implicitly uses the default accelerator stream. * // In order to create a launcher that receives a custom accelerator stream * // use: accelerator.LoadAutoGroupedKernel<Index, ArrayView<int> int>(...) * var myKernel = accelerator.LoadAutoGroupedStreamKernel<Index, ArrayView<int>, int>(MyKernel2); * * // Allocate some memory * using (var buffer = accelerator.Allocate<int>(1024)) * { * // Launch buffer.Length many threads and pass a view to buffer * myKernel(buffer.Length, buffer.View, 42); * * // Wait for the kernel to finish... * accelerator.Synchronize(); * * // Resolve data * var data = buffer.GetAsArray(); * // ... * } * }*/ using (var accelerator = new CudaAccelerator(context)) // test with CPUAccelerator { var methodInfo = typeof(Impl_ILGPU).GetMethod(nameof(MyKernel), BindingFlags.Public | BindingFlags.Static); var myKernel = accelerator.LoadAutoGroupedStreamKernel <Index, ArrayView <byte>, ArrayView <double>, ArrayView <int>, ArrayView <int>, ArrayView <double>, ArrayView <byte>, int >(MyKernel); /* * var myKernel = accelerator.LoadAutoGroupedStreamKernel<Action<Index, * ArrayView<byte>, * ArrayView<int>, * ArrayView<int>, * ArrayView<double>, * ArrayView2D<byte>>>(methodInfo);*/ // Allocate some memory var input1_dev = accelerator.Allocate <int>(DataGenerator.In1.Length); var input2_dev = accelerator.Allocate <int>(DataGenerator.In2.Length); var input3_dev = accelerator.Allocate <double>(DataGenerator.In3.Length); //var input4_dev = accelerator.Allocate<byte>(DataGenerator.In4_2.GetLength(0), DataGenerator.In4_2.GetLength(1)); var input4_dev = accelerator.Allocate <byte>(DataGenerator.In4_3_bytes.Length); // init output parameters var result_dev = accelerator.Allocate <byte>(resultsBytes.Length); var resultCalc_dev = accelerator.Allocate <double>(calculatables.Length); input1_dev.CopyFrom(DataGenerator.In1, 0, 0, DataGenerator.In1.Length); input2_dev.CopyFrom(DataGenerator.In2, 0, 0, DataGenerator.In2.Length); input3_dev.CopyFrom(DataGenerator.In3, 0, 0, DataGenerator.In3.Length); //input4_dev.CopyFrom(DataFeeder.In4_2_bytes, new Index2(), new Index2(DataFeeder.In4_2_bytes.GetLength(0), 0), new Index2(1, 2)); //input4_dev.CopyFrom(DataGenerator.In4_2_bytes, Index2.Zero, Index2.Zero, input4_dev.Extent); input4_dev.CopyFrom(DataGenerator.In4_3_bytes, 0, 0, DataGenerator.In4_3_bytes.Length); myKernel(input1_dev.Length, result_dev.View, resultCalc_dev.View, input1_dev.View, input2_dev.View, input3_dev.View, input4_dev.View, DataGenerator.Width); // Wait for the kernel to finish... accelerator.Synchronize(); // Resolve data resultsBytes = result_dev.GetAsArray(); calculatables = resultCalc_dev.GetAsArray(); //d_in1.Dispose(); //d_in1 = null; /* * var kernelWithDefaultStream = accelerator.LoadAutoGroupedStreamKernel< * Index, * ArrayView<bool>, * ArrayView<int>, * ArrayView<int>, * ArrayView<double>, * ArrayView2D<bool> * >(MyKernel); * * kernelWithDefaultStream(buffer.Extent, buffer.View, 1); */ // Launch buffer.Length many threads and pass a view to buffer //myKernel(d_in1.Length, d_in1.View, 42); // Wait for the kernel to finish... //accelerator.Synchronize(); // Resolve data //var data = buffer.GetAsArray(); // ... } } }