static void Main(string[] args) { // configure CUDA cudaDeviceProp prop; cuda.GetDeviceProperties(out prop, 0); const int BLOCK_DIM = 256; runner = HybRunner.Cuda().SetDistrib(16 * prop.multiProcessorCount, 1, BLOCK_DIM, 1, 1, BLOCK_DIM * sizeof(float)); wrapper = runner.Wrap(new Program()); int size = 1000000; // very slow convergence with no preconditioner SparseMatrix A = SparseMatrix.Laplacian_1D(size); FloatResidentArray B = new FloatResidentArray(size); FloatResidentArray X = new FloatResidentArray(size); int maxiter = 1000; float eps = 1.0e-09f; for (int i = 0; i < size; ++i) { B[i] = 1.0f; // right side X[i] = 0.0f; // starting point } ConjugateGradient(X, A, B, maxiter, eps); }
static void Main(string[] args) { //open the input image and lock the image Bitmap baseImage = (Bitmap)Image.FromFile("../../images/lena_highres_greyscale.bmp"); int height = baseImage.Height, width = baseImage.Width; //create result image and lock Bitmap resImage = new Bitmap(width, height); //take pointer from locked memory byte[] inputPixels = new byte[width * height]; byte[] outputPixels = new byte[width * height]; ReadImage(inputPixels, baseImage, width, height); // pin images memory for cuda HybRunner runner = HybRunner.Cuda().SetDistrib(32, 32, 16, 16, 1, 0); dynamic wrapper = runner.Wrap(new Program()); wrapper.ComputeSobel(outputPixels, inputPixels, width, height); // unregister pinned memory and unlock images SaveImage("lena_highres_sobel.bmp", outputPixels, width, height); try { Process.Start("lena_highres_sobel.bmp"); } catch {} // catch exception for non interactives machines }
static void Main(string[] args) { GrayBitmap image = GrayBitmap.Load("../../images/lena_highres_greyscale_noise.bmp"); GrayBitmap denoised = new GrayBitmap(image.Width, image.Height); ushort[] input = image.PixelsUShort; ushort[] output = new ushort[image.Width * image.Height]; Stopwatch watch = new Stopwatch(); watch.Start(); int window = 3; // create an instance of runner HybRunner runner = HybRunner.Cuda(); // wrap a new instance of Program dynamic wrapper = runner.Wrap(new Program()); // run the method on GPU wrapper.ParForGPU(output, input, (int)image.Width, (int)image.Height, window); watch.Stop(); string time = String.Format("{0:0.00}", watch.ElapsedMilliseconds * 1.0E-3); Console.WriteLine($"Naive GPU time : {time}"); denoised.PixelsUShort = output; denoised.Save("../../output-03-naive-gpu/denoised.bmp"); }
static void Main(string[] args) { SparseMatrix A = SparseMatrix.Laplacian_1D(10000000); float[] X = VectorReader.GetSplatVector(10000000, 1.0F); int redo = 2; double memoryOperationsSize = (double)redo * (3.0 * (double)(A.data.Length * sizeof(float)) + (double)(2 * A.rows.Length * sizeof(uint)) + (double)(A.indices.Length * sizeof(uint))); Console.WriteLine("matrix read --- starting computations"); float[] B = new float[A.rows.Length - 1]; #region CUDA cudaDeviceProp prop; cuda.GetDeviceProperties(out prop, 0); HybRunner runner = HybRunner.Cuda("SparseMatrix_CUDA.dll").SetDistrib(8 * prop.multiProcessorCount, 256); dynamic wrapper = runner.Wrap(new Program()); for (int i = 0; i < redo; ++i) { wrapper.Multiply(B, A, X, X.Length); } #endregion }
static void Main(string[] args) { GrayBitmap image = GrayBitmap.Load("../../images/lena_highres_greyscale_noise.bmp"); GrayBitmap denoised = new GrayBitmap(image.Width, image.Height); ushort[] input = image.PixelsUShort; ushort[] output = new ushort[image.Width * image.Height]; Stopwatch watch = new Stopwatch(); watch.Start(); dim3 grid = new dim3(< gridX >, <gridY>, 1); dim3 block = new dim3(< blockX >, <blockY>, 1); // create an instance of runner HybRunner runner = HybRunner.Cuda(); // wrap a new instance of Program dynamic wrapper = runner.Wrap(new Filter()); // run the method on GPU wrapper.SetDistrib(grid, block).ParForGPU(output, input, (int)image.Width, (int)image.Height); watch.Stop(); string time = String.Format("{0:0.00}", watch.ElapsedMilliseconds * 1.0E-3); Console.WriteLine($"Parallel2D GPU time : {time}"); denoised.PixelsUShort = output; denoised.Save("../../output-05-dice-gpu/denoised.bmp"); }
static void Main(string[] args) { const int size = 512; byte[,] input = new byte[size, size]; for (int i = 0; i < size; ++i) { for (int j = 0; j < size; ++j) { input[i, j] = (byte)1; } } dynamic wrapper = HybRunner.Cuda().SetDistrib(32, 32, 16, 16, 1, 0).Wrap(new Program()); wrapper.Run(input); for (int i = 0; i < size; ++i) { for (int j = 0; j < size; ++j) { if (input[i, j] != (byte)((i + j) % 256)) { Console.Out.WriteLine("error in " + i + " " + j); } } } Console.Out.WriteLine("DONE"); }
static void Main(string[] args) { Random random = new Random(); const int N = 1024 * 1024 * 32; int[] a = new int[N]; for (int i = 0; i < N; ++i) { a[i] = (random.NextDouble() < 0.2) ? 1 : 0; } int[] result = new int[1]; cudaDeviceProp prop; cuda.GetDeviceProperties(out prop, 0); const int BLOCK_DIM = 256; HybRunner runner = HybRunner.Cuda().SetDistrib(16 * prop.multiProcessorCount, 1, BLOCK_DIM, 1, 1, BLOCK_DIM * sizeof(int)); dynamic wrapped = runner.Wrap(new Program()); wrapped.ReduceAdd(N, a, result); cuda.DeviceSynchronize(); Console.Out.WriteLine("sum = {0}", result[0]); Console.Out.WriteLine("expected = {0}", a.Aggregate((i, j) => i + j)); }
public static void Main() { const int N = 1024 * 1024 * 256; float[] a = new float[N]; float[] b = new float[N]; float[] dst = new float[N]; for (int i = 0; i < N; ++i) { a[i] = (float)i; b[i] = 1.0F; } dynamic wrapped = HybRunner.Cuda().Wrap(new Program()); // create a CUDA runner from current program wrapped.Add(dst, a, b, N); // invoke method on the GPU cuda.DeviceSynchronize(); // kernel calls are asynchronous. We need to wait for it to terminate. for (int i = 0; i < N; ++i) { if (dst[i] != (float)i + 1.0F) { Console.Error.WriteLine("ERROR at {0} -- {1} != {2}", i, dst[i], i + 1); Environment.Exit(6); // abort } } Console.Out.WriteLine("OK"); }
static void Main(string[] args) { HybRunner runner = HybRunner.Cuda().SetDistrib(32, 32, 16, 16, 1, 0); GrayBitmap image = GrayBitmap.Load("../../images/lena512.bmp"); uint height = image.Height, width = image.Width; ushort[] inputPixels = image.PixelsUShort; float[] imageFloat = new float[width * height]; float[] imageCompute = new float[width * height]; for (int i = 0; i < width * height; ++i) { imageFloat[i] = (float)inputPixels[i]; } dynamic wrapper = runner.Wrap(new Program()); wrapper.Sobel(imageFloat, imageCompute, (int)width, (int)height); ushort[] outputPixel = new ushort[width * height]; for (int i = 0; i < width * height; ++i) { outputPixel[i] = (ushort)imageCompute[i]; } GrayBitmap imageSobel = new GrayBitmap(width, height); imageSobel.PixelsUShort = outputPixel; imageSobel.Save("../../output-01-gpu/sobel.bmp"); }
public static void Main() { const int N = 1024 * 1024 * 32; float[] a = new float[N]; float[] b = new float[N]; for (int i = 0; i < N; ++i) { a[i] = (float)i; b[i] = 1.0F; } dynamic wrapped = HybRunner.Cuda().Wrap(new Program()); wrapped.Add(a, b, N); cuda.DeviceSynchronize(); for (int i = 0; i < N; ++i) { if (a[i] != (float)i + 1.0F) { Console.Error.WriteLine("ERROR at {0} -- {1} != {2}", i, a[i], i + 1); Environment.Exit(6); // abort } } Console.Out.WriteLine("OK"); }
static void Main(string[] args) { dynamic wrapped = HybRunner.Cuda().SetDistrib(1, 1).Wrap(new Program()); wrapped.Run(); Console.Out.WriteLine("DONE"); }
unsafe static void Main(string[] args) { int nStreams = 8; cudaStream_t[] streams = new cudaStream_t[nStreams]; //create streams int N = 1024 * 1024 * 32; float[] a = new float[N]; float[] b = new float[N]; for (int k = 0; k < N; ++k) { a[k] = (float)k; b[k] = 1.0F; } IntPtr d_a, d_b; // device pointers cuda.Malloc(out d_a, N * sizeof(float)); cuda.Malloc(out d_b, N * sizeof(float)); GCHandle handle_a = GCHandle.Alloc(a, GCHandleType.Pinned); GCHandle handle_b = GCHandle.Alloc(b, GCHandleType.Pinned); IntPtr h_a = handle_a.AddrOfPinnedObject(); IntPtr h_b = handle_b.AddrOfPinnedObject(); cuda.DeviceSynchronize(); cuda.Memcpy(d_a, h_a, N * sizeof(float), cudaMemcpyKind.cudaMemcpyHostToDevice); cuda.Memcpy(d_b, h_b, N * sizeof(float), cudaMemcpyKind.cudaMemcpyHostToDevice); int slice = N / nStreams; // size of the array compute by each stream dynamic wrapped = HybRunner.Cuda().Wrap(new Program()); int start; int stop; // call kernel with each stream // copy data device to host // synchronize and destroy streams for (int i = 0; i < N; ++i) { if (a[i] != (float)i + (1.0F * 100.0F)) { Console.Error.WriteLine("ERROR at {0} -- {1} != {2}", i, a[i], i + 1); Environment.Exit(6); // abort } } handle_a.Free(); handle_b.Free(); }
static void Main(string[] args) { const int N = 1024 * 1024 * 32; float[] a = new float[N]; // initialization Random random = new Random(42); Parallel.For(0, N, i => a[i] = (float)random.NextDouble()); // hybridizer configuration cudaDeviceProp prop; cuda.GetDeviceProperties(out prop, 0); int gridDimX = 16 * prop.multiProcessorCount; int blockDimX = 256; cuda.DeviceSetCacheConfig(cudaFuncCache.cudaFuncCachePreferShared); HybRunner runner = HybRunner.Cuda().SetDistrib(gridDimX, 1, blockDimX, 1, 1, blockDimX * sizeof(float)); float[] buffMax = new float[1]; float[] buffAdd = new float[1]; var maxReductor = new GridReductor <MaxReductor>(); var addReductor = new GridReductor <AddReductor>(); dynamic wrapped = runner.Wrap(new EntryPoints()); // device reduction wrapped.ReduceMax(maxReductor, buffMax, a, N); wrapped.ReduceAdd(addReductor, buffAdd, a, N); cuda.ERROR_CHECK(cuda.DeviceSynchronize()); // check results float expectedMax = a.AsParallel().Aggregate((x, y) => Math.Max(x, y)); float expectedAdd = a.AsParallel().Aggregate((x, y) => x + y); bool hasError = false; if (buffMax[0] != expectedMax) { Console.Error.WriteLine($"MAX Error : {buffMax[0]} != {expectedMax}"); hasError = true; } // addition is not associative, so results cannot be exactly the same // https://en.wikipedia.org/wiki/Associative_property#Nonassociativity_of_floating_point_calculation if (Math.Abs(buffAdd[0] - expectedAdd) / expectedAdd > 1.0E-5F) { Console.Error.WriteLine($"ADD Error : {buffAdd[0]} != {expectedAdd}"); hasError = true; } if (hasError) { Environment.Exit(1); } Console.Out.WriteLine("OK"); }
static void Main(string[] args) { cudaDeviceProp prop; cuda.GetDeviceProperties(out prop, 0); int N = prop.multiProcessorCount * 256; var a = new Element[N]; var a_verif = new Element[N]; double[] b = new double[N]; Random random = new Random(42); for (int i = 0; i < N; ++i) { a[i] = new Element((double)i); a_verif[i] = new Element((double)i); b[i] = 1.0; double rand = random.NextDouble(); if (rand < 0.33) { a[i].Decoration.Add(Filters.A, new A()); a_verif[i].Decoration.Add(Filters.A, new A()); } else if (rand < 0.66) { a[i].Decoration.Add(Filters.B, new A()); a_verif[i].Decoration.Add(Filters.B, new A()); } else { a[i].Decoration.Add(Filters.A, new B()); a_verif[i].Decoration.Add(Filters.A, new B()); } } CudaMarshaler.changeAggregation(true); var runner = HybRunner.Cuda(); Console.WriteLine("Init done"); dynamic wrap = runner.Wrap(new Program()); wrap.Run(N, a, b, Filters.A); cuda.ERROR_CHECK(cuda.DeviceSynchronize()); Console.WriteLine("Kernel done"); Run(N, a_verif, b, Filters.A); for (int i = 0; i < N; ++i) { if (a[i].Data != a_verif[i].Data) { Console.WriteLine($"ERROR at {i} : {a[i].Data} != {a_verif[i].Data}"); Environment.Exit(1); } } Console.WriteLine("OK"); }
static void Main(string[] args) { if (args.Length == 0) { args = new string[] { "512", "512", "512", "512" }; } const int redo = 10; int heightA = Convert.ToInt32(args[0]); int widthA = Convert.ToInt32(args[1]); int heightB = Convert.ToInt32(args[2]); int widthB = Convert.ToInt32(args[3]); if (widthA != heightB) { throw new ArgumentException("invalid data -- incompatible matrices"); } Console.WriteLine("Execution Naive matrix mul with sizes ({0}, {1}) x ({2}, {3})", heightA, widthA, heightB, widthB); NaiveMatrix matrixA = new NaiveMatrix(widthA, heightA); NaiveMatrix matrixB = new NaiveMatrix(widthB, heightB); NaiveMatrix res_net = new NaiveMatrix(widthB, heightA); NaiveMatrix res_cuda = new NaiveMatrix(widthB, heightA); double numberCompute = ((double)matrixA.Height * (double)matrixA.Width * (double)matrixB.Width) * 3.0E-9; matrixA.FillMatrix(); matrixB.FillMatrix(); Random rand = new Random(); #region CUDA HybRunner runner = HybRunner.Cuda().SetDistrib(4, 5, 8, 32, 32, 0); dynamic wrapper = runner.Wrap(new Program()); for (int i = 0; i < redo; ++i) { wrapper.ComputeRowsOfProduct(res_cuda, matrixA, matrixB, 0, res_cuda.Height); } #endregion #region C# for (int i = 0; i < redo; ++i) { Parallel.For(0, res_net.Height, (line) => { ComputeRowsOfProduct(res_net, matrixA, matrixB, line, line + 1); }); } #endregion Console.Out.WriteLine("DONE"); }
public Program() : base(1024, 1024, GraphicsMode.Default, "Hybridizer Mandelbulb", GameWindowFlags.Default) { WindowBorder = WindowBorder.Fixed; // disable resize Init(); runner = HybRunner.Cuda().SetDistrib(32, 32, 16, 16, 1, 0); wrapped = runner.Wrap(new Mandelbulb()); cuda.ERROR_CHECK(cuda.GetLastError()); cuda.ERROR_CHECK(cuda.DeviceSynchronize()); }
static void Main(string[] args) { int currentDevice; cuda.GetDevice(out currentDevice); cudaDeviceProp prop; cuda.GetDeviceProperties(out prop, currentDevice); GrayBitmap image = GrayBitmap.Load("../../images/lena_highres_greyscale_noise.bmp"); GrayBitmap denoised = new GrayBitmap(image.Width, image.Height); ushort[] input = image.PixelsUShort; ushort[] output = new ushort[image.Width * image.Height]; Stopwatch watch = new Stopwatch(); watch.Start(); int chunk; if ((prop.major >= 6) && (prop.minor == 0)) { chunk = ((int)image.Height + (prop.multiProcessorCount / 2) - 1) / (prop.multiProcessorCount / 2); } else { chunk = ((int)image.Height + (prop.multiProcessorCount) - 1) / (prop.multiProcessorCount); } Console.Out.WriteLine("Chunk size = {0}", chunk); dim3 grid = new dim3(16, ((int)image.Height + chunk - 1) / chunk, 1); dim3 block = new dim3(128, 1, 1); // create an instance of runner HybRunner runner = HybRunner.Cuda(); // wrap a new instance of Program dynamic wrapper = runner.Wrap(new Filter()); // run the method on GPU wrapper.SetDistrib(grid, block).ParForGPU(output, input, (int)image.Width, (int)image.Height, chunk); cuda.DeviceSynchronize(); watch.Stop(); string time = String.Format("{0:0.00}", watch.ElapsedMilliseconds * 1.0E-3); string kernelTime = String.Format("{0:0.00}", runner.LastKernelDuration.ElapsedMilliseconds * 1.0E-3); Console.WriteLine($"SweepSort GPU time : {time}"); Console.WriteLine($"SweepSort GPU -- kernel time : {kernelTime}"); denoised.PixelsUShort = output; denoised.Save("../../output-07-cache-aware-gpu/denoised.bmp"); cuda.DeviceReset(); }
static void Main(string[] args) { float4[] callResult_net = new float4[OPT_N / 4]; float4[] putResult_net = new float4[OPT_N / 4]; float4[] stockPrice_net = new float4[OPT_N / 4]; float4[] optionStrike_net = new float4[OPT_N / 4]; float4[] optionYears_net = new float4[OPT_N / 4]; float4[] callResult_cuda = new float4[OPT_N / 4]; float4[] putResult_cuda = new float4[OPT_N / 4]; Random rand = new Random(Guid.NewGuid().GetHashCode()); for (int i = 0; i < OPT_N / 4; ++i) { callResult_net[i] = new float4(0.0f, 0.0f, 0.0f, 0.0f); putResult_net[i] = new float4(-1.0f, -1.0f, -1.0f, -1.0f); callResult_cuda[i] = new float4(0.0f, 0.0f, 0.0f, 0.0f); putResult_cuda[i] = new float4(-1.0f, -1.0f, -1.0f, -1.0f); stockPrice_net[i] = rand.NextFloat4(5.0f, 30.0f); optionStrike_net[i] = rand.NextFloat4(1.0f, 100.0f); optionYears_net[i] = rand.NextFloat4(0.25f, 10f); } cudaDeviceProp prop; cuda.GetDeviceProperties(out prop, 0); HybRunner runner = HybRunner.Cuda("BlackScholesFloat4_CUDA.dll").SetDistrib(8 * prop.multiProcessorCount, 256); dynamic wrapper = runner.Wrap(new Program()); for (int i = 0; i < NUM_ITERATIONS; ++i) { wrapper.BlackScholes(callResult_cuda, putResult_cuda, stockPrice_net, optionStrike_net, optionYears_net, 0, OPT_N / 4); } for (int i = 0; i < NUM_ITERATIONS; ++i) { Parallel.For(0, OPT_N / 4, (opt) => { BlackScholes(callResult_net, putResult_net, stockPrice_net, optionStrike_net, optionYears_net, opt, opt + 1); }); } WriteCalculationError(callResult_net, callResult_cuda, putResult_net, putResult_cuda); }
static void Main(string[] args) { // TODO: enable cudaOccupancyCalculator int deviceCount; int numThreads = 128; int multiProcessorCount = 1; cuda.GetDeviceCount(out deviceCount); bool found = false; for (int i = 0; i < deviceCount; ++i) { cudaDeviceProp prop; cuda.GetDeviceProperties(out prop, i); if (prop.cooperativeLaunch != 0) { cuda.SetDevice(i); numThreads = 128; multiProcessorCount = prop.multiProcessorCount; Console.Out.WriteLine($"running on device {i}"); found = true; break; } } if (!found) { Console.Error.WriteLine("No GPU Found supporting Cooperative Launch"); Environment.Exit(6); } var runner = HybRunner.Cuda().SetGridSync(true); dynamic wrapped = runner.Wrap(new Program()); int maxBlocksPerSM = wrapped.MaxBlocksPerSM(new Action <float[], float[], uint>(reduceSinglePassMultiBlockCG), numThreads, numThreads * sizeof(double) + 16); int numBlocks = maxBlocksPerSM * multiProcessorCount; wrapped.SetDistrib(numBlocks, 1, numThreads, 1, 1, numThreads * sizeof(double)); const int N = 1024 * 1024; float[] a = new float[N]; float[] b = new float[numBlocks]; for (int i = 0; i < N; ++i) { a[i] = 1.0F; } cuda.ERROR_CHECK((cudaError_t)(int)wrapped.reduceSinglePassMultiBlockCG(a, b, N)); cuda.ERROR_CHECK(cuda.DeviceSynchronize()); Console.Out.WriteLine(b[0]); }
public void ParallelCalculateHeatMap_Cuda(string dllName, double[,] heatMap, int width, int height, float widthTerrain, float heightTerrain, float destinationX, float destinationY) { cudaDeviceProp prop; cuda.GetDeviceProperties(out prop, 0); HybRunner runner = HybRunner.Cuda(dllName).SetDistrib(prop.multiProcessorCount * 16, 128); // create a wrapper object to call GPU methods instead of C# dynamic wrapped = runner.Wrap(this); // run the method on GPU wrapped.ParallelCalculateHeatMap(heatMap, width, height, widthTerrain, heightTerrain, destinationX, destinationY); }
static void Main(string[] args) { int[] a = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13 }; // create an instance of HybRunner object to wrap calls on GPU HybRunner runner = HybRunner.Cuda().SetDistrib(4, 4); // create a wrapper object to call GPU methods instead of C# dynamic wrapped = runner.Wrap(new Program()); // run the method on GPU wrapped.Run(a.Length, a); }
static void Main(string[] args) { const int redo = 20; int[] light_net = new int[N * N]; int[] light_cuda = new int[N * N]; #region c# for (int i = 0; i < redo; ++i) { ComputeImage(light_net, false); } #endregion c# HybRunner runner = HybRunner.Cuda("Mandelbrot_CUDA.dll").SetDistrib(32, 32, 16, 16, 1, 0); wrapper = runner.Wrap(new Program()); // profile with nsight to get performance #region cuda for (int i = 0; i < redo; ++i) { ComputeImage(light_cuda, true); } #endregion #region save to image Color[] colors = new Color[maxiter + 1]; for (int k = 0; k < maxiter; ++k) { int red = (int)(127.0F * (float)k / (float)maxiter); int green = (int)(200.0F * (float)k / (float)maxiter); int blue = (int)(90.0F * (float)k / (float)maxiter); colors[k] = Color.FromArgb(red, green, blue); } colors[maxiter] = Color.Black; Bitmap image = new Bitmap(N, N); for (int i = 0; i < N; ++i) { for (int j = 0; j < N; ++j) { int index = i * N + j; image.SetPixel(i, j, colors[light_cuda[index]]); } } image.Save("mandelbrot.png", System.Drawing.Imaging.ImageFormat.Png); #endregion try { Process.Start("mandelbrot.png"); } catch {} // catch exception for non interactives machines }
public static void Run(Stats data, BigInteger n, int keySize, BigInteger e, BigInteger realTotient, bool debug) { var expectedSigDigits = (int)Math.Round(data.SigDigitsRegression.GetRegressionCurve().ValueAt(keySize) - .5); var baseNString = n.ToString().Substring(0, expectedSigDigits); var dynamicNDouble = double.Parse(BigInteger.Parse(n.ToString().Substring(expectedSigDigits)).ToString()); //Remove this block after testing var nStr = n.ToString(); var totStr = realTotient.ToString(); var actualShared = 0; for (; actualShared < n.ToString().Length; actualShared++) { if (nStr[actualShared] != totStr[actualShared]) { break; } } var target = BigInteger.Parse(realTotient.ToString().Substring(expectedSigDigits)); var interval = data.MinRangeRegression.GetPredictionInterval(dynamicNDouble, .99); var min = interval.LowerBound < 0 ? 0 : new BigInteger(interval.LowerBound); var max = new BigInteger(interval.UpperBound); if (debug) { Console.WriteLine("\n N: " + nStr.Substring(actualShared)); Console.WriteLine("Totient: " + totStr.Substring(actualShared)); Console.WriteLine("Predicted Shared: " + expectedSigDigits); Console.WriteLine("Actual Shared: " + actualShared); Console.WriteLine("Diff: " + double.Parse(BigInteger.Subtract(n, realTotient).ToString())); //Console.WriteLine("Estimated Diff Magnitude: " + mag); //Console.WriteLine(max >= target && target >= min); Console.WriteLine("Range: " + interval); } cudaDeviceProp prop; cuda.GetDeviceProperties(out prop, 0); //if .SetDistrib is not used, the default is .SetDistrib(prop.multiProcessorCount * 16, 128) HybRunner runner = HybRunner.Cuda(); // create a wrapper object to call GPU methods instead of C# dynamic wrapped = runner.Wrap(new CryptoExternal()); wrapped.GuessTotient(); //wrapped.GuessTotient(min,max,target); Console.Out.WriteLine("DONE"); }
unsafe static void Main(string[] args) { int N = 1024 * 1024 * 32; float[] a = new float[N]; float[] b = new float[N]; for (int k = 0; k < N; ++k) { a[k] = (float)k; b[k] = 1.0F; } IntPtr d_a, d_b; //device pointer cuda.Malloc(out d_a, N * sizeof(float)); cuda.Malloc(out d_b, N * sizeof(float)); GCHandle handle_a = GCHandle.Alloc(a, GCHandleType.Pinned); GCHandle handle_b = GCHandle.Alloc(b, GCHandleType.Pinned); IntPtr h_a = handle_a.AddrOfPinnedObject(); IntPtr h_b = handle_b.AddrOfPinnedObject(); cuda.DeviceSynchronize(); cuda.Memcpy(d_a, h_a, N * sizeof(float), cudaMemcpyKind.cudaMemcpyHostToDevice); cuda.Memcpy(d_b, h_b, N * sizeof(float), cudaMemcpyKind.cudaMemcpyHostToDevice); dynamic wrapped = HybRunner.Cuda().Wrap(new Program()); wrapped.Add(d_a, d_b, N); cuda.DeviceSynchronize(); cuda.Memcpy(h_a, d_a, N * sizeof(float), cudaMemcpyKind.cudaMemcpyDeviceToHost); for (int i = 0; i < N; ++i) { if (a[i] != (float)i + 1.0F) { Console.Error.WriteLine("ERROR at {0} -- {1} != {2}", i, a[i], i + 1); Environment.Exit(6); // abort } } handle_a.Free(); handle_b.Free(); }
static void Main(string[] args) { HybRunner runner = HybRunner.Cuda().SetDistrib(32, 32, 16, 16, 1, 0); GrayBitmap image = GrayBitmap.Load("../../images/lena512.bmp"); uint height = image.Height, width = image.Width; ushort[] inputPixels = image.PixelsUShort; float[] imageFloat = new float[width * height]; float[] imageCompute = new float[width * height]; for (int i = 0; i < width * height; ++i) { imageFloat[i] = (float)inputPixels[i]; } IntPtr src = runner.Marshaller.MarshalManagedToNative(imageFloat); //bind texture cudaChannelFormatDesc channelDescTex = TextureHelpers.cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKind.cudaChannelFormatKindFloat); cudaArray_t cuArrayTex = TextureHelpers.CreateCudaArray(channelDescTex, src, (int)width, (int)height); cudaResourceDesc resDescTex = TextureHelpers.CreateCudaResourceDesc(cuArrayTex); //create Texture descriptor cudaTextureDesc texDesc = TextureHelpers.CreateCudaTextureDesc(); //create Texture object cudaTextureObject_t texObj; cuda.CreateTextureObject(out texObj, ref resDescTex, ref texDesc); //create and bind surface dynamic wrapper = runner.Wrap(new Program()); wrapper.Sobel(texObj, imageCompute, (int)width, (int)height); ushort[] outputPixel = new ushort[width * height]; for (int i = 0; i < width * height; ++i) { outputPixel[i] = (ushort)imageCompute[i]; } GrayBitmap imageSobel = new GrayBitmap(width, height); imageSobel.PixelsUShort = outputPixel; imageSobel.Save("../../output-03-surface/sobel.bmp"); }
static void Main(string[] args) { if (args.Length == 0) { args = new string[] { "512", "512", "512", "512" }; } const int redo = 10; int heightA = Convert.ToInt32(args[0]); int widthA = Convert.ToInt32(args[1]); int heightB = Convert.ToInt32(args[2]); int widthB = Convert.ToInt32(args[3]); if (widthA != heightB) { throw new ArgumentException("invalid data -- incompatible matrices"); } Console.WriteLine("Execution Naive matrix mul with sizes ({0}, {1}) x ({2}, {3})", heightA, widthA, heightB, widthB); NaiveMatrix matrixA = new NaiveMatrix(widthA, heightA); NaiveMatrix matrixB = new NaiveMatrix(widthB, heightB); NaiveMatrix res_net = new NaiveMatrix(widthB, heightA); NaiveMatrix res_cuda = new NaiveMatrix(widthB, heightA); double numberCompute = ((double)matrixA.Height * (double)matrixA.Width * (double)matrixB.Width) * 3.0E-9; matrixA.FillMatrix(); matrixB.FillMatrix(); #region CUDA HybRunner runner = HybRunner.Cuda("SharedMatrix_CUDA.dll").SetDistrib(4, 5, 32, 32, 1, 1024 * 2 * 8); dynamic wrapper = runner.Wrap(new Program()); for (int i = 0; i < redo; ++i) { wrapper.Multiply(res_cuda, matrixA, matrixB, matrixA.Width); } #endregion #region C# Reference(res_net, matrixA, matrixB); #endregion Console.Out.WriteLine("DONE"); }
public static void Main() { const int n = 1_000_000; var a = new double[n]; var b = new double[n]; var results = new double[n]; cuda.GetDeviceProperties(out var prop, 0); HybRunner runner = HybRunner.Cuda("DotnetosGPU.Hybridizer_CUDA.dll") .SetDistrib(prop.multiProcessorCount * 16, 256); // create a wrapper object to call GPU methods instead of C# dynamic wrapped = runner.Wrap(new Program()); // run the method on GPU wrapped.Run(n, a, b, results); }
static void Main(string[] args) { int[] a = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13 }; // create an instance of HybRunner object to wrap calls on GPU HybRunner runner = HybRunner.Cuda().SetDistrib(4, 4); // create a wrapper object to call GPU methods instead of C# dynamic wrapped = runner.Wrap(new Program()); // run the method on GPU wrapped.Run(a.Length, a); // synchronize the GPU to flush stdout on the device // add error checking cuda.ERROR_CHECK(cuda.DeviceSynchronize()); }
static void Main(string[] args) { int currentDevice; cuda.GetDevice(out currentDevice); cudaDeviceProp prop; cuda.GetDeviceProperties(out prop, currentDevice); Console.Out.WriteLine("Device properties:"); Console.Out.WriteLine("\tname : {0}", new String(prop.name)); Console.Out.WriteLine("\tmultiProcessorCount : {0}", prop.multiProcessorCount); int mm = prop.major * 10 + prop.minor; int count = 0; switch (mm) { case 35: count = 192; break; case 37: count = 192; break; case 50: count = 128; break; case 60: count = 64; break; case 61: count = 128; break; case 70: count = 64; break; default: count = 0; break; } Console.Out.WriteLine("\tTotal cores : {0}", count * prop.multiProcessorCount); HybRunner runner = HybRunner.Cuda(); Console.Out.WriteLine("Runner configuration:"); Console.Out.WriteLine("\tGridDimX = {0}", runner.GridDimX); Console.Out.WriteLine("\tGridDimY = {0}", runner.GridDimY); Console.Out.WriteLine("\tBlockDimX = {0}", runner.BlockDimX); Console.Out.WriteLine("\tBlockDimY = {0}", runner.BlockDimY); Console.Out.WriteLine("\tBlockDimZ = {0}", runner.BlockDimZ); Console.Out.WriteLine("TOTAL parallelization = {0}", runner.GridDimX * runner.GridDimY * runner.BlockDimX * runner.BlockDimY * runner.BlockDimZ); }
public void Test(string dllname) { // 268 MB allocated on device -- should fit in every CUDA compatible GPU int N = 1024 * 1024 * 16; double[] acuda = new double[N]; double[] adotnet = new double[N]; double[] b = new double[N]; Random rand = new Random(); //Initialize acuda et adotnet and b by some doubles randoms, acuda and adotnet have same numbers. for (int i = 0; i < N; ++i) { acuda[i] = rand.NextDouble(); adotnet[i] = acuda[i]; b[i] = rand.NextDouble(); } cudaDeviceProp prop; cuda.GetDeviceProperties(out prop, 0); HybRunner runner = HybRunner.Cuda(dllname).SetDistrib(prop.multiProcessorCount * 16, 128); // create a wrapper object to call GPU methods instead of C# dynamic wrapped = runner.Wrap(this); // run the method on GPU wrapped.Run(N, acuda, b); // run .Net method Run(N, adotnet, b); // verify the results for (int k = 0; k < N; ++k) { if (acuda[k] != adotnet[k]) { Console.Out.WriteLine("ERROR !"); } } Console.Out.WriteLine("DONE"); //Thread.Sleep(10000); }