예제 #1
0
        static void Main(string[] args)
        {
            // configure CUDA
            cudaDeviceProp prop;

            cuda.GetDeviceProperties(out prop, 0);
            const int BLOCK_DIM = 256;

            runner  = HybRunner.Cuda().SetDistrib(16 * prop.multiProcessorCount, 1, BLOCK_DIM, 1, 1, BLOCK_DIM * sizeof(float));
            wrapper = runner.Wrap(new Program());

            int                size = 1000000; // very slow convergence with no preconditioner
            SparseMatrix       A    = SparseMatrix.Laplacian_1D(size);
            FloatResidentArray B    = new FloatResidentArray(size);
            FloatResidentArray X    = new FloatResidentArray(size);

            int   maxiter = 1000;
            float eps     = 1.0e-09f;

            for (int i = 0; i < size; ++i)
            {
                B[i] = 1.0f; // right side
                X[i] = 0.0f; // starting point
            }

            ConjugateGradient(X, A, B, maxiter, eps);
        }
예제 #2
0
        static void Main(string[] args)
        {
            //open the input image and lock the image
            Bitmap baseImage = (Bitmap)Image.FromFile("../../images/lena_highres_greyscale.bmp");
            int    height = baseImage.Height, width = baseImage.Width;

            //create result image and lock
            Bitmap resImage = new Bitmap(width, height);

            //take pointer from locked memory

            byte[] inputPixels  = new byte[width * height];
            byte[] outputPixels = new byte[width * height];

            ReadImage(inputPixels, baseImage, width, height);

            // pin images memory for cuda

            HybRunner runner  = HybRunner.Cuda().SetDistrib(32, 32, 16, 16, 1, 0);
            dynamic   wrapper = runner.Wrap(new Program());

            wrapper.ComputeSobel(outputPixels, inputPixels, width, height);

            // unregister pinned memory and unlock images
            SaveImage("lena_highres_sobel.bmp", outputPixels, width, height);
            try { Process.Start("lena_highres_sobel.bmp"); } catch {}            // catch exception for non interactives machines
        }
예제 #3
0
        static void Main(string[] args)
        {
            GrayBitmap image    = GrayBitmap.Load("../../images/lena_highres_greyscale_noise.bmp");
            GrayBitmap denoised = new GrayBitmap(image.Width, image.Height);

            ushort[] input  = image.PixelsUShort;
            ushort[] output = new ushort[image.Width * image.Height];

            Stopwatch watch = new Stopwatch();

            watch.Start();

            int window = 3;

            // create an instance of runner
            HybRunner runner = HybRunner.Cuda();
            // wrap a new instance of Program
            dynamic wrapper = runner.Wrap(new Program());

            // run the method on GPU
            wrapper.ParForGPU(output, input, (int)image.Width, (int)image.Height, window);

            watch.Stop();
            string time = String.Format("{0:0.00}", watch.ElapsedMilliseconds * 1.0E-3);

            Console.WriteLine($"Naive GPU time : {time}");
            denoised.PixelsUShort = output;
            denoised.Save("../../output-03-naive-gpu/denoised.bmp");
        }
예제 #4
0
        static void Main(string[] args)
        {
            SparseMatrix A = SparseMatrix.Laplacian_1D(10000000);

            float[] X = VectorReader.GetSplatVector(10000000, 1.0F);

            int    redo = 2;
            double memoryOperationsSize = (double)redo * (3.0 * (double)(A.data.Length * sizeof(float)) + (double)(2 * A.rows.Length * sizeof(uint)) + (double)(A.indices.Length * sizeof(uint)));

            Console.WriteLine("matrix read --- starting computations");

            float[] B = new float[A.rows.Length - 1];

            #region CUDA
            cudaDeviceProp prop;
            cuda.GetDeviceProperties(out prop, 0);

            HybRunner runner  = HybRunner.Cuda("SparseMatrix_CUDA.dll").SetDistrib(8 * prop.multiProcessorCount, 256);
            dynamic   wrapper = runner.Wrap(new Program());

            for (int i = 0; i < redo; ++i)
            {
                wrapper.Multiply(B, A, X, X.Length);
            }
            #endregion
        }
        static void Main(string[] args)
        {
            GrayBitmap image    = GrayBitmap.Load("../../images/lena_highres_greyscale_noise.bmp");
            GrayBitmap denoised = new GrayBitmap(image.Width, image.Height);

            ushort[] input  = image.PixelsUShort;
            ushort[] output = new ushort[image.Width * image.Height];

            Stopwatch watch = new Stopwatch();

            watch.Start();

            dim3 grid  = new dim3(< gridX >, <gridY>, 1);
            dim3 block = new dim3(< blockX >, <blockY>, 1);

            // create an instance of runner
            HybRunner runner = HybRunner.Cuda();
            // wrap a new instance of Program
            dynamic wrapper = runner.Wrap(new Filter());

            // run the method on GPU
            wrapper.SetDistrib(grid, block).ParForGPU(output, input, (int)image.Width, (int)image.Height);

            watch.Stop();
            string time = String.Format("{0:0.00}", watch.ElapsedMilliseconds * 1.0E-3);

            Console.WriteLine($"Parallel2D GPU time : {time}");
            denoised.PixelsUShort = output;
            denoised.Save("../../output-05-dice-gpu/denoised.bmp");
        }
        static void Main(string[] args)
        {
            const int size = 512;

            byte[,] input = new byte[size, size];
            for (int i = 0; i < size; ++i)
            {
                for (int j = 0; j < size; ++j)
                {
                    input[i, j] = (byte)1;
                }
            }

            dynamic wrapper = HybRunner.Cuda().SetDistrib(32, 32, 16, 16, 1, 0).Wrap(new Program());

            wrapper.Run(input);

            for (int i = 0; i < size; ++i)
            {
                for (int j = 0; j < size; ++j)
                {
                    if (input[i, j] != (byte)((i + j) % 256))
                    {
                        Console.Out.WriteLine("error in " + i + " " + j);
                    }
                }
            }

            Console.Out.WriteLine("DONE");
        }
예제 #7
0
        static void Main(string[] args)
        {
            Random    random = new Random();
            const int N      = 1024 * 1024 * 32;

            int[] a = new int[N];
            for (int i = 0; i < N; ++i)
            {
                a[i] = (random.NextDouble() < 0.2) ? 1 : 0;
            }

            int[] result = new int[1];

            cudaDeviceProp prop;

            cuda.GetDeviceProperties(out prop, 0);

            const int BLOCK_DIM = 256;
            HybRunner runner    = HybRunner.Cuda().SetDistrib(16 * prop.multiProcessorCount, 1, BLOCK_DIM, 1, 1, BLOCK_DIM * sizeof(int));

            dynamic wrapped = runner.Wrap(new Program());

            wrapped.ReduceAdd(N, a, result);

            cuda.DeviceSynchronize();
            Console.Out.WriteLine("sum =      {0}", result[0]);
            Console.Out.WriteLine("expected = {0}", a.Aggregate((i, j) => i + j));
        }
예제 #8
0
        public static void Main()
        {
            const int N = 1024 * 1024 * 256;

            float[] a   = new float[N];
            float[] b   = new float[N];
            float[] dst = new float[N];

            for (int i = 0; i < N; ++i)
            {
                a[i] = (float)i;
                b[i] = 1.0F;
            }

            dynamic wrapped = HybRunner.Cuda().Wrap(new Program()); // create a CUDA runner from current program

            wrapped.Add(dst, a, b, N);                              // invoke method on the GPU

            cuda.DeviceSynchronize();                               // kernel calls are asynchronous. We need to wait for it to terminate.

            for (int i = 0; i < N; ++i)
            {
                if (dst[i] != (float)i + 1.0F)
                {
                    Console.Error.WriteLine("ERROR at {0} -- {1} != {2}", i, dst[i], i + 1);
                    Environment.Exit(6); // abort
                }
            }

            Console.Out.WriteLine("OK");
        }
예제 #9
0
        static void Main(string[] args)
        {
            HybRunner runner = HybRunner.Cuda().SetDistrib(32, 32, 16, 16, 1, 0);

            GrayBitmap image = GrayBitmap.Load("../../images/lena512.bmp");
            uint       height = image.Height, width = image.Width;

            ushort[] inputPixels = image.PixelsUShort;

            float[] imageFloat   = new float[width * height];
            float[] imageCompute = new float[width * height];
            for (int i = 0; i < width * height; ++i)
            {
                imageFloat[i] = (float)inputPixels[i];
            }

            dynamic wrapper = runner.Wrap(new Program());

            wrapper.Sobel(imageFloat, imageCompute, (int)width, (int)height);

            ushort[] outputPixel = new ushort[width * height];
            for (int i = 0; i < width * height; ++i)
            {
                outputPixel[i] = (ushort)imageCompute[i];
            }

            GrayBitmap imageSobel = new GrayBitmap(width, height);

            imageSobel.PixelsUShort = outputPixel;
            imageSobel.Save("../../output-01-gpu/sobel.bmp");
        }
예제 #10
0
        public static void Main()
        {
            const int N = 1024 * 1024 * 32;

            float[] a = new float[N];
            float[] b = new float[N];

            for (int i = 0; i < N; ++i)
            {
                a[i] = (float)i;
                b[i] = 1.0F;
            }

            dynamic wrapped = HybRunner.Cuda().Wrap(new Program());

            wrapped.Add(a, b, N);

            cuda.DeviceSynchronize();

            for (int i = 0; i < N; ++i)
            {
                if (a[i] != (float)i + 1.0F)
                {
                    Console.Error.WriteLine("ERROR at {0} -- {1} != {2}", i, a[i], i + 1);
                    Environment.Exit(6); // abort
                }
            }

            Console.Out.WriteLine("OK");
        }
예제 #11
0
        static void Main(string[] args)
        {
            dynamic wrapped = HybRunner.Cuda().SetDistrib(1, 1).Wrap(new Program());

            wrapped.Run();

            Console.Out.WriteLine("DONE");
        }
        unsafe static void Main(string[] args)
        {
            int nStreams = 8;

            cudaStream_t[] streams = new cudaStream_t[nStreams];

            //create streams

            int N = 1024 * 1024 * 32;

            float[] a = new float[N];
            float[] b = new float[N];

            for (int k = 0; k < N; ++k)
            {
                a[k] = (float)k;
                b[k] = 1.0F;
            }

            IntPtr d_a, d_b; // device pointers

            cuda.Malloc(out d_a, N * sizeof(float));
            cuda.Malloc(out d_b, N * sizeof(float));

            GCHandle handle_a = GCHandle.Alloc(a, GCHandleType.Pinned);
            GCHandle handle_b = GCHandle.Alloc(b, GCHandleType.Pinned);
            IntPtr   h_a      = handle_a.AddrOfPinnedObject();
            IntPtr   h_b      = handle_b.AddrOfPinnedObject();

            cuda.DeviceSynchronize();

            cuda.Memcpy(d_a, h_a, N * sizeof(float), cudaMemcpyKind.cudaMemcpyHostToDevice);
            cuda.Memcpy(d_b, h_b, N * sizeof(float), cudaMemcpyKind.cudaMemcpyHostToDevice);

            int slice = N / nStreams; // size of the array compute by each stream

            dynamic wrapped = HybRunner.Cuda().Wrap(new Program());
            int     start;
            int     stop;

            // call kernel with each stream

            // copy data device to host

            // synchronize and destroy streams

            for (int i = 0; i < N; ++i)
            {
                if (a[i] != (float)i + (1.0F * 100.0F))
                {
                    Console.Error.WriteLine("ERROR at {0} -- {1} != {2}", i, a[i], i + 1);
                    Environment.Exit(6); // abort
                }
            }

            handle_a.Free();
            handle_b.Free();
        }
예제 #13
0
        static void Main(string[] args)
        {
            const int N = 1024 * 1024 * 32;

            float[] a = new float[N];

            // initialization
            Random random = new Random(42);

            Parallel.For(0, N, i => a[i] = (float)random.NextDouble());

            // hybridizer configuration
            cudaDeviceProp prop;

            cuda.GetDeviceProperties(out prop, 0);
            int gridDimX  = 16 * prop.multiProcessorCount;
            int blockDimX = 256;

            cuda.DeviceSetCacheConfig(cudaFuncCache.cudaFuncCachePreferShared);
            HybRunner runner = HybRunner.Cuda().SetDistrib(gridDimX, 1, blockDimX, 1, 1, blockDimX * sizeof(float));

            float[] buffMax     = new float[1];
            float[] buffAdd     = new float[1];
            var     maxReductor = new GridReductor <MaxReductor>();
            var     addReductor = new GridReductor <AddReductor>();
            dynamic wrapped     = runner.Wrap(new EntryPoints());

            // device reduction
            wrapped.ReduceMax(maxReductor, buffMax, a, N);
            wrapped.ReduceAdd(addReductor, buffAdd, a, N);
            cuda.ERROR_CHECK(cuda.DeviceSynchronize());

            // check results
            float expectedMax = a.AsParallel().Aggregate((x, y) => Math.Max(x, y));
            float expectedAdd = a.AsParallel().Aggregate((x, y) => x + y);
            bool  hasError    = false;

            if (buffMax[0] != expectedMax)
            {
                Console.Error.WriteLine($"MAX Error : {buffMax[0]} != {expectedMax}");
                hasError = true;
            }

            // addition is not associative, so results cannot be exactly the same
            // https://en.wikipedia.org/wiki/Associative_property#Nonassociativity_of_floating_point_calculation
            if (Math.Abs(buffAdd[0] - expectedAdd) / expectedAdd > 1.0E-5F)
            {
                Console.Error.WriteLine($"ADD Error : {buffAdd[0]} != {expectedAdd}");
                hasError = true;
            }

            if (hasError)
            {
                Environment.Exit(1);
            }

            Console.Out.WriteLine("OK");
        }
예제 #14
0
        static void Main(string[] args)
        {
            cudaDeviceProp prop;

            cuda.GetDeviceProperties(out prop, 0);
            int N       = prop.multiProcessorCount * 256;
            var a       = new Element[N];
            var a_verif = new Element[N];

            double[] b      = new double[N];
            Random   random = new Random(42);

            for (int i = 0; i < N; ++i)
            {
                a[i]       = new Element((double)i);
                a_verif[i] = new Element((double)i);
                b[i]       = 1.0;
                double rand = random.NextDouble();
                if (rand < 0.33)
                {
                    a[i].Decoration.Add(Filters.A, new A());
                    a_verif[i].Decoration.Add(Filters.A, new A());
                }
                else if (rand < 0.66)
                {
                    a[i].Decoration.Add(Filters.B, new A());
                    a_verif[i].Decoration.Add(Filters.B, new A());
                }
                else
                {
                    a[i].Decoration.Add(Filters.A, new B());
                    a_verif[i].Decoration.Add(Filters.A, new B());
                }
            }

            CudaMarshaler.changeAggregation(true);
            var runner = HybRunner.Cuda();

            Console.WriteLine("Init done");
            dynamic wrap = runner.Wrap(new Program());

            wrap.Run(N, a, b, Filters.A);
            cuda.ERROR_CHECK(cuda.DeviceSynchronize());
            Console.WriteLine("Kernel done");

            Run(N, a_verif, b, Filters.A);

            for (int i = 0; i < N; ++i)
            {
                if (a[i].Data != a_verif[i].Data)
                {
                    Console.WriteLine($"ERROR at {i} : {a[i].Data} != {a_verif[i].Data}");
                    Environment.Exit(1);
                }
            }

            Console.WriteLine("OK");
        }
예제 #15
0
        static void Main(string[] args)
        {
            if (args.Length == 0)
            {
                args = new string[] { "512", "512", "512", "512" };
            }
            const int redo = 10;

            int heightA = Convert.ToInt32(args[0]);
            int widthA  = Convert.ToInt32(args[1]);
            int heightB = Convert.ToInt32(args[2]);
            int widthB  = Convert.ToInt32(args[3]);

            if (widthA != heightB)
            {
                throw new ArgumentException("invalid data -- incompatible matrices");
            }

            Console.WriteLine("Execution Naive matrix mul with sizes ({0}, {1}) x ({2}, {3})", heightA, widthA, heightB, widthB);

            NaiveMatrix matrixA = new NaiveMatrix(widthA, heightA);
            NaiveMatrix matrixB = new NaiveMatrix(widthB, heightB);

            NaiveMatrix res_net  = new NaiveMatrix(widthB, heightA);
            NaiveMatrix res_cuda = new NaiveMatrix(widthB, heightA);

            double numberCompute = ((double)matrixA.Height * (double)matrixA.Width * (double)matrixB.Width) * 3.0E-9;

            matrixA.FillMatrix();
            matrixB.FillMatrix();

            Random rand = new Random();

            #region CUDA

            HybRunner runner  = HybRunner.Cuda().SetDistrib(4, 5, 8, 32, 32, 0);
            dynamic   wrapper = runner.Wrap(new Program());

            for (int i = 0; i < redo; ++i)
            {
                wrapper.ComputeRowsOfProduct(res_cuda, matrixA, matrixB, 0, res_cuda.Height);
            }
            #endregion

            #region C#

            for (int i = 0; i < redo; ++i)
            {
                Parallel.For(0, res_net.Height, (line) =>
                {
                    ComputeRowsOfProduct(res_net, matrixA, matrixB, line, line + 1);
                });
            }
            #endregion

            Console.Out.WriteLine("DONE");
        }
예제 #16
0
 public Program() : base(1024, 1024, GraphicsMode.Default, "Hybridizer Mandelbulb", GameWindowFlags.Default)
 {
     WindowBorder = WindowBorder.Fixed; // disable resize
     Init();
     runner  = HybRunner.Cuda().SetDistrib(32, 32, 16, 16, 1, 0);
     wrapped = runner.Wrap(new Mandelbulb());
     cuda.ERROR_CHECK(cuda.GetLastError());
     cuda.ERROR_CHECK(cuda.DeviceSynchronize());
 }
예제 #17
0
        static void Main(string[] args)
        {
            int currentDevice;

            cuda.GetDevice(out currentDevice);
            cudaDeviceProp prop;

            cuda.GetDeviceProperties(out prop, currentDevice);

            GrayBitmap image    = GrayBitmap.Load("../../images/lena_highres_greyscale_noise.bmp");
            GrayBitmap denoised = new GrayBitmap(image.Width, image.Height);

            ushort[] input  = image.PixelsUShort;
            ushort[] output = new ushort[image.Width * image.Height];

            Stopwatch watch = new Stopwatch();

            watch.Start();

            int chunk;

            if ((prop.major >= 6) && (prop.minor == 0))
            {
                chunk = ((int)image.Height + (prop.multiProcessorCount / 2) - 1) / (prop.multiProcessorCount / 2);
            }
            else
            {
                chunk = ((int)image.Height + (prop.multiProcessorCount) - 1) / (prop.multiProcessorCount);
            }

            Console.Out.WriteLine("Chunk size = {0}", chunk);

            dim3 grid  = new dim3(16, ((int)image.Height + chunk - 1) / chunk, 1);
            dim3 block = new dim3(128, 1, 1);

            // create an instance of runner
            HybRunner runner = HybRunner.Cuda();
            // wrap a new instance of Program
            dynamic wrapper = runner.Wrap(new Filter());

            // run the method on GPU
            wrapper.SetDistrib(grid, block).ParForGPU(output, input, (int)image.Width, (int)image.Height, chunk);
            cuda.DeviceSynchronize();

            watch.Stop();
            string time = String.Format("{0:0.00}", watch.ElapsedMilliseconds * 1.0E-3);

            string kernelTime = String.Format("{0:0.00}", runner.LastKernelDuration.ElapsedMilliseconds * 1.0E-3);

            Console.WriteLine($"SweepSort GPU time : {time}");
            Console.WriteLine($"SweepSort GPU -- kernel time : {kernelTime}");
            denoised.PixelsUShort = output;
            denoised.Save("../../output-07-cache-aware-gpu/denoised.bmp");

            cuda.DeviceReset();
        }
예제 #18
0
        static void Main(string[] args)
        {
            float4[] callResult_net   = new float4[OPT_N / 4];
            float4[] putResult_net    = new float4[OPT_N / 4];
            float4[] stockPrice_net   = new float4[OPT_N / 4];
            float4[] optionStrike_net = new float4[OPT_N / 4];
            float4[] optionYears_net  = new float4[OPT_N / 4];

            float4[] callResult_cuda = new float4[OPT_N / 4];
            float4[] putResult_cuda  = new float4[OPT_N / 4];

            Random rand = new Random(Guid.NewGuid().GetHashCode());

            for (int i = 0; i < OPT_N / 4; ++i)
            {
                callResult_net[i]   = new float4(0.0f, 0.0f, 0.0f, 0.0f);
                putResult_net[i]    = new float4(-1.0f, -1.0f, -1.0f, -1.0f);
                callResult_cuda[i]  = new float4(0.0f, 0.0f, 0.0f, 0.0f);
                putResult_cuda[i]   = new float4(-1.0f, -1.0f, -1.0f, -1.0f);
                stockPrice_net[i]   = rand.NextFloat4(5.0f, 30.0f);
                optionStrike_net[i] = rand.NextFloat4(1.0f, 100.0f);
                optionYears_net[i]  = rand.NextFloat4(0.25f, 10f);
            }

            cudaDeviceProp prop;

            cuda.GetDeviceProperties(out prop, 0);
            HybRunner runner  = HybRunner.Cuda("BlackScholesFloat4_CUDA.dll").SetDistrib(8 * prop.multiProcessorCount, 256);
            dynamic   wrapper = runner.Wrap(new Program());

            for (int i = 0; i < NUM_ITERATIONS; ++i)
            {
                wrapper.BlackScholes(callResult_cuda,
                                     putResult_cuda,
                                     stockPrice_net,
                                     optionStrike_net,
                                     optionYears_net,
                                     0, OPT_N / 4);
            }
            for (int i = 0; i < NUM_ITERATIONS; ++i)
            {
                Parallel.For(0, OPT_N / 4, (opt) =>
                {
                    BlackScholes(callResult_net,
                                 putResult_net,
                                 stockPrice_net,
                                 optionStrike_net,
                                 optionYears_net,
                                 opt,
                                 opt + 1);
                });
            }

            WriteCalculationError(callResult_net, callResult_cuda, putResult_net, putResult_cuda);
        }
예제 #19
0
        static void Main(string[] args)
        {
            // TODO: enable cudaOccupancyCalculator
            int deviceCount;
            int numThreads          = 128;
            int multiProcessorCount = 1;

            cuda.GetDeviceCount(out deviceCount);
            bool found = false;

            for (int i = 0; i < deviceCount; ++i)
            {
                cudaDeviceProp prop;
                cuda.GetDeviceProperties(out prop, i);
                if (prop.cooperativeLaunch != 0)
                {
                    cuda.SetDevice(i);
                    numThreads          = 128;
                    multiProcessorCount = prop.multiProcessorCount;
                    Console.Out.WriteLine($"running on device {i}");
                    found = true;
                    break;
                }
            }
            if (!found)
            {
                Console.Error.WriteLine("No GPU Found supporting Cooperative Launch");
                Environment.Exit(6);
            }


            var     runner         = HybRunner.Cuda().SetGridSync(true);
            dynamic wrapped        = runner.Wrap(new Program());
            int     maxBlocksPerSM = wrapped.MaxBlocksPerSM(new Action <float[], float[], uint>(reduceSinglePassMultiBlockCG), numThreads, numThreads * sizeof(double) + 16);
            int     numBlocks      = maxBlocksPerSM * multiProcessorCount;

            wrapped.SetDistrib(numBlocks, 1, numThreads, 1, 1, numThreads * sizeof(double));

            const int N = 1024 * 1024;

            float[] a = new float[N];
            float[] b = new float[numBlocks];

            for (int i = 0; i < N; ++i)
            {
                a[i] = 1.0F;
            }

            cuda.ERROR_CHECK((cudaError_t)(int)wrapped.reduceSinglePassMultiBlockCG(a, b, N));
            cuda.ERROR_CHECK(cuda.DeviceSynchronize());
            Console.Out.WriteLine(b[0]);
        }
예제 #20
0
        public void ParallelCalculateHeatMap_Cuda(string dllName, double[,] heatMap, int width, int height, float widthTerrain, float heightTerrain, float destinationX, float destinationY)
        {
            cudaDeviceProp prop;

            cuda.GetDeviceProperties(out prop, 0);
            HybRunner runner = HybRunner.Cuda(dllName).SetDistrib(prop.multiProcessorCount * 16, 128);

            // create a wrapper object to call GPU methods instead of C#
            dynamic wrapped = runner.Wrap(this);

            // run the method on GPU
            wrapped.ParallelCalculateHeatMap(heatMap, width, height, widthTerrain, heightTerrain, destinationX, destinationY);
        }
예제 #21
0
        static void Main(string[] args)
        {
            int[] a = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13 };

            // create an instance of HybRunner object to wrap calls on GPU
            HybRunner runner = HybRunner.Cuda().SetDistrib(4, 4);

            // create a wrapper object to call GPU methods instead of C#
            dynamic wrapped = runner.Wrap(new Program());

            // run the method on GPU
            wrapped.Run(a.Length, a);
        }
예제 #22
0
        static void Main(string[] args)
        {
            const int redo = 20;

            int[] light_net  = new int[N * N];
            int[] light_cuda = new int[N * N];

            #region c#
            for (int i = 0; i < redo; ++i)
            {
                ComputeImage(light_net, false);
            }
            #endregion c#

            HybRunner runner = HybRunner.Cuda("Mandelbrot_CUDA.dll").SetDistrib(32, 32, 16, 16, 1, 0);
            wrapper = runner.Wrap(new Program());
            // profile with nsight to get performance
            #region cuda
            for (int i = 0; i < redo; ++i)
            {
                ComputeImage(light_cuda, true);
            }
            #endregion

            #region save to image
            Color[] colors = new Color[maxiter + 1];

            for (int k = 0; k < maxiter; ++k)
            {
                int red   = (int)(127.0F * (float)k / (float)maxiter);
                int green = (int)(200.0F * (float)k / (float)maxiter);
                int blue  = (int)(90.0F * (float)k / (float)maxiter);
                colors[k] = Color.FromArgb(red, green, blue);
            }
            colors[maxiter] = Color.Black;

            Bitmap image = new Bitmap(N, N);
            for (int i = 0; i < N; ++i)
            {
                for (int j = 0; j < N; ++j)
                {
                    int index = i * N + j;
                    image.SetPixel(i, j, colors[light_cuda[index]]);
                }
            }

            image.Save("mandelbrot.png", System.Drawing.Imaging.ImageFormat.Png);
            #endregion

            try { Process.Start("mandelbrot.png"); } catch {}            // catch exception for non interactives machines
        }
예제 #23
0
        public static void Run(Stats data, BigInteger n, int keySize, BigInteger e, BigInteger realTotient, bool debug)
        {
            var expectedSigDigits = (int)Math.Round(data.SigDigitsRegression.GetRegressionCurve().ValueAt(keySize) - .5);

            var baseNString    = n.ToString().Substring(0, expectedSigDigits);
            var dynamicNDouble = double.Parse(BigInteger.Parse(n.ToString().Substring(expectedSigDigits)).ToString());

            //Remove this block after testing
            var nStr         = n.ToString();
            var totStr       = realTotient.ToString();
            var actualShared = 0;

            for (; actualShared < n.ToString().Length; actualShared++)
            {
                if (nStr[actualShared] != totStr[actualShared])
                {
                    break;
                }
            }
            var target = BigInteger.Parse(realTotient.ToString().Substring(expectedSigDigits));

            var interval = data.MinRangeRegression.GetPredictionInterval(dynamicNDouble, .99);
            var min      = interval.LowerBound < 0 ? 0 : new BigInteger(interval.LowerBound);
            var max      = new BigInteger(interval.UpperBound);

            if (debug)
            {
                Console.WriteLine("\n      N: " + nStr.Substring(actualShared));
                Console.WriteLine("Totient: " + totStr.Substring(actualShared));
                Console.WriteLine("Predicted Shared: " + expectedSigDigits);
                Console.WriteLine("Actual Shared: " + actualShared);
                Console.WriteLine("Diff: " + double.Parse(BigInteger.Subtract(n, realTotient).ToString()));
                //Console.WriteLine("Estimated Diff Magnitude: " + mag);
                //Console.WriteLine(max >= target && target >= min);
                Console.WriteLine("Range: " + interval);
            }

            cudaDeviceProp prop;

            cuda.GetDeviceProperties(out prop, 0);
            //if .SetDistrib is not used, the default is .SetDistrib(prop.multiProcessorCount * 16, 128)
            HybRunner runner = HybRunner.Cuda();

            // create a wrapper object to call GPU methods instead of C#
            dynamic wrapped = runner.Wrap(new CryptoExternal());

            wrapped.GuessTotient();
            //wrapped.GuessTotient(min,max,target);

            Console.Out.WriteLine("DONE");
        }
        unsafe static void Main(string[] args)
        {
            int N = 1024 * 1024 * 32;

            float[] a = new float[N];
            float[] b = new float[N];

            for (int k = 0; k < N; ++k)
            {
                a[k] = (float)k;
                b[k] = 1.0F;
            }

            IntPtr d_a, d_b; //device pointer

            cuda.Malloc(out d_a, N * sizeof(float));
            cuda.Malloc(out d_b, N * sizeof(float));

            GCHandle handle_a = GCHandle.Alloc(a, GCHandleType.Pinned);
            GCHandle handle_b = GCHandle.Alloc(b, GCHandleType.Pinned);
            IntPtr   h_a      = handle_a.AddrOfPinnedObject();
            IntPtr   h_b      = handle_b.AddrOfPinnedObject();

            cuda.DeviceSynchronize();

            cuda.Memcpy(d_a, h_a, N * sizeof(float), cudaMemcpyKind.cudaMemcpyHostToDevice);
            cuda.Memcpy(d_b, h_b, N * sizeof(float), cudaMemcpyKind.cudaMemcpyHostToDevice);

            dynamic wrapped = HybRunner.Cuda().Wrap(new Program());

            wrapped.Add(d_a, d_b, N);

            cuda.DeviceSynchronize();

            cuda.Memcpy(h_a, d_a, N * sizeof(float), cudaMemcpyKind.cudaMemcpyDeviceToHost);

            for (int i = 0; i < N; ++i)
            {
                if (a[i] != (float)i + 1.0F)
                {
                    Console.Error.WriteLine("ERROR at {0} -- {1} != {2}", i, a[i], i + 1);
                    Environment.Exit(6); // abort
                }
            }

            handle_a.Free();
            handle_b.Free();
        }
        static void Main(string[] args)
        {
            HybRunner runner = HybRunner.Cuda().SetDistrib(32, 32, 16, 16, 1, 0);

            GrayBitmap image = GrayBitmap.Load("../../images/lena512.bmp");
            uint       height = image.Height, width = image.Width;

            ushort[] inputPixels = image.PixelsUShort;

            float[] imageFloat   = new float[width * height];
            float[] imageCompute = new float[width * height];
            for (int i = 0; i < width * height; ++i)
            {
                imageFloat[i] = (float)inputPixels[i];
            }

            IntPtr src = runner.Marshaller.MarshalManagedToNative(imageFloat);

            //bind texture
            cudaChannelFormatDesc channelDescTex = TextureHelpers.cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKind.cudaChannelFormatKindFloat);
            cudaArray_t           cuArrayTex     = TextureHelpers.CreateCudaArray(channelDescTex, src, (int)width, (int)height);
            cudaResourceDesc      resDescTex     = TextureHelpers.CreateCudaResourceDesc(cuArrayTex);

            //create Texture descriptor
            cudaTextureDesc texDesc = TextureHelpers.CreateCudaTextureDesc();

            //create Texture object
            cudaTextureObject_t texObj;

            cuda.CreateTextureObject(out texObj, ref resDescTex, ref texDesc);

            //create and bind surface

            dynamic wrapper = runner.Wrap(new Program());

            wrapper.Sobel(texObj, imageCompute, (int)width, (int)height);

            ushort[] outputPixel = new ushort[width * height];
            for (int i = 0; i < width * height; ++i)
            {
                outputPixel[i] = (ushort)imageCompute[i];
            }

            GrayBitmap imageSobel = new GrayBitmap(width, height);

            imageSobel.PixelsUShort = outputPixel;
            imageSobel.Save("../../output-03-surface/sobel.bmp");
        }
예제 #26
0
        static void Main(string[] args)
        {
            if (args.Length == 0)
            {
                args = new string[] { "512", "512", "512", "512" };
            }
            const int redo = 10;

            int heightA = Convert.ToInt32(args[0]);
            int widthA  = Convert.ToInt32(args[1]);
            int heightB = Convert.ToInt32(args[2]);
            int widthB  = Convert.ToInt32(args[3]);

            if (widthA != heightB)
            {
                throw new ArgumentException("invalid data -- incompatible matrices");
            }

            Console.WriteLine("Execution Naive matrix mul with sizes ({0}, {1}) x ({2}, {3})", heightA, widthA, heightB, widthB);

            NaiveMatrix matrixA = new NaiveMatrix(widthA, heightA);
            NaiveMatrix matrixB = new NaiveMatrix(widthB, heightB);

            NaiveMatrix res_net  = new NaiveMatrix(widthB, heightA);
            NaiveMatrix res_cuda = new NaiveMatrix(widthB, heightA);

            double numberCompute = ((double)matrixA.Height * (double)matrixA.Width * (double)matrixB.Width) * 3.0E-9;

            matrixA.FillMatrix();
            matrixB.FillMatrix();

            #region CUDA

            HybRunner runner  = HybRunner.Cuda("SharedMatrix_CUDA.dll").SetDistrib(4, 5, 32, 32, 1, 1024 * 2 * 8);
            dynamic   wrapper = runner.Wrap(new Program());

            for (int i = 0; i < redo; ++i)
            {
                wrapper.Multiply(res_cuda, matrixA, matrixB, matrixA.Width);
            }
            #endregion

            #region C#
            Reference(res_net, matrixA, matrixB);
            #endregion

            Console.Out.WriteLine("DONE");
        }
예제 #27
0
        public static void Main()
        {
            const int n       = 1_000_000;
            var       a       = new double[n];
            var       b       = new double[n];
            var       results = new double[n];

            cuda.GetDeviceProperties(out var prop, 0);
            HybRunner runner = HybRunner.Cuda("DotnetosGPU.Hybridizer_CUDA.dll")
                               .SetDistrib(prop.multiProcessorCount * 16, 256);

            // create a wrapper object to call GPU methods instead of C#
            dynamic wrapped = runner.Wrap(new Program());

            // run the method on GPU
            wrapped.Run(n, a, b, results);
        }
예제 #28
0
        static void Main(string[] args)
        {
            int[] a = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13 };

            // create an instance of HybRunner object to wrap calls on GPU
            HybRunner runner = HybRunner.Cuda().SetDistrib(4, 4);

            // create a wrapper object to call GPU methods instead of C#
            dynamic wrapped = runner.Wrap(new Program());

            // run the method on GPU
            wrapped.Run(a.Length, a);

            // synchronize the GPU to flush stdout on the device
            // add error checking
            cuda.ERROR_CHECK(cuda.DeviceSynchronize());
        }
        static void Main(string[] args)
        {
            int currentDevice;

            cuda.GetDevice(out currentDevice);

            cudaDeviceProp prop;

            cuda.GetDeviceProperties(out prop, currentDevice);
            Console.Out.WriteLine("Device properties:");
            Console.Out.WriteLine("\tname : {0}", new String(prop.name));
            Console.Out.WriteLine("\tmultiProcessorCount : {0}", prop.multiProcessorCount);

            int mm    = prop.major * 10 + prop.minor;
            int count = 0;

            switch (mm)
            {
            case 35: count = 192; break;

            case 37: count = 192; break;

            case 50: count = 128; break;

            case 60: count = 64; break;

            case 61: count = 128; break;

            case 70: count = 64; break;

            default: count = 0; break;
            }

            Console.Out.WriteLine("\tTotal cores : {0}", count * prop.multiProcessorCount);

            HybRunner runner = HybRunner.Cuda();

            Console.Out.WriteLine("Runner configuration:");
            Console.Out.WriteLine("\tGridDimX = {0}", runner.GridDimX);
            Console.Out.WriteLine("\tGridDimY = {0}", runner.GridDimY);
            Console.Out.WriteLine("\tBlockDimX = {0}", runner.BlockDimX);
            Console.Out.WriteLine("\tBlockDimY = {0}", runner.BlockDimY);
            Console.Out.WriteLine("\tBlockDimZ = {0}", runner.BlockDimZ);

            Console.Out.WriteLine("TOTAL parallelization = {0}", runner.GridDimX * runner.GridDimY * runner.BlockDimX * runner.BlockDimY * runner.BlockDimZ);
        }
예제 #30
0
        public void Test(string dllname)
        {
            // 268 MB allocated on device -- should fit in every CUDA compatible GPU
            int N = 1024 * 1024 * 16;

            double[] acuda   = new double[N];
            double[] adotnet = new double[N];

            double[] b = new double[N];

            Random rand = new Random();

            //Initialize acuda et adotnet and b by some doubles randoms, acuda and adotnet have same numbers.
            for (int i = 0; i < N; ++i)
            {
                acuda[i]   = rand.NextDouble();
                adotnet[i] = acuda[i];
                b[i]       = rand.NextDouble();
            }

            cudaDeviceProp prop;

            cuda.GetDeviceProperties(out prop, 0);
            HybRunner runner = HybRunner.Cuda(dllname).SetDistrib(prop.multiProcessorCount * 16, 128);

            // create a wrapper object to call GPU methods instead of C#
            dynamic wrapped = runner.Wrap(this);

            // run the method on GPU
            wrapped.Run(N, acuda, b);

            // run .Net method
            Run(N, adotnet, b);

            // verify the results
            for (int k = 0; k < N; ++k)
            {
                if (acuda[k] != adotnet[k])
                {
                    Console.Out.WriteLine("ERROR !");
                }
            }
            Console.Out.WriteLine("DONE");
            //Thread.Sleep(10000);
        }