CudaAccelerator C# (CSharp) 코드 예제들

예제 #1

0

파일 보기

파일: Demo.cs 프로젝트: Joey9801/ilgpu-arrayview-sketch

        static void Main(string[] args)
        {
            using var context     = new Context();
            using var accelerator = new CudaAccelerator(context);

            Demo2DDenseX(accelerator);
            Demo2DDenseY(accelerator);
            Demo2DTile(accelerator);
        }

예제 #2

0

파일 보기

파일: Program.cs 프로젝트: phoyd/ILGPU.Samples

        static void Main()
        {
            const int DataSize = 1024;
            const CuBlasAPIVersion CuBlasVersion = CuBlasAPIVersion.V10;

            using (var context = new Context())
            {
                // Enable algorithms library
                context.EnableAlgorithms();

                // Check for Cuda support
                foreach (var acceleratorId in CudaAccelerator.CudaAccelerators)
                {
                    using (var accelerator = new CudaAccelerator(context, acceleratorId))
                    {
                        Console.WriteLine($"Performing operations on {accelerator}");
                        var buf  = accelerator.Allocate <float>(DataSize);
                        var buf2 = accelerator.Allocate <float>(DataSize);

                        accelerator.Initialize(accelerator.DefaultStream, buf, 1.0f);
                        accelerator.Initialize(accelerator.DefaultStream, buf2.View, 1.0f);

                        // Initialize the CuBlas library using manual pointer mode handling
                        // (default behavior)
                        using (var blas = new CuBlas(accelerator, CuBlasVersion))
                        {
                            // Set pointer mode to Host to enable data transfer to CPU memory
                            blas.PointerMode = CuBlasPointerMode.Host;
                            float output = blas.Nrm2(buf);

                            // Set pointer mode to Device to enable data transfer to GPU memory
                            blas.PointerMode = CuBlasPointerMode.Device;
                            blas.Nrm2(buf, buf2);

                            // Use pointer mode scopes to recover the previous pointer mode
                            using (var scope = blas.BeginPointerScope(CuBlasPointerMode.Host))
                            {
                                float output2 = blas.Nrm2(buf);
                            }
                        }

                        // Initialize the CuBlas<T> library using custom pointer mode handlers
                        using (var blas = new CuBlas <CuBlasPointerModeHandlers.AutomaticMode>(accelerator, CuBlasVersion))
                        {
                            // Automatic transfer to host
                            float output = blas.Nrm2(buf);

                            // Automatic transfer to device
                            blas.Nrm2(buf, buf2);
                        }
                    }
                }
            }
        }

예제 #3

0

파일 보기

        public static float[] RunOddEvenSort(float[] a, ref Stopwatch sw)
        {
            int  N       = a.Length;
            bool evenArr = (N % 2) == 0 ? true : false;

            bool stopFlag      = false;
            bool iterationEven = true;

            //Create context and accelerator
            var gpu = new CudaAccelerator(new Context());

            //Create typed launcher
            var oddEvenKernel = gpu.LoadAutoGroupedStreamKernel <
                Index1,
                ArrayView <float>,
                VariableView <byte>,
                bool,
                int,
                bool>(OddEvenSort);

            //Allocate memory
            MemoryBuffer <float> d_a        = gpu.Allocate <float>(N);
            MemoryBuffer <byte>  d_stopFlag = gpu.AllocateZero <byte>(1);

            d_a.CopyFrom(a, 0, Index1.Zero, N);

            sw.Restart();
            //Run kernel
            byte[] zero_val = new byte[1];
            zero_val[0] = 0;

            while (true)
            {
                if (stopFlag)
                {
                    break;
                }
                stopFlag = true;

                d_stopFlag.CopyFrom(zero_val, 0, 0, 1);
                oddEvenKernel(N / 2, d_a, d_stopFlag.View.GetVariableView(), iterationEven, N, evenArr);
                gpu.Synchronize();
                if (d_stopFlag.GetAsArray()[0] > 0)
                {
                    stopFlag = false;
                }

                iterationEven = !iterationEven;
            }
            sw.Stop();

            return(d_a.GetAsArray());
        }

예제 #4

0

파일 보기

 public CudaProgressMemoryBuffer(
     CudaAccelerator accelerator,
     long length,
     int elementSize)
     : base(accelerator, length, elementSize)
 {
     CudaException.ThrowIfFailed(
         CudaAPI.CurrentAPI.AllocateHostMemory(
             out IntPtr resultPtr,
             new IntPtr(LengthInBytes)));
     NativePtr = resultPtr;
 }

예제 #5

0

파일 보기

파일: CudaDirectXBuffer.cs 프로젝트: Jimbobicus/ILGPU.SharpDX

 /// <summary>
 /// Constructs a new Cuda buffer.
 /// </summary>
 /// <param name="accelerator">The target accelerator.</param>
 /// <param name="d3dDevice">The target DX device.</param>
 /// <param name="buffer">The target DX buffer.</param>
 /// <param name="bufferFlags">The buffer flags.</param>
 /// <param name="viewFlags">The used view flags</param>
 internal CudaDirectXBuffer(
     CudaAccelerator accelerator,
     Device d3dDevice,
     Buffer buffer,
     DirectXBufferFlags bufferFlags,
     DirectXViewFlags viewFlags)
     : base(accelerator, d3dDevice, buffer, bufferFlags, viewFlags)
 {
     CudaDirectXAccelerator.RegisterResource(
         Buffer,
         viewFlags,
         out cudaGraphicsResource);
 }

예제 #6

0

파일 보기

파일: CudaDirectXTexture2DArray.cs 프로젝트: Jimbobicus/ILGPU.SharpDX

 /// <summary>
 /// Constructs a new Cuda texture 2D.
 /// </summary>
 /// <param name="accelerator">The target accelerator.</param>
 /// <param name="d3dDevice">The target DX device.</param>
 /// <param name="texture">The target DX texture.</param>
 /// <param name="bufferFlags">The used buffer flags.</param>
 /// <param name="viewFlags">The used view flags.</param>
 internal CudaDirectXTexture2DArray(
     CudaAccelerator accelerator,
     Device d3dDevice,
     Texture2D texture,
     DirectXBufferFlags bufferFlags,
     DirectXViewFlags viewFlags)
     : base(accelerator, d3dDevice, texture, bufferFlags, viewFlags)
 {
     CudaDirectXAccelerator.RegisterResource(
         texture,
         viewFlags,
         out cudaGraphicsResource);
 }

예제 #7

0

파일 보기

파일: Program.cs 프로젝트: m4rs-mt/ILGPU

        /// <summary>
        /// Demonstrates using EmitRef.
        /// </summary>
        static void SubtractUsingEmitRef(CudaAccelerator accelerator)
        {
            using var buffer = accelerator.Allocate1D <long>(32);
            var kernel = accelerator.LoadAutoGroupedStreamKernel <Index1D, ArrayView <long> >(SubtractEmitRefKernel);

            kernel((int)buffer.Length, buffer.View);

            var results = buffer.GetAsArray1D();

            for (var i = 0; i < results.Length; i++)
            {
                Console.WriteLine($"[{i}] = {results[i]}");
            }
        }

예제 #8

0

파일 보기

        /// <summary>
        /// Demonstrates a block statement, with local register declaration.
        /// </summary>
        static void AddUsingTempRegister(CudaAccelerator accelerator)
        {
            using var buffer = accelerator.Allocate1D <double>(1024);
            var kernel = accelerator.LoadAutoGroupedStreamKernel <Index1D, ArrayView <double> >(MultipleInstructionKernel);

            kernel((int)buffer.Length, buffer.View);

            var results = buffer.GetAsArray1D();

            for (var i = 0; i < results.Length; i++)
            {
                Console.WriteLine($"[{i}] = {results[i]}");
            }
        }

예제 #9

0

파일 보기

파일: SquaredDistance.cs 프로젝트: GPSnoopy/GpuSandbox

        public static void IlGpuConstants(
            CudaAccelerator gpu,
            Real[] mSquaredDistances,
            Real[] mCoordinates,
            int c,
            int n)
        {
            if (n % 2 != 0)
            {
                throw new ArgumentException("n must be a multiple of 2");
            }

            IlGpuOptimisedImpl(gpu, mSquaredDistances, mCoordinates, c, n, "SquaredDistance.IlGpuConstants", IlGpuKernelConstants, i => new SpecializedValue <int>(i));
        }

예제 #10

0

파일 보기

파일: SquaredDistance.cs 프로젝트: GPSnoopy/GpuSandbox

        public static void IlGpuLocalMemory(
            CudaAccelerator gpu,
            Real[] mSquaredDistances,
            Real[] mCoordinates,
            int c,
            int n)
        {
            if (n % 2 != 0)
            {
                throw new ArgumentException("n must be a multiple of 2");
            }

            IlGpuOptimisedImpl(gpu, mSquaredDistances, mCoordinates, c, n, "SquaredDistance.IlGpuLocalMemory", IlGpuKernelLocalMemory);
        }

예제 #11

0

파일 보기

        // Use the low-level CuFFT API to perform an inverse transform.
        static void DoInversePlan(
            CudaAccelerator accelerator,
            CuFFTAPI api,
            Complex[] input,
            out Complex[] output)
        {
            using var stream       = accelerator.CreateStream() as CudaStream;
            using var inputBuffer  = accelerator.Allocate1D(input);
            using var outputBuffer = accelerator.Allocate1D <Complex>(input.Length);

            CuFFTException.ThrowIfFailed(
                api.Plan1D(
                    out var plan,
                    input.Length,
                    CuFFTType.CUFFT_Z2Z,
                    batch: 1));
            try
            {
                CuFFTException.ThrowIfFailed(
                    api.SetStream(plan, stream));
                CuFFTException.ThrowIfFailed(
                    api.ExecZ2Z(
                        plan,
                        inputBuffer.View.BaseView,
                        outputBuffer.View.BaseView,
                        CuFFTDirection.INVERSE));

                output = outputBuffer.GetAsArray1D(stream);
            }
            finally
            {
                CuFFTException.ThrowIfFailed(
                    api.Destroy(plan));
            }
            WorkaroundKnownIssue(accelerator, api);

            // Scale the output to obtain the inverse.
            for (var i = 0; i < output.Length; i++)
            {
                output[i] /= output.Length;
            }

            Console.WriteLine("Inverse Values:");
            for (var i = 0; i < output.Length; i++)
            {
                Console.WriteLine($"  [{i}] = {output[i].Real}");
            }
        }

예제 #12

0

파일 보기

        /// <summary>
        /// Constructs a page lock scope for the accelerator.
        /// </summary>
        /// <param name="accelerator">The associated accelerator.</param>
        /// <param name="hostPtr">The host buffer pointer to page lock.</param>
        /// <param name="numElements">The number of elements in the buffer.</param>
        internal CudaPageLockScope(
            CudaAccelerator accelerator,
            IntPtr hostPtr,
            long numElements)
            : base(accelerator, numElements)
        {
            if (!accelerator.Device.SupportsMappingHostMemory)
            {
                throw new NotSupportedException(
                          RuntimeErrorMessages.NotSupportedPageLock);
            }
            HostPtr = hostPtr;

            bool supportsHostPointer = accelerator
                                       .Device
                                       .SupportsUsingHostPointerForRegisteredMemory;

            // Setup internal memory registration flags.
            var flags = MemHostRegisterFlags.CU_MEMHOSTREGISTER_PORTABLE;

            if (!supportsHostPointer)
            {
                flags |= MemHostRegisterFlags.CU_MEMHOSTREGISTER_DEVICEMAP;
            }

            // Perform the memory registration.
            CudaException.ThrowIfFailed(
                CurrentAPI.MemHostRegister(
                    hostPtr,
                    new IntPtr(LengthInBytes),
                    flags));

            // Check whether we have to determine the actual device pointer or are able
            // to reuse the host pointer for all operations.
            if (supportsHostPointer)
            {
                AddrOfLockedObject = hostPtr;
            }
            else
            {
                CudaException.ThrowIfFailed(
                    CurrentAPI.MemHostGetDevicePointer(
                        out IntPtr devicePtr,
                        hostPtr,
                        0));
                AddrOfLockedObject = devicePtr;
            }
        }

예제 #13

0

파일 보기

파일: Program.cs 프로젝트: NullandKale/RasterizationTest

    public static void Main()
    {
        using Context context = new Context();
        context.EnableAlgorithms();
        using Accelerator device = new CudaAccelerator(context);

        int width  = 1920;
        int height = 1080;

        byte[] h_bitmapData = new byte[width * height * 3];

        using MemoryBuffer2D <Vec3> canvasData = device.Allocate <Vec3>(width, height);
        using MemoryBuffer <byte> d_bitmapData = device.Allocate <byte>(width * height * 3);

        CanvasData c = new CanvasData(canvasData, d_bitmapData, width, height);
        // pos              // look at         // up
        Camera camera = new Camera(new Vec3(0, 50, -100), new Vec3(0, 0, 0), new Vec3(0, -1, 0), width, height, 40f);

        WorldData world = new WorldData(device);

        //world.loadMesh(new Vec3(10, 0, 0), "./Assets/defaultcube.obj");
        world.loadMesh(new Vec3(0, 0, 0), "./Assets/cat.obj");

        var frameBufferToBitmap = device.LoadAutoGroupedStreamKernel <Index2, CanvasData>(CanvasData.CanvasToBitmap);
        var RTMethod            = device.LoadAutoGroupedStreamKernel <Index2, CanvasData, dWorldBuffer, Camera>(PerPixelRayIntersectionMethod);

        //do rasterization here

        Stopwatch timer = new Stopwatch();

        timer.Start();

        RTMethod(new Index2(width, height), c, world.getDeviceWorldBuffer(), camera);
        frameBufferToBitmap(canvasData.Extent, c);
        device.Synchronize();

        d_bitmapData.CopyTo(h_bitmapData, 0, 0, d_bitmapData.Extent);

        timer.Stop();
        Console.WriteLine("Rendered in: " + timer.Elapsed);

        //bitmap magic that ignores striding be careful with some
        using Bitmap b = new Bitmap(width, height, width * 3, PixelFormat.Format24bppRgb, Marshal.UnsafeAddrOfPinnedArrayElement(h_bitmapData, 0));
        b.Save("out.bmp");

        Process.Start("cmd.exe", "/c out.bmp");
    }

예제 #14

0

파일 보기

        /// <summary>
        /// Demonstrates using the mul.hi.u64 and mul.lo.u64 inline PTX instructions to
        /// multiply two UInt64 values to produce a UInt128 value.
        /// </summary>
        static void MultiplyUInt128(CudaAccelerator accelerator)
        {
            using var buffer = accelerator.Allocate1D <UInt128>(1024);
            var kernel = accelerator.LoadAutoGroupedStreamKernel <Index1D, ArrayView <UInt128>, SpecializedValue <ulong> >(MultiplyUInt128Kernel);

            kernel(
                (int)buffer.Length,
                buffer.View,
                SpecializedValue.New(ulong.MaxValue));

            var results = buffer.GetAsArray1D();

            for (var i = 0; i < results.Length; i++)
            {
                Console.WriteLine($"[{i}] = {results[i]}");
            }
        }

예제 #15

0

파일 보기

        private static void RunManyMatrixMultiplication(Gpu aleaGpu, CudaAccelerator ilGpu)
        {
            const int m = 100;
            const int n = 250 - 1;

            var resultM = new Real[m * n * n];
            var resultC = new Real[m * n * n];
            var left    = new Real[m * n * n];
            var right   = new Real[n * n];

            Benchmark.Run(Loops,
                          () => ManyMatrixMultiplication.Initialise(left, right, m, n),
                          () => ManyMatrixMultiplication.Initialise(left, right, m, n),
                          () => AssertAreEqual(resultM, resultC, m * n, n),
                          () => ManyMatrixMultiplication.Managed(resultM, left, right, m, n),
                          () => ManyMatrixMultiplication.Alea(aleaGpu, resultC, left, right, m, n));
        }

예제 #16

0

파일 보기

파일: ConvexHullGpuParallelized.cs 프로젝트: timiskhakov/ComputingTheConvexHullOnGpu

        public static HashSet <Point> QuickHull(Points points)
        {
            if (points.Xs.Length != points.Ys.Length)
            {
                throw new ArgumentException($"Invalid {nameof(Points)} structure");
            }
            if (points.Xs.Length <= 2)
            {
                throw new ArgumentException($"Too little points: {points.Xs.Length}, expected 3 or more");
            }

            var result = new HashSet <Point>();

            var left  = (points.Xs[0], points.Ys[0]);
            var right = (points.Xs[0], points.Ys[0]);

            for (var i = 0; i < points.Xs.Length; i++)
            {
                if (points.Xs[i] < left.Item1)
                {
                    left.Item1 = points.Xs[i];
                    left.Item2 = points.Ys[i];
                }
                if (points.Xs[i] > right.Item1)
                {
                    right.Item1 = points.Xs[i];
                    right.Item2 = points.Ys[i];
                }
            }

            using var context = new Context();
            context.EnableAlgorithms();

            using var accelerator = new CudaAccelerator(context);
            using var xsBuffer    = accelerator.Allocate <float>(points.Xs.Length);
            using var ysBuffer    = accelerator.Allocate <float>(points.Ys.Length);
            xsBuffer.CopyFrom(points.Xs, 0, 0, points.Xs.Length);
            ysBuffer.CopyFrom(points.Ys, 0, 0, points.Ys.Length);

            FindHull(in points, left.Item1, left.Item2, right.Item1, right.Item2, 1, result, accelerator, xsBuffer.View, ysBuffer.View);
            FindHull(in points, left.Item1, left.Item2, right.Item1, right.Item2, -1, result, accelerator, xsBuffer.View, ysBuffer.View);

            return(result);
        }

예제 #17

0

파일 보기

        private static void RunAddVector(Gpu aleaGpu, CudaAccelerator ilGpu)
        {
            const int m = 2 * 24 * 12;
            const int n = 2 * 25600 - 1;

            var matrixM = new Real[m * n];
            var matrixC = new Real[m * n];
            var vector  = new Real[n];

            Benchmark.Run(Loops,
                          () => AddVector.Initialise(matrixM, vector, m, n),
                          () => AddVector.Initialise(matrixC, vector, m, n),
                          () => AssertAreEqual(matrixM, matrixC, m, n),
                          () => AddVector.Managed(matrixM, vector, m, n),
#if USE_ALEA
                          () => AddVector.Alea(aleaGpu, matrixC, vector, m, n),
#endif
                          () => AddVector.IlGpu(ilGpu, matrixC, vector, m, n));
        }

예제 #18

0

파일 보기

        public static void IlGpu(CudaAccelerator gpu, Real[] matrix, Real[] vector, int m, int n)
        {
            using (var cudaMatrix = gpu.Allocate(matrix))
                using (var cudaVector = gpu.Allocate(vector))
                {
                    var timer = Stopwatch.StartNew();

                    var gridSizeX = Util.DivUp(n, 32);
                    var gridSizeY = Util.DivUp(m, 8);
                    var lp        = ((gridSizeX, gridSizeY, 1), (32, 8));

                    gpu.Launch(IlGpuKernel, gpu.DefaultStream, lp, cudaMatrix.View, cudaVector.View, m, n);

                    gpu.Synchronize();
                    Util.PrintPerformance(timer, "AddVector.IlGpu", 3, m, n);

                    cudaMatrix.CopyTo(matrix, 0, 0, matrix.Length);
                }
        }

예제 #19

0

파일 보기

        public static float[] RunMatrixMulShared(float[][] a, float[][] b, int N, ref Stopwatch sw)
        {
            //Create context and accelerator
            var gpu = new CudaAccelerator(new Context());

            //Create typed launcher
            var matrixMulKernelShared = gpu.LoadStreamKernel <
                ArrayView <float>,
                ArrayView <float>,
                ArrayView <float>,
                int>(MatrixMulShared);

            //Allocate memory
            var buffSize             = N * N;
            MemoryBuffer <float> d_a = gpu.Allocate <float>(buffSize);
            MemoryBuffer <float> d_b = gpu.Allocate <float>(buffSize);
            MemoryBuffer <float> d_c = gpu.Allocate <float>(buffSize);

            d_a.CopyFrom(FlatternArr(a), 0, Index1.Zero, buffSize);
            d_b.CopyFrom(FlatternArr(b), 0, Index1.Zero, buffSize);

            //Groups per grid dimension
            int GrPerDim = (int)Math.Ceiling((float)N / groupSize);

            KernelConfig dimension = (
                new Index2(GrPerDim, GrPerDim),                 // Number of groups
                new Index2(groupSize, groupSize));              // Group size (thread count in group)

            sw.Restart();

            matrixMulKernelShared(dimension, d_a.View, d_b.View, d_c.View, N);

            // Wait for the kernel to finish...
            gpu.Synchronize();

            sw.Stop();

            var c = d_c.GetAsArray();

            return(c);
        }

예제 #20

0

파일 보기

        /// <summary>
        /// Constructs a page lock scope for the accelerator.
        /// </summary>
        /// <param name="accelerator">The associated accelerator.</param>
        /// <param name="hostPtr">The host buffer pointer to page lock.</param>
        /// <param name="numElements">The number of elements in the buffer.</param>
        internal CudaPageLockScope(
            CudaAccelerator accelerator,
            IntPtr hostPtr,
            long numElements)
            : base(accelerator)
        {
            if (!accelerator.Device.SupportsMappingHostMemory)
            {
                throw new NotSupportedException(
                          RuntimeErrorMessages.NotSupportedPageLock);
            }
            HostPtr = hostPtr;
            Length  = numElements;

            var flags = MemHostRegisterFlags.CU_MEMHOSTREGISTER_PORTABLE;

            if (!accelerator.Device.SupportsUsingHostPointerForRegisteredMemory)
            {
                flags |= MemHostRegisterFlags.CU_MEMHOSTREGISTER_DEVICEMAP;
            }
            CudaException.ThrowIfFailed(
                CurrentAPI.MemHostRegister(
                    hostPtr,
                    new IntPtr(LengthInBytes),
                    flags));
            if (accelerator.Device.SupportsUsingHostPointerForRegisteredMemory)
            {
                AddrOfLockedObject = hostPtr;
            }
            else
            {
                CudaException.ThrowIfFailed(
                    CurrentAPI.MemHostGetDevicePointer(
                        out IntPtr devicePtr,
                        hostPtr,
                        0));
                AddrOfLockedObject = devicePtr;
            }
        }

예제 #21

0

파일 보기

파일: SquaredDistance.cs 프로젝트: GPSnoopy/GpuSandbox

        public static void IlGpu(
            CudaAccelerator gpu,
            Real[] mSquaredDistances,
            Real[] mCoordinates,
            int c,
            int n)
        {
            using var cudaSquaredDistance = gpu.Allocate(mSquaredDistances);
            using var cudaCoordinates     = gpu.Allocate(mCoordinates);
            var timer = Stopwatch.StartNew();

            const int blockSize = 128;

            var gridSize = Util.DivUp(n * n, blockSize);
            var lp       = (gridSize, blockSize);

            gpu.Launch(IlGpuKernel, gpu.DefaultStream, lp, cudaSquaredDistance.View, cudaCoordinates.View, c, n);
            gpu.Synchronize();

            Util.PrintPerformance(timer, "SquaredDistance.IlGpu", n, c, n);

            cudaSquaredDistance.CopyTo(mSquaredDistances, 0, 0, mSquaredDistances.Length);
        }

예제 #22

0

파일 보기

파일: ConvexHullGpuParallelized.cs 프로젝트: timiskhakov/ComputingTheConvexHullOnGpu

        public static HashSet <Point> QuickHull(Point[] points)
        {
            if (points.Length <= 2)
            {
                throw new ArgumentException($"Too little points: {points.Length}, expected 3 or more");
            }

            var result = new HashSet <Point>();

            var left  = points[0];
            var right = points[0];

            for (var i = 1; i < points.Length; i++)
            {
                if (points[i].X < left.X)
                {
                    left = points[i];
                }
                if (points[i].X > right.X)
                {
                    right = points[i];
                }
            }

            using var context = new Context();
            context.EnableAlgorithms();

            using var accelerator = new CudaAccelerator(context);
            using var buffer      = accelerator.Allocate <Point>(points.Length);
            buffer.CopyFrom(points, 0, 0, points.Length);

            FindHull(points, left, right, 1, result, accelerator, buffer.View);
            FindHull(points, left, right, -1, result, accelerator, buffer.View);

            return(result);
        }

예제 #23

0

파일 보기

파일: SquaredDistance.cs 프로젝트: GPSnoopy/GpuSandbox

        private static void IlGpuOptimisedImpl(
            CudaAccelerator gpu,
            Real[] mSquaredDistances,
            Real[] mCoordinates,
            int c,
            int n,
            string name,
            Action <ArrayView2D <Real>, ArrayView <Real>, SpecializedValue <int>, SpecializedValue <int>, int> kernelFunc)
        {
            using var cudaSquaredDistance = gpu.Allocate <Real>(n, n);
            using var cudaCoordinates     = gpu.Allocate(mCoordinates);
            var timer = Stopwatch.StartNew();

            const int blockSize = 128;
            var       gridSize  = Util.DivUp(n, blockSize);
            var       lp        = ((gridSize, gridSize, 1), (blockSize, 1, 1));

            gpu.Launch(kernelFunc, gpu.DefaultStream, lp, cudaSquaredDistance.View, cudaCoordinates.View, SpecializedValue.New(blockSize), SpecializedValue.New(c), n);
            gpu.Synchronize();

            Util.PrintPerformance(timer, name, n, c, n);

            cudaSquaredDistance.CopyTo(mSquaredDistances, (0, 0), 0, (n, n));
        }

예제 #24

0

파일 보기

        private static void Performance()
        {
            using (var context = new Context())
            {
                using (var accelerator = new CudaAccelerator(context))
                {
                    using (var b = accelerator.CreateBackend())
                    {
                        using (var c = accelerator.Context.CreateCompileUnit(b))
                        {
                            var method = typeof(Program).GetMethod("MathKernel", BindingFlags.Static | BindingFlags.Public);
                            var compiled = b.Compile(c, method);

                            var kernel = accelerator.LoadAutoGroupedStreamKernel<Index2, ArrayView2D<float>>(MathKernel);
                            //var kernel = accelerator.LoadAutoGroupedKernel(compiled);

                            int size = 100000;
                            var W = new[] { 50 };
                            var H = new[] { 50 };

                            for (int n = 0; n < W.Length; n++)
                            {
                                for (int m = 0; m < H.Length; m++)
                                {
                                    int x = W[n];
                                    int y = H[m];

                                    Console.WriteLine($"\n\nW {x}, H {y} \n\n");

                                    //var watch = Stopwatch.StartNew();
                                    //for (int k = 0; k < size; k++)
                                    //{
                                    //    var v = new float[x, y];
                                    //    for (int i = 0; i < x; i++)
                                    //    {
                                    //        for (int j = 0; j < y; j++)
                                    //        {
                                    //            v[i, j] = (float)Math.Sqrt(i * j);
                                    //        }
                                    //    }
                                    //}
                                    //watch.Stop();
                                    //Console.WriteLine($"\n\nElapsed CPU Time Linear: {watch.ElapsedMilliseconds}ms\n");
                                    //GC.Collect();
                                    //
                                    //watch = Stopwatch.StartNew();
                                    //Parallel.For(0, size, k =>
                                    //{
                                    //    var v = new float[x, y];
                                    //    Parallel.For(0, x, i =>
                                    //    {
                                    //        Parallel.For(0, y, j =>
                                    //        {
                                    //            v[i, j] = (float)Math.Sqrt(i * j);
                                    //        });
                                    //    });
                                    //});
                                    //watch.Stop();
                                    //Console.WriteLine($"Elapsed CPU Time Parallel: {watch.ElapsedMilliseconds}ms\n\n");
                                    //GC.Collect();

                                    //var watch = Stopwatch.StartNew();
                                    //for (int k = 0; k < size; k++)
                                    //{
                                    //    var idx = new Index2(x, y);
                                    //    var buffer = accelerator.Allocate<float>(idx);
                                    //    kernel(idx, buffer.View);
                                    //    accelerator.Synchronize();
                                    //    buffer.Dispose();
                                    //}
                                    //watch.Stop();
                                    //Console.WriteLine($"\n\nElapsed GPU Time Linear: {watch.ElapsedMilliseconds}ms\n");
                                    //GC.Collect();

                                    var kn = Enumerable.Repeat(accelerator.LoadAutoGroupedStreamKernel<Index2, ArrayView2D<float>>(MathKernel), size).ToList();

                                    var watch = Stopwatch.StartNew();
                                    Parallel.For(0, size, k =>
                                    {
                                        var idx = new Index2(x, y);
                                        var buffer = accelerator.Allocate<float>(idx);
                                        //kn[k](idx, buffer.View);
                                        //kernel.Launch(idx, buffer.View);
                                        kernel(idx, buffer.View);
                                        accelerator.Synchronize();
                                        buffer.Dispose();
                                    });
                                    watch.Stop();
                                    Console.WriteLine($"Elapsed GPU Time Parallel: {watch.ElapsedMilliseconds}ms\n\n");
                                    GC.Collect();
                                }
                            }
                        }
                    }
                }
            }
        }

예제 #25

0

파일 보기

파일: Program.cs 프로젝트: PhonicCanine/Number-String-Counter

        //Note: This program runs a *lot* faster in Release mode than Debug mode (because bounds checking is disabled in ILGPU).
        static void Main(string[] args)
        {
            //needed for the other method (Parallel.For)
            //int maxLength = 0;
            //long iterations = 0;

            long originalMin = 0;

            //As a note, it takes around 2 hours on CPU (Core i7 4790K) to search ~120B numbers
            long min = 0;
            long max = 113373373373;

            //Length of an array of longs.
            //Since longs are int64, multiply by ~8 for actual memory use in bytes
            const long allocatedMemory = 200000000;

            //Cache to store results that work in. As this grows, so does the time to search it for good results.
            const int resultCacheSize = 100;

            //Stores the minimum value for the specific depth
            long minForMax = long.MaxValue;

            //The chain length to search for:
            //E.G. 100 (OneHundred) -> 10 (Ten) -> 3 (Three) -> 5 (Five) -> 4 [end] is of length 4.
            const int chainLength = 8;

            //Include punctuation in the count or not.
            //E.G., With punctuation 137 -> "One Hundred and Thirty-Seven" (28 characters)
            //& Without punctuation, 137 -> "OneHundredandThirtySeven" (24 characters)
            const bool includePunctuation = false;

            //Stop when one number is found with specified chain length (obviously invalidates the percentage count)
            const bool stopAtOneFound = false;

            //min is increased as the program runs, so we need a copy of its original value for calculating the % done.
            originalMin = min;

            using (var context = new Context())
            {
                Accelerator acc;

                try
                {
                    acc = new CudaAccelerator(context);
                }
                catch (Exception)
                {
                    //no cuda
                    acc = new CPUAccelerator(context);
                }

                var a = acc;
                Console.WriteLine("Performing ops on " + a.Name + ". " + a.NumMultiprocessors.ToString() + " processors.");

                //Set up two kernels to get the data
                var searchKernel = a.LoadAutoGroupedStreamKernel <Index, ArrayView <UInt64>, long, long, bool>(SearchForChain);
                var resultKernel = a.LoadAutoGroupedStreamKernel <Index, ArrayView <UInt64>, ArrayView <UInt64> >(FindNonZero);

                using (var buffa = a.Allocate <UInt64>((int)allocatedMemory))
                {
                    using (var buffb = a.Allocate <UInt64>(resultCacheSize))
                    {
                        //Loop while we haven't gone over the maximum value in search range
                        while (min < max)
                        {
                            //Search for numbers first (Kernel)
                            searchKernel((int)allocatedMemory, buffa.View, min, chainLength, !includePunctuation);
                            a.Synchronize();

                            //Read back array to find nonzero entries (Kernel)
                            resultKernel(resultCacheSize, buffb, buffa);
                            a.Synchronize();

                            var  arr   = buffb.GetAsArray();
                            bool found = false;

                            //Read back the results array for nonzero entries (Normal .net)
                            for (int i = 0; i < buffb.Length; i++)
                            {
                                if (arr[i] != 0)
                                {
                                    found = true;
                                    Console.WriteLine(arr[i]);
                                    if (arr[i] < (ulong)minForMax)
                                    {
                                        minForMax = (long)arr[i];
                                    }
                                }
                            }

                            //break if we have found a number and had to stop at one found
                            if (found && stopAtOneFound)
                            {
                                break;
                            }
                            min += allocatedMemory;
                            long total = max - originalMin;
                            long diff  = min - originalMin;

                            //For displaying the percentage complete
                            Console.WriteLine((((decimal)diff / (decimal)total) * 100).ToString() + "% complete");
                        }
                    }
                }
            }


            //The code commented below is for doing this via a Parallel.for.

            /*
             * Parallel.For(min,max,(i)=> {
             *  int chainLength = searchGPU(i,true);
             *  if (chainLength > maxLength)
             *  {
             *      maxLength = chainLength;
             *      Console.WriteLine(NumberToString(i) + "   <=>   (" + i.ToString() + ") gave chain length: " + chainLength.ToString());
             *      minForMax = long.MaxValue;
             *  }
             *
             *  if (chainLength == maxLength && Math.Abs(i) < Math.Abs(minForMax))
             *  {
             *      minForMax = i;
             *      Console.WriteLine(i.ToString() + " was a better candidate for " + chainLength.ToString());
             *  }
             *
             *  iterations++;
             *  if (iterations % ((max - min) / 10000) == 0)
             *  {
             *
             *      decimal percent = (decimal)iterations / (decimal)((max - min));
             *
             *      Console.WriteLine((percent * 100)+" Percent done.");
             *  }
             * });*/

            Console.WriteLine(NumberToString(minForMax) + "   <=>   (" + minForMax.ToString() + ") gave chain length: " + chainLength.ToString());
        }

예제 #26

0

파일 보기

 /// <summary>
 /// Constructs a new CuBlas instance to access the Nvidia cublas library.
 /// </summary>
 /// <param name="accelerator">The associated cuda accelerator.</param>
 public CuBlas(CudaAccelerator accelerator)
     : base(accelerator)
 {
 }

예제 #27

0

파일 보기

 /// <summary>
 /// Constructs a new CuBlas instance to access the Nvidia cublas library.
 /// </summary>
 /// <param name="accelerator">The associated cuda accelerator.</param>
 /// <param name="apiVersion">The cuBlas API version.</param>
 public CuBlas(CudaAccelerator accelerator, CuBlasAPIVersion apiVersion)
     : base(accelerator, apiVersion)
 {
 }

예제 #28

0

파일 보기

 /// <summary>
 /// Constructs a new Cuda DX-interop accelerator.
 /// </summary>
 /// <param name="accelerator">The target Cuda accelerator.</param>
 /// <param name="d3dDevice">The target DX device.</param>
 internal CudaDirectXAccelerator(CudaAccelerator accelerator, Device d3dDevice)
     : base(accelerator, d3dDevice)
 {
 }

예제 #29

0

파일 보기

파일: CudaScanProvider.cs 프로젝트: kant2002/ILGPU.Lightning

 public ScanProviderImplementation CreateCudaExtension(CudaAccelerator accelerator)
 {
     return(new Cuda.CudaScanProviderImplementation(accelerator));
 }

예제 #30

0

파일 보기

 public ImplCuda()
 {
     _context = new Context();
     _gpu     = new CudaAccelerator(_context);
 }

C# (CSharp) CudaAccelerator 예제들