예제 #1
0
        public static void IlGpu(CudaAccelerator gpu, Real[] result, Real[] left, Real[] right, int n)
        {
            using (var cudaResult = gpu.Allocate(result))
                using (var cudaLeft = gpu.Allocate(left))
                    using (var cudaRight = gpu.Allocate(right))
                    {
                        using var blas = new CuBlas(gpu, CuBlasAPIVersion.V11);

                        var timer = Stopwatch.StartNew();

                        blas.Gemm(
                            CuBlasOperation.NonTranspose,
                            CuBlasOperation.NonTranspose,
                            n, n, n,
                            1, cudaLeft.View, n,
                            cudaRight.View, n, 0,
                            cudaResult.View, n);

                        gpu.Synchronize();

                        PrintPerformance(timer, "MatrixMultiplication.IlGpu.cuBLAS", n, n, n);

                        cudaResult.CopyTo(result, 0, 0, result.Length);
                    }
        }
예제 #2
0
        private static void IlGpuOptimisedImpl <TInt>(
            CudaAccelerator gpu,
            Real[] mSquaredDistances,
            Real[] mCoordinates,
            int c,
            int n,
            string name,
            Action <ArrayView2D <Real>, ArrayView <Real>, TInt, int> kernelFunc,
            Func <int, TInt> numCoordGetter)
            where TInt : struct
        {
            using var cudaSquaredDistance = gpu.Allocate <Real>(n, n);
            using var cudaCoordinates     = gpu.Allocate(mCoordinates);
            var timer = Stopwatch.StartNew();

            const int blockSize = 128;
            var       gridSize  = Util.DivUp(n, blockSize);
            var       lp        = ((gridSize, gridSize, 1), (blockSize, 1, 1), SharedMemoryConfig.RequestDynamic <Real>(2 * c * blockSize));

            gpu.Launch(kernelFunc, gpu.DefaultStream, lp, cudaSquaredDistance.View, cudaCoordinates.View, numCoordGetter(c), n);
            gpu.Synchronize();

            Util.PrintPerformance(timer, name, n, c, n);

            cudaSquaredDistance.CopyTo(mSquaredDistances, (0, 0), 0, (n, n));
        }
예제 #3
0
        public static void IlGpu(
            CudaAccelerator gpu,
            Real[] mIntraReturn,
            Real[] vClose,
            Real[] vIsAlive,
            Real[] vIsValidDay,
            int m,
            int n)
        {
            using (var cudaIntraReturn = gpu.Allocate(mIntraReturn))
                using (var cudaClose = gpu.Allocate(vClose))
                    using (var cudaIsAlive = gpu.Allocate(vIsAlive))
                        using (var cudaIsValidDay = gpu.Allocate(vIsValidDay))
                        {
                            var timer = Stopwatch.StartNew();

                            var gridSizeX = Util.DivUp(n, 32);
                            var gridSizeY = Util.DivUp(m, 8);
                            var lp        = ((gridSizeX, gridSizeY, 1), (32, 8));

                            gpu.Launch(IlGpuKernel, gpu.DefaultStream, lp, cudaIntraReturn.View, cudaClose.View, cudaIsAlive.View, cudaIsValidDay.View, m, n);

                            gpu.Synchronize();
                            Util.PrintPerformance(timer, "IntraReturn.IlGpu", 5, m, n);

                            cudaIntraReturn.CopyTo(mIntraReturn, 0, 0, mIntraReturn.Length);
                        }
        }
예제 #4
0
        public static float[] RunMatrixMul(float[][] a, float[][] b, int N)
        {
            //Create context and accelerator
            var gpu = new CudaAccelerator(new Context());

            //Create typed launcher
            var matrixMulKernel = gpu.LoadAutoGroupedStreamKernel <
                Index2,
                ArrayView <float>,
                ArrayView <float>,
                ArrayView <float>,
                int>(MatrixMul);

            //Allocate memory
            int buffSize             = N * N;
            MemoryBuffer <float> d_a = gpu.Allocate <float>(buffSize);
            MemoryBuffer <float> d_b = gpu.Allocate <float>(buffSize);
            MemoryBuffer <float> d_c = gpu.Allocate <float>(buffSize);

            d_a.CopyFrom(FlatternArr(a), 0, Index1.Zero, buffSize);
            d_b.CopyFrom(FlatternArr(b), 0, Index1.Zero, buffSize);

            matrixMulKernel(new Index2(N, N), d_a.View, d_b.View, d_c.View, N);

            // Wait for the kernel to finish...
            gpu.Synchronize();

            var c = d_c.GetAsArray();

            return(c);
        }
예제 #5
0
        static void Main()
        {
            const int DataSize = 1024;
            const CuBlasAPIVersion CuBlasVersion = CuBlasAPIVersion.V10;

            using (var context = new Context())
            {
                // Enable algorithms library
                context.EnableAlgorithms();

                // Check for Cuda support
                foreach (var acceleratorId in CudaAccelerator.CudaAccelerators)
                {
                    using (var accelerator = new CudaAccelerator(context, acceleratorId))
                    {
                        Console.WriteLine($"Performing operations on {accelerator}");
                        var buf  = accelerator.Allocate <float>(DataSize);
                        var buf2 = accelerator.Allocate <float>(DataSize);

                        accelerator.Initialize(accelerator.DefaultStream, buf, 1.0f);
                        accelerator.Initialize(accelerator.DefaultStream, buf2.View, 1.0f);

                        // Initialize the CuBlas library using manual pointer mode handling
                        // (default behavior)
                        using (var blas = new CuBlas(accelerator, CuBlasVersion))
                        {
                            // Set pointer mode to Host to enable data transfer to CPU memory
                            blas.PointerMode = CuBlasPointerMode.Host;
                            float output = blas.Nrm2(buf);

                            // Set pointer mode to Device to enable data transfer to GPU memory
                            blas.PointerMode = CuBlasPointerMode.Device;
                            blas.Nrm2(buf, buf2);

                            // Use pointer mode scopes to recover the previous pointer mode
                            using (var scope = blas.BeginPointerScope(CuBlasPointerMode.Host))
                            {
                                float output2 = blas.Nrm2(buf);
                            }
                        }

                        // Initialize the CuBlas<T> library using custom pointer mode handlers
                        using (var blas = new CuBlas <CuBlasPointerModeHandlers.AutomaticMode>(accelerator, CuBlasVersion))
                        {
                            // Automatic transfer to host
                            float output = blas.Nrm2(buf);

                            // Automatic transfer to device
                            blas.Nrm2(buf, buf2);
                        }
                    }
                }
            }
        }
예제 #6
0
        public static float[] RunOddEvenSort2(float[] a)
        {
            int  N       = a.Length;
            bool evenArr = (N % 2) == 0 ? true : false;

            //Create context and accelerator
            var gpu = new CudaAccelerator(new Context());

            //Create typed launcher
            var oddEvenKernel = gpu.LoadAutoGroupedStreamKernel <
                Index1,
                ArrayView <float>,
                VariableView <byte>,
                int,
                bool>(OddEvenSort2);

            //Allocate memory
            MemoryBuffer <float> d_a        = gpu.Allocate <float>(N);
            MemoryBuffer <byte>  d_stopFlag = gpu.AllocateZero <byte>(1);

            d_a.CopyFrom(a, 0, Index1.Zero, N);

            //Run kernel
            oddEvenKernel(N / 2, d_a, d_stopFlag.View.GetVariableView(0), N, evenArr);
            gpu.Synchronize();

            return(d_a.GetAsArray());
        }
예제 #7
0
        public static float[] RunFloydWarshall(float[][] a, int N, ref Stopwatch sw)
        {
            //Create context and accelerator
            var gpu = new CudaAccelerator(new Context());

            //Create typed launcher
            var floydWarshallKernel = gpu.LoadAutoGroupedStreamKernel <
                Index1,
                int,
                ArrayView <float>,
                int>(FloydWarshall);

            //Allocate memory
            var bufSize = N * N;
            MemoryBuffer <float> d_graphMinDist = gpu.Allocate <float>(bufSize);

            d_graphMinDist.CopyFrom(FlatternArr(a), 0, Index1.Zero, bufSize);

            sw.Restart();

            for (int k = 0; k < N; k++)
            {
                floydWarshallKernel(bufSize, k, d_graphMinDist, N);
                gpu.Synchronize();
            }

            sw.Stop();

            return(d_graphMinDist.GetAsArray());
        }
예제 #8
0
        public static void IlGpu(CudaAccelerator gpu, Real[] matrix, Real[] vector, int m, int n)
        {
            using (var cudaMatrix = gpu.Allocate(matrix))
                using (var cudaVector = gpu.Allocate(vector))
                {
                    var timer = Stopwatch.StartNew();

                    var gridSizeX = Util.DivUp(n, 32);
                    var gridSizeY = Util.DivUp(m, 8);
                    var lp        = ((gridSizeX, gridSizeY, 1), (32, 8));

                    gpu.Launch(IlGpuKernel, gpu.DefaultStream, lp, cudaMatrix.View, cudaVector.View, m, n);

                    gpu.Synchronize();
                    Util.PrintPerformance(timer, "AddVector.IlGpu", 3, m, n);

                    cudaMatrix.CopyTo(matrix, 0, 0, matrix.Length);
                }
        }
예제 #9
0
        public static float[] RunMatrixMulShared(float[][] a, float[][] b, int N, ref Stopwatch sw)
        {
            //Create context and accelerator
            var gpu = new CudaAccelerator(new Context());

            //Create typed launcher
            var matrixMulKernelShared = gpu.LoadStreamKernel <
                ArrayView <float>,
                ArrayView <float>,
                ArrayView <float>,
                int>(MatrixMulShared);

            //Allocate memory
            var buffSize             = N * N;
            MemoryBuffer <float> d_a = gpu.Allocate <float>(buffSize);
            MemoryBuffer <float> d_b = gpu.Allocate <float>(buffSize);
            MemoryBuffer <float> d_c = gpu.Allocate <float>(buffSize);

            d_a.CopyFrom(FlatternArr(a), 0, Index1.Zero, buffSize);
            d_b.CopyFrom(FlatternArr(b), 0, Index1.Zero, buffSize);

            //Groups per grid dimension
            int GrPerDim = (int)Math.Ceiling((float)N / groupSize);

            KernelConfig dimension = (
                new Index2(GrPerDim, GrPerDim),                 // Number of groups
                new Index2(groupSize, groupSize));              // Group size (thread count in group)

            sw.Restart();

            matrixMulKernelShared(dimension, d_a.View, d_b.View, d_c.View, N);

            // Wait for the kernel to finish...
            gpu.Synchronize();

            sw.Stop();

            var c = d_c.GetAsArray();

            return(c);
        }
예제 #10
0
        public static float[] RunOddEvenSort(float[] a, ref Stopwatch sw)
        {
            int  N       = a.Length;
            bool evenArr = (N % 2) == 0 ? true : false;

            bool stopFlag      = false;
            bool iterationEven = true;

            //Create context and accelerator
            var gpu = new CudaAccelerator(new Context());

            //Create typed launcher
            var oddEvenKernel = gpu.LoadAutoGroupedStreamKernel <
                Index1,
                ArrayView <float>,
                VariableView <byte>,
                bool,
                int,
                bool>(OddEvenSort);

            //Allocate memory
            MemoryBuffer <float> d_a        = gpu.Allocate <float>(N);
            MemoryBuffer <byte>  d_stopFlag = gpu.AllocateZero <byte>(1);

            d_a.CopyFrom(a, 0, Index1.Zero, N);

            sw.Restart();
            //Run kernel
            byte[] zero_val = new byte[1];
            zero_val[0] = 0;

            while (true)
            {
                if (stopFlag)
                {
                    break;
                }
                stopFlag = true;

                d_stopFlag.CopyFrom(zero_val, 0, 0, 1);
                oddEvenKernel(N / 2, d_a, d_stopFlag.View.GetVariableView(), iterationEven, N, evenArr);
                gpu.Synchronize();
                if (d_stopFlag.GetAsArray()[0] > 0)
                {
                    stopFlag = false;
                }

                iterationEven = !iterationEven;
            }
            sw.Stop();

            return(d_a.GetAsArray());
        }
예제 #11
0
        public override void Init()
        {
            GeneratePTX(); // alternative approach through this?

            context     = new Context();
            accelerator = new CudaAccelerator(context);

            var methodInfo = typeof(Impl_ILGPU).GetMethod(nameof(MyKernel), BindingFlags.Public | BindingFlags.Static);

            myKernel = accelerator.LoadAutoGroupedStreamKernel <Index,
                                                                ArrayView <byte>,
                                                                ArrayView <double>,
                                                                ArrayView <int>,
                                                                ArrayView <int>,
                                                                ArrayView <double>,
                                                                ArrayView <byte>,
                                                                int
                                                                >(MyKernel);

            // Allocate some memory
            input1_dev = accelerator.Allocate <int>(DataGenerator.In1.Length);
            input2_dev = accelerator.Allocate <int>(DataGenerator.In2.Length);
            input3_dev = accelerator.Allocate <double>(DataGenerator.In3.Length);
            input4_dev = accelerator.Allocate <byte>(DataGenerator.In4_3_bytes.Length);

            // init output parameters
            result_dev     = accelerator.Allocate <byte>(resultsBytes.Length);
            resultCalc_dev = accelerator.Allocate <double>(calculatables.Length);

            input1_dev.CopyFrom(DataGenerator.In1, 0, 0, DataGenerator.In1.Length);
            input2_dev.CopyFrom(DataGenerator.In2, 0, 0, DataGenerator.In2.Length);
            input3_dev.CopyFrom(DataGenerator.In3, 0, 0, DataGenerator.In3.Length);
            input4_dev.CopyFrom(DataGenerator.In4_3_bytes, 0, 0, DataGenerator.In4_3_bytes.Length);
        }
예제 #12
0
        public static void IlGpu(
            CudaAccelerator gpu,
            Real[] mSquaredDistances,
            Real[] mCoordinates,
            int c,
            int n)
        {
            using var cudaSquaredDistance = gpu.Allocate(mSquaredDistances);
            using var cudaCoordinates     = gpu.Allocate(mCoordinates);
            var timer = Stopwatch.StartNew();

            const int blockSize = 128;

            var gridSize = Util.DivUp(n * n, blockSize);
            var lp       = (gridSize, blockSize);

            gpu.Launch(IlGpuKernel, gpu.DefaultStream, lp, cudaSquaredDistance.View, cudaCoordinates.View, c, n);
            gpu.Synchronize();

            Util.PrintPerformance(timer, "SquaredDistance.IlGpu", n, c, n);

            cudaSquaredDistance.CopyTo(mSquaredDistances, 0, 0, mSquaredDistances.Length);
        }
예제 #13
0
        private static void IlGpuOptimisedImpl(
            CudaAccelerator gpu,
            Real[] mSquaredDistances,
            Real[] mCoordinates,
            int c,
            int n,
            string name,
            Action <ArrayView2D <Real>, ArrayView <Real>, SpecializedValue <int>, SpecializedValue <int>, int> kernelFunc)
        {
            using var cudaSquaredDistance = gpu.Allocate <Real>(n, n);
            using var cudaCoordinates     = gpu.Allocate(mCoordinates);
            var timer = Stopwatch.StartNew();

            const int blockSize = 128;
            var       gridSize  = Util.DivUp(n, blockSize);
            var       lp        = ((gridSize, gridSize, 1), (blockSize, 1, 1));

            gpu.Launch(kernelFunc, gpu.DefaultStream, lp, cudaSquaredDistance.View, cudaCoordinates.View, SpecializedValue.New(blockSize), SpecializedValue.New(c), n);
            gpu.Synchronize();

            Util.PrintPerformance(timer, name, n, c, n);

            cudaSquaredDistance.CopyTo(mSquaredDistances, (0, 0), 0, (n, n));
        }
예제 #14
0
    public static void Main()
    {
        using Context context = new Context();
        context.EnableAlgorithms();
        using Accelerator device = new CudaAccelerator(context);

        int width  = 1920;
        int height = 1080;

        byte[] h_bitmapData = new byte[width * height * 3];

        using MemoryBuffer2D <Vec3> canvasData = device.Allocate <Vec3>(width, height);
        using MemoryBuffer <byte> d_bitmapData = device.Allocate <byte>(width * height * 3);

        CanvasData c = new CanvasData(canvasData, d_bitmapData, width, height);
        // pos              // look at         // up
        Camera camera = new Camera(new Vec3(0, 50, -100), new Vec3(0, 0, 0), new Vec3(0, -1, 0), width, height, 40f);

        WorldData world = new WorldData(device);

        //world.loadMesh(new Vec3(10, 0, 0), "./Assets/defaultcube.obj");
        world.loadMesh(new Vec3(0, 0, 0), "./Assets/cat.obj");

        var frameBufferToBitmap = device.LoadAutoGroupedStreamKernel <Index2, CanvasData>(CanvasData.CanvasToBitmap);
        var RTMethod            = device.LoadAutoGroupedStreamKernel <Index2, CanvasData, dWorldBuffer, Camera>(PerPixelRayIntersectionMethod);

        //do rasterization here

        Stopwatch timer = new Stopwatch();

        timer.Start();

        RTMethod(new Index2(width, height), c, world.getDeviceWorldBuffer(), camera);
        frameBufferToBitmap(canvasData.Extent, c);
        device.Synchronize();

        d_bitmapData.CopyTo(h_bitmapData, 0, 0, d_bitmapData.Extent);

        timer.Stop();
        Console.WriteLine("Rendered in: " + timer.Elapsed);

        //bitmap magic that ignores striding be careful with some
        using Bitmap b = new Bitmap(width, height, width * 3, PixelFormat.Format24bppRgb, Marshal.UnsafeAddrOfPinnedArrayElement(h_bitmapData, 0));
        b.Save("out.bmp");

        Process.Start("cmd.exe", "/c out.bmp");
    }
예제 #15
0
        private static void Performance()
        {
            using (var context = new Context())
            {
                using (var accelerator = new CudaAccelerator(context))
                {
                    using (var b = accelerator.CreateBackend())
                    {
                        using (var c = accelerator.Context.CreateCompileUnit(b))
                        {
                            var method = typeof(Program).GetMethod("MathKernel", BindingFlags.Static | BindingFlags.Public);
                            var compiled = b.Compile(c, method);

                            var kernel = accelerator.LoadAutoGroupedStreamKernel<Index2, ArrayView2D<float>>(MathKernel);
                            //var kernel = accelerator.LoadAutoGroupedKernel(compiled);

                            int size = 100000;
                            var W = new[] { 50 };
                            var H = new[] { 50 };

                            for (int n = 0; n < W.Length; n++)
                            {
                                for (int m = 0; m < H.Length; m++)
                                {
                                    int x = W[n];
                                    int y = H[m];

                                    Console.WriteLine($"\n\nW {x}, H {y} \n\n");

                                    //var watch = Stopwatch.StartNew();
                                    //for (int k = 0; k < size; k++)
                                    //{
                                    //    var v = new float[x, y];
                                    //    for (int i = 0; i < x; i++)
                                    //    {
                                    //        for (int j = 0; j < y; j++)
                                    //        {
                                    //            v[i, j] = (float)Math.Sqrt(i * j);
                                    //        }
                                    //    }
                                    //}
                                    //watch.Stop();
                                    //Console.WriteLine($"\n\nElapsed CPU Time Linear: {watch.ElapsedMilliseconds}ms\n");
                                    //GC.Collect();
                                    //
                                    //watch = Stopwatch.StartNew();
                                    //Parallel.For(0, size, k =>
                                    //{
                                    //    var v = new float[x, y];
                                    //    Parallel.For(0, x, i =>
                                    //    {
                                    //        Parallel.For(0, y, j =>
                                    //        {
                                    //            v[i, j] = (float)Math.Sqrt(i * j);
                                    //        });
                                    //    });
                                    //});
                                    //watch.Stop();
                                    //Console.WriteLine($"Elapsed CPU Time Parallel: {watch.ElapsedMilliseconds}ms\n\n");
                                    //GC.Collect();

                                    //var watch = Stopwatch.StartNew();
                                    //for (int k = 0; k < size; k++)
                                    //{
                                    //    var idx = new Index2(x, y);
                                    //    var buffer = accelerator.Allocate<float>(idx);
                                    //    kernel(idx, buffer.View);
                                    //    accelerator.Synchronize();
                                    //    buffer.Dispose();
                                    //}
                                    //watch.Stop();
                                    //Console.WriteLine($"\n\nElapsed GPU Time Linear: {watch.ElapsedMilliseconds}ms\n");
                                    //GC.Collect();

                                    var kn = Enumerable.Repeat(accelerator.LoadAutoGroupedStreamKernel<Index2, ArrayView2D<float>>(MathKernel), size).ToList();

                                    var watch = Stopwatch.StartNew();
                                    Parallel.For(0, size, k =>
                                    {
                                        var idx = new Index2(x, y);
                                        var buffer = accelerator.Allocate<float>(idx);
                                        //kn[k](idx, buffer.View);
                                        //kernel.Launch(idx, buffer.View);
                                        kernel(idx, buffer.View);
                                        accelerator.Synchronize();
                                        buffer.Dispose();
                                    });
                                    watch.Stop();
                                    Console.WriteLine($"Elapsed GPU Time Parallel: {watch.ElapsedMilliseconds}ms\n\n");
                                    GC.Collect();
                                }
                            }
                        }
                    }
                }
            }
        }
예제 #16
0
        public void ProccessOld()
        {
            // Create the required ILGPU context
            using (var context = new Context())
            {
                /*
                 * using (var accelerator = new CPUAccelerator(context))
                 * {
                 *  // accelerator.LoadAutoGroupedStreamKernel creates a typed launcher
                 *  // that implicitly uses the default accelerator stream.
                 *  // In order to create a launcher that receives a custom accelerator stream
                 *  // use: accelerator.LoadAutoGroupedKernel<Index, ArrayView<int> int>(...)
                 *  var myKernel = accelerator.LoadAutoGroupedStreamKernel<Index, ArrayView<int>, int>(MyKernel2);
                 *
                 *  // Allocate some memory
                 *  using (var buffer = accelerator.Allocate<int>(1024))
                 *  {
                 *      // Launch buffer.Length many threads and pass a view to buffer
                 *      myKernel(buffer.Length, buffer.View, 42);
                 *
                 *      // Wait for the kernel to finish...
                 *      accelerator.Synchronize();
                 *
                 *      // Resolve data
                 *      var data = buffer.GetAsArray();
                 *      // ...
                 *  }
                 * }*/

                using (var accelerator = new CudaAccelerator(context)) // test with CPUAccelerator
                {
                    var methodInfo = typeof(Impl_ILGPU).GetMethod(nameof(MyKernel), BindingFlags.Public | BindingFlags.Static);
                    var myKernel   = accelerator.LoadAutoGroupedStreamKernel <Index,
                                                                              ArrayView <byte>,
                                                                              ArrayView <double>,
                                                                              ArrayView <int>,
                                                                              ArrayView <int>,
                                                                              ArrayView <double>,
                                                                              ArrayView <byte>,
                                                                              int
                                                                              >(MyKernel);

                    /*
                     * var myKernel = accelerator.LoadAutoGroupedStreamKernel<Action<Index,
                     *      ArrayView<byte>,
                     *      ArrayView<int>,
                     *      ArrayView<int>,
                     *      ArrayView<double>,
                     *      ArrayView2D<byte>>>(methodInfo);*/

                    // Allocate some memory
                    var input1_dev = accelerator.Allocate <int>(DataGenerator.In1.Length);
                    var input2_dev = accelerator.Allocate <int>(DataGenerator.In2.Length);
                    var input3_dev = accelerator.Allocate <double>(DataGenerator.In3.Length);
                    //var input4_dev = accelerator.Allocate<byte>(DataGenerator.In4_2.GetLength(0), DataGenerator.In4_2.GetLength(1));
                    var input4_dev = accelerator.Allocate <byte>(DataGenerator.In4_3_bytes.Length);

                    // init output parameters
                    var result_dev     = accelerator.Allocate <byte>(resultsBytes.Length);
                    var resultCalc_dev = accelerator.Allocate <double>(calculatables.Length);

                    input1_dev.CopyFrom(DataGenerator.In1, 0, 0, DataGenerator.In1.Length);
                    input2_dev.CopyFrom(DataGenerator.In2, 0, 0, DataGenerator.In2.Length);
                    input3_dev.CopyFrom(DataGenerator.In3, 0, 0, DataGenerator.In3.Length);
                    //input4_dev.CopyFrom(DataFeeder.In4_2_bytes, new Index2(), new Index2(DataFeeder.In4_2_bytes.GetLength(0), 0), new Index2(1, 2));
                    //input4_dev.CopyFrom(DataGenerator.In4_2_bytes, Index2.Zero, Index2.Zero, input4_dev.Extent);
                    input4_dev.CopyFrom(DataGenerator.In4_3_bytes, 0, 0, DataGenerator.In4_3_bytes.Length);

                    myKernel(input1_dev.Length, result_dev.View, resultCalc_dev.View, input1_dev.View, input2_dev.View, input3_dev.View, input4_dev.View, DataGenerator.Width);

                    // Wait for the kernel to finish...
                    accelerator.Synchronize();

                    // Resolve data
                    resultsBytes  = result_dev.GetAsArray();
                    calculatables = resultCalc_dev.GetAsArray();

                    //d_in1.Dispose();
                    //d_in1 = null;

                    /*
                     * var kernelWithDefaultStream = accelerator.LoadAutoGroupedStreamKernel<
                     *  Index,
                     *  ArrayView<bool>,
                     *  ArrayView<int>,
                     *  ArrayView<int>,
                     *  ArrayView<double>,
                     *  ArrayView2D<bool>
                     *  >(MyKernel);
                     *
                     * kernelWithDefaultStream(buffer.Extent, buffer.View, 1);
                     */

                    // Launch buffer.Length many threads and pass a view to buffer
                    //myKernel(d_in1.Length, d_in1.View, 42);

                    // Wait for the kernel to finish...
                    //accelerator.Synchronize();

                    // Resolve data
                    //var data = buffer.GetAsArray();
                    // ...
                }
            }
        }