Ejemplo n.º 1
0
        private static void SetBaseIndex(int base_index)
        {
            unsafe
            {
                IntPtr[] x1      = new IntPtr[] { new IntPtr(base_index) };
                GCHandle handle2 = GCHandle.Alloc(x1, GCHandleType.Pinned);
                var      parm1   = handle2.AddrOfPinnedObject();

                IntPtr[] kp = new IntPtr[] { parm1 };

                CUmodule module = RUNTIME.RuntimeModule;
                CudaHelpers.CheckCudaError(Cuda.cuModuleGetFunction(out CUfunction function, module, "_Z21set_kernel_base_indexi"));
                Campy.Utils.CudaHelpers.MakeLinearTiling(1,
                                                         out Campy.Utils.CudaHelpers.dim3 tile_size,
                                                         out Campy.Utils.CudaHelpers.dim3 tiles);
                CUresult res;
                fixed(IntPtr *kernelParams = kp)
                {
                    res = Cuda.cuLaunchKernel(
                        function,
                        tiles.x, tiles.y, tiles.z,             // grid has one block.
                        tile_size.x, tile_size.y, tile_size.z, // n threads.
                        0,                                     // no shared memory
                        default(CUstream),
                        (IntPtr)kernelParams,
                        (IntPtr)IntPtr.Zero
                        );
                }

                Utils.CudaHelpers.CheckCudaError(res);
                res = Cuda.cuCtxSynchronize(); // Make sure it's copied back to host.
                Utils.CudaHelpers.CheckCudaError(res);
            }
        }
Ejemplo n.º 2
0
        public static void For(int number_of_threads, SimpleKernel simpleKernel)
        {
            if (Campy.Utils.Options.IsOn("import-only"))
            {
                JustImport(simpleKernel);
                return;
            }

            GCHandle handle1 = default(GCHandle);
            GCHandle handle2 = default(GCHandle);

            try
            {
                unsafe
                {
                    System.Reflection.MethodInfo method_info = simpleKernel.Method;
                    String kernel_assembly_file_name         = method_info.DeclaringType.Assembly.Location;
                    Mono.Cecil.ModuleDefinition md           = Campy.Meta.StickyReadMod.StickyReadModule(
                        kernel_assembly_file_name, new ReaderParameters {
                        ReadSymbols = true
                    });
                    MethodReference method_reference = md.ImportReference(method_info);

                    CUfunction ptr_to_kernel = default(CUfunction);
                    CUmodule   module        = default(CUmodule);

                    Campy.Utils.TimePhase.Time("compile     ", () =>
                    {
                        IntPtr image = Singleton._compiler.Compile(method_reference, simpleKernel.Target);
                        module       = Singleton._compiler.SetModule(method_reference, image);
                        Singleton._compiler.StoreJits(module);
                        ptr_to_kernel = Singleton._compiler.GetCudaFunction(method_reference, module);
                    });

                    RUNTIME.BclCheckHeap();

                    BUFFERS buffer = Singleton.Buffer;
                    IntPtr  kernel_target_object = IntPtr.Zero;

                    Campy.Utils.TimePhase.Time("deep copy ", () =>
                    {
                        int count = simpleKernel.Method.GetParameters().Length;
                        var bb    = Singleton._compiler.GetBasicBlock(method_reference);
                        if (bb.HasThis)
                        {
                            count++;
                        }
                        if (!(count == 1 || count == 2))
                        {
                            throw new Exception("Expecting at least one parameter for kernel.");
                        }

                        if (bb.HasThis)
                        {
                            kernel_target_object = buffer.AddDataStructure(simpleKernel.Target);
                        }
                    });

                    Campy.Utils.TimePhase.Time("kernel cctor set up", () =>
                    {
                        // For each cctor, run on GPU.
                        // Construct dependency graph of methods.
                        List <MethodReference> order_list = COMPILER.Singleton.ConstructCctorOrder();

                        // Finally, call cctors.
                        foreach (var bb in order_list)
                        {
                            if (Campy.Utils.Options.IsOn("trace-cctors"))
                            {
                                System.Console.WriteLine("Executing cctor "
                                                         + bb.FullName);
                            }
                            var cctor = Singleton._compiler.GetCudaFunction(bb, module);

                            var res = CUresult.CUDA_SUCCESS;
                            Campy.Utils.CudaHelpers.MakeLinearTiling(1,
                                                                     out Campy.Utils.CudaHelpers.dim3 tile_size, out Campy.Utils.CudaHelpers.dim3 tiles);

                            res = Cuda.cuLaunchKernel(
                                cctor,
                                tiles.x, tiles.y, tiles.z,             // grid has one block.
                                tile_size.x, tile_size.y, tile_size.z, // n threads.
                                0,                                     // no shared memory
                                default(CUstream),
                                (IntPtr)IntPtr.Zero,
                                (IntPtr)IntPtr.Zero
                                );

                            CudaHelpers.CheckCudaError(res);
                            res = Cuda.cuCtxSynchronize(); // Make sure it's copied back to host.
                            CudaHelpers.CheckCudaError(res);
                        }
                    });

                    if (Campy.Utils.Options.IsOn("trace-cctors"))
                    {
                        System.Console.WriteLine("Done with cctors");
                    }

                    Campy.Utils.TimePhase.Time("kernel call ", () =>
                    {
                        IntPtr[] parm1 = new IntPtr[1];
                        IntPtr[] parm2 = new IntPtr[1];

                        parm1[0] = kernel_target_object;
                        parm2[0] = buffer.New(BUFFERS.SizeOf(typeof(int)));

                        IntPtr[] x1     = parm1;
                        handle1         = GCHandle.Alloc(x1, GCHandleType.Pinned);
                        IntPtr pointer1 = handle1.AddrOfPinnedObject();

                        IntPtr[] x2     = parm2;
                        handle2         = GCHandle.Alloc(x2, GCHandleType.Pinned);
                        IntPtr pointer2 = handle2.AddrOfPinnedObject();

                        IntPtr[] kp = new IntPtr[] { pointer1, pointer2 };
                        var res     = CUresult.CUDA_SUCCESS;
                        fixed(IntPtr * kernelParams = kp)
                        {
                            Campy.Utils.CudaHelpers.MakeLinearTiling(number_of_threads,
                                                                     out Campy.Utils.CudaHelpers.dim3 tile_size, out Campy.Utils.CudaHelpers.dim3 tiles);

                            //MakeLinearTiling(1, out dim3 tile_size, out dim3 tiles);

                            res = Cuda.cuLaunchKernel(
                                ptr_to_kernel,
                                tiles.x, tiles.y, tiles.z,             // grid has one block.
                                tile_size.x, tile_size.y, tile_size.z, // n threads.
                                0,                                     // no shared memory
                                default(CUstream),
                                (IntPtr)kernelParams,
                                (IntPtr)IntPtr.Zero
                                );
                        }
Ejemplo n.º 3
0
        public static void For(int number_of_threads, SimpleKernel simpleKernel)
        {
            GCHandle handle1 = default(GCHandle);
            GCHandle handle2 = default(GCHandle);

            try
            {
                unsafe
                {
                    //////// COMPILE KERNEL INTO GPU CODE ///////
                    /////////////////////////////////////////////
                    var stopwatch_cuda_compile = new Stopwatch();
                    stopwatch_cuda_compile.Start();
                    IntPtr     image               = Singleton()._converter.Compile(simpleKernel.Method, simpleKernel.Target);
                    CUfunction ptr_to_kernel       = Singleton()._converter.GetCudaFunction(simpleKernel.Method, image);
                    var        elapse_cuda_compile = stopwatch_cuda_compile.Elapsed;

                    RUNTIME.CheckHeap();

                    //////// COPY DATA INTO GPU /////////////////
                    /////////////////////////////////////////////
                    var stopwatch_deep_copy_to = new Stopwatch();
                    stopwatch_deep_copy_to.Reset();
                    stopwatch_deep_copy_to.Start();
                    BUFFERS buffer = Singleton().Buffer;

                    // Set up parameters.
                    int count = simpleKernel.Method.GetParameters().Length;
                    var bb    = Singleton()._converter.GetBasicBlock(simpleKernel.Method);
                    if (bb.HasThis)
                    {
                        count++;
                    }
                    if (!(count == 1 || count == 2))
                    {
                        throw new Exception("Expecting at least one parameter for kernel.");
                    }

                    IntPtr[] parm1 = new IntPtr[1];
                    IntPtr[] parm2 = new IntPtr[1];
                    IntPtr   ptr   = IntPtr.Zero;

                    // The method really should have a "this" because it's a closure
                    // object.
                    if (bb.HasThis)
                    {
                        RUNTIME.CheckHeap();
                        ptr      = buffer.AddDataStructure(simpleKernel.Target);
                        parm1[0] = ptr;
                    }

                    {
                        Type btype = typeof(int);
                        var  s     = BUFFERS.SizeOf(btype);
                        var  ptr2  = buffer.New(s);
                        // buffer.DeepCopyToImplementation(index, ptr2);
                        parm2[0] = ptr2;
                    }

                    stopwatch_deep_copy_to.Start();
                    var elapse_deep_copy_to = stopwatch_cuda_compile.Elapsed;

                    var stopwatch_call_kernel = new Stopwatch();
                    stopwatch_call_kernel.Reset();
                    stopwatch_call_kernel.Start();

                    IntPtr[] x1 = parm1;
                    handle1 = GCHandle.Alloc(x1, GCHandleType.Pinned);
                    IntPtr pointer1 = handle1.AddrOfPinnedObject();

                    IntPtr[] x2 = parm2;
                    handle2 = GCHandle.Alloc(x2, GCHandleType.Pinned);
                    IntPtr pointer2 = handle2.AddrOfPinnedObject();

                    RUNTIME.CheckHeap();

                    IntPtr[] kp  = new IntPtr[] { pointer1, pointer2 };
                    var      res = CUresult.CUDA_SUCCESS;
                    fixed(IntPtr *kernelParams = kp)
                    {
                        Campy.Utils.CudaHelpers.MakeLinearTiling(number_of_threads, out Campy.Utils.CudaHelpers.dim3 tile_size, out Campy.Utils.CudaHelpers.dim3 tiles);

                        //MakeLinearTiling(1, out dim3 tile_size, out dim3 tiles);

                        res = Cuda.cuLaunchKernel(
                            ptr_to_kernel,
                            tiles.x, tiles.y, tiles.z,             // grid has one block.
                            tile_size.x, tile_size.y, tile_size.z, // n threads.
                            0,                                     // no shared memory
                            default(CUstream),
                            (IntPtr)kernelParams,
                            (IntPtr)IntPtr.Zero
                            );
                    }
                    CudaHelpers.CheckCudaError(res);
                    res = Cuda.cuCtxSynchronize(); // Make sure it's copied back to host.
                    CudaHelpers.CheckCudaError(res);

                    stopwatch_call_kernel.Stop();
                    var elapse_call_kernel = stopwatch_call_kernel.Elapsed;

                    if (Campy.Utils.Options.IsOn("jit_trace"))
                    {
                        System.Console.WriteLine("cuda compile  " + elapse_cuda_compile);
                        System.Console.WriteLine("deep copy in  " + elapse_deep_copy_to);
                        System.Console.WriteLine("cuda kernel   " + elapse_call_kernel);
                    }

                    {
                        var stopwatch_deep_copy_back = new Stopwatch();
                        stopwatch_deep_copy_back.Reset();

                        RUNTIME.CheckHeap();

                        stopwatch_deep_copy_back.Start();

                        buffer.SynchDataStructures();

                        stopwatch_deep_copy_back.Stop();

                        RUNTIME.CheckHeap();

                        var elapse_deep_copy_back = stopwatch_deep_copy_back.Elapsed;
                        if (Campy.Utils.Options.IsOn("jit_trace"))
                        {
                            System.Console.WriteLine("deep copy out " + elapse_deep_copy_back);
                        }
                    }
                }
            }
            catch (Exception e)
            {
                Console.WriteLine(e);
                throw e;
            }
            finally
            {
                if (default(GCHandle) != handle1)
                {
                    handle1.Free();
                }
                if (default(GCHandle) != handle2)
                {
                    handle2.Free();
                }
            }
        }
Ejemplo n.º 4
0
        static unsafe void Part1()
        {
            Cuda.cuInit(0);
            var res = Cuda.cuDeviceGet(out int device, 0);

            if (res != CUresult.CUDA_SUCCESS)
            {
                throw new Exception();
            }
            res = Cuda.cuDeviceGetPCIBusId(out string pciBusId, 100, device);
            if (res != CUresult.CUDA_SUCCESS)
            {
                throw new Exception();
            }
            res = Cuda.cuDeviceGetName(out string name, 100, device);
            if (res != CUresult.CUDA_SUCCESS)
            {
                throw new Exception();
            }
            res = Cuda.cuCtxCreate_v2(out CUcontext cuContext, 0, device);
            if (res != CUresult.CUDA_SUCCESS)
            {
                throw new Exception();
            }
            string       assembly_directory = AssemblyDirectory;
            StreamReader sr     = new StreamReader(assembly_directory + @"/../../Project2/x64/Debug/stuff.ptx");
            string       kernel = sr.ReadToEnd();
            IntPtr       ptr    = Marshal.StringToHGlobalAnsi(kernel);

            int[]    option_values        = new int[] { };
            GCHandle option_values_handle = GCHandle.Alloc(option_values, GCHandleType.Pinned);
            IntPtr   options_values_ptr   = option_values_handle.AddrOfPinnedObject();

            CUjit_option[] options = new CUjit_option[] { };
            res = Cuda.cuModuleLoadDataEx(out CUmodule cuModule, ptr, (uint)0, options, options_values_ptr);
            if (res != CUresult.CUDA_SUCCESS)
            {
                throw new Exception();
            }
            res = Cuda.cuModuleGetFunction(out CUfunction helloWorld, cuModule, "_Z5helloPc");
            if (res != CUresult.CUDA_SUCCESS)
            {
                throw new Exception();
            }
            byte[]   v       = { (byte)'G', (byte)'d', (byte)'k', (byte)'k', (byte)'n', 31, (byte)'v', (byte)'n', (byte)'q', (byte)'k', (byte)'c', 0 };
            GCHandle handle  = GCHandle.Alloc(v, GCHandleType.Pinned);
            IntPtr   pointer = IntPtr.Zero;

            pointer = handle.AddrOfPinnedObject();
            res     = Cuda.cuMemAlloc_v2(out IntPtr dptr, 11 * sizeof(byte));
            if (res != CUresult.CUDA_SUCCESS)
            {
                throw new Exception();
            }
            res = Cuda.cuMemcpyHtoD_v2(dptr, pointer, 11 * sizeof(byte));
            if (res != CUresult.CUDA_SUCCESS)
            {
                throw new Exception();
            }
            IntPtr[] x        = new IntPtr[] { dptr };
            GCHandle handle2  = GCHandle.Alloc(x, GCHandleType.Pinned);
            IntPtr   pointer2 = IntPtr.Zero;

            pointer2 = handle2.AddrOfPinnedObject();
            IntPtr[] kp = new IntPtr[] { pointer2 };
            fixed(IntPtr *kernelParams = kp)
            {
                res = Cuda.cuLaunchKernel(helloWorld,
                                          1, 1, 1,  // grid has one block.
                                          11, 1, 1, // block has 11 threads.
                                          0,        // no shared memory
                                          default(CUstream),
                                          (IntPtr)kernelParams,
                                          (IntPtr)IntPtr.Zero
                                          );
            }

            if (res != CUresult.CUDA_SUCCESS)
            {
                throw new Exception();
            }
            res = Cuda.cuMemcpyDtoH_v2(pointer, dptr, 11 * sizeof(byte));
            if (res != CUresult.CUDA_SUCCESS)
            {
                throw new Exception();
            }
            for (int i = 0; i < 11; ++i)
            {
                System.Console.Write((char)v[i]);
            }
            System.Console.WriteLine();
            Cuda.cuCtxDestroy_v2(cuContext);
        }
Ejemplo n.º 5
0
        static unsafe void Part2()
        {
            Cuda.cuInit(0);
            var res = Cuda.cuDeviceGet(out int device, 0);

            if (res != CUresult.CUDA_SUCCESS)
            {
                throw new Exception();
            }
            res = Cuda.cuDeviceGetPCIBusId(out string pciBusId, 100, device);
            if (res != CUresult.CUDA_SUCCESS)
            {
                throw new Exception();
            }
            res = Cuda.cuDeviceGetName(out string name, 100, device);
            if (res != CUresult.CUDA_SUCCESS)
            {
                throw new Exception();
            }
            res = Cuda.cuCtxCreate_v2(out CUcontext cuContext, 0, device);
            if (res != CUresult.CUDA_SUCCESS)
            {
                throw new Exception();
            }
            string     assembly_directory = AssemblyDirectory;
            FileStream fs      = new FileStream(assembly_directory + @"/../../Project2/x64/Debug/stuff.cu.obj", FileMode.Open);
            var        len     = fs.Length;
            var        gpu_obj = new byte[len];

            fs.Read(gpu_obj, 0, (int)len);

            uint num_ops_link = 5;
            var  op_link      = new CUjit_option[num_ops_link];

            ulong[] op_values_link = new ulong[num_ops_link];

            int size = 1024 * 100;

            op_link[0]        = CUjit_option.CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
            op_values_link[0] = (ulong)size;

            op_link[1] = CUjit_option.CU_JIT_INFO_LOG_BUFFER;
            byte[] info_log_buffer        = new byte[size];
            var    info_log_buffer_handle = GCHandle.Alloc(info_log_buffer, GCHandleType.Pinned);
            var    info_log_buffer_intptr = info_log_buffer_handle.AddrOfPinnedObject();

            op_values_link[1] = (ulong)info_log_buffer_intptr;

            op_link[2]        = CUjit_option.CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;
            op_values_link[2] = (ulong)size;

            op_link[3] = CUjit_option.CU_JIT_ERROR_LOG_BUFFER;
            byte[] error_log_buffer        = new byte[size];
            var    error_log_buffer_handle = GCHandle.Alloc(error_log_buffer, GCHandleType.Pinned);
            var    error_log_buffer_intptr = error_log_buffer_handle.AddrOfPinnedObject();

            op_values_link[3] = (ulong)error_log_buffer_intptr;

            op_link[4]        = CUjit_option.CU_JIT_LOG_VERBOSE;
            op_values_link[4] = (ulong)1;

            var op_values_link_handle = GCHandle.Alloc(op_values_link, GCHandleType.Pinned);
            var op_values_link_intptr = op_values_link_handle.AddrOfPinnedObject();

            res = Cuda.cuLinkCreate_v2(num_ops_link, op_link, op_values_link_intptr, out CUlinkState linkState);
            {
                string info = Marshal.PtrToStringAnsi(info_log_buffer_intptr);
                System.Console.WriteLine(info);
                string error = Marshal.PtrToStringAnsi(error_log_buffer_intptr);
                System.Console.WriteLine(error);
            }

            uint num_ops = 0;

            CUjit_option[] op               = new CUjit_option[0];
            ulong[]        op_values        = new ulong[0];
            var            op_values_handle = GCHandle.Alloc(op_values, GCHandleType.Pinned);
            var            op_values_intptr = op_values_handle.AddrOfPinnedObject();

            var    kernel_handle      = GCHandle.Alloc(gpu_obj, GCHandleType.Pinned);
            IntPtr gpu_bcl_obj_intptr = kernel_handle.AddrOfPinnedObject();

            res = Cuda.cuLinkAddData_v2(linkState, CUjitInputType.CU_JIT_INPUT_OBJECT,
                                        gpu_bcl_obj_intptr, (uint)len,
                                        "", num_ops, op, op_values_intptr);
            {
                string info = Marshal.PtrToStringAnsi(info_log_buffer_intptr);
                System.Console.WriteLine(info);
                string error = Marshal.PtrToStringAnsi(error_log_buffer_intptr);
                System.Console.WriteLine(error);
            }

            IntPtr image;

            res = Cuda.cuLinkComplete(linkState, out image, out ulong sz);
            res = Cuda.cuModuleLoadDataEx(out CUmodule module, image, 0, op, op_values_link_intptr);

            res = Cuda.cuModuleGetFunction(out CUfunction helloWorld, module, "_Z5helloPc");
            if (res != CUresult.CUDA_SUCCESS)
            {
                throw new Exception();
            }
            byte[]   v       = { (byte)'G', (byte)'d', (byte)'k', (byte)'k', (byte)'n', 31, (byte)'v', (byte)'n', (byte)'q', (byte)'k', (byte)'c', 0 };
            GCHandle handle  = GCHandle.Alloc(v, GCHandleType.Pinned);
            IntPtr   pointer = IntPtr.Zero;

            pointer = handle.AddrOfPinnedObject();
            res     = Cuda.cuMemAlloc_v2(out IntPtr dptr, 11 * sizeof(byte));
            if (res != CUresult.CUDA_SUCCESS)
            {
                throw new Exception();
            }
            res = Cuda.cuMemcpyHtoD_v2(dptr, pointer, 11 * sizeof(byte));
            if (res != CUresult.CUDA_SUCCESS)
            {
                throw new Exception();
            }
            IntPtr[] x        = new IntPtr[] { dptr };
            GCHandle handle2  = GCHandle.Alloc(x, GCHandleType.Pinned);
            IntPtr   pointer2 = IntPtr.Zero;

            pointer2 = handle2.AddrOfPinnedObject();
            IntPtr[] kp = new IntPtr[] { pointer2 };
            fixed(IntPtr *kernelParams = kp)
            {
                res = Cuda.cuLaunchKernel(helloWorld,
                                          1, 1, 1,  // grid has one block.
                                          11, 1, 1, // block has 11 threads.
                                          0,        // no shared memory
                                          default(CUstream),
                                          (IntPtr)kernelParams,
                                          (IntPtr)IntPtr.Zero
                                          );
            }

            if (res != CUresult.CUDA_SUCCESS)
            {
                throw new Exception();
            }
            res = Cuda.cuMemcpyDtoH_v2(pointer, dptr, 11 * sizeof(byte));
            if (res != CUresult.CUDA_SUCCESS)
            {
                throw new Exception();
            }
            for (int i = 0; i < 11; ++i)
            {
                System.Console.Write((char)v[i]);
            }
            System.Console.WriteLine();
            Cuda.cuCtxDestroy_v2(cuContext);
        }
Ejemplo n.º 6
0
        static unsafe void Main(string[] args)
        {
            System.Console.WriteLine("1");
            Cuda.cuInit(0);
            System.Console.WriteLine("2");

            // Device api.
            var res = Cuda.cuDeviceGet(out int device, 0);

            if (res != CUresult.CUDA_SUCCESS)
            {
                throw new Exception();
            }
            res = Cuda.cuDeviceGetPCIBusId(out string pciBusId, 100, device);
            if (res != CUresult.CUDA_SUCCESS)
            {
                throw new Exception();
            }
            res = Cuda.cuDeviceGetName(out string name, 100, device);
            if (res != CUresult.CUDA_SUCCESS)
            {
                throw new Exception();
            }

            System.Console.WriteLine("3");

            res = Cuda.cuCtxCreate_v2(out CUcontext cuContext, 0, device);
            if (res != CUresult.CUDA_SUCCESS)
            {
                throw new Exception();
            }
            string cu_kernel      = @"
#include <stdio.h>

__global__
void kern(int * ar)
{
	int i = threadIdx.x;
	if (i < 11)
		ar[i] = ar[i] + 1;
}
";
            string compile_string = @"
nvcc --ptx --gpu-architecture=sm_20 -ccbin ""C:\Program Files(x86)\Microsoft Visual Studio 14.0\VC\bin"" y.cu";

            string kernel = @"
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-21373419
// Cuda compilation tools, release 8.0, V8.0.55
// Based on LLVM 3.4svn
//

.version 5.0
.target sm_20
.address_size 64

	// .globl	_Z4kernPi

.visible .entry _Z4kernPi(
	.param .u64 _Z4kernPi_param_0
)
{
	.reg .pred  %p<2>;
	.reg .b32   %r<4>;
	.reg .b64   %rd<5>;


	ld.param.u64    %rd1, [_Z4kernPi_param_0];
	mov.u32     %r1, %tid.x;
	setp.gt.s32	%p1, %r1, 10;
	@%p1 bra    BB0_2;

	cvta.to.global.u64  %rd2, %rd1;
	mul.wide.s32    %rd3, %r1, 4;
	add.s64     %rd4, %rd2, %rd3;
	ld.global.u32   %r2, [%rd4];
	add.s32     %r3, %r2, 1;
	st.global.u32   [%rd4], %r3;

BB0_2:
	ret;
}
";
            IntPtr ptr    = Marshal.StringToHGlobalAnsi(kernel);

            res = Cuda.cuModuleLoadData(out CUmodule cuModule, ptr);
            if (res != CUresult.CUDA_SUCCESS)
            {
                throw new Exception();
            }
            res = Cuda.cuModuleGetFunction(out CUfunction helloWorld, cuModule, "_Z4kernPi");
            if (res != CUresult.CUDA_SUCCESS)
            {
                throw new Exception();
            }
            int[]    v       = { 'G', 'd', 'k', 'k', 'n', (char)31, 'v', 'n', 'q', 'k', 'c' };
            GCHandle handle  = GCHandle.Alloc(v, GCHandleType.Pinned);
            IntPtr   pointer = IntPtr.Zero;

            pointer = handle.AddrOfPinnedObject();
            res     = Cuda.cuMemAlloc_v2(out IntPtr dptr, 11 * sizeof(int));
            if (res != CUresult.CUDA_SUCCESS)
            {
                throw new Exception();
            }
            res = Cuda.cuMemcpyHtoD_v2(dptr, pointer, 11 * sizeof(int));
            if (res != CUresult.CUDA_SUCCESS)
            {
                throw new Exception();
            }

            IntPtr[] x        = new IntPtr[] { dptr };
            GCHandle handle2  = GCHandle.Alloc(x, GCHandleType.Pinned);
            IntPtr   pointer2 = IntPtr.Zero;

            pointer2 = handle2.AddrOfPinnedObject();

            IntPtr[] kp = new IntPtr[] { pointer2 };
            fixed(IntPtr *kernelParams = kp)
            {
                res = Cuda.cuLaunchKernel(helloWorld,
                                          1, 1, 1,  // grid has one block.
                                          11, 1, 1, // block has 11 threads.
                                          0,        // no shared memory
                                          default(CUstream),
                                          (IntPtr)kernelParams,
                                          (IntPtr)IntPtr.Zero
                                          );
            }

            if (res != CUresult.CUDA_SUCCESS)
            {
                throw new Exception();
            }
            res = Cuda.cuMemcpyDtoH_v2(pointer, dptr, 11 * sizeof(int));
            for (int i = 0; i < 11; ++i)
            {
                System.Console.Write(Convert.ToChar(v[i]));
            }
            System.Console.WriteLine();
            if (res != CUresult.CUDA_SUCCESS)
            {
                throw new Exception();
            }
            Cuda.cuCtxDestroy_v2(cuContext);
        }