public static void Run(int size, double density) { try { Console.WriteLine("Running CUDA tests (Double) ... [N = {0}]", size); Console.WriteLine(); var A = Generate.Random(size, size, density); var B = Generate.RandomSymmetric(size, density, true); // Initialize CUDA device. Cuda.Initialize(); RunCudaTest(A, B); Console.WriteLine(); } catch (CudaException e) { Console.WriteLine(e.Result); } catch (Exception e) { Console.WriteLine(e.Message); } }
private static void SetBaseIndex(int base_index) { unsafe { IntPtr[] x1 = new IntPtr[] { new IntPtr(base_index) }; GCHandle handle2 = GCHandle.Alloc(x1, GCHandleType.Pinned); var parm1 = handle2.AddrOfPinnedObject(); IntPtr[] kp = new IntPtr[] { parm1 }; CUmodule module = RUNTIME.RuntimeModule; CudaHelpers.CheckCudaError(Cuda.cuModuleGetFunction(out CUfunction function, module, "_Z21set_kernel_base_indexi")); Campy.Utils.CudaHelpers.MakeLinearTiling(1, out Campy.Utils.CudaHelpers.dim3 tile_size, out Campy.Utils.CudaHelpers.dim3 tiles); CUresult res; fixed(IntPtr *kernelParams = kp) { res = Cuda.cuLaunchKernel( function, tiles.x, tiles.y, tiles.z, // grid has one block. tile_size.x, tile_size.y, tile_size.z, // n threads. 0, // no shared memory default(CUstream), (IntPtr)kernelParams, (IntPtr)IntPtr.Zero ); } Utils.CudaHelpers.CheckCudaError(res); res = Cuda.cuCtxSynchronize(); // Make sure it's copied back to host. Utils.CudaHelpers.CheckCudaError(res); } }
public Engine(Dictionary <RenderingPaneHwndDescription> renderingPaneHwndDescriptions) { Console.WriteLine("\nMojo initializing...\n"); D3D11.Initialize(out mDxgiFactory, out mD3D11Device); Cuda.Initialize(mD3D11Device); Thrust.Initialize(); Segmenter = new Segmenter { Interop = new Interop.Segmenter(mD3D11Device, mD3D11Device.ImmediateContext, Constants.Parameters) }; Viewers = new Dictionary <Viewer> { { "Segmenter2D", new Viewer { RenderingPane = new RenderingPane(mDxgiFactory, mD3D11Device, mD3D11Device.ImmediateContext, renderingPaneHwndDescriptions.Get("Segmenter2D").Handle, renderingPaneHwndDescriptions.Get("Segmenter2D").Width, renderingPaneHwndDescriptions.Get("Segmenter2D").Height, new Segmenter2D.RenderingStrategy(mD3D11Device, mD3D11Device.ImmediateContext, Segmenter)), UserInputHandler = new Segmenter2D.UserInputHandler(Segmenter) } } }; }
public static void CheckCudaError(Swigged.Cuda.CUresult res) { if (res != CUresult.CUDA_SUCCESS) { Cuda.cuGetErrorString(res, out IntPtr pStr); var cuda_error = Marshal.PtrToStringAnsi(pStr); throw new Exception("CUDA error: " + cuda_error); } }
public void Dispose() { Viewers.Internal.Values.ToList().ForEach(viewer => viewer.Dispose()); Segmenter.Dispose(); Thrust.Terminate(); Cuda.Terminate(); D3D11.Terminate(ref mDxgiFactory, ref mD3D11Device); Console.WriteLine("\nMojo terminating...\n"); }
/// <summary> /// Allocated a GPU managed buffer. /// Code based on https://www.codeproject.com/Articles/32125/Unmanaged-Arrays-in-C-No-Problem /// </summary> public IntPtr New(int bytes) { if (false) { // Let's try allocating a block of memory on the host. cuMemHostAlloc allocates bytesize // bytes of host memory that is page-locked and accessible to the device. // Note: cuMemHostAlloc and cuMemAllocHost seem to be almost identical except for the // third parameter to cuMemHostAlloc that is used for the type of memory allocation. var res = Cuda.cuMemHostAlloc(out IntPtr p, 10, (uint)Cuda.CU_MEMHOSTALLOC_DEVICEMAP); if (res == CUresult.CUDA_SUCCESS) { System.Console.WriteLine("Worked."); } else { System.Console.WriteLine("Did not work."); } } if (false) { // Allocate CPU memory, pin it, then register it with GPU. int f = new int(); GCHandle handle = GCHandle.Alloc(f, GCHandleType.Pinned); IntPtr pointer = (IntPtr)handle; var size = Marshal.SizeOf(f); var res = Cuda.cuMemHostRegister_v2(pointer, (uint)size, (uint)Cuda.CU_MEMHOSTALLOC_DEVICEMAP); if (res == CUresult.CUDA_SUCCESS) { System.Console.WriteLine("Worked."); } else { System.Console.WriteLine("Did not work."); } } { // Allocate Unified Memory. var size = bytes; var res = Cuda.cuMemAllocManaged(out IntPtr pointer, (uint)size, (uint)Swigged.Cuda.CUmemAttach_flags.CU_MEM_ATTACH_GLOBAL); if (res != CUresult.CUDA_SUCCESS) { throw new Exception("cuMemAllocManged failed."); } return(pointer); } if (false) { return(Marshal.AllocHGlobal(bytes)); } }
protected override void Factorize(int rows, int columns, int nnz, CuSparseContext <Complex> A) { var ap = A.ColumnPointers; var ai = A.RowIndices; var ax = A.Values; var desc = A.MatrixDescriptor; // Analyze chol(A) to know structure of L. Check(NativeMethods.cusolverSpXcsrcholAnalysis(_p, rows, nnz, desc, ap, ai, _info)); int size_internal = 0, size_chol = 0; // Workspace for chol(A). Check(NativeMethods.cusolverSpZcsrcholBufferInfo(_p, rows, nnz, desc, ax, ap, ai, _info, ref size_internal, ref size_chol)); Cuda.Malloc(ref _buffer, sizeof(char) * size_chol); // Compute A = L*L^T. Check(NativeMethods.cusolverSpZcsrcholFactor(_p, rows, nnz, desc, ax, ap, ai, _info, _buffer)); }
protected override void Factorize(int rows, int columns, int nnz, CuSparseContext <double> A) { var ap = A.ColumnPointers; var ai = A.RowIndices; var ax = A.Values; var desc = A.MatrixDescriptor; // Analyze qr(A) to know structure of L. Check(NativeMethods.cusolverSpXcsrqrAnalysis(_p, rows, columns, nnz, desc, ap, ai, _info)); int size_internal = 0, size_qr = 0; // Workspace for qr(A). Check(NativeMethods.cusolverSpDcsrqrBufferInfo(_p, rows, columns, nnz, desc, ax, ap, ai, _info, ref size_internal, ref size_qr)); Cuda.Malloc(ref _buffer, sizeof(char) * size_qr); Check(NativeMethods.cusolverSpDcsrqrSetup(_p, rows, columns, nnz, desc, ax, ap, ai, 0.0, _info)); // Compute A = Q*R. Check(NativeMethods.cusolverSpDcsrqrFactor(_p, rows, columns, nnz, IntPtr.Zero, IntPtr.Zero, _info, _buffer)); }
public static void For(int number_of_threads, SimpleKernel simpleKernel) { if (Campy.Utils.Options.IsOn("import-only")) { JustImport(simpleKernel); return; } GCHandle handle1 = default(GCHandle); GCHandle handle2 = default(GCHandle); try { unsafe { System.Reflection.MethodInfo method_info = simpleKernel.Method; String kernel_assembly_file_name = method_info.DeclaringType.Assembly.Location; Mono.Cecil.ModuleDefinition md = Campy.Meta.StickyReadMod.StickyReadModule( kernel_assembly_file_name, new ReaderParameters { ReadSymbols = true }); MethodReference method_reference = md.ImportReference(method_info); CUfunction ptr_to_kernel = default(CUfunction); CUmodule module = default(CUmodule); Campy.Utils.TimePhase.Time("compile ", () => { IntPtr image = Singleton._compiler.Compile(method_reference, simpleKernel.Target); module = Singleton._compiler.SetModule(method_reference, image); Singleton._compiler.StoreJits(module); ptr_to_kernel = Singleton._compiler.GetCudaFunction(method_reference, module); }); RUNTIME.BclCheckHeap(); BUFFERS buffer = Singleton.Buffer; IntPtr kernel_target_object = IntPtr.Zero; Campy.Utils.TimePhase.Time("deep copy ", () => { int count = simpleKernel.Method.GetParameters().Length; var bb = Singleton._compiler.GetBasicBlock(method_reference); if (bb.HasThis) { count++; } if (!(count == 1 || count == 2)) { throw new Exception("Expecting at least one parameter for kernel."); } if (bb.HasThis) { kernel_target_object = buffer.AddDataStructure(simpleKernel.Target); } }); Campy.Utils.TimePhase.Time("kernel cctor set up", () => { // For each cctor, run on GPU. // Construct dependency graph of methods. List <MethodReference> order_list = COMPILER.Singleton.ConstructCctorOrder(); // Finally, call cctors. foreach (var bb in order_list) { if (Campy.Utils.Options.IsOn("trace-cctors")) { System.Console.WriteLine("Executing cctor " + bb.FullName); } var cctor = Singleton._compiler.GetCudaFunction(bb, module); var res = CUresult.CUDA_SUCCESS; Campy.Utils.CudaHelpers.MakeLinearTiling(1, out Campy.Utils.CudaHelpers.dim3 tile_size, out Campy.Utils.CudaHelpers.dim3 tiles); res = Cuda.cuLaunchKernel( cctor, tiles.x, tiles.y, tiles.z, // grid has one block. tile_size.x, tile_size.y, tile_size.z, // n threads. 0, // no shared memory default(CUstream), (IntPtr)IntPtr.Zero, (IntPtr)IntPtr.Zero ); CudaHelpers.CheckCudaError(res); res = Cuda.cuCtxSynchronize(); // Make sure it's copied back to host. CudaHelpers.CheckCudaError(res); } }); if (Campy.Utils.Options.IsOn("trace-cctors")) { System.Console.WriteLine("Done with cctors"); } Campy.Utils.TimePhase.Time("kernel call ", () => { IntPtr[] parm1 = new IntPtr[1]; IntPtr[] parm2 = new IntPtr[1]; parm1[0] = kernel_target_object; parm2[0] = buffer.New(BUFFERS.SizeOf(typeof(int))); IntPtr[] x1 = parm1; handle1 = GCHandle.Alloc(x1, GCHandleType.Pinned); IntPtr pointer1 = handle1.AddrOfPinnedObject(); IntPtr[] x2 = parm2; handle2 = GCHandle.Alloc(x2, GCHandleType.Pinned); IntPtr pointer2 = handle2.AddrOfPinnedObject(); IntPtr[] kp = new IntPtr[] { pointer1, pointer2 }; var res = CUresult.CUDA_SUCCESS; fixed(IntPtr * kernelParams = kp) { Campy.Utils.CudaHelpers.MakeLinearTiling(number_of_threads, out Campy.Utils.CudaHelpers.dim3 tile_size, out Campy.Utils.CudaHelpers.dim3 tiles); //MakeLinearTiling(1, out dim3 tile_size, out dim3 tiles); res = Cuda.cuLaunchKernel( ptr_to_kernel, tiles.x, tiles.y, tiles.z, // grid has one block. tile_size.x, tile_size.y, tile_size.z, // n threads. 0, // no shared memory default(CUstream), (IntPtr)kernelParams, (IntPtr)IntPtr.Zero ); }
public static void For(int number_of_threads, SimpleKernel simpleKernel) { GCHandle handle1 = default(GCHandle); GCHandle handle2 = default(GCHandle); try { unsafe { //////// COMPILE KERNEL INTO GPU CODE /////// ///////////////////////////////////////////// var stopwatch_cuda_compile = new Stopwatch(); stopwatch_cuda_compile.Start(); IntPtr image = Singleton()._converter.Compile(simpleKernel.Method, simpleKernel.Target); CUfunction ptr_to_kernel = Singleton()._converter.GetCudaFunction(simpleKernel.Method, image); var elapse_cuda_compile = stopwatch_cuda_compile.Elapsed; RUNTIME.CheckHeap(); //////// COPY DATA INTO GPU ///////////////// ///////////////////////////////////////////// var stopwatch_deep_copy_to = new Stopwatch(); stopwatch_deep_copy_to.Reset(); stopwatch_deep_copy_to.Start(); BUFFERS buffer = Singleton().Buffer; // Set up parameters. int count = simpleKernel.Method.GetParameters().Length; var bb = Singleton()._converter.GetBasicBlock(simpleKernel.Method); if (bb.HasThis) { count++; } if (!(count == 1 || count == 2)) { throw new Exception("Expecting at least one parameter for kernel."); } IntPtr[] parm1 = new IntPtr[1]; IntPtr[] parm2 = new IntPtr[1]; IntPtr ptr = IntPtr.Zero; // The method really should have a "this" because it's a closure // object. if (bb.HasThis) { RUNTIME.CheckHeap(); ptr = buffer.AddDataStructure(simpleKernel.Target); parm1[0] = ptr; } { Type btype = typeof(int); var s = BUFFERS.SizeOf(btype); var ptr2 = buffer.New(s); // buffer.DeepCopyToImplementation(index, ptr2); parm2[0] = ptr2; } stopwatch_deep_copy_to.Start(); var elapse_deep_copy_to = stopwatch_cuda_compile.Elapsed; var stopwatch_call_kernel = new Stopwatch(); stopwatch_call_kernel.Reset(); stopwatch_call_kernel.Start(); IntPtr[] x1 = parm1; handle1 = GCHandle.Alloc(x1, GCHandleType.Pinned); IntPtr pointer1 = handle1.AddrOfPinnedObject(); IntPtr[] x2 = parm2; handle2 = GCHandle.Alloc(x2, GCHandleType.Pinned); IntPtr pointer2 = handle2.AddrOfPinnedObject(); RUNTIME.CheckHeap(); IntPtr[] kp = new IntPtr[] { pointer1, pointer2 }; var res = CUresult.CUDA_SUCCESS; fixed(IntPtr *kernelParams = kp) { Campy.Utils.CudaHelpers.MakeLinearTiling(number_of_threads, out Campy.Utils.CudaHelpers.dim3 tile_size, out Campy.Utils.CudaHelpers.dim3 tiles); //MakeLinearTiling(1, out dim3 tile_size, out dim3 tiles); res = Cuda.cuLaunchKernel( ptr_to_kernel, tiles.x, tiles.y, tiles.z, // grid has one block. tile_size.x, tile_size.y, tile_size.z, // n threads. 0, // no shared memory default(CUstream), (IntPtr)kernelParams, (IntPtr)IntPtr.Zero ); } CudaHelpers.CheckCudaError(res); res = Cuda.cuCtxSynchronize(); // Make sure it's copied back to host. CudaHelpers.CheckCudaError(res); stopwatch_call_kernel.Stop(); var elapse_call_kernel = stopwatch_call_kernel.Elapsed; if (Campy.Utils.Options.IsOn("jit_trace")) { System.Console.WriteLine("cuda compile " + elapse_cuda_compile); System.Console.WriteLine("deep copy in " + elapse_deep_copy_to); System.Console.WriteLine("cuda kernel " + elapse_call_kernel); } { var stopwatch_deep_copy_back = new Stopwatch(); stopwatch_deep_copy_back.Reset(); RUNTIME.CheckHeap(); stopwatch_deep_copy_back.Start(); buffer.SynchDataStructures(); stopwatch_deep_copy_back.Stop(); RUNTIME.CheckHeap(); var elapse_deep_copy_back = stopwatch_deep_copy_back.Elapsed; if (Campy.Utils.Options.IsOn("jit_trace")) { System.Console.WriteLine("deep copy out " + elapse_deep_copy_back); } } } } catch (Exception e) { Console.WriteLine(e); throw e; } finally { if (default(GCHandle) != handle1) { handle1.Free(); } if (default(GCHandle) != handle2) { handle2.Free(); } } }
static unsafe void Part1() { Cuda.cuInit(0); var res = Cuda.cuDeviceGet(out int device, 0); if (res != CUresult.CUDA_SUCCESS) { throw new Exception(); } res = Cuda.cuDeviceGetPCIBusId(out string pciBusId, 100, device); if (res != CUresult.CUDA_SUCCESS) { throw new Exception(); } res = Cuda.cuDeviceGetName(out string name, 100, device); if (res != CUresult.CUDA_SUCCESS) { throw new Exception(); } res = Cuda.cuCtxCreate_v2(out CUcontext cuContext, 0, device); if (res != CUresult.CUDA_SUCCESS) { throw new Exception(); } string assembly_directory = AssemblyDirectory; StreamReader sr = new StreamReader(assembly_directory + @"/../../Project2/x64/Debug/stuff.ptx"); string kernel = sr.ReadToEnd(); IntPtr ptr = Marshal.StringToHGlobalAnsi(kernel); int[] option_values = new int[] { }; GCHandle option_values_handle = GCHandle.Alloc(option_values, GCHandleType.Pinned); IntPtr options_values_ptr = option_values_handle.AddrOfPinnedObject(); CUjit_option[] options = new CUjit_option[] { }; res = Cuda.cuModuleLoadDataEx(out CUmodule cuModule, ptr, (uint)0, options, options_values_ptr); if (res != CUresult.CUDA_SUCCESS) { throw new Exception(); } res = Cuda.cuModuleGetFunction(out CUfunction helloWorld, cuModule, "_Z5helloPc"); if (res != CUresult.CUDA_SUCCESS) { throw new Exception(); } byte[] v = { (byte)'G', (byte)'d', (byte)'k', (byte)'k', (byte)'n', 31, (byte)'v', (byte)'n', (byte)'q', (byte)'k', (byte)'c', 0 }; GCHandle handle = GCHandle.Alloc(v, GCHandleType.Pinned); IntPtr pointer = IntPtr.Zero; pointer = handle.AddrOfPinnedObject(); res = Cuda.cuMemAlloc_v2(out IntPtr dptr, 11 * sizeof(byte)); if (res != CUresult.CUDA_SUCCESS) { throw new Exception(); } res = Cuda.cuMemcpyHtoD_v2(dptr, pointer, 11 * sizeof(byte)); if (res != CUresult.CUDA_SUCCESS) { throw new Exception(); } IntPtr[] x = new IntPtr[] { dptr }; GCHandle handle2 = GCHandle.Alloc(x, GCHandleType.Pinned); IntPtr pointer2 = IntPtr.Zero; pointer2 = handle2.AddrOfPinnedObject(); IntPtr[] kp = new IntPtr[] { pointer2 }; fixed(IntPtr *kernelParams = kp) { res = Cuda.cuLaunchKernel(helloWorld, 1, 1, 1, // grid has one block. 11, 1, 1, // block has 11 threads. 0, // no shared memory default(CUstream), (IntPtr)kernelParams, (IntPtr)IntPtr.Zero ); } if (res != CUresult.CUDA_SUCCESS) { throw new Exception(); } res = Cuda.cuMemcpyDtoH_v2(pointer, dptr, 11 * sizeof(byte)); if (res != CUresult.CUDA_SUCCESS) { throw new Exception(); } for (int i = 0; i < 11; ++i) { System.Console.Write((char)v[i]); } System.Console.WriteLine(); Cuda.cuCtxDestroy_v2(cuContext); }
static unsafe void Part2() { Cuda.cuInit(0); var res = Cuda.cuDeviceGet(out int device, 0); if (res != CUresult.CUDA_SUCCESS) { throw new Exception(); } res = Cuda.cuDeviceGetPCIBusId(out string pciBusId, 100, device); if (res != CUresult.CUDA_SUCCESS) { throw new Exception(); } res = Cuda.cuDeviceGetName(out string name, 100, device); if (res != CUresult.CUDA_SUCCESS) { throw new Exception(); } res = Cuda.cuCtxCreate_v2(out CUcontext cuContext, 0, device); if (res != CUresult.CUDA_SUCCESS) { throw new Exception(); } string assembly_directory = AssemblyDirectory; FileStream fs = new FileStream(assembly_directory + @"/../../Project2/x64/Debug/stuff.cu.obj", FileMode.Open); var len = fs.Length; var gpu_obj = new byte[len]; fs.Read(gpu_obj, 0, (int)len); uint num_ops_link = 5; var op_link = new CUjit_option[num_ops_link]; ulong[] op_values_link = new ulong[num_ops_link]; int size = 1024 * 100; op_link[0] = CUjit_option.CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES; op_values_link[0] = (ulong)size; op_link[1] = CUjit_option.CU_JIT_INFO_LOG_BUFFER; byte[] info_log_buffer = new byte[size]; var info_log_buffer_handle = GCHandle.Alloc(info_log_buffer, GCHandleType.Pinned); var info_log_buffer_intptr = info_log_buffer_handle.AddrOfPinnedObject(); op_values_link[1] = (ulong)info_log_buffer_intptr; op_link[2] = CUjit_option.CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES; op_values_link[2] = (ulong)size; op_link[3] = CUjit_option.CU_JIT_ERROR_LOG_BUFFER; byte[] error_log_buffer = new byte[size]; var error_log_buffer_handle = GCHandle.Alloc(error_log_buffer, GCHandleType.Pinned); var error_log_buffer_intptr = error_log_buffer_handle.AddrOfPinnedObject(); op_values_link[3] = (ulong)error_log_buffer_intptr; op_link[4] = CUjit_option.CU_JIT_LOG_VERBOSE; op_values_link[4] = (ulong)1; var op_values_link_handle = GCHandle.Alloc(op_values_link, GCHandleType.Pinned); var op_values_link_intptr = op_values_link_handle.AddrOfPinnedObject(); res = Cuda.cuLinkCreate_v2(num_ops_link, op_link, op_values_link_intptr, out CUlinkState linkState); { string info = Marshal.PtrToStringAnsi(info_log_buffer_intptr); System.Console.WriteLine(info); string error = Marshal.PtrToStringAnsi(error_log_buffer_intptr); System.Console.WriteLine(error); } uint num_ops = 0; CUjit_option[] op = new CUjit_option[0]; ulong[] op_values = new ulong[0]; var op_values_handle = GCHandle.Alloc(op_values, GCHandleType.Pinned); var op_values_intptr = op_values_handle.AddrOfPinnedObject(); var kernel_handle = GCHandle.Alloc(gpu_obj, GCHandleType.Pinned); IntPtr gpu_bcl_obj_intptr = kernel_handle.AddrOfPinnedObject(); res = Cuda.cuLinkAddData_v2(linkState, CUjitInputType.CU_JIT_INPUT_OBJECT, gpu_bcl_obj_intptr, (uint)len, "", num_ops, op, op_values_intptr); { string info = Marshal.PtrToStringAnsi(info_log_buffer_intptr); System.Console.WriteLine(info); string error = Marshal.PtrToStringAnsi(error_log_buffer_intptr); System.Console.WriteLine(error); } IntPtr image; res = Cuda.cuLinkComplete(linkState, out image, out ulong sz); res = Cuda.cuModuleLoadDataEx(out CUmodule module, image, 0, op, op_values_link_intptr); res = Cuda.cuModuleGetFunction(out CUfunction helloWorld, module, "_Z5helloPc"); if (res != CUresult.CUDA_SUCCESS) { throw new Exception(); } byte[] v = { (byte)'G', (byte)'d', (byte)'k', (byte)'k', (byte)'n', 31, (byte)'v', (byte)'n', (byte)'q', (byte)'k', (byte)'c', 0 }; GCHandle handle = GCHandle.Alloc(v, GCHandleType.Pinned); IntPtr pointer = IntPtr.Zero; pointer = handle.AddrOfPinnedObject(); res = Cuda.cuMemAlloc_v2(out IntPtr dptr, 11 * sizeof(byte)); if (res != CUresult.CUDA_SUCCESS) { throw new Exception(); } res = Cuda.cuMemcpyHtoD_v2(dptr, pointer, 11 * sizeof(byte)); if (res != CUresult.CUDA_SUCCESS) { throw new Exception(); } IntPtr[] x = new IntPtr[] { dptr }; GCHandle handle2 = GCHandle.Alloc(x, GCHandleType.Pinned); IntPtr pointer2 = IntPtr.Zero; pointer2 = handle2.AddrOfPinnedObject(); IntPtr[] kp = new IntPtr[] { pointer2 }; fixed(IntPtr *kernelParams = kp) { res = Cuda.cuLaunchKernel(helloWorld, 1, 1, 1, // grid has one block. 11, 1, 1, // block has 11 threads. 0, // no shared memory default(CUstream), (IntPtr)kernelParams, (IntPtr)IntPtr.Zero ); } if (res != CUresult.CUDA_SUCCESS) { throw new Exception(); } res = Cuda.cuMemcpyDtoH_v2(pointer, dptr, 11 * sizeof(byte)); if (res != CUresult.CUDA_SUCCESS) { throw new Exception(); } for (int i = 0; i < 11; ++i) { System.Console.Write((char)v[i]); } System.Console.WriteLine(); Cuda.cuCtxDestroy_v2(cuContext); }
static unsafe void Main(string[] args) { System.Console.WriteLine("1"); Cuda.cuInit(0); System.Console.WriteLine("2"); // Device api. var res = Cuda.cuDeviceGet(out int device, 0); if (res != CUresult.CUDA_SUCCESS) { throw new Exception(); } res = Cuda.cuDeviceGetPCIBusId(out string pciBusId, 100, device); if (res != CUresult.CUDA_SUCCESS) { throw new Exception(); } res = Cuda.cuDeviceGetName(out string name, 100, device); if (res != CUresult.CUDA_SUCCESS) { throw new Exception(); } System.Console.WriteLine("3"); res = Cuda.cuCtxCreate_v2(out CUcontext cuContext, 0, device); if (res != CUresult.CUDA_SUCCESS) { throw new Exception(); } string cu_kernel = @" #include <stdio.h> __global__ void kern(int * ar) { int i = threadIdx.x; if (i < 11) ar[i] = ar[i] + 1; } "; string compile_string = @" nvcc --ptx --gpu-architecture=sm_20 -ccbin ""C:\Program Files(x86)\Microsoft Visual Studio 14.0\VC\bin"" y.cu"; string kernel = @" // // Generated by NVIDIA NVVM Compiler // // Compiler Build ID: CL-21373419 // Cuda compilation tools, release 8.0, V8.0.55 // Based on LLVM 3.4svn // .version 5.0 .target sm_20 .address_size 64 // .globl _Z4kernPi .visible .entry _Z4kernPi( .param .u64 _Z4kernPi_param_0 ) { .reg .pred %p<2>; .reg .b32 %r<4>; .reg .b64 %rd<5>; ld.param.u64 %rd1, [_Z4kernPi_param_0]; mov.u32 %r1, %tid.x; setp.gt.s32 %p1, %r1, 10; @%p1 bra BB0_2; cvta.to.global.u64 %rd2, %rd1; mul.wide.s32 %rd3, %r1, 4; add.s64 %rd4, %rd2, %rd3; ld.global.u32 %r2, [%rd4]; add.s32 %r3, %r2, 1; st.global.u32 [%rd4], %r3; BB0_2: ret; } "; IntPtr ptr = Marshal.StringToHGlobalAnsi(kernel); res = Cuda.cuModuleLoadData(out CUmodule cuModule, ptr); if (res != CUresult.CUDA_SUCCESS) { throw new Exception(); } res = Cuda.cuModuleGetFunction(out CUfunction helloWorld, cuModule, "_Z4kernPi"); if (res != CUresult.CUDA_SUCCESS) { throw new Exception(); } int[] v = { 'G', 'd', 'k', 'k', 'n', (char)31, 'v', 'n', 'q', 'k', 'c' }; GCHandle handle = GCHandle.Alloc(v, GCHandleType.Pinned); IntPtr pointer = IntPtr.Zero; pointer = handle.AddrOfPinnedObject(); res = Cuda.cuMemAlloc_v2(out IntPtr dptr, 11 * sizeof(int)); if (res != CUresult.CUDA_SUCCESS) { throw new Exception(); } res = Cuda.cuMemcpyHtoD_v2(dptr, pointer, 11 * sizeof(int)); if (res != CUresult.CUDA_SUCCESS) { throw new Exception(); } IntPtr[] x = new IntPtr[] { dptr }; GCHandle handle2 = GCHandle.Alloc(x, GCHandleType.Pinned); IntPtr pointer2 = IntPtr.Zero; pointer2 = handle2.AddrOfPinnedObject(); IntPtr[] kp = new IntPtr[] { pointer2 }; fixed(IntPtr *kernelParams = kp) { res = Cuda.cuLaunchKernel(helloWorld, 1, 1, 1, // grid has one block. 11, 1, 1, // block has 11 threads. 0, // no shared memory default(CUstream), (IntPtr)kernelParams, (IntPtr)IntPtr.Zero ); } if (res != CUresult.CUDA_SUCCESS) { throw new Exception(); } res = Cuda.cuMemcpyDtoH_v2(pointer, dptr, 11 * sizeof(int)); for (int i = 0; i < 11; ++i) { System.Console.Write(Convert.ToChar(v[i])); } System.Console.WriteLine(); if (res != CUresult.CUDA_SUCCESS) { throw new Exception(); } Cuda.cuCtxDestroy_v2(cuContext); }
public Buffers() { _asm = new Asm(); Cuda.cuInit(0); Cuda.cuCtxCreate_v2(out _pctx, (uint)Swigged.Cuda.CUctx_flags.CU_CTX_MAP_HOST, 0); }