public static void cuParamSetv(CUfunction hfunc, int offset, Object obj) { Wrap(() => { try { obj.AssertNotNull().GetType().IsValueType.AssertTrue(); var size = Marshal.SizeOf(obj); var ptr = Marshal.AllocHGlobal(size); try { Marshal.StructureToPtr(obj, ptr, false); var error = nativeParamSetv(hfunc, offset, ptr, (uint)size); if (error != CUresult.CUDA_SUCCESS) throw new CudaException(error); } finally { Marshal.FreeHGlobal(ptr); } } catch (CudaException) { throw; } catch (DllNotFoundException dnfe) { throw new CudaException(CudaError.NoDriver, dnfe); } catch (Exception e) { throw new CudaException(CudaError.Unknown, e); } }); }
public JittedFunction(CUfunction handle, String name) { CudaDriver.Ensure(); Handle = handle.AssertThat(h => h.IsNotNull); Name = name ?? "N/A"; MaxThreadsPerBlock = nvcuda.cuFuncGetAttribute(CUfunction_attribute.MaxThreadsPerBlock, this); SharedSizeBytes = nvcuda.cuFuncGetAttribute(CUfunction_attribute.SharedSizeBytes, this); ConstSizeBytes = nvcuda.cuFuncGetAttribute(CUfunction_attribute.ConstSizeBytes, this); LocalSizeBytes = nvcuda.cuFuncGetAttribute(CUfunction_attribute.LocalSizeBytes, this); NumRegs = nvcuda.cuFuncGetAttribute(CUfunction_attribute.NumRegs, this); PtxVersion = (HardwareIsa)nvcuda.cuFuncGetAttribute(CUfunction_attribute.PtxVersion, this); BinaryVersion = (HardwareIsa)nvcuda.cuFuncGetAttribute(CUfunction_attribute.BinaryVersion, this); }
public static void cuParamSetf(CUfunction hfunc, int offset, float value) { Wrap(() => { try { var error = nativeParamSetf(hfunc, offset, value); if (error != CUresult.CUDA_SUCCESS) throw new CudaException(error); } catch (CudaException) { throw; } catch (DllNotFoundException dnfe) { throw new CudaException(CudaError.NoDriver, dnfe); } catch (Exception e) { throw new CudaException(CudaError.Unknown, e); } }); }
public static void cuLaunchGrid(CUfunction f, dim3 dim) { // wow here we ain't able to specify the Z dimension if (dim.Z != 1) throw new CudaException(CudaError.InvalidGridDim); cuLaunchGrid(f, dim.X, dim.Y); }
public static void cuLaunchGrid(CUfunction f, int grid_width, int grid_height) { Wrap(() => { try { // if we don't verify this here, we'll get a strange message from the driver var caps = CudaDevice.Current.Caps.GridCaps; var valid_grid_dim = caps.MaxGridDim >= new dim3(grid_width, grid_height, 1); if (!valid_grid_dim) throw new CudaException(CudaError.InvalidGridDim); var error = nativeLaunchGrid(f, grid_width, grid_height); if (error != CUresult.CUDA_SUCCESS) throw new CudaException(error); } catch (CudaException) { throw; } catch (DllNotFoundException dnfe) { throw new CudaException(CudaError.NoDriver, dnfe); } catch (Exception e) { throw new CudaException(CudaError.Unknown, e); } }); }
// http://developer.download.nvidia.com/compute/cuda/3_1/toolkit/docs/online/group__CUEXEC_g1a99f308b2f1655df734eb62452fafd3.html private static extern CUresult nativeLaunchGrid(CUfunction f, int grid_width, int grid_height);
public static void cuFuncSetBlockShape(CUfunction hfunc, dim3 dim) { cuFuncSetBlockShape(hfunc, dim.X, dim.Y, dim.Z); }
public static void cuFuncSetBlockShape(CUfunction hfunc, int x, int y, int z) { Wrap(() => { try { // if we don't verify this here, we'll get a strange message from the driver var caps = CudaDevice.Current.Caps.GridCaps; var valid_block_dim = caps.MaxBlockDim >= new dim3(x, y, z); if (!valid_block_dim) throw new CudaException(CudaError.InvalidBlockDim); var error = nativeFuncSetBlockShape(hfunc, x, y, z); if (error != CUresult.CUDA_SUCCESS) throw new CudaException(error); } catch (CudaException) { throw; } catch (DllNotFoundException dnfe) { throw new CudaException(CudaError.NoDriver, dnfe); } catch (Exception e) { throw new CudaException(CudaError.Unknown, e); } }); }
public CudaProvider(string cudaKernelPath, bool stochastic, int memoryCacheSize) { _stochastic = stochastic; _cache = new DeviceMemory(memoryCacheSize); _cuda = new CudaContext(); _kernel = new KernelModule(_cuda, cudaKernelPath); _blas = new CudaBlas(AtomicsMode.Allowed); _cuda.SetCurrent(); _pointwiseMultiply = _kernel.LoadFunction("PointwiseMultiply"); _addInPlace = _kernel.LoadFunction("AddInPlace"); _subtractInPlace = _kernel.LoadFunction("SubtractInPlace"); _addToEachRow = _kernel.LoadFunction("AddToEachRow"); _addToEachColumn = _kernel.LoadFunction("AddToEachColumn"); _tanh = _kernel.LoadFunction("TanH"); _tanhDerivative = _kernel.LoadFunction("TanHDerivative"); _sigmoid = _kernel.LoadFunction("Sigmoid"); _sigmoidDerivative = _kernel.LoadFunction("SigmoidDerivative"); _sumRows = _kernel.LoadFunction("SumRows"); _relu = _kernel.LoadFunction("RELU"); _reluDerivative = _kernel.LoadFunction("RELUDerivative"); _memClear = _kernel.LoadFunction("MemClear"); _sumColumns = _kernel.LoadFunction("SumColumns"); _pointwiseDivide = _kernel.LoadFunction("PointwiseDivide"); _sqrt = _kernel.LoadFunction("Sqrt"); _findMinAndMax = _kernel.LoadFunction("FindMinAndMax"); _findSum = _kernel.LoadFunction("FindSum"); _findStdDev = _kernel.LoadFunction("FindStdDev"); _constrain = _kernel.LoadFunction("Constrain"); _pow = _kernel.LoadFunction("Pow"); _diagonal = _kernel.LoadFunction("Diagonal"); _l1Regularisation = _kernel.LoadFunction("L1Regularisation"); _leakyRelu = _kernel.LoadFunction("LeakyRELU"); _leakyReluDerivative = _kernel.LoadFunction("LeakyRELUDerivative"); _pointwiseDivideRows = _kernel.LoadFunction("PointwiseDivideRows"); _pointwiseDivideColumns = _kernel.LoadFunction("PointwiseDivideColumns"); _splitRows = _kernel.LoadFunction("SplitRows"); _splitColumns = _kernel.LoadFunction("SplitColumns"); _concatRows = _kernel.LoadFunction("ConcatRows"); _concatColumns = _kernel.LoadFunction("ConcatColumns"); _euclideanDistance = _kernel.LoadFunction("EuclideanDistance"); _manhattanDistance = _kernel.LoadFunction("ManhattanDistance"); _abs = _kernel.LoadFunction("Abs"); _normalise = _kernel.LoadFunction("Normalise"); _softmaxVector = _kernel.LoadFunction("SoftmaxVector"); _multiEuclidean = _kernel.LoadFunction("MultiEuclideanDistance"); _multiManhattan = _kernel.LoadFunction("MultiManhattanDistance"); _log = _kernel.LoadFunction("Log"); _vectorAdd = _kernel.LoadFunction("VectorAdd"); _vectorCopyRandom = _kernel.LoadFunction("VectorCopyRandom"); _copyToMatrix = _kernel.LoadFunction("CopyToMatrix"); _vectorSplit = _kernel.LoadFunction("VectorSplit"); _tensorConvertToVector = _kernel.LoadFunction("TensorConvertToVector"); _tensorConvertToMatrix = _kernel.LoadFunction("TensorConvertToMatrix"); _tensorAddPadding = _kernel.LoadFunction("TensorAddPadding"); _tensorRemovePadding = _kernel.LoadFunction("TensorRemovePadding"); _tensorIm2Col = _kernel.LoadFunction("TensorIm2Col"); _softmaxDerivative = _kernel.LoadFunction("SoftmaxDerivative"); _reverse = _kernel.LoadFunction("Reverse"); _rotate = _kernel.LoadFunction("Rotate"); _tensorMaxPool = _kernel.LoadFunction("TensorMaxPool"); _tensorReverseMaxPool = _kernel.LoadFunction("TensorReverseMaxPool"); _tensorReverseIm2Col = _kernel.LoadFunction("TensorReverseIm2Col"); }
// http://developer.download.nvidia.com/compute/cuda/3_1/toolkit/docs/online/group__CUEXEC_gf4466f8fe7043de8c97137990bb2e1e9.html private static extern CUresult nativeParamSetv(CUfunction hfunc, int offset, IntPtr ptr, uint numbytes);
// http://developer.download.nvidia.com/compute/cuda/3_1/toolkit/docs/online/group__CUEXEC_g91d75e10ed90df3fd3ecf2488f2cb27f.html private static extern CUresult nativeFuncSetCacheConfig(CUfunction hfunc, CUfunc_cache config);
public static void cuFuncSetSharedSize(CUfunction hfunc, uint bytes) { Wrap(() => { try { var error = nativeFuncSetSharedSize(hfunc, bytes); if (error != CUresult.CUDA_SUCCESS) throw new CudaException(error); } catch (CudaException) { throw; } catch (DllNotFoundException dnfe) { throw new CudaException(CudaError.NoDriver, dnfe); } catch (Exception e) { throw new CudaException(CudaError.Unknown, e); } }); }
// http://developer.download.nvidia.com/compute/cuda/3_1/toolkit/docs/online/group__CUEXEC_g65bc1dcf1127400d8f7ff935edbd333c.html private static extern CUresult nativeFuncSetSharedSize(CUfunction hfunc, uint bytes);
public static int cuFuncGetAttribute(CUfunction_attribute attrib, CUfunction hfunc) { return Wrap(() => { try { int i; var error = nativeFuncGetAttribute(out i, attrib, hfunc); if (error != CUresult.CUDA_SUCCESS) throw new CudaException(error); return i; } catch (CudaException) { throw; } catch (DllNotFoundException dnfe) { throw new CudaException(CudaError.NoDriver, dnfe); } catch (Exception e) { throw new CudaException(CudaError.Unknown, e); } }); }
// http://developer.download.nvidia.com/compute/cuda/3_1/toolkit/docs/online/group__CUEXEC_g7bde0e4a4ce32ce7460348e01a91f45f.html private static extern CUresult nativeFuncGetAttribute(out int pi, CUfunction_attribute attrib, CUfunction hfunc);
// http://developer.download.nvidia.com/compute/cuda/3_1/toolkit/docs/online/group__CUEXEC_g1946eac8c4d9d74f6aba01b0aab2c3dd.html private static extern CUresult nativeParamSetSize(CUfunction hfunc, uint numbytes);
// http://developer.download.nvidia.com/compute/cuda/3_1/toolkit/docs/online/group__CUMODULE_ge18a9f0d853ae3a96a38416a0671606b.html private static extern CUresult nativeModuleGetFunction(out CUfunction hfunc, CUmodule hmod, String name);
// http://developer.download.nvidia.com/compute/cuda/3_1/toolkit/docs/online/group__CUEXEC_g27981d94ac79ac7ebe8590a858785b3b.html private static extern CUresult nativeParamSetf(CUfunction hfunc, int offset, float value);
public JittedFunction(CUfunction handle) : this(handle, null) { }
public static void For(int number_of_threads, SimpleKernel simpleKernel) { if (Campy.Utils.Options.IsOn("import-only")) { JustImport(simpleKernel); return; } GCHandle handle1 = default(GCHandle); GCHandle handle2 = default(GCHandle); try { unsafe { System.Reflection.MethodInfo method_info = simpleKernel.Method; String kernel_assembly_file_name = method_info.DeclaringType.Assembly.Location; Mono.Cecil.ModuleDefinition md = Campy.Meta.StickyReadMod.StickyReadModule( kernel_assembly_file_name, new ReaderParameters { ReadSymbols = true }); MethodReference method_reference = md.ImportReference(method_info); CUfunction ptr_to_kernel = default(CUfunction); CUmodule module = default(CUmodule); Campy.Utils.TimePhase.Time("compile ", () => { IntPtr image = Singleton._compiler.Compile(method_reference, simpleKernel.Target); module = Singleton._compiler.SetModule(method_reference, image); Singleton._compiler.StoreJits(module); ptr_to_kernel = Singleton._compiler.GetCudaFunction(method_reference, module); }); RUNTIME.BclCheckHeap(); BUFFERS buffer = Singleton.Buffer; IntPtr kernel_target_object = IntPtr.Zero; Campy.Utils.TimePhase.Time("deep copy ", () => { int count = simpleKernel.Method.GetParameters().Length; var bb = Singleton._compiler.GetBasicBlock(method_reference); if (bb.HasThis) { count++; } if (!(count == 1 || count == 2)) { throw new Exception("Expecting at least one parameter for kernel."); } if (bb.HasThis) { kernel_target_object = buffer.AddDataStructure(simpleKernel.Target); } }); Campy.Utils.TimePhase.Time("kernel cctor set up", () => { // For each cctor, run on GPU. // Construct dependency graph of methods. List <MethodReference> order_list = COMPILER.Singleton.ConstructCctorOrder(); // Finally, call cctors. foreach (var bb in order_list) { if (Campy.Utils.Options.IsOn("trace-cctors")) { System.Console.WriteLine("Executing cctor " + bb.FullName); } var cctor = Singleton._compiler.GetCudaFunction(bb, module); var res = new CUresult(cudaError_enum.CUDA_SUCCESS); Campy.Utils.CudaHelpers.MakeLinearTiling(1, out Campy.Utils.CudaHelpers.dim3 tile_size, out Campy.Utils.CudaHelpers.dim3 tiles); res = Functions.cuLaunchKernel( cctor, tiles.x, tiles.y, tiles.z, // grid has one block. tile_size.x, tile_size.y, tile_size.z, // n threads. 0, // no shared memory default(CUstream), (IntPtr)IntPtr.Zero, (IntPtr)IntPtr.Zero ); CudaHelpers.CheckCudaError(res); res = Functions.cuCtxSynchronize(); // Make sure it's copied back to host. CudaHelpers.CheckCudaError(res); } }); if (Campy.Utils.Options.IsOn("trace-cctors")) { System.Console.WriteLine("Done with cctors"); } Campy.Utils.TimePhase.Time("kernel call ", () => { IntPtr[] parm1 = new IntPtr[1]; IntPtr[] parm2 = new IntPtr[1]; parm1[0] = kernel_target_object; parm2[0] = buffer.New(BUFFERS.SizeOf(typeof(int))); IntPtr[] x1 = parm1; handle1 = GCHandle.Alloc(x1, GCHandleType.Pinned); IntPtr pointer1 = handle1.AddrOfPinnedObject(); IntPtr[] x2 = parm2; handle2 = GCHandle.Alloc(x2, GCHandleType.Pinned); IntPtr pointer2 = handle2.AddrOfPinnedObject(); IntPtr[] kp = new IntPtr[] { pointer1, pointer2 }; var res = new CUresult(cudaError_enum.CUDA_SUCCESS); fixed(IntPtr * kernelParams = kp) { Campy.Utils.CudaHelpers.MakeLinearTiling(number_of_threads, out Campy.Utils.CudaHelpers.dim3 tile_size, out Campy.Utils.CudaHelpers.dim3 tiles); //MakeLinearTiling(1, out dim3 tile_size, out dim3 tiles); res = Functions.cuLaunchKernel( ptr_to_kernel, tiles.x, tiles.y, tiles.z, // grid has one block. tile_size.x, tile_size.y, tile_size.z, // n threads. 0, // no shared memory default(CUstream), (IntPtr)kernelParams, (IntPtr)IntPtr.Zero ); }
public static void cuFuncSetCacheConfig(CUfunction hfunc, CUfunc_cache config) { Wrap(() => { try { var error = nativeFuncSetCacheConfig(hfunc, config); if (error != CUresult.CUDA_SUCCESS) throw new CudaException(error); } catch (CudaException) { throw; } catch (DllNotFoundException dnfe) { throw new CudaException(CudaError.NoDriver, dnfe); } catch (Exception e) { throw new CudaException(CudaError.Unknown, e); } }); }
public static unsafe void Gpu() { Cuda.CUcontext ctx = new CUcontext(IntPtr.Zero); try { Cuda.Functions.cuInit(0); int count = 0; var res = Cuda.Functions.cuDeviceGetCount(ref count); if (res.Value != cudaError_enum.CUDA_SUCCESS) { throw new Exception(); } for (int deviceID = 0; deviceID < count; ++deviceID) { CUdevice device = default(CUdevice); res = Cuda.Functions.cuDeviceGet(ref device, deviceID); if (res.Value != cudaError_enum.CUDA_SUCCESS) { throw new Exception(); } byte[] name = new byte[100]; fixed(void *p = name) { res = Cuda.Functions.cuDeviceGetName((IntPtr)p, 100, device); } if (res.Value != cudaError_enum.CUDA_SUCCESS) { throw new Exception(); } System.Text.ASCIIEncoding enc = new System.Text.ASCIIEncoding(); string n = enc.GetString(name).Replace("\0", ""); CUdevprop props = default(CUdevprop); res = Cuda.Functions.cuDeviceGetProperties(ref props, device); if (res.Value != cudaError_enum.CUDA_SUCCESS) { throw new Exception(); } System.Console.WriteLine("--------"); System.Console.WriteLine(Helper.OutProps(props, n)); } { CUdevice device = default(CUdevice); res = Cuda.Functions.cuDeviceGet(ref device, 0); if (res.Value != cudaError_enum.CUDA_SUCCESS) { throw new Exception(); } CUcontext cuContext = default(CUcontext); res = Cuda.Functions.cuCtxCreate_v2(ref cuContext, 0, device); if (res.Value != cudaError_enum.CUDA_SUCCESS) { throw new Exception(); } string kernel = @" // // Generated by NVIDIA NVVM Compiler // // Compiler Build ID: CL-21373419 // Cuda compilation tools, release 8.0, V8.0.55 // Based on LLVM 3.4svn // .version 5.0 .target sm_20 .address_size 64 // .globl _Z4kernPi .visible .entry _Z4kernPi( .param .u64 _Z4kernPi_param_0 ) { .reg .pred %p<2>; .reg .b32 %r<4>; .reg .b64 %rd<5>; ld.param.u64 %rd1, [_Z4kernPi_param_0]; mov.u32 %r1, %tid.x; setp.gt.s32 %p1, %r1, 10; @%p1 bra BB0_2; cvta.to.global.u64 %rd2, %rd1; mul.wide.s32 %rd3, %r1, 4; add.s64 %rd4, %rd2, %rd3; ld.global.u32 %r2, [%rd4]; add.s32 %r3, %r2, 1; st.global.u32 [%rd4], %r3; BB0_2: ret; } "; IntPtr ptr = Marshal.StringToHGlobalAnsi(kernel); CUmodule cuModule = default(CUmodule); res = Functions.cuModuleLoadData(ref cuModule, ptr); if (res.Value != cudaError_enum.CUDA_SUCCESS) { throw new Exception(); } CUfunction helloWorld = default(CUfunction); var fun = Marshal.StringToHGlobalAnsi("_Z4kernPi"); res = Functions.cuModuleGetFunction(ref helloWorld, cuModule, fun); if (res.Value != cudaError_enum.CUDA_SUCCESS) { throw new Exception(); } int[] v = { 'G', 'd', 'k', 'k', 'n', (char)31, 'v', 'n', 'q', 'k', 'c' }; GCHandle handle = GCHandle.Alloc(v, GCHandleType.Pinned); IntPtr pointer = IntPtr.Zero; pointer = handle.AddrOfPinnedObject(); CUdeviceptr dptr = default(CUdeviceptr); res = Functions.cuMemAlloc_v2(ref dptr, 11 * sizeof(int)); if (res.Value != cudaError_enum.CUDA_SUCCESS) { throw new Exception(); } res = Functions.cuMemcpyHtoD_v2(dptr, pointer, 11 * sizeof(int)); if (res.Value != cudaError_enum.CUDA_SUCCESS) { throw new Exception(); } IntPtr[] x = new IntPtr[] { (IntPtr)dptr.Value }; GCHandle handle2 = GCHandle.Alloc(x, GCHandleType.Pinned); IntPtr pointer2 = handle2.AddrOfPinnedObject(); IntPtr[] kp = new IntPtr[] { pointer2 }; fixed(IntPtr *kernelParams = kp) { res = Functions.cuLaunchKernel(helloWorld, 1, 1, 1, // grid has one block. 11, 1, 1, // block has 11 threads. 0, // no shared memory default(CUstream), (IntPtr)kernelParams, (IntPtr)IntPtr.Zero ); } if (res.Value != cudaError_enum.CUDA_SUCCESS) { throw new Exception(); } res = Functions.cuMemcpyDtoH_v2(pointer, dptr, 11 * sizeof(int)); if (res.Value != cudaError_enum.CUDA_SUCCESS) { throw new Exception(); } Cuda.Functions.cuCtxDestroy_v2(cuContext); var aofc = v.Select(c => (char)c).ToArray(); System.Console.WriteLine("Result = " + new string(aofc)); } { CUdevice device = default(CUdevice); res = Cuda.Functions.cuDeviceGet(ref device, 0); if (res.Value != cudaError_enum.CUDA_SUCCESS) { throw new Exception(); } CUcontext cuContext = default(CUcontext); res = Cuda.Functions.cuCtxCreate_v2(ref cuContext, 0, device); if (res.Value != cudaError_enum.CUDA_SUCCESS) { throw new Exception(); } string path = Assembly.GetAssembly(typeof(Program)).Location; path = Path.GetDirectoryName(path); path = Path.GetFullPath(path + @"\..\..\..\.."); path = path + @"\cuda\x64\Debug\vector-sum.ptx"; StreamReader sr = new StreamReader(path); String ptx = sr.ReadToEnd(); IntPtr ptr = Marshal.StringToHGlobalAnsi(ptx); CUmodule module = default(CUmodule); res = Cuda.Functions.cuModuleLoadData(ref module, ptr); if (res.Value != cudaError_enum.CUDA_SUCCESS) { throw new Exception(); } CUfunction helloWorld = default(CUfunction); var fun = Marshal.StringToHGlobalAnsi("VectorSumParallel"); res = Cuda.Functions.cuModuleGetFunction(ref helloWorld, module, fun); if (res.Value != cudaError_enum.CUDA_SUCCESS) { throw new Exception(); } int n = 3; int[] a = Enumerable.Range(1, 3).Select(v => v * 3).ToArray(); int[] b = Enumerable.Range(1, 3).Select(v => v * 2).ToArray(); int[] c = new int[3]; CUdeviceptr d_a = default(CUdeviceptr); CUdeviceptr d_b = default(CUdeviceptr); CUdeviceptr d_c = default(CUdeviceptr); res = Cuda.Functions.cuMemAlloc_v2(ref d_a, n * sizeof(int)); if (res.Value != cudaError_enum.CUDA_SUCCESS) { throw new Exception(); } res = Cuda.Functions.cuMemAlloc_v2(ref d_b, n * sizeof(int)); if (res.Value != cudaError_enum.CUDA_SUCCESS) { throw new Exception(); } res = Cuda.Functions.cuMemAlloc_v2(ref d_c, n * sizeof(int)); if (res.Value != cudaError_enum.CUDA_SUCCESS) { throw new Exception(); } var ha = GCHandle.Alloc(a, GCHandleType.Pinned); var hb = GCHandle.Alloc(b, GCHandleType.Pinned); var hc = GCHandle.Alloc(c, GCHandleType.Pinned); var pa = ha.AddrOfPinnedObject(); var pb = hb.AddrOfPinnedObject(); var pc = hc.AddrOfPinnedObject(); res = Cuda.Functions.cuMemcpyHtoD_v2(d_a, pa, sizeof(int) * n); if (res.Value != cudaError_enum.CUDA_SUCCESS) { throw new Exception(); } res = Cuda.Functions.cuMemcpyHtoD_v2(d_b, pb, sizeof(int) * n); if (res.Value != cudaError_enum.CUDA_SUCCESS) { throw new Exception(); } IntPtr[] xa = new IntPtr[] { (IntPtr)d_a.Value }; GCHandle handlea = GCHandle.Alloc(xa, GCHandleType.Pinned); IntPtr pointera = handlea.AddrOfPinnedObject(); IntPtr[] xb = new IntPtr[] { (IntPtr)d_b.Value }; GCHandle handleb = GCHandle.Alloc(xb, GCHandleType.Pinned); IntPtr pointerb = handleb.AddrOfPinnedObject(); IntPtr[] xc = new IntPtr[] { (IntPtr)d_c.Value }; GCHandle handlec = GCHandle.Alloc(xc, GCHandleType.Pinned); IntPtr pointerc = handlec.AddrOfPinnedObject(); IntPtr[] xn = new IntPtr[] { (IntPtr)n }; GCHandle handlen = GCHandle.Alloc(xn, GCHandleType.Pinned); IntPtr pointern = handlen.AddrOfPinnedObject(); IntPtr[] kp = new IntPtr[] { pointera, pointerb, pointerc, pointern }; fixed(IntPtr *kernelParams = kp) { res = Cuda.Functions.cuLaunchKernel( helloWorld, 1, 1, 1, // grid has one block. (uint)n, 1, 1, // block has 3 threads. 0, // no shared memory default(CUstream), (IntPtr)kernelParams, (IntPtr)IntPtr.Zero ); } if (res.Value != cudaError_enum.CUDA_SUCCESS) { throw new Exception(); } res = Cuda.Functions.cuMemcpyDtoH_v2(hc.AddrOfPinnedObject(), d_c, n * sizeof(int)); if (res.Value != cudaError_enum.CUDA_SUCCESS) { throw new Exception(); } Cuda.Functions.cuCtxDestroy_v2(cuContext); System.Console.WriteLine(String.Join(" ", c)); } } finally { } }
// http://developer.download.nvidia.com/compute/cuda/3_1/toolkit/docs/online/group__CUEXEC_g3196abfe0d52f6806eced043ea1e1fb4.html private static extern CUresult nativeFuncSetBlockShape(CUfunction hfunc, int x, int y, int z);
public override void DoLayout() { CUdeviceptr p1 = new CUdeviceptr(); CUDADriver.cuMemAlloc(ref p1, 1 << 10); byte[] b = new byte[1 << 10]; CUDADriver.cuMemcpyHtoD(p1, b, (uint)b.Length); CUfunction func = new CUfunction(); CUResult res; int nnodes = (int)Network.VertexCount * 2; int blocks = 32; if (nnodes < 1024 * blocks) { nnodes = 1024 * blocks; } while ((nnodes & (prop.SIMDWidth - 1)) != 0) { nnodes++; } nnodes--; //float dtime = 0.025f; float dthf = dtime * 0.5f; //float epssq = 0.05f * 0.05f; //float itolsq = 1.0f / (0.5f * 0.5f); CUDADriver.cuModuleGetFunction(ref func, mod, "dummy"); // Float4[] data = new Float4[100]; CUdeviceptr ptr = new CUdeviceptr(); //CUDADriver.cuMemAlloc(ref ptr, (uint) 100 * System.Runtime.InteropServices.Marshal.SizeOf(Float4)); CUDADriver.cuParamSeti(func, 0, (uint)ptr.Pointer); CUDADriver.cuParamSetSize(func, 4); res = CUDADriver.cuLaunch(func); if (res != CUResult.Success) { Logger.AddMessage(LogEntryType.Warning, "CUDA Error in dummy function: " + res.ToString()); } // InitializationKernel<<<1, 1>>>(); CUDADriver.cuModuleGetFunction(ref func, mod, "InitializationKernel"); res = CUDADriver.cuLaunch(func); if (res != CUResult.Success) { Logger.AddMessage(LogEntryType.Warning, "CUDA Error in InitializationKernel: " + res.ToString()); } // BoundingBoxKernel<<<blocks * FACTOR1, THREADS1>>>(); CUDADriver.cuModuleGetFunction(ref func, mod, "BoundingBoxKernel: " + res.ToString()); CUDADriver.cuLaunch(func); if (res != CUResult.Success) { Logger.AddMessage(LogEntryType.Warning, "CUDA Error in BoundingBoxKernel: " + res.ToString()); } // TreeBuildingKernel<<<blocks * FACTOR2, THREADS2>>>(); CUDADriver.cuModuleGetFunction(ref func, mod, "TreeBuildingKernel: " + res.ToString()); CUDADriver.cuLaunch(func); if (res != CUResult.Success) { Logger.AddMessage(LogEntryType.Warning, "CUDA Error in TreeBuildingKernel: " + res.ToString()); } // SummarizationKernel<<<blocks * FACTOR3, THREADS3>>>(); CUDADriver.cuModuleGetFunction(ref func, mod, "SummarizationKernel: " + res.ToString()); CUDADriver.cuLaunch(func); if (res != CUResult.Success) { Logger.AddMessage(LogEntryType.Warning, "CUDA Error in SummarizationKernel: " + res.ToString()); } // ForceCalculationKernel<<<blocks * FACTOR5, THREADS5>>>(); CUDADriver.cuModuleGetFunction(ref func, mod, "ForceCalculationKernel: " + res.ToString()); CUDADriver.cuLaunch(func); if (res != CUResult.Success) { Logger.AddMessage(LogEntryType.Warning, "CUDA Error in ForceCalculationKernel: " + res.ToString()); } // IntegrationKernel<<<blocks * FACTOR6, THREADS6>>>(); CUDADriver.cuModuleGetFunction(ref func, mod, "IntegrationKernel"); CUDADriver.cuLaunch(func); if (res != CUResult.Success) { Logger.AddMessage(LogEntryType.Warning, "CUDA Error in IntegrationKernel: " + res.ToString()); } }