// General GPU Device CUDA Initialization static int gpuDeviceInit(int devID) { int deviceCount = CudaContext.GetDeviceCount(); if (deviceCount == 0) { Console.Write("gpuDeviceInit() CUDA error: no devices supporting CUDA.\n"); Environment.Exit(-1); } if (devID < 0) { devID = 0; } if (devID > deviceCount - 1) { Console.Write("\n"); Console.Write(">> {0} CUDA capable GPU device(s) detected. <<\n", deviceCount); Console.Write(">> gpuDeviceInit (-device={0}) is not a valid GPU device. <<\n", devID); Console.Write("\n"); return(-devID); } if (CudaContext.GetDeviceComputeCapability(devID).Major < 1) { Console.Write("gpuDeviceInit(): GPU device does not support CUDA.\n"); Environment.Exit(-1); } ctx = new CudaContext(devID); Console.Write("> gpuDeviceInit() CUDA device [{0}]: {1}\n", devID, ctx.GetDeviceName()); return(devID); }
public GPU_Functionality(int deviceID = 0) { ctx = new CudaContext(deviceID); version = ctx.GetDeviceComputeCapability(); Trace.WriteLine($"cuda compute capability {version.Major}.{version.Minor}"); CUmodule collision_module = ctx.LoadModulePTX("collision_kernels.ptx"); kNarrowPhase = new CudaKernel("kNarrowPhase_new", collision_module, ctx); kFindClosestFace = new CudaKernel("kFindClosestFace", collision_module, ctx); kCollisionResponseForce = new CudaKernel("kCollisionResponseForce", collision_module, ctx); dim3 block = new dim3(block_size, 1, 1); kNarrowPhase.BlockDimensions = block; kFindClosestFace.BlockDimensions = block; kCollisionResponseForce.BlockDimensions = block; // cz CUmodule module_cz_kernels = ctx.LoadModulePTX("cz_kernels.ptx"); kczCZForce = new CudaKernel("kczCZForce", module_cz_kernels, ctx); kczCZForce.BlockDimensions = block; // elem CUmodule module_elem_kernels = ctx.LoadModulePTX("elem_kernels.ptx"); kelElementElasticityForce = new CudaKernel("kelElementElasticityForce", module_elem_kernels, ctx); kelElementElasticityForce.BlockDimensions = block; }
public void CompileKernel() { //generate as output language obviously from strict code var code = @"extern ""C"" __global__ void blur(unsigned char* image, unsigned char* output, size_t width, size_t height) { size_t tid = blockIdx.x * blockDim.x + threadIdx.x; if (tid > width && tid < width*height-width) { output[tid] = image[tid];// (image[tid-2048]+image[tid-1]+image[tid]+image[tid+1]+image[tid+2048])/5; } }"; using var rtc = new CudaRuntimeCompiler(code, "blur"); try { // Use max capabilities on actual hardware we have at runtime var computeVersion = CudaContext.GetDeviceComputeCapability(0); var shaderModelVersion = "" + computeVersion.Major + computeVersion.Minor; Console.WriteLine("ShaderModelVersion=" + shaderModelVersion); // see http://docs.nvidia.com/cuda/nvrtc/index.html for usage and options //https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/ //nvcc .\vectorAdd.cu -use_fast_math -ptx -m 64 -arch compute_61 -code sm_61 -o .\vectorAdd.ptx //https://docs.nvidia.com/cuda/nvrtc/index.html#group__options rtc.Compile(new[] { "--gpu-architecture=compute_" + shaderModelVersion }); Console.WriteLine("Cuda compile log: " + rtc.GetLogAsString()); var deviceID = 0; var ctx = new CudaContext(deviceID); kernel = ctx.LoadKernelPTX(rtc.GetPTX(), "blur"); kernel.GridDimensions = (Size + 511) / 512; kernel.BlockDimensions = 512; //unused: float[] copyInput = new float[Size]; input = image; output = new CudaDeviceVariable <byte>(Size); } catch (NVRTCException ex) { Console.WriteLine("Cuda compile log: " + rtc.GetLogAsString()); throw new Exception(ex.NVRTCError + " " + ex); } }
private static async Task <CudaModule> CompileAsync( IMethod method, IEnumerable <ITypeMember> memberRoots, IEnumerable <IType> typeRoots, int threadIdParamIndex, ClrAssembly assembly, CudaContext context) { // Figure out which members we need to compile. var desc = await CreateContentDescriptionAsync(method, memberRoots, typeRoots, assembly); // Compile those members to LLVM IR. Use an Itanium name mangling scheme. var mangler = new ItaniumMangler(assembly.Resolver.TypeEnvironment); var moduleBuilder = LlvmBackend.Compile(desc, assembly.Resolver.TypeEnvironment); var module = moduleBuilder.Module; // Generate type metadata for all type roots. foreach (var type in typeRoots) { moduleBuilder.Metadata.GetMetadata(type, moduleBuilder); } // Get the compiled kernel function. var kernelFuncName = mangler.Mangle(method, true); var kernelFunc = LLVM.GetNamedFunction(module, kernelFuncName); if (threadIdParamIndex >= 0) { // If we have a thread ID parameter, then we need to generate a thunk // kernel function that calls our actual kernel function. This thunk's // responsibility is to determine the thread ID of the kernel. var thunkKernelName = "kernel"; var thunkTargetType = kernelFunc.TypeOf().GetElementType(); var thunkParamTypes = new List <LLVMTypeRef>(thunkTargetType.GetParamTypes()); if (threadIdParamIndex < thunkParamTypes.Count) { thunkParamTypes.RemoveAt(threadIdParamIndex); } var thunkKernel = LLVM.AddFunction( module, thunkKernelName, LLVM.FunctionType( thunkTargetType.GetReturnType(), thunkParamTypes.ToArray(), thunkTargetType.IsFunctionVarArg)); using (var builder = new IRBuilder(moduleBuilder.Context)) { builder.PositionBuilderAtEnd(thunkKernel.AppendBasicBlock("entry")); var args = new List <LLVMValueRef>(thunkKernel.GetParams()); args.Insert(threadIdParamIndex, ComputeUniqueThreadId(builder, module)); var call = builder.CreateCall(kernelFunc, args.ToArray(), ""); if (call.TypeOf().TypeKind == LLVMTypeKind.LLVMVoidTypeKind) { builder.CreateRetVoid(); } else { builder.CreateRet(call); } } kernelFuncName = thunkKernelName; kernelFunc = thunkKernel; } // Mark the compiled kernel as a kernel symbol. LLVM.AddNamedMetadataOperand( module, "nvvm.annotations", LLVM.MDNode(new LLVMValueRef[] { kernelFunc, MDString("kernel"), LLVM.ConstInt(LLVM.Int32TypeInContext(LLVM.GetModuleContext(module)), 1, false) })); // LLVM.DumpModule(module); // Compile that LLVM IR down to PTX. LLVMTargetMachineRef machine; var ptx = CompileToPtx(module, context.GetDeviceComputeCapability(), out machine); // Console.WriteLine(System.Text.Encoding.UTF8.GetString(ptx)); // Load the PTX kernel. return(new CudaModule(assembly, moduleBuilder, machine, context.LoadModulePTX(ptx), kernelFuncName, context)); }