Example #1
0
        // General GPU Device CUDA Initialization
        static int gpuDeviceInit(int devID)
        {
            int deviceCount = CudaContext.GetDeviceCount();

            if (deviceCount == 0)
            {
                Console.Write("gpuDeviceInit() CUDA error: no devices supporting CUDA.\n");
                Environment.Exit(-1);
            }
            if (devID < 0)
            {
                devID = 0;
            }
            if (devID > deviceCount - 1)
            {
                Console.Write("\n");
                Console.Write(">> {0} CUDA capable GPU device(s) detected. <<\n", deviceCount);
                Console.Write(">> gpuDeviceInit (-device={0}) is not a valid GPU device. <<\n", devID);
                Console.Write("\n");
                return(-devID);
            }


            if (CudaContext.GetDeviceComputeCapability(devID).Major < 1)
            {
                Console.Write("gpuDeviceInit(): GPU device does not support CUDA.\n");
                Environment.Exit(-1);
            }
            ctx = new CudaContext(devID);
            Console.Write("> gpuDeviceInit() CUDA device [{0}]: {1}\n", devID, ctx.GetDeviceName());
            return(devID);
        }
Example #2
0
        public GPU_Functionality(int deviceID = 0)
        {
            ctx     = new CudaContext(deviceID);
            version = ctx.GetDeviceComputeCapability();
            Trace.WriteLine($"cuda compute capability {version.Major}.{version.Minor}");

            CUmodule collision_module = ctx.LoadModulePTX("collision_kernels.ptx");

            kNarrowPhase            = new CudaKernel("kNarrowPhase_new", collision_module, ctx);
            kFindClosestFace        = new CudaKernel("kFindClosestFace", collision_module, ctx);
            kCollisionResponseForce = new CudaKernel("kCollisionResponseForce", collision_module, ctx);
            dim3 block = new dim3(block_size, 1, 1);

            kNarrowPhase.BlockDimensions            = block;
            kFindClosestFace.BlockDimensions        = block;
            kCollisionResponseForce.BlockDimensions = block;

            // cz
            CUmodule module_cz_kernels = ctx.LoadModulePTX("cz_kernels.ptx");

            kczCZForce = new CudaKernel("kczCZForce", module_cz_kernels, ctx);
            kczCZForce.BlockDimensions = block;

            // elem
            CUmodule module_elem_kernels = ctx.LoadModulePTX("elem_kernels.ptx");

            kelElementElasticityForce = new CudaKernel("kelElementElasticityForce", module_elem_kernels, ctx);
            kelElementElasticityForce.BlockDimensions = block;
        }
        public void CompileKernel()
        {
            //generate as output language obviously from strict code
            var code =
                @"extern ""C"" __global__ void blur(unsigned char* image, unsigned char* output, size_t width, size_t height)
{
  size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
  if (tid > width && tid < width*height-width) {
    output[tid] = image[tid];// (image[tid-2048]+image[tid-1]+image[tid]+image[tid+1]+image[tid+2048])/5;
  }
}";

            using var rtc = new CudaRuntimeCompiler(code, "blur");
            try
            {
                // Use max capabilities on actual hardware we have at runtime
                var computeVersion     = CudaContext.GetDeviceComputeCapability(0);
                var shaderModelVersion = "" + computeVersion.Major + computeVersion.Minor;
                Console.WriteLine("ShaderModelVersion=" + shaderModelVersion);
                // see http://docs.nvidia.com/cuda/nvrtc/index.html for usage and options
                //https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
                //nvcc .\vectorAdd.cu -use_fast_math -ptx -m 64 -arch compute_61 -code sm_61 -o .\vectorAdd.ptx
                //https://docs.nvidia.com/cuda/nvrtc/index.html#group__options
                rtc.Compile(new[] { "--gpu-architecture=compute_" + shaderModelVersion });
                Console.WriteLine("Cuda compile log: " + rtc.GetLogAsString());
                var deviceID = 0;
                var ctx      = new CudaContext(deviceID);
                kernel = ctx.LoadKernelPTX(rtc.GetPTX(), "blur");
                kernel.GridDimensions  = (Size + 511) / 512;
                kernel.BlockDimensions = 512;
                //unused: float[] copyInput = new float[Size];
                input  = image;
                output = new CudaDeviceVariable <byte>(Size);
            }
            catch (NVRTCException ex)
            {
                Console.WriteLine("Cuda compile log: " + rtc.GetLogAsString());
                throw new Exception(ex.NVRTCError + " " + ex);
            }
        }
Example #4
0
        private static async Task <CudaModule> CompileAsync(
            IMethod method,
            IEnumerable <ITypeMember> memberRoots,
            IEnumerable <IType> typeRoots,
            int threadIdParamIndex,
            ClrAssembly assembly,
            CudaContext context)
        {
            // Figure out which members we need to compile.
            var desc = await CreateContentDescriptionAsync(method, memberRoots, typeRoots, assembly);

            // Compile those members to LLVM IR. Use an Itanium name mangling scheme.
            var mangler       = new ItaniumMangler(assembly.Resolver.TypeEnvironment);
            var moduleBuilder = LlvmBackend.Compile(desc, assembly.Resolver.TypeEnvironment);
            var module        = moduleBuilder.Module;

            // Generate type metadata for all type roots.
            foreach (var type in typeRoots)
            {
                moduleBuilder.Metadata.GetMetadata(type, moduleBuilder);
            }

            // Get the compiled kernel function.
            var kernelFuncName = mangler.Mangle(method, true);
            var kernelFunc     = LLVM.GetNamedFunction(module, kernelFuncName);

            if (threadIdParamIndex >= 0)
            {
                // If we have a thread ID parameter, then we need to generate a thunk
                // kernel function that calls our actual kernel function. This thunk's
                // responsibility is to determine the thread ID of the kernel.
                var thunkKernelName = "kernel";
                var thunkTargetType = kernelFunc.TypeOf().GetElementType();
                var thunkParamTypes = new List <LLVMTypeRef>(thunkTargetType.GetParamTypes());
                if (threadIdParamIndex < thunkParamTypes.Count)
                {
                    thunkParamTypes.RemoveAt(threadIdParamIndex);
                }
                var thunkKernel = LLVM.AddFunction(
                    module,
                    thunkKernelName,
                    LLVM.FunctionType(
                        thunkTargetType.GetReturnType(),
                        thunkParamTypes.ToArray(),
                        thunkTargetType.IsFunctionVarArg));

                using (var builder = new IRBuilder(moduleBuilder.Context))
                {
                    builder.PositionBuilderAtEnd(thunkKernel.AppendBasicBlock("entry"));
                    var args = new List <LLVMValueRef>(thunkKernel.GetParams());
                    args.Insert(threadIdParamIndex, ComputeUniqueThreadId(builder, module));
                    var call = builder.CreateCall(kernelFunc, args.ToArray(), "");
                    if (call.TypeOf().TypeKind == LLVMTypeKind.LLVMVoidTypeKind)
                    {
                        builder.CreateRetVoid();
                    }
                    else
                    {
                        builder.CreateRet(call);
                    }
                }

                kernelFuncName = thunkKernelName;
                kernelFunc     = thunkKernel;
            }

            // Mark the compiled kernel as a kernel symbol.
            LLVM.AddNamedMetadataOperand(
                module,
                "nvvm.annotations",
                LLVM.MDNode(new LLVMValueRef[]
            {
                kernelFunc,
                MDString("kernel"),
                LLVM.ConstInt(LLVM.Int32TypeInContext(LLVM.GetModuleContext(module)), 1, false)
            }));

            // LLVM.DumpModule(module);

            // Compile that LLVM IR down to PTX.
            LLVMTargetMachineRef machine;
            var ptx = CompileToPtx(module, context.GetDeviceComputeCapability(), out machine);

            // Console.WriteLine(System.Text.Encoding.UTF8.GetString(ptx));

            // Load the PTX kernel.
            return(new CudaModule(assembly, moduleBuilder, machine, context.LoadModulePTX(ptx), kernelFuncName, context));
        }