public void GeneratePTX() { // test compilation using (var context = new Context()) { context.EnableAlgorithms(); using (Backend b = new PTXBackend(context, PTXArchitecture.SM_50, PTXInstructionSet.ISA_50, TargetPlatform.X64)) { //using (var unit = context.CreateCompileUnit(b, CompileUnitFlags.None)) //{ /* * var compiledKernel = b.Compile( * unit, * typeof(GPU_test_ILGPU).GetMethod(nameof(MyKernel), BindingFlags.Public | BindingFlags.Static)); * * System.IO.File.WriteAllBytes("MyKernel.ptx", compiledKernel.GetBuffer()); */ var methods = typeof(Impl_ILGPU).GetMethods(BindingFlags.Static | BindingFlags.Public); var method = methods.FirstOrDefault(f => f.Name == nameof(MyKernel));//.GetMethod(nameof(MyKernel), BindingFlags.Static); var compiledKernel = b.Compile(method, default); var ptxKernel = compiledKernel as PTXCompiledKernel; System.IO.File.WriteAllText("MyKernel.ptx", ptxKernel.PTXAssembly); //} } } }
/// <summary> /// The Cuda (PTX) implementation. /// </summary> /// <remarks> /// Note that this function signature corresponds to the PTX-backend specific /// delegate type <see cref="PTXIntrinsic.Handler"/>. /// </remarks> static void GeneratePTXCode( PTXBackend backend, PTXCodeGenerator codeGenerator, Value value) { // The passed value will be the call node in this case // Load X parameter register (first argument) var xRegister = codeGenerator.LoadPrimitive(value[0]); // Allocate target register to write our result to var target = codeGenerator.AllocateHardware(value); // Emit our desired instructions using (var command = codeGenerator.BeginCommand( PTXInstructions.GetArithmeticOperation( BinaryArithmeticKind.Mul, ArithmeticBasicValueType.Int32, backend.Capabilities, false))) { command.AppendArgument(target); command.AppendArgument(xRegister); command.AppendConstant(2); } }
/// <summary> /// Generates intrinsic math instructions for the following kinds: /// Rcp, Sqrt, Sin, Cos, Exp2, Log2, IsInf, IsNaN /// </summary> /// <param name="backend">The current backend.</param> /// <param name="codeGenerator">The code generator.</param> /// <param name="value">The value to generate code for.</param> public static void GenerateMathIntrinsic( PTXBackend backend, PTXCodeGenerator codeGenerator, Value value) { var arithmeticValue = value as UnaryArithmeticValue; var instruction = PTXInstructions.GetArithmeticOperation( arithmeticValue.Kind, arithmeticValue.ArithmeticBasicValueType, codeGenerator.FastMath); var argument = codeGenerator.LoadPrimitive(arithmeticValue.Value); var targetRegister = codeGenerator.AllocateHardware(arithmeticValue); using var command = codeGenerator.BeginCommand(instruction); command.AppendArgument(targetRegister); command.AppendArgument(argument); }
/// <summary> /// Setups all required settings. /// </summary> private void SetupAccelerator() { Bind(); CudaException.ThrowIfFailed( CurrentAPI.GetDeviceName(out string name, DeviceId)); Name = name; DefaultStream = new CudaStream(this, IntPtr.Zero); CudaException.ThrowIfFailed( CurrentAPI.GetDeviceComputeCapability(out int major, out int minor, DeviceId)); Architecture = PTXBackend.GetArchitecture(major, minor); CudaException.ThrowIfFailed( CurrentAPI.GetTotalDeviceMemory(out long total, DeviceId)); MemorySize = total; // Resolve max grid size MaxGridSize = new Index3( CurrentAPI.GetDeviceAttribute( DeviceAttribute.CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, DeviceId), CurrentAPI.GetDeviceAttribute( DeviceAttribute.CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, DeviceId), CurrentAPI.GetDeviceAttribute( DeviceAttribute.CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, DeviceId)); // Resolve max group size MaxGroupSize = new Index3( CurrentAPI.GetDeviceAttribute( DeviceAttribute.CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, DeviceId), CurrentAPI.GetDeviceAttribute( DeviceAttribute.CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, DeviceId), CurrentAPI.GetDeviceAttribute( DeviceAttribute.CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, DeviceId)); // Resolve max threads per group MaxNumThreadsPerGroup = CurrentAPI.GetDeviceAttribute( DeviceAttribute.CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, DeviceId); // Resolve max shared memory per block MaxSharedMemoryPerGroup = CurrentAPI.GetDeviceAttribute( DeviceAttribute.CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, DeviceId); // Resolve total constant memory MaxConstantMemory = CurrentAPI.GetDeviceAttribute( DeviceAttribute.CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY, DeviceId); // Resolve clock rate ClockRate = CurrentAPI.GetDeviceAttribute( DeviceAttribute.CU_DEVICE_ATTRIBUTE_CLOCK_RATE, DeviceId); // Resolve warp size WarpSize = CurrentAPI.GetDeviceAttribute( DeviceAttribute.CU_DEVICE_ATTRIBUTE_WARP_SIZE, DeviceId); // Resolve number of multiprocessors NumMultiprocessors = CurrentAPI.GetDeviceAttribute( DeviceAttribute.CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, DeviceId); // Result max number of threads per multiprocessor MaxNumThreadsPerMultiprocessor = CurrentAPI.GetDeviceAttribute( DeviceAttribute.CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR, DeviceId); // Resolve cache configuration CudaException.ThrowIfFailed( CurrentAPI.GetSharedMemoryConfig(out sharedMemoryConfiguration)); CudaException.ThrowIfFailed( CurrentAPI.GetCacheConfig(out cacheConfiguration)); }