Exemple #1
0
        public void GeneratePTX()
        {
            // test compilation
            using (var context = new Context())
            {
                context.EnableAlgorithms();

                using (Backend b = new PTXBackend(context, PTXArchitecture.SM_50, PTXInstructionSet.ISA_50, TargetPlatform.X64))
                {
                    //using (var unit = context.CreateCompileUnit(b, CompileUnitFlags.None))
                    //{

                    /*
                     * var compiledKernel = b.Compile(
                     *  unit,
                     *  typeof(GPU_test_ILGPU).GetMethod(nameof(MyKernel), BindingFlags.Public | BindingFlags.Static));
                     *
                     * System.IO.File.WriteAllBytes("MyKernel.ptx", compiledKernel.GetBuffer());
                     */

                    var methods        = typeof(Impl_ILGPU).GetMethods(BindingFlags.Static | BindingFlags.Public);
                    var method         = methods.FirstOrDefault(f => f.Name == nameof(MyKernel));//.GetMethod(nameof(MyKernel), BindingFlags.Static);
                    var compiledKernel = b.Compile(method, default);

                    var ptxKernel = compiledKernel as PTXCompiledKernel;
                    System.IO.File.WriteAllText("MyKernel.ptx", ptxKernel.PTXAssembly);
                    //}
                }
            }
        }
Exemple #2
0
        /// <summary>
        /// The Cuda (PTX) implementation.
        /// </summary>
        /// <remarks>
        /// Note that this function signature corresponds to the PTX-backend specific
        /// delegate type <see cref="PTXIntrinsic.Handler"/>.
        /// </remarks>
        static void GeneratePTXCode(
            PTXBackend backend,
            PTXCodeGenerator codeGenerator,
            Value value)
        {
            // The passed value will be the call node in this case
            // Load X parameter register (first argument)
            var xRegister = codeGenerator.LoadPrimitive(value[0]);

            // Allocate target register to write our result to
            var target = codeGenerator.AllocateHardware(value);

            // Emit our desired instructions
            using (var command = codeGenerator.BeginCommand(
                       PTXInstructions.GetArithmeticOperation(
                           BinaryArithmeticKind.Mul,
                           ArithmeticBasicValueType.Int32,
                           backend.Capabilities,
                           false)))
            {
                command.AppendArgument(target);
                command.AppendArgument(xRegister);
                command.AppendConstant(2);
            }
        }
Exemple #3
0
        /// <summary>
        /// Generates intrinsic math instructions for the following kinds:
        /// Rcp, Sqrt, Sin, Cos, Exp2, Log2, IsInf, IsNaN
        /// </summary>
        /// <param name="backend">The current backend.</param>
        /// <param name="codeGenerator">The code generator.</param>
        /// <param name="value">The value to generate code for.</param>
        public static void GenerateMathIntrinsic(
            PTXBackend backend,
            PTXCodeGenerator codeGenerator,
            Value value)
        {
            var arithmeticValue = value as UnaryArithmeticValue;
            var instruction     = PTXInstructions.GetArithmeticOperation(
                arithmeticValue.Kind,
                arithmeticValue.ArithmeticBasicValueType,
                codeGenerator.FastMath);

            var argument       = codeGenerator.LoadPrimitive(arithmeticValue.Value);
            var targetRegister = codeGenerator.AllocateHardware(arithmeticValue);

            using var command = codeGenerator.BeginCommand(instruction);
            command.AppendArgument(targetRegister);
            command.AppendArgument(argument);
        }
        /// <summary>
        /// Setups all required settings.
        /// </summary>
        private void SetupAccelerator()
        {
            Bind();

            CudaException.ThrowIfFailed(
                CurrentAPI.GetDeviceName(out string name, DeviceId));
            Name          = name;
            DefaultStream = new CudaStream(this, IntPtr.Zero);

            CudaException.ThrowIfFailed(
                CurrentAPI.GetDeviceComputeCapability(out int major, out int minor, DeviceId));
            Architecture = PTXBackend.GetArchitecture(major, minor);

            CudaException.ThrowIfFailed(
                CurrentAPI.GetTotalDeviceMemory(out long total, DeviceId));
            MemorySize = total;

            // Resolve max grid size
            MaxGridSize = new Index3(
                CurrentAPI.GetDeviceAttribute(
                    DeviceAttribute.CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, DeviceId),
                CurrentAPI.GetDeviceAttribute(
                    DeviceAttribute.CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, DeviceId),
                CurrentAPI.GetDeviceAttribute(
                    DeviceAttribute.CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, DeviceId));

            // Resolve max group size
            MaxGroupSize = new Index3(
                CurrentAPI.GetDeviceAttribute(
                    DeviceAttribute.CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, DeviceId),
                CurrentAPI.GetDeviceAttribute(
                    DeviceAttribute.CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, DeviceId),
                CurrentAPI.GetDeviceAttribute(
                    DeviceAttribute.CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, DeviceId));

            // Resolve max threads per group
            MaxNumThreadsPerGroup = CurrentAPI.GetDeviceAttribute(
                DeviceAttribute.CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, DeviceId);

            // Resolve max shared memory per block
            MaxSharedMemoryPerGroup = CurrentAPI.GetDeviceAttribute(
                DeviceAttribute.CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, DeviceId);

            // Resolve total constant memory
            MaxConstantMemory = CurrentAPI.GetDeviceAttribute(
                DeviceAttribute.CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY, DeviceId);

            // Resolve clock rate
            ClockRate = CurrentAPI.GetDeviceAttribute(
                DeviceAttribute.CU_DEVICE_ATTRIBUTE_CLOCK_RATE, DeviceId);

            // Resolve warp size
            WarpSize = CurrentAPI.GetDeviceAttribute(
                DeviceAttribute.CU_DEVICE_ATTRIBUTE_WARP_SIZE, DeviceId);

            // Resolve number of multiprocessors
            NumMultiprocessors = CurrentAPI.GetDeviceAttribute(
                DeviceAttribute.CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, DeviceId);

            // Result max number of threads per multiprocessor
            MaxNumThreadsPerMultiprocessor = CurrentAPI.GetDeviceAttribute(
                DeviceAttribute.CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR, DeviceId);

            // Resolve cache configuration
            CudaException.ThrowIfFailed(
                CurrentAPI.GetSharedMemoryConfig(out sharedMemoryConfiguration));
            CudaException.ThrowIfFailed(
                CurrentAPI.GetCacheConfig(out cacheConfiguration));
        }