Exemple #1
0
        /// <summary>
        /// Loads a compiled kernel into the given Cuda context as kernel program.
        /// </summary>
        /// <param name="accelerator">The associated accelerator.</param>
        /// <param name="kernel">The source kernel.</param>
        /// <param name="launcher">The launcher method for the given kernel.</param>
        internal CudaKernel(
            CudaAccelerator accelerator,
            PTXCompiledKernel kernel,
            MethodInfo launcher)
            : base(accelerator, kernel, launcher)
        {
            var kernelLoaded = CurrentAPI.LoadModule(
                out modulePtr,
                kernel.PTXAssembly,
                out string errorLog);

            if (kernelLoaded != CudaError.CUDA_SUCCESS)
            {
                Trace.WriteLine("PTX Kernel loading failed:");
                if (string.IsNullOrWhiteSpace(errorLog))
                {
                    Trace.WriteLine(">> No error information available");
                }
                else
                {
                    Trace.WriteLine(errorLog);
                }
            }
            CudaException.ThrowIfFailed(kernelLoaded);

            CudaException.ThrowIfFailed(
                CurrentAPI.GetModuleFunction(
                    out functionPtr,
                    modulePtr,
                    kernel.Name));
        }
Exemple #2
0
        /// <summary>
        /// Init memory information.
        /// </summary>
        private void InitMemoryInfo()
        {
            // Resolve the total memory size
            ThrowIfFailed(
                CurrentAPI.GetTotalDeviceMemory(out long total, DeviceId));
            MemorySize = total;

            // Resolve max shared memory per block
            MaxSharedMemoryPerGroup = CurrentAPI.GetDeviceAttribute(
                DeviceAttributeKind.CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK,
                DeviceId);

            // Resolve the maximum amount of shared memory per multiprocessor
            MaxSharedMemoryPerMultiprocessor = CurrentAPI.GetDeviceAttribute(
                DeviceAttributeKind.
                CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR,
                DeviceId);

            // Resolve total constant memory
            MaxConstantMemory = CurrentAPI.GetDeviceAttribute(
                DeviceAttributeKind.CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY, DeviceId);

            // Resolve memory clock rate
            MemoryClockRate = CurrentAPI.GetDeviceAttribute(
                DeviceAttributeKind.CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, DeviceId)
                              / 1000;

            // Resolve the bus width
            MemoryBusWidth = CurrentAPI.GetDeviceAttribute(
                DeviceAttributeKind.CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH,
                DeviceId);
        }
Exemple #3
0
        /// <summary>
        /// Init grid information.
        /// </summary>
        private void InitGridInfo()
        {
            int workItemDimensions = IntrinsicMath.Max(CurrentAPI.GetDeviceInfo <int>(
                                                           DeviceId,
                                                           CLDeviceInfoType.CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS), 3);

            // OpenCL does not report maximium grid sizes, MaxGridSize value is consistent
            // with the CPU accelator and values returned by CUDA accelerators.
            // MaxGridSize is ultimately contrained by system and device memory
            // and how each kernel manages memory.
            MaxGridSize = new Index3D(int.MaxValue, ushort.MaxValue, ushort.MaxValue);

            // Resolve max threads per group
            MaxNumThreadsPerGroup = CurrentAPI.GetDeviceInfo <IntPtr>(
                DeviceId,
                CLDeviceInfoType.CL_DEVICE_MAX_WORK_GROUP_SIZE).ToInt32();

            // Max work item thread dimensions
            var workItemSizes = new IntPtr[workItemDimensions];

            CurrentAPI.GetDeviceInfo(
                DeviceId,
                CLDeviceInfoType.CL_DEVICE_MAX_WORK_ITEM_SIZES,
                workItemSizes);

            MaxGroupSize = new Index3D(
                workItemSizes[0].ToInt32(),
                workItemSizes[1].ToInt32(),
                workItemSizes[2].ToInt32());

            // Result max number of threads per multiprocessor
            MaxNumThreadsPerMultiprocessor = MaxNumThreadsPerGroup;
        }
Exemple #4
0
        /// <inheritdoc/>
        public unsafe override void Synchronize()
        {
            ReadOnlySpan <IntPtr> events = stackalloc[] { EventPtr };

            CLException.ThrowIfFailed(
                CurrentAPI.WaitForEvents(events));
        }
Exemple #5
0
 /// <inheritdoc/>
 protected override void DisposeAcceleratorObject(bool disposing)
 {
     CLException.VerifyDisposed(
         disposing,
         CurrentAPI.clReleaseEvent(EventPtr));
     EventPtr = IntPtr.Zero;
 }
Exemple #6
0
 private void InitGenericAddressSpaceSupport()
 {
     if (DeviceVersion < CLDeviceVersion.CL20)
     {
         Capabilities.GenericAddressSpace = false;
     }
     else if (DeviceVersion < CLDeviceVersion.CL30)
     {
         Capabilities.GenericAddressSpace = true;
     }
     else
     {
         try
         {
             Capabilities.GenericAddressSpace =
                 CurrentAPI.GetDeviceInfo <int>(
                     DeviceId,
                     CLDeviceInfoType.CL_DEVICE_GENERIC_ADDRESS_SPACE_SUPPORT)
                 != 0;
         }
         catch (CLException)
         {
             Capabilities.GenericAddressSpace = false;
         }
     }
 }
Exemple #7
0
 /// <summary>
 /// Disposes this Cuda buffer.
 /// </summary>
 protected override void DisposeAcceleratorObject(bool disposing)
 {
     CudaException.VerifyDisposed(
         disposing,
         CurrentAPI.FreeMemory(NativePtr));
     NativePtr = IntPtr.Zero;
 }
Exemple #8
0
 /// <summary>
 /// Initializes major vendor features.
 /// </summary>
 private void InitVendorFeatures()
 {
     // Check major vendor features
     if (Device.Vendor == CLDeviceVendor.Nvidia ||
         Device.Vendor == CLDeviceVendor.AMD)
     {
         return;
     }
     // Compile dummy kernel to resolve additional information
     CLException.ThrowIfFailed(CLKernel.LoadKernel(
                                   this,
                                   DummyKernelName,
                                   DummyKernelSource,
                                   CVersion,
                                   out IntPtr programPtr,
                                   out IntPtr kernelPtr,
                                   out var _));
     try
     {
         // Resolve information
         WarpSize = CurrentAPI.GetKernelWorkGroupInfo <IntPtr>(
             kernelPtr,
             DeviceId,
             CLKernelWorkGroupInfoType
             .CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE).ToInt32();
     }
     finally
     {
         CLException.ThrowIfFailed(
             CurrentAPI.ReleaseKernel(kernelPtr));
         CLException.ThrowIfFailed(
             CurrentAPI.ReleaseProgram(programPtr));
     }
 }
Exemple #9
0
        /// <summary>
        /// Init general device information.
        /// </summary>
        private void InitDeviceInfo()
        {
            // Get the device name
            ThrowIfFailed(
                CurrentAPI.GetDeviceName(out string name, DeviceId));
            Name = name;

            // Resolve clock rate
            ClockRate = CurrentAPI.GetDeviceAttribute(
                DeviceAttributeKind.CU_DEVICE_ATTRIBUTE_CLOCK_RATE, DeviceId) / 1000;

            // Resolve warp size
            WarpSize = CurrentAPI.GetDeviceAttribute(
                DeviceAttributeKind.CU_DEVICE_ATTRIBUTE_WARP_SIZE, DeviceId);

            // Resolve number of multiprocessors
            NumMultiprocessors = CurrentAPI.GetDeviceAttribute(
                DeviceAttributeKind.CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, DeviceId);

            // Result max number of threads per multiprocessor
            MaxNumThreadsPerMultiprocessor = CurrentAPI.GetDeviceAttribute(
                DeviceAttributeKind.CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR,
                DeviceId);

            // Resolve the current driver mode
            DriverMode = (DeviceDriverMode)CurrentAPI.GetDeviceAttribute(
                DeviceAttributeKind.CU_DEVICE_ATTRIBUTE_TCC_DRIVER,
                DeviceId);
        }
Exemple #10
0
        /// <summary>
        /// Detects OpenCL devices.
        /// </summary>
        /// <param name="predicate">
        /// The predicate to include a given device.
        /// </param>
        /// <param name="registry">The registry to add all devices to.</param>
        private static void GetDevicesInternal(
            Predicate <CLDevice> predicate,
            DeviceRegistry registry)
        {
            var devices = new IntPtr[MaxNumDevicesPerPlatform];

            // Resolve all platforms
            if (!CurrentAPI.IsSupported ||
                CurrentAPI.GetNumPlatforms(out int numPlatforms) !=
                CLError.CL_SUCCESS ||
                numPlatforms < 1)
            {
                return;
            }

            var platforms = new IntPtr[numPlatforms];

            if (CurrentAPI.GetPlatforms(platforms, ref numPlatforms) !=
                CLError.CL_SUCCESS)
            {
                return;
            }

            foreach (var platform in platforms)
            {
                // Resolve all devices
                int numDevices = devices.Length;
                Array.Clear(devices, 0, numDevices);

                if (CurrentAPI.GetDevices(
                        platform,
                        CLDeviceType.CL_DEVICE_TYPE_ALL,
                        devices,
                        out numDevices) != CLError.CL_SUCCESS)
                {
                    continue;
                }

                for (int i = 0; i < numDevices; ++i)
                {
                    // Resolve device and ignore invalid devices
                    var device = devices[i];
                    if (device == IntPtr.Zero)
                    {
                        continue;
                    }

                    // Check for available device
                    if (CurrentAPI.GetDeviceInfo <int>(
                            device,
                            CLDeviceInfoType.CL_DEVICE_AVAILABLE) == 0)
                    {
                        continue;
                    }

                    var desc = new CLDevice(platform, device);
                    registry.Register(desc, predicate);
                }
            }
        }
Exemple #11
0
        /// <summary cref="Accelerator.EstimateGroupSizeInternal(
        /// Kernel, int, int, out int)"/>
        protected override int EstimateGroupSizeInternal(
            Kernel kernel,
            int dynamicSharedMemorySizeInBytes,
            int maxGroupSize,
            out int minGridSize)
        {
            if (dynamicSharedMemorySizeInBytes > 0)
            {
                throw new ArgumentOutOfRangeException(
                          nameof(dynamicSharedMemorySizeInBytes));
            }

            if (maxGroupSize < 1)
            {
                maxGroupSize = MaxNumThreadsPerGroup;
            }

            var clKernel            = kernel as CLKernel;
            var workGroupSizeNative = CurrentAPI.GetKernelWorkGroupInfo <IntPtr>(
                clKernel.KernelPtr,
                DeviceId,
                CLKernelWorkGroupInfoType.CL_KERNEL_WORK_GROUP_SIZE);
            int workGroupSize = workGroupSizeNative.ToInt32();

            workGroupSize = IntrinsicMath.Min(workGroupSize, maxGroupSize);
            minGridSize   = IntrinsicMath.DivRoundUp(MaxNumThreads, workGroupSize);

            return(workGroupSize);
        }
Exemple #12
0
        /// <summary>
        /// Constructs a new OpenCL accelerator reference.
        /// </summary>
        /// <param name="platformId">The OpenCL platform id.</param>
        /// <param name="deviceId">The OpenCL device id.</param>
        public CLDevice(IntPtr platformId, IntPtr deviceId)
        {
            if (platformId == IntPtr.Zero)
            {
                throw new ArgumentOutOfRangeException(nameof(platformId));
            }
            if (deviceId == IntPtr.Zero)
            {
                throw new ArgumentOutOfRangeException(nameof(deviceId));
            }

            Backends.Backend.EnsureRunningOnNativePlatform();

            PlatformId = platformId;
            DeviceId   = deviceId;

            InitPlatformInfo();
            InitDeviceInfo();
            InitGridInfo();
            InitVendorAndWarpSizeInfo();
            InitMemoryInfo();
            InitCInfo();
            InitExtensions();

            // Resolve extension method
            getKernelSubGroupInfo = CurrentAPI.GetExtension <clGetKernelSubGroupInfoKHR>(
                platformId);

            // Init capabilities
            Capabilities = new CLCapabilityContext(this);
            InitGenericAddressSpaceSupport();
        }
Exemple #13
0
        /// <summary>
        /// Loads the binary representation of the given OpenCL kernel.
        /// </summary>
        /// <param name="program">The program pointer.</param>
        /// <returns>The binary representation of the underlying kernel.</returns>
        public static unsafe byte[] LoadBinaryRepresentation(IntPtr program)
        {
            IntPtr kernelSize;

            CLException.ThrowIfFailed(
                CurrentAPI.GetProgramInfo(
                    program,
                    CLProgramInfo.CL_PROGRAM_BINARY_SIZES,
                    new IntPtr(IntPtr.Size),
                    &kernelSize,
                    out var _));

            var programBinary = new byte[kernelSize.ToInt32()];

            fixed(byte *binPtr = &programBinary[0])
            {
                CLException.ThrowIfFailed(
                    CurrentAPI.GetProgramInfo(
                        program,
                        CLProgramInfo.CL_PROGRAM_BINARIES,
                        new IntPtr(IntPtr.Size),
                        &binPtr,
                        out var _));
            }

            return(programBinary);
        }
Exemple #14
0
        /// <summary>
        /// Init general device information.
        /// </summary>
        private void InitDeviceInfo()
        {
            // Resolve general device information
            Name = CurrentAPI.GetDeviceInfo(
                DeviceId,
                CLDeviceInfoType.CL_DEVICE_NAME);
            DeviceType = (CLDeviceType)CurrentAPI.GetDeviceInfo <long>(
                DeviceId,
                CLDeviceInfoType.CL_DEVICE_TYPE);
            DeviceVersion = CLDeviceVersion.TryParse(
                CurrentAPI.GetDeviceInfo(
                    DeviceId,
                    CLDeviceInfoType.CL_DEVICE_VERSION),
                out var deviceVersion)
                ? deviceVersion
                : CLDeviceVersion.CL10;

            // Resolve clock rate
            ClockRate = CurrentAPI.GetDeviceInfo <int>(
                DeviceId,
                CLDeviceInfoType.CL_DEVICE_MAX_CLOCK_FREQUENCY);

            // Resolve number of multiprocessors
            NumMultiprocessors = CurrentAPI.GetDeviceInfo <int>(
                DeviceId,
                CLDeviceInfoType.CL_DEVICE_MAX_COMPUTE_UNITS);
        }
Exemple #15
0
        /// <summary>
        /// Init grid information.
        /// </summary>
        private void InitGridInfo()
        {
            // Max grid size
            int workItemDimensions = IntrinsicMath.Max(CurrentAPI.GetDeviceInfo <int>(
                                                           DeviceId,
                                                           CLDeviceInfoType.CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS), 3);
            var workItemSizes = new IntPtr[workItemDimensions];

            CurrentAPI.GetDeviceInfo(
                DeviceId,
                CLDeviceInfoType.CL_DEVICE_MAX_WORK_ITEM_SIZES,
                workItemSizes);
            MaxGridSize = new Index3D(
                workItemSizes[0].ToInt32(),
                workItemSizes[1].ToInt32(),
                workItemSizes[2].ToInt32());

            // Resolve max threads per group
            MaxNumThreadsPerGroup = CurrentAPI.GetDeviceInfo <IntPtr>(
                DeviceId,
                CLDeviceInfoType.CL_DEVICE_MAX_WORK_GROUP_SIZE).ToInt32();
            MaxGroupSize = new Index3D(
                MaxNumThreadsPerGroup,
                MaxNumThreadsPerGroup,
                MaxNumThreadsPerGroup);

            // Result max number of threads per multiprocessor
            MaxNumThreadsPerMultiprocessor = MaxNumThreadsPerGroup;
        }
Exemple #16
0
        /// <summary cref="MemoryBuffer{T, TIndex}.CopyFromView(
        /// AcceleratorStream, ArrayView{T}, LongIndex1)"/>
        protected internal unsafe override void CopyFromView(
            AcceleratorStream stream,
            ArrayView <T> source,
            LongIndex1 targetOffset)
        {
            var binding = Accelerator.BindScoped();

            var sourceAddress = new IntPtr(source.LoadEffectiveAddress());
            var targetAddress = new IntPtr(ComputeEffectiveAddress(targetOffset));
            var lengthInBytes = new IntPtr(source.LengthInBytes);

            switch (source.AcceleratorType)
            {
            case AcceleratorType.CPU:
            case AcceleratorType.Cuda:
                CudaException.ThrowIfFailed(
                    CurrentAPI.MemcpyAsync(
                        targetAddress,
                        sourceAddress,
                        lengthInBytes,
                        stream));
                break;

            default:
                throw new NotSupportedException(
                          RuntimeErrorMessages.NotSupportedTargetAccelerator);
            }

            binding.Recover();
        }
Exemple #17
0
 /// <summary>
 /// Disposes this OpenCL buffer.
 /// </summary>
 protected override void DisposeAcceleratorObject(bool disposing)
 {
     CLException.VerifyDisposed(
         disposing,
         CurrentAPI.ReleaseBuffer(NativePtr));
     NativePtr = IntPtr.Zero;
 }
Exemple #18
0
 /// <summary cref="MemoryBuffer.MemSetToZero(AcceleratorStream)"/>
 public override void MemSetToZero(AcceleratorStream stream) =>
 CLException.ThrowIfFailed(
     CurrentAPI.FillBuffer <byte>(
         ((CLStream)stream).CommandQueue,
         NativePtr,
         0,
         IntPtr.Zero,
         new IntPtr(LengthInBytes)));
Exemple #19
0
        /// <inheritdoc/>
        protected override ProfilingMarker AddProfilingMarkerInternal()
        {
            var profilingMarker = new CudaProfilingMarker();

            CudaException.ThrowIfFailed(
                CurrentAPI.RecordEvent(profilingMarker.EventPtr, StreamPtr));
            return(profilingMarker);
        }
Exemple #20
0
 /// <summary>
 /// Disposes this Cuda kernel.
 /// </summary>
 protected override void DisposeAcceleratorObject(bool disposing)
 {
     CudaException.VerifyDisposed(
         disposing,
         CurrentAPI.DestroyModule(modulePtr));
     functionPtr = IntPtr.Zero;
     modulePtr   = IntPtr.Zero;
 }
Exemple #21
0
 /// <summary cref="DisposeBase.Dispose(bool)"/>
 protected override void Dispose(bool disposing)
 {
     if (NativePtr != IntPtr.Zero)
     {
         CurrentAPI.FreeHostMemory(NativePtr);
         NativePtr = IntPtr.Zero;
     }
     base.Dispose(disposing);
 }
Exemple #22
0
 /// <summary>
 /// Constructs a new Cuda stream with given <see cref="StreamFlags"/>.
 /// </summary>
 /// <param name="accelerator">The associated accelerator.</param>
 /// <param name="flag">
 /// Stream flag to use. Allows blocking and non-blocking streams.
 /// </param>
 internal CudaStream(Accelerator accelerator, StreamFlags flag)
     : base(accelerator)
 {
     CudaException.ThrowIfFailed(
         CurrentAPI.CreateStream(
             out streamPtr,
             flag));
     responsibleForHandle = true;
 }
Exemple #23
0
 public readonly CLError PreLaunchKernel(
     CLStream stream,
     CLKernel kernel,
     RuntimeKernelConfig config) =>
 CurrentAPI.SetKernelArgumentUnsafeWithKernel(
     kernel,
     0,
     config.SharedMemoryConfig.DynamicArraySize,
     null);
Exemple #24
0
        /// <summary cref="AcceleratorStream.Synchronize"/>
        public override void Synchronize()
        {
            var binding = Accelerator.BindScoped();

            CudaException.ThrowIfFailed(
                CurrentAPI.SynchronizeStream(streamPtr));

            binding.Recover();
        }
Exemple #25
0
        /// <inheritdoc/>
        protected override ProfilingMarker AddProfilingMarkerInternal()
        {
            using var binding = Accelerator.BindScoped();
            var profilingMarker = new CudaProfilingMarker(Accelerator);

            CudaException.ThrowIfFailed(
                CurrentAPI.RecordEvent(profilingMarker.EventPtr, StreamPtr));
            return(profilingMarker);
        }
Exemple #26
0
        /// <inheritdoc/>
        public unsafe override void Synchronize()
        {
            using var binding = Accelerator.BindScoped();

            ReadOnlySpan <IntPtr> events = stackalloc[] { EventPtr };

            CLException.ThrowIfFailed(
                CurrentAPI.WaitForEvents(events));
        }
Exemple #27
0
 /// <summary cref="DisposeBase.Dispose(bool)"/>
 protected override void Dispose(bool disposing)
 {
     base.Dispose(disposing);
     if (contextPtr != IntPtr.Zero)
     {
         CLException.ThrowIfFailed(
             CurrentAPI.ReleaseContext(contextPtr));
         contextPtr = IntPtr.Zero;
     }
 }
Exemple #28
0
 /// <summary cref="DisposeBase.Dispose(bool)"/>
 protected override void Dispose(bool disposing)
 {
     if (NativePtr != IntPtr.Zero)
     {
         CudaException.ThrowIfFailed(
             CurrentAPI.FreeMemory(NativePtr));
         NativePtr = IntPtr.Zero;
     }
     base.Dispose(disposing);
 }
Exemple #29
0
 /// <summary cref="DisposeBase.Dispose(bool)"/>
 protected override void Dispose(bool disposing)
 {
     if (NativePtr != IntPtr.Zero)
     {
         CLException.ThrowIfFailed(
             CurrentAPI.ReleaseBuffer(NativePtr));
         NativePtr = IntPtr.Zero;
     }
     base.Dispose(disposing);
 }
Exemple #30
0
            /// <summary>
            /// Frees the Cuda host memory.
            /// </summary>
            protected override void DisposeAcceleratorObject(bool disposing)
            {
                if (NativePtr == IntPtr.Zero)
                {
                    return;
                }

                CurrentAPI.FreeHostMemory(NativePtr);
                NativePtr = IntPtr.Zero;
            }