/// <summary> /// Loads a compiled kernel into the given Cuda context as kernel program. /// </summary> /// <param name="accelerator">The associated accelerator.</param> /// <param name="kernel">The source kernel.</param> /// <param name="launcher">The launcher method for the given kernel.</param> internal CudaKernel( CudaAccelerator accelerator, PTXCompiledKernel kernel, MethodInfo launcher) : base(accelerator, kernel, launcher) { var kernelLoaded = CurrentAPI.LoadModule( out modulePtr, kernel.PTXAssembly, out string errorLog); if (kernelLoaded != CudaError.CUDA_SUCCESS) { Trace.WriteLine("PTX Kernel loading failed:"); if (string.IsNullOrWhiteSpace(errorLog)) { Trace.WriteLine(">> No error information available"); } else { Trace.WriteLine(errorLog); } } CudaException.ThrowIfFailed(kernelLoaded); CudaException.ThrowIfFailed( CurrentAPI.GetModuleFunction( out functionPtr, modulePtr, kernel.Name)); }
/// <summary> /// Init memory information. /// </summary> private void InitMemoryInfo() { // Resolve the total memory size ThrowIfFailed( CurrentAPI.GetTotalDeviceMemory(out long total, DeviceId)); MemorySize = total; // Resolve max shared memory per block MaxSharedMemoryPerGroup = CurrentAPI.GetDeviceAttribute( DeviceAttributeKind.CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, DeviceId); // Resolve the maximum amount of shared memory per multiprocessor MaxSharedMemoryPerMultiprocessor = CurrentAPI.GetDeviceAttribute( DeviceAttributeKind. CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR, DeviceId); // Resolve total constant memory MaxConstantMemory = CurrentAPI.GetDeviceAttribute( DeviceAttributeKind.CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY, DeviceId); // Resolve memory clock rate MemoryClockRate = CurrentAPI.GetDeviceAttribute( DeviceAttributeKind.CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, DeviceId) / 1000; // Resolve the bus width MemoryBusWidth = CurrentAPI.GetDeviceAttribute( DeviceAttributeKind.CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, DeviceId); }
/// <summary> /// Init grid information. /// </summary> private void InitGridInfo() { int workItemDimensions = IntrinsicMath.Max(CurrentAPI.GetDeviceInfo <int>( DeviceId, CLDeviceInfoType.CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS), 3); // OpenCL does not report maximium grid sizes, MaxGridSize value is consistent // with the CPU accelator and values returned by CUDA accelerators. // MaxGridSize is ultimately contrained by system and device memory // and how each kernel manages memory. MaxGridSize = new Index3D(int.MaxValue, ushort.MaxValue, ushort.MaxValue); // Resolve max threads per group MaxNumThreadsPerGroup = CurrentAPI.GetDeviceInfo <IntPtr>( DeviceId, CLDeviceInfoType.CL_DEVICE_MAX_WORK_GROUP_SIZE).ToInt32(); // Max work item thread dimensions var workItemSizes = new IntPtr[workItemDimensions]; CurrentAPI.GetDeviceInfo( DeviceId, CLDeviceInfoType.CL_DEVICE_MAX_WORK_ITEM_SIZES, workItemSizes); MaxGroupSize = new Index3D( workItemSizes[0].ToInt32(), workItemSizes[1].ToInt32(), workItemSizes[2].ToInt32()); // Result max number of threads per multiprocessor MaxNumThreadsPerMultiprocessor = MaxNumThreadsPerGroup; }
/// <inheritdoc/> public unsafe override void Synchronize() { ReadOnlySpan <IntPtr> events = stackalloc[] { EventPtr }; CLException.ThrowIfFailed( CurrentAPI.WaitForEvents(events)); }
/// <inheritdoc/> protected override void DisposeAcceleratorObject(bool disposing) { CLException.VerifyDisposed( disposing, CurrentAPI.clReleaseEvent(EventPtr)); EventPtr = IntPtr.Zero; }
private void InitGenericAddressSpaceSupport() { if (DeviceVersion < CLDeviceVersion.CL20) { Capabilities.GenericAddressSpace = false; } else if (DeviceVersion < CLDeviceVersion.CL30) { Capabilities.GenericAddressSpace = true; } else { try { Capabilities.GenericAddressSpace = CurrentAPI.GetDeviceInfo <int>( DeviceId, CLDeviceInfoType.CL_DEVICE_GENERIC_ADDRESS_SPACE_SUPPORT) != 0; } catch (CLException) { Capabilities.GenericAddressSpace = false; } } }
/// <summary> /// Disposes this Cuda buffer. /// </summary> protected override void DisposeAcceleratorObject(bool disposing) { CudaException.VerifyDisposed( disposing, CurrentAPI.FreeMemory(NativePtr)); NativePtr = IntPtr.Zero; }
/// <summary> /// Initializes major vendor features. /// </summary> private void InitVendorFeatures() { // Check major vendor features if (Device.Vendor == CLDeviceVendor.Nvidia || Device.Vendor == CLDeviceVendor.AMD) { return; } // Compile dummy kernel to resolve additional information CLException.ThrowIfFailed(CLKernel.LoadKernel( this, DummyKernelName, DummyKernelSource, CVersion, out IntPtr programPtr, out IntPtr kernelPtr, out var _)); try { // Resolve information WarpSize = CurrentAPI.GetKernelWorkGroupInfo <IntPtr>( kernelPtr, DeviceId, CLKernelWorkGroupInfoType .CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE).ToInt32(); } finally { CLException.ThrowIfFailed( CurrentAPI.ReleaseKernel(kernelPtr)); CLException.ThrowIfFailed( CurrentAPI.ReleaseProgram(programPtr)); } }
/// <summary> /// Init general device information. /// </summary> private void InitDeviceInfo() { // Get the device name ThrowIfFailed( CurrentAPI.GetDeviceName(out string name, DeviceId)); Name = name; // Resolve clock rate ClockRate = CurrentAPI.GetDeviceAttribute( DeviceAttributeKind.CU_DEVICE_ATTRIBUTE_CLOCK_RATE, DeviceId) / 1000; // Resolve warp size WarpSize = CurrentAPI.GetDeviceAttribute( DeviceAttributeKind.CU_DEVICE_ATTRIBUTE_WARP_SIZE, DeviceId); // Resolve number of multiprocessors NumMultiprocessors = CurrentAPI.GetDeviceAttribute( DeviceAttributeKind.CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, DeviceId); // Result max number of threads per multiprocessor MaxNumThreadsPerMultiprocessor = CurrentAPI.GetDeviceAttribute( DeviceAttributeKind.CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR, DeviceId); // Resolve the current driver mode DriverMode = (DeviceDriverMode)CurrentAPI.GetDeviceAttribute( DeviceAttributeKind.CU_DEVICE_ATTRIBUTE_TCC_DRIVER, DeviceId); }
/// <summary> /// Detects OpenCL devices. /// </summary> /// <param name="predicate"> /// The predicate to include a given device. /// </param> /// <param name="registry">The registry to add all devices to.</param> private static void GetDevicesInternal( Predicate <CLDevice> predicate, DeviceRegistry registry) { var devices = new IntPtr[MaxNumDevicesPerPlatform]; // Resolve all platforms if (!CurrentAPI.IsSupported || CurrentAPI.GetNumPlatforms(out int numPlatforms) != CLError.CL_SUCCESS || numPlatforms < 1) { return; } var platforms = new IntPtr[numPlatforms]; if (CurrentAPI.GetPlatforms(platforms, ref numPlatforms) != CLError.CL_SUCCESS) { return; } foreach (var platform in platforms) { // Resolve all devices int numDevices = devices.Length; Array.Clear(devices, 0, numDevices); if (CurrentAPI.GetDevices( platform, CLDeviceType.CL_DEVICE_TYPE_ALL, devices, out numDevices) != CLError.CL_SUCCESS) { continue; } for (int i = 0; i < numDevices; ++i) { // Resolve device and ignore invalid devices var device = devices[i]; if (device == IntPtr.Zero) { continue; } // Check for available device if (CurrentAPI.GetDeviceInfo <int>( device, CLDeviceInfoType.CL_DEVICE_AVAILABLE) == 0) { continue; } var desc = new CLDevice(platform, device); registry.Register(desc, predicate); } } }
/// <summary cref="Accelerator.EstimateGroupSizeInternal( /// Kernel, int, int, out int)"/> protected override int EstimateGroupSizeInternal( Kernel kernel, int dynamicSharedMemorySizeInBytes, int maxGroupSize, out int minGridSize) { if (dynamicSharedMemorySizeInBytes > 0) { throw new ArgumentOutOfRangeException( nameof(dynamicSharedMemorySizeInBytes)); } if (maxGroupSize < 1) { maxGroupSize = MaxNumThreadsPerGroup; } var clKernel = kernel as CLKernel; var workGroupSizeNative = CurrentAPI.GetKernelWorkGroupInfo <IntPtr>( clKernel.KernelPtr, DeviceId, CLKernelWorkGroupInfoType.CL_KERNEL_WORK_GROUP_SIZE); int workGroupSize = workGroupSizeNative.ToInt32(); workGroupSize = IntrinsicMath.Min(workGroupSize, maxGroupSize); minGridSize = IntrinsicMath.DivRoundUp(MaxNumThreads, workGroupSize); return(workGroupSize); }
/// <summary> /// Constructs a new OpenCL accelerator reference. /// </summary> /// <param name="platformId">The OpenCL platform id.</param> /// <param name="deviceId">The OpenCL device id.</param> public CLDevice(IntPtr platformId, IntPtr deviceId) { if (platformId == IntPtr.Zero) { throw new ArgumentOutOfRangeException(nameof(platformId)); } if (deviceId == IntPtr.Zero) { throw new ArgumentOutOfRangeException(nameof(deviceId)); } Backends.Backend.EnsureRunningOnNativePlatform(); PlatformId = platformId; DeviceId = deviceId; InitPlatformInfo(); InitDeviceInfo(); InitGridInfo(); InitVendorAndWarpSizeInfo(); InitMemoryInfo(); InitCInfo(); InitExtensions(); // Resolve extension method getKernelSubGroupInfo = CurrentAPI.GetExtension <clGetKernelSubGroupInfoKHR>( platformId); // Init capabilities Capabilities = new CLCapabilityContext(this); InitGenericAddressSpaceSupport(); }
/// <summary> /// Loads the binary representation of the given OpenCL kernel. /// </summary> /// <param name="program">The program pointer.</param> /// <returns>The binary representation of the underlying kernel.</returns> public static unsafe byte[] LoadBinaryRepresentation(IntPtr program) { IntPtr kernelSize; CLException.ThrowIfFailed( CurrentAPI.GetProgramInfo( program, CLProgramInfo.CL_PROGRAM_BINARY_SIZES, new IntPtr(IntPtr.Size), &kernelSize, out var _)); var programBinary = new byte[kernelSize.ToInt32()]; fixed(byte *binPtr = &programBinary[0]) { CLException.ThrowIfFailed( CurrentAPI.GetProgramInfo( program, CLProgramInfo.CL_PROGRAM_BINARIES, new IntPtr(IntPtr.Size), &binPtr, out var _)); } return(programBinary); }
/// <summary> /// Init general device information. /// </summary> private void InitDeviceInfo() { // Resolve general device information Name = CurrentAPI.GetDeviceInfo( DeviceId, CLDeviceInfoType.CL_DEVICE_NAME); DeviceType = (CLDeviceType)CurrentAPI.GetDeviceInfo <long>( DeviceId, CLDeviceInfoType.CL_DEVICE_TYPE); DeviceVersion = CLDeviceVersion.TryParse( CurrentAPI.GetDeviceInfo( DeviceId, CLDeviceInfoType.CL_DEVICE_VERSION), out var deviceVersion) ? deviceVersion : CLDeviceVersion.CL10; // Resolve clock rate ClockRate = CurrentAPI.GetDeviceInfo <int>( DeviceId, CLDeviceInfoType.CL_DEVICE_MAX_CLOCK_FREQUENCY); // Resolve number of multiprocessors NumMultiprocessors = CurrentAPI.GetDeviceInfo <int>( DeviceId, CLDeviceInfoType.CL_DEVICE_MAX_COMPUTE_UNITS); }
/// <summary> /// Init grid information. /// </summary> private void InitGridInfo() { // Max grid size int workItemDimensions = IntrinsicMath.Max(CurrentAPI.GetDeviceInfo <int>( DeviceId, CLDeviceInfoType.CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS), 3); var workItemSizes = new IntPtr[workItemDimensions]; CurrentAPI.GetDeviceInfo( DeviceId, CLDeviceInfoType.CL_DEVICE_MAX_WORK_ITEM_SIZES, workItemSizes); MaxGridSize = new Index3D( workItemSizes[0].ToInt32(), workItemSizes[1].ToInt32(), workItemSizes[2].ToInt32()); // Resolve max threads per group MaxNumThreadsPerGroup = CurrentAPI.GetDeviceInfo <IntPtr>( DeviceId, CLDeviceInfoType.CL_DEVICE_MAX_WORK_GROUP_SIZE).ToInt32(); MaxGroupSize = new Index3D( MaxNumThreadsPerGroup, MaxNumThreadsPerGroup, MaxNumThreadsPerGroup); // Result max number of threads per multiprocessor MaxNumThreadsPerMultiprocessor = MaxNumThreadsPerGroup; }
/// <summary cref="MemoryBuffer{T, TIndex}.CopyFromView( /// AcceleratorStream, ArrayView{T}, LongIndex1)"/> protected internal unsafe override void CopyFromView( AcceleratorStream stream, ArrayView <T> source, LongIndex1 targetOffset) { var binding = Accelerator.BindScoped(); var sourceAddress = new IntPtr(source.LoadEffectiveAddress()); var targetAddress = new IntPtr(ComputeEffectiveAddress(targetOffset)); var lengthInBytes = new IntPtr(source.LengthInBytes); switch (source.AcceleratorType) { case AcceleratorType.CPU: case AcceleratorType.Cuda: CudaException.ThrowIfFailed( CurrentAPI.MemcpyAsync( targetAddress, sourceAddress, lengthInBytes, stream)); break; default: throw new NotSupportedException( RuntimeErrorMessages.NotSupportedTargetAccelerator); } binding.Recover(); }
/// <summary> /// Disposes this OpenCL buffer. /// </summary> protected override void DisposeAcceleratorObject(bool disposing) { CLException.VerifyDisposed( disposing, CurrentAPI.ReleaseBuffer(NativePtr)); NativePtr = IntPtr.Zero; }
/// <summary cref="MemoryBuffer.MemSetToZero(AcceleratorStream)"/> public override void MemSetToZero(AcceleratorStream stream) => CLException.ThrowIfFailed( CurrentAPI.FillBuffer <byte>( ((CLStream)stream).CommandQueue, NativePtr, 0, IntPtr.Zero, new IntPtr(LengthInBytes)));
/// <inheritdoc/> protected override ProfilingMarker AddProfilingMarkerInternal() { var profilingMarker = new CudaProfilingMarker(); CudaException.ThrowIfFailed( CurrentAPI.RecordEvent(profilingMarker.EventPtr, StreamPtr)); return(profilingMarker); }
/// <summary> /// Disposes this Cuda kernel. /// </summary> protected override void DisposeAcceleratorObject(bool disposing) { CudaException.VerifyDisposed( disposing, CurrentAPI.DestroyModule(modulePtr)); functionPtr = IntPtr.Zero; modulePtr = IntPtr.Zero; }
/// <summary cref="DisposeBase.Dispose(bool)"/> protected override void Dispose(bool disposing) { if (NativePtr != IntPtr.Zero) { CurrentAPI.FreeHostMemory(NativePtr); NativePtr = IntPtr.Zero; } base.Dispose(disposing); }
/// <summary> /// Constructs a new Cuda stream with given <see cref="StreamFlags"/>. /// </summary> /// <param name="accelerator">The associated accelerator.</param> /// <param name="flag"> /// Stream flag to use. Allows blocking and non-blocking streams. /// </param> internal CudaStream(Accelerator accelerator, StreamFlags flag) : base(accelerator) { CudaException.ThrowIfFailed( CurrentAPI.CreateStream( out streamPtr, flag)); responsibleForHandle = true; }
public readonly CLError PreLaunchKernel( CLStream stream, CLKernel kernel, RuntimeKernelConfig config) => CurrentAPI.SetKernelArgumentUnsafeWithKernel( kernel, 0, config.SharedMemoryConfig.DynamicArraySize, null);
/// <summary cref="AcceleratorStream.Synchronize"/> public override void Synchronize() { var binding = Accelerator.BindScoped(); CudaException.ThrowIfFailed( CurrentAPI.SynchronizeStream(streamPtr)); binding.Recover(); }
/// <inheritdoc/> protected override ProfilingMarker AddProfilingMarkerInternal() { using var binding = Accelerator.BindScoped(); var profilingMarker = new CudaProfilingMarker(Accelerator); CudaException.ThrowIfFailed( CurrentAPI.RecordEvent(profilingMarker.EventPtr, StreamPtr)); return(profilingMarker); }
/// <inheritdoc/> public unsafe override void Synchronize() { using var binding = Accelerator.BindScoped(); ReadOnlySpan <IntPtr> events = stackalloc[] { EventPtr }; CLException.ThrowIfFailed( CurrentAPI.WaitForEvents(events)); }
/// <summary cref="DisposeBase.Dispose(bool)"/> protected override void Dispose(bool disposing) { base.Dispose(disposing); if (contextPtr != IntPtr.Zero) { CLException.ThrowIfFailed( CurrentAPI.ReleaseContext(contextPtr)); contextPtr = IntPtr.Zero; } }
/// <summary cref="DisposeBase.Dispose(bool)"/> protected override void Dispose(bool disposing) { if (NativePtr != IntPtr.Zero) { CudaException.ThrowIfFailed( CurrentAPI.FreeMemory(NativePtr)); NativePtr = IntPtr.Zero; } base.Dispose(disposing); }
/// <summary cref="DisposeBase.Dispose(bool)"/> protected override void Dispose(bool disposing) { if (NativePtr != IntPtr.Zero) { CLException.ThrowIfFailed( CurrentAPI.ReleaseBuffer(NativePtr)); NativePtr = IntPtr.Zero; } base.Dispose(disposing); }
/// <summary> /// Frees the Cuda host memory. /// </summary> protected override void DisposeAcceleratorObject(bool disposing) { if (NativePtr == IntPtr.Zero) { return; } CurrentAPI.FreeHostMemory(NativePtr); NativePtr = IntPtr.Zero; }