public GroupInfo GetInfo() { if (CompareExchange(ref _dcgmHandle, IntPtr.Zero, IntPtr.Zero) == IntPtr.Zero) { throw new InvalidOperationException("Failed operation because " + nameof(Metrics) + " has not been initialized."); } if (CompareExchange(ref _groupId, IntPtr.Zero, IntPtr.Zero) == IntPtr.Zero) { throw new NullReferenceException("Failed operation because group identity has not been initialized."); } dcgm_group_info_v2 dcgmGroupInfo = new dcgm_group_info_v2 { }; dcgmGroupInfo.version = Utils.dcgmGroupInfoVersion(); dcgm_return status = libdcgm.dcgmGroupGetInfo(_dcgmHandle, _groupId, &dcgmGroupInfo); if (status != dcgm_return.Ok) { throw new InvalidOperationException($"Error getting group information. {Utils.errorString(status)}."); } dcgm_group_entity_pair *entity = (dcgm_group_entity_pair *)dcgmGroupInfo.entityList; var entityIds = new List <int>(); for (int i = 0; i < dcgmGroupInfo.count; i++) { entityIds.Add(entity[i].entity_id); } var groupInfo = new GroupInfo { GroupName = new string((sbyte *)dcgmGroupInfo.groupName), DeviceIds = entityIds, }; return(groupInfo); }
internal static string errorString(dcgm_return result) { switch (result) { case dcgm_return.Ok: return("Success"); case dcgm_return.BadParam: return("Bad parameter passed to function"); case dcgm_return.GenericError: return("Generic unspecified error"); case dcgm_return.Memory: return("Out of memory error"); case dcgm_return.NotConfigured: return("Setting not configured"); case dcgm_return.NotSupported: return("Feature not supported"); case dcgm_return.InitError: return("DCGM initialization error"); case dcgm_return.NvmlError: return("NVML error"); case dcgm_return.Pending: return("Object is in a pending state"); case dcgm_return.Uninitialized: return("Object is in an undefined state"); case dcgm_return.Timeout: return("Timeout"); case dcgm_return.VersionMismatch: return("API version mismatch"); case dcgm_return.UnknownField: return("Unknown field identifier"); case dcgm_return.NoData: return("No data is available"); case dcgm_return.StaleData: return("Only stale data is available"); case dcgm_return.NotWatched: return("Field is not being watched"); case dcgm_return.NoPermission: return("No permission"); case dcgm_return.GpuIsLost: return("GPU is lost"); case dcgm_return.ResetRequired: return("GPU requires reset"); case dcgm_return.ConnectionNotValid: return("Host engine connection invalid/disconnected"); case dcgm_return.GpuNotSupported: return("This GPU is not supported by DCGM"); case dcgm_return.GroupIncompatible: return("The GPUs of this group are incompatible with each other for the requested operation"); case dcgm_return.MaxLimit: return("Max limit reached for the object"); case dcgm_return.LibraryNotFound: return("DCGM library could not be found"); case dcgm_return.DuplicateKey: return("Duplicate Key passed to function"); case dcgm_return.GpuInSyncBoostGroup: return("GPU is a part of a Sync Boost Group"); case dcgm_return.GpuNotInSyncBoostGroup: return("GPU is not a part of Sync Boost Group"); case dcgm_return.RequiresRoot: return("Host engine is running as non-root"); case dcgm_return.NvvsError: return("DCGM GPU Diagnostic returned an error"); case dcgm_return.InsufficientSize: return("An input argument is not large enough"); case dcgm_return.FieldUnsupportedByApi: return("The given field ID is not supported by the API being called"); case dcgm_return.ModuleNotLoaded: return("This request is serviced by a module of DCGM that is not currently loaded"); case dcgm_return.InUse: return("The requested operation could not be completed because the affected resource is in use"); case dcgm_return.GroupIsEmpty: return("The specified group is empty, and this operation is incompatible with an empty group"); case dcgm_return.ProfilingNotSupported: return("Profiling is not supported for this group of GPUs or GPU"); case dcgm_return.ProfilingLibraryError: return("The third-party Profiling module returned an unrecoverable error"); case dcgm_return.ProfilingMultiPass: return("The requested profiling metrics cannot be collected in a single pass"); case dcgm_return.DiagAlreadyRunning: return("A diag instance is already running, cannot run a new diag until the current one finishes"); case dcgm_return.DiagBadJson: return("The GPU Diagnostic returned Json that cannot be parsed."); case dcgm_return.DiagBadLaunch: return("Error while launching the GPU Diagnostic."); case dcgm_return.DiagVariance: return("The results of training DCGM GPU Diagnostic cannot be trusted because they vary too much from run to run"); case dcgm_return.DiagThresholdExceeded: return("A field value met or exceeded the error threshold."); case dcgm_return.InsufficientDriverVersion: return("The installed driver version is insufficient for this API"); default: // Wrong error codes should be handled by the caller return(""); } }
/// <summary> /// Gpu metrics watcher. /// <para/> /// Initializes watcher and starts recording updates for metrics collection. If no device identifiers are specified, all supported devices are watched. /// <para/> /// </summary> /// <param name="dcgmHandle"> /// DCGM Handle. /// </param> /// <param name="name"> /// Name of the watcher. /// </param> /// <param name="updateFrequency"> /// How often to update metrics in millisecond. /// </param> /// <param name="deviceIds"> /// Device identifiers of GPUs to watch. /// </param> internal Watcher(IntPtr dcgmHandle, string name, TimeSpan updateFrequency, params int[] deviceIds) { if (dcgmHandle == IntPtr.Zero) { throw new ArgumentNullException(nameof(dcgmHandle)); } if (name is null) { throw new ArgumentNullException(nameof(name)); } if (deviceIds is null) { throw new ArgumentNullException(nameof(deviceIds)); } _syncpoint = new object(); _latestMetrics = new List <LatestGpuMetrics>(); _dcgmHandle = dcgmHandle; // Generate new guid for this watcher var watcherId = Guid.NewGuid().ToString(); // Create GPU Group var groupName = name + "_gpu_group_" + watcherId; var groupId = IntPtr.Zero; fixed(char *c = groupName) { var string_buffer = stackalloc byte[StringBufferSize]; var len = Utf8.GetBytes(c, groupName.Length, string_buffer, StringBufferSize - 1); string_buffer[len] = 0; if (deviceIds.Length == 0) { var result = libdcgm.dcgmGroupCreate(dcgmHandle, dcgm_group_type.Default, string_buffer, &groupId); if (result != dcgm_return.Ok) { throw new InvalidOperationException($"Error creating group. {Utils.errorString(result)}."); } } else { var result = libdcgm.dcgmGroupCreate(dcgmHandle, dcgm_group_type.Empty, string_buffer, &groupId); if (result != dcgm_return.Ok) { throw new InvalidOperationException($"Error creating group. {Utils.errorString(result)}."); } foreach (uint id in deviceIds) { result = libdcgm.dcgmGroupAddDevice(dcgmHandle, groupId, id); if (result != dcgm_return.Ok) { throw new InvalidOperationException($"Error adding gpu id {id} to group. {Utils.errorString(result)}."); } } } } // Instantiate GPU group _gpuGroupId = groupId; _gpuGroup = new GpuGroup(_dcgmHandle, _gpuGroupId); // Create GPU metrics field group var gpuMetricsGroupName = name + "_field_group_" + watcherId; var fieldIds = DefaultFieldIds; var fieldGroupId = IntPtr.Zero; fixed(ushort *f = fieldIds) { fixed(char *c = gpuMetricsGroupName) { byte *string_buffer = stackalloc byte[StringBufferSize]; int len = Utf8.GetBytes(c, gpuMetricsGroupName.Length, string_buffer, StringBufferSize - 1); string_buffer[len] = 0; dcgm_return result = libdcgm.dcgmFieldGroupCreate(_dcgmHandle, fieldIds.Length, f, string_buffer, &fieldGroupId); if (result != dcgm_return.Ok) { throw new InvalidOperationException($"Error creating field group. {Utils.errorString(result)}."); } } } // Instantiate GPU metrics field group _gpuMetricsGroupId = fieldGroupId; _gpuMetricsFieldGroup = new FieldGroup(_dcgmHandle, _gpuMetricsGroupId); var status = libdcgm.dcgmWatchFields(_dcgmHandle, _gpuGroupId, _gpuMetricsGroupId, (ulong)updateFrequency.TotalMilliseconds * 1000, Math.Ceiling(updateFrequency.TotalSeconds), 0); if (status != dcgm_return.Ok) { throw new InvalidOperationException($"Error setting watches. {Utils.errorString(status)}."); } _pollTimer = new System.Timers.Timer(updateFrequency.TotalMilliseconds); _pollTimer.Elapsed += (object source, System.Timers.ElapsedEventArgs e) => GetLatestInternal(); _pollTimer.AutoReset = true; _pollTimer.Enabled = true; GetLatestInternal(); }