Example #1
0
        public GroupInfo GetInfo()
        {
            if (CompareExchange(ref _dcgmHandle, IntPtr.Zero, IntPtr.Zero) == IntPtr.Zero)
            {
                throw new InvalidOperationException("Failed operation because " + nameof(Metrics) + " has not been initialized.");
            }
            if (CompareExchange(ref _groupId, IntPtr.Zero, IntPtr.Zero) == IntPtr.Zero)
            {
                throw new NullReferenceException("Failed operation because group identity has not been initialized.");
            }

            dcgm_group_info_v2 dcgmGroupInfo = new dcgm_group_info_v2 {
            };

            dcgmGroupInfo.version = Utils.dcgmGroupInfoVersion();

            dcgm_return status = libdcgm.dcgmGroupGetInfo(_dcgmHandle, _groupId, &dcgmGroupInfo);

            if (status != dcgm_return.Ok)
            {
                throw new InvalidOperationException($"Error getting group information. {Utils.errorString(status)}.");
            }

            dcgm_group_entity_pair *entity = (dcgm_group_entity_pair *)dcgmGroupInfo.entityList;
            var entityIds = new List <int>();

            for (int i = 0; i < dcgmGroupInfo.count; i++)
            {
                entityIds.Add(entity[i].entity_id);
            }

            var groupInfo = new GroupInfo
            {
                GroupName = new string((sbyte *)dcgmGroupInfo.groupName),
                DeviceIds = entityIds,
            };

            return(groupInfo);
        }
Example #2
0
        internal static string errorString(dcgm_return result)
        {
            switch (result)
            {
            case dcgm_return.Ok:
                return("Success");

            case dcgm_return.BadParam:
                return("Bad parameter passed to function");

            case dcgm_return.GenericError:
                return("Generic unspecified error");

            case dcgm_return.Memory:
                return("Out of memory error");

            case dcgm_return.NotConfigured:
                return("Setting not configured");

            case dcgm_return.NotSupported:
                return("Feature not supported");

            case dcgm_return.InitError:
                return("DCGM initialization error");

            case dcgm_return.NvmlError:
                return("NVML error");

            case dcgm_return.Pending:
                return("Object is in a pending state");

            case dcgm_return.Uninitialized:
                return("Object is in an undefined state");

            case dcgm_return.Timeout:
                return("Timeout");

            case dcgm_return.VersionMismatch:
                return("API version mismatch");

            case dcgm_return.UnknownField:
                return("Unknown field identifier");

            case dcgm_return.NoData:
                return("No data is available");

            case dcgm_return.StaleData:
                return("Only stale data is available");

            case dcgm_return.NotWatched:
                return("Field is not being watched");

            case dcgm_return.NoPermission:
                return("No permission");

            case dcgm_return.GpuIsLost:
                return("GPU is lost");

            case dcgm_return.ResetRequired:
                return("GPU requires reset");

            case dcgm_return.ConnectionNotValid:
                return("Host engine connection invalid/disconnected");

            case dcgm_return.GpuNotSupported:
                return("This GPU is not supported by DCGM");

            case dcgm_return.GroupIncompatible:
                return("The GPUs of this group are incompatible with each other for the requested operation");

            case dcgm_return.MaxLimit:
                return("Max limit reached for the object");

            case dcgm_return.LibraryNotFound:
                return("DCGM library could not be found");

            case dcgm_return.DuplicateKey:
                return("Duplicate Key passed to function");

            case dcgm_return.GpuInSyncBoostGroup:
                return("GPU is a part of a Sync Boost Group");

            case dcgm_return.GpuNotInSyncBoostGroup:
                return("GPU is not a part of Sync Boost Group");

            case dcgm_return.RequiresRoot:
                return("Host engine is running as non-root");

            case dcgm_return.NvvsError:
                return("DCGM GPU Diagnostic returned an error");

            case dcgm_return.InsufficientSize:
                return("An input argument is not large enough");

            case dcgm_return.FieldUnsupportedByApi:
                return("The given field ID is not supported by the API being called");

            case dcgm_return.ModuleNotLoaded:
                return("This request is serviced by a module of DCGM that is not currently loaded");

            case dcgm_return.InUse:
                return("The requested operation could not be completed because the affected resource is in use");

            case dcgm_return.GroupIsEmpty:
                return("The specified group is empty, and this operation is incompatible with an empty group");

            case dcgm_return.ProfilingNotSupported:
                return("Profiling is not supported for this group of GPUs or GPU");

            case dcgm_return.ProfilingLibraryError:
                return("The third-party Profiling module returned an unrecoverable error");

            case dcgm_return.ProfilingMultiPass:
                return("The requested profiling metrics cannot be collected in a single pass");

            case dcgm_return.DiagAlreadyRunning:
                return("A diag instance is already running, cannot run a new diag until the current one finishes");

            case dcgm_return.DiagBadJson:
                return("The GPU Diagnostic returned Json that cannot be parsed.");

            case dcgm_return.DiagBadLaunch:
                return("Error while launching the GPU Diagnostic.");

            case dcgm_return.DiagVariance:
                return("The results of training DCGM GPU Diagnostic cannot be trusted because they vary too much from run to run");

            case dcgm_return.DiagThresholdExceeded:
                return("A field value met or exceeded the error threshold.");

            case dcgm_return.InsufficientDriverVersion:
                return("The installed driver version is insufficient for this API");

            default:
                // Wrong error codes should be handled by the caller
                return("");
            }
        }
Example #3
0
        /// <summary>
        /// Gpu metrics watcher.
        /// <para/>
        /// Initializes watcher and starts recording updates for metrics collection. If no device identifiers are specified, all supported devices are watched.
        /// <para/>
        /// </summary>
        /// <param name="dcgmHandle">
        /// DCGM Handle.
        /// </param>
        /// <param name="name">
        /// Name of the watcher.
        /// </param>
        /// <param name="updateFrequency">
        /// How often to update metrics in millisecond.
        /// </param>
        /// <param name="deviceIds">
        /// Device identifiers of GPUs to watch.
        /// </param>
        internal Watcher(IntPtr dcgmHandle, string name, TimeSpan updateFrequency, params int[] deviceIds)
        {
            if (dcgmHandle == IntPtr.Zero)
            {
                throw new ArgumentNullException(nameof(dcgmHandle));
            }
            if (name is null)
            {
                throw new ArgumentNullException(nameof(name));
            }
            if (deviceIds is null)
            {
                throw new ArgumentNullException(nameof(deviceIds));
            }

            _syncpoint     = new object();
            _latestMetrics = new List <LatestGpuMetrics>();
            _dcgmHandle    = dcgmHandle;

            // Generate new guid for this watcher
            var watcherId = Guid.NewGuid().ToString();

            // Create GPU Group
            var groupName = name + "_gpu_group_" + watcherId;
            var groupId   = IntPtr.Zero;

            fixed(char *c = groupName)
            {
                var string_buffer = stackalloc byte[StringBufferSize];
                var len           = Utf8.GetBytes(c, groupName.Length, string_buffer, StringBufferSize - 1);

                string_buffer[len] = 0;

                if (deviceIds.Length == 0)
                {
                    var result = libdcgm.dcgmGroupCreate(dcgmHandle, dcgm_group_type.Default, string_buffer, &groupId);
                    if (result != dcgm_return.Ok)
                    {
                        throw new InvalidOperationException($"Error creating group. {Utils.errorString(result)}.");
                    }
                }
                else
                {
                    var result = libdcgm.dcgmGroupCreate(dcgmHandle, dcgm_group_type.Empty, string_buffer, &groupId);
                    if (result != dcgm_return.Ok)
                    {
                        throw new InvalidOperationException($"Error creating group. {Utils.errorString(result)}.");
                    }
                    foreach (uint id in deviceIds)
                    {
                        result = libdcgm.dcgmGroupAddDevice(dcgmHandle, groupId, id);
                        if (result != dcgm_return.Ok)
                        {
                            throw new InvalidOperationException($"Error adding gpu id {id} to group. {Utils.errorString(result)}.");
                        }
                    }
                }
            }

            // Instantiate GPU group
            _gpuGroupId = groupId;
            _gpuGroup   = new GpuGroup(_dcgmHandle, _gpuGroupId);

            // Create GPU metrics field group
            var gpuMetricsGroupName = name + "_field_group_" + watcherId;
            var fieldIds            = DefaultFieldIds;
            var fieldGroupId        = IntPtr.Zero;

            fixed(ushort *f = fieldIds)
            {
                fixed(char *c = gpuMetricsGroupName)
                {
                    byte *string_buffer = stackalloc byte[StringBufferSize];
                    int   len           = Utf8.GetBytes(c, gpuMetricsGroupName.Length, string_buffer, StringBufferSize - 1);

                    string_buffer[len] = 0;

                    dcgm_return result = libdcgm.dcgmFieldGroupCreate(_dcgmHandle, fieldIds.Length, f, string_buffer, &fieldGroupId);

                    if (result != dcgm_return.Ok)
                    {
                        throw new InvalidOperationException($"Error creating field group. {Utils.errorString(result)}.");
                    }
                }
            }

            // Instantiate GPU metrics field group
            _gpuMetricsGroupId    = fieldGroupId;
            _gpuMetricsFieldGroup = new FieldGroup(_dcgmHandle, _gpuMetricsGroupId);


            var status = libdcgm.dcgmWatchFields(_dcgmHandle,
                                                 _gpuGroupId,
                                                 _gpuMetricsGroupId,
                                                 (ulong)updateFrequency.TotalMilliseconds * 1000,
                                                 Math.Ceiling(updateFrequency.TotalSeconds),
                                                 0);

            if (status != dcgm_return.Ok)
            {
                throw new InvalidOperationException($"Error setting watches. {Utils.errorString(status)}.");
            }

            _pollTimer           = new System.Timers.Timer(updateFrequency.TotalMilliseconds);
            _pollTimer.Elapsed  += (object source, System.Timers.ElapsedEventArgs e) => GetLatestInternal();
            _pollTimer.AutoReset = true;
            _pollTimer.Enabled   = true;

            GetLatestInternal();
        }