public static Task <List <DeviceMonitor> > GetDeviceMonitors(IEnumerable <BaseDevice> devices, bool isDCHDriver) { return(Task.Run(() => { var ret = new List <DeviceMonitor>(); var cpus = devices.Where(dev => dev is CPUDevice).Cast <CPUDevice>().ToList(); var amds = devices.Where(dev => dev is AMDDevice).Cast <AMDDevice>().ToList(); var nvidias = devices.Where(dev => dev is CUDADevice).Cast <CUDADevice>().ToList(); foreach (var cpu in cpus) { ret.Add(new DeviceMonitorCPU(cpu.UUID)); } if (amds.Count > 0) { var amdBusIdAndUuids = amds.ToDictionary(amd => amd.PCIeBusID, amd => amd.UUID); var(_, amdInfos) = QueryAdl.TryQuery(amdBusIdAndUuids); foreach (var amdInfo in amdInfos) { ret.Add(new DeviceMonitorAMD(amdInfo)); } } if (nvidias.Count > 0) { var nvidiaUUIDAndBusIds = nvidias.ToDictionary(nvidia => nvidia.UUID, nvidia => nvidia.PCIeBusID); var nvidiaInfos = NvidiaMonitorManager.Init(nvidiaUUIDAndBusIds, isDCHDriver && UseNvmlFallback.Enabled); foreach (var nvidiaInfo in nvidiaInfos) { ret.Add(new DeviceMonitorNVIDIA(nvidiaInfo)); } } return ret; })); }
public static Task <List <DeviceMonitor> > GetDeviceMonitors(IEnumerable <BaseDevice> devices) { return(Task.Run(() => { var ret = new List <DeviceMonitor>(); var cpus = devices.Where(dev => dev is CPUDevice).Cast <CPUDevice>().ToList(); var amds = devices.Where(dev => dev is AMDDevice).Cast <AMDDevice>().ToList(); var nvidias = devices.Where(dev => dev is CUDADevice).Cast <CUDADevice>().ToList(); foreach (var cpu in cpus) { ret.Add(new DeviceMonitorCPU(cpu.UUID)); } if (amds.Count > 0) { AMD_ODN.nhm_amd_set_debug_log_level(_amdDebugLogLevel); AMD_ODN.nhm_amd_reg_log_cb(_amdLog); var amdInit = AMD_ODN.nhm_amd_init(); if (0 == amdInit) { foreach (var amd in amds) { var hasRet = AMD_ODN.nhm_amd_has_adapter(amd.PCIeBusID); if (0 == hasRet) { ret.Add(new DeviceMonitorAMD(amd.UUID, amd.PCIeBusID)); } else { Logger.Info("DeviceMonitorManager", $"AMD nhm_amd_has_adapter {hasRet} for BusID {amd.PCIeBusID}"); } } } else { Logger.Info("DeviceMonitorManager", $"AMD nhm_amd_init {amdInit}"); } } if (nvidias.Count > 0) { var initialNvmlRestartTimeWait = Math.Min(500 * nvidias.Count, 5000); // 500ms per GPU or initial MAX of 5seconds var firstMaxTimeoutAfterNvmlRestart = TimeSpan.FromMilliseconds(initialNvmlRestartTimeWait); var nvidiaUUIDAndBusIds = nvidias.ToDictionary(nvidia => nvidia.UUID, nvidia => nvidia.PCIeBusID); NvidiaMonitorManager.Init(nvidiaUUIDAndBusIds); foreach (var nvidia in nvidias) { var deviceMonitorNVIDIA = new DeviceMonitorNVIDIA(nvidia.UUID, nvidia.PCIeBusID, firstMaxTimeoutAfterNvmlRestart); ret.Add(deviceMonitorNVIDIA); } } return ret; })); }
private static void RestartNVIDIAMonitoring() { lock (DeviceMonitorNVIDIA._lock) { NvidiaMonitorManager.ShutdownNvml(); var nvidiaInfos = NvidiaMonitorManager.Init(_nvidiaUUIDAndBusIds, _isDCHDriver && UseNvmlFallback.Enabled); foreach (var nvidiaInfo in nvidiaInfos) { var deviceMonitorNVIDIA = _deviceMonitorNVIDIAs.Where(devMon => devMon.UUID == nvidiaInfo.UUID).FirstOrDefault(); if (deviceMonitorNVIDIA == null) { continue; } deviceMonitorNVIDIA.ResetHandles(nvidiaInfo); } } }
// NVML is thread-safe according to the documentation private T ExecNvmlProcedure <T>(T failReturn, string tag, Func <T> nvmlExecFun) { if (!NvidiaMonitorManager.InitalNVMLInitSuccess) { Logger.ErrorDelayed(LogTag, $"{tag} InitalNVMLInitSuccess==FALSE", TimeSpan.FromMinutes(5)); return(failReturn); } if (NvidiaMonitorManager.IsNVMLRestarting) { Logger.ErrorDelayed(LogTag, $"Skipping {tag} NVML IsRestarting", TimeSpan.FromSeconds(5)); return(failReturn); } try { var execRet = nvmlExecFun(); _deviceMonitorWatchdog.Reset(); // if nvmlExecFun doesn't throw we mark this as success return(execRet); } catch (Exception e) { Logger.ErrorDelayed(LogTag, e.ToString(), TimeSpan.FromSeconds(30)); if (e is NvmlException ne && !SkipNvmlErrorRecovery(ne.ReturnCode)) { if (_deviceMonitorWatchdog.IsAttemptErrorRecoveryPermanentlyDisabled()) { Logger.ErrorDelayed(LogTag, $"{tag} Will NOT RESTART NVML. Recovery for this device is permanently disabled.", TimeSpan.FromSeconds(30)); return(failReturn); } _deviceMonitorWatchdog.SetErrorTime(); var shouldAttemptRestartNvml = _deviceMonitorWatchdog.ShouldAttemptErrorRecovery(); if (shouldAttemptRestartNvml) { _deviceMonitorWatchdog.UpdateTickError(); Logger.Info(LogTag, $"{tag} Will call NVML restart"); NvidiaMonitorManager.AttemptRestartNVML(); } } } return(failReturn); }
public static Task <List <DeviceMonitor> > GetDeviceMonitors(IEnumerable <BaseDevice> devices, bool isDCHDriver) { return(Task.Run(() => { var ret = new List <DeviceMonitor>(); var cpus = devices.Where(dev => dev is CPUDevice).Cast <CPUDevice>().ToList(); var amds = devices.Where(dev => dev is AMDDevice).Cast <AMDDevice>().ToList(); var nvidias = devices.Where(dev => dev is CUDADevice).Cast <CUDADevice>().ToList(); foreach (var cpu in cpus) { ret.Add(new DeviceMonitorCPU(cpu.UUID)); } if (amds.Count > 0) { var amdBusIdAndUuids = amds.ToDictionary(amd => amd.PCIeBusID, amd => amd.UUID); var(_, amdInfos) = QueryAdl.TryQuery(amdBusIdAndUuids); foreach (var amd in amds) { var currentAmdInfos = amdInfos.Where(info => info.BusID == amd.PCIeBusID); ret.Add(new DeviceMonitorAMD(amd.UUID, amd.PCIeBusID, currentAmdInfos.ToArray())); } } if (nvidias.Count > 0) { var initialNvmlRestartTimeWait = Math.Min(500 * nvidias.Count, 5000); // 500ms per GPU or initial MAX of 5seconds var firstMaxTimeoutAfterNvmlRestart = TimeSpan.FromMilliseconds(initialNvmlRestartTimeWait); var nvidiaUUIDAndBusIds = nvidias.ToDictionary(nvidia => nvidia.UUID, nvidia => nvidia.PCIeBusID); NvidiaMonitorManager.Init(nvidiaUUIDAndBusIds, isDCHDriver && UseNvmlFallback.Enabled); foreach (var nvidia in nvidias) { var deviceMonitorNVIDIA = new DeviceMonitorNVIDIA(nvidia.UUID, nvidia.PCIeBusID, firstMaxTimeoutAfterNvmlRestart); ret.Add(deviceMonitorNVIDIA); } } return ret; })); }