private static void CheckDriverLife(object objectInfo) { using var tryLock = new TryLock(_lock); if (!NVIDIA_MON.nhm_nvidia_is_nvapi_alive() || !NVIDIA_MON.nhm_nvidia_is_nvml_alive()) { FailCounter++; RestartDrivers(); if (FailCounter == 20) { DriverAliveCheckTimer.Change(0, 60000); CurrentTimeout = 60000; } else if (FailCounter == 30) { DriverAliveCheckTimer.Change(0, 3600000); CurrentTimeout = 3600000; } } else { FailCounter = 0; if (CurrentTimeout != 10000) { DriverAliveCheckTimer.Change(0, 10000); } } }
private static void RestartDrivers() { NVIDIA_MON.nhm_nvidia_deinit(); NVIDIA_MON.nhm_nvidia_init(); }
public static Task <List <DeviceMonitor> > GetDeviceMonitors(IEnumerable <BaseDevice> devices) { return(Task.Run(() => { var ret = new List <DeviceMonitor>(); void addCPUs() { var cpus = devices.GetDeviceTypes <CPUDevice>(); foreach (var cpu in cpus) { ret.Add(new DeviceMonitorCPU(cpu.UUID)); } } void addAMDs() { var amds = devices.GetDeviceTypes <AMDDevice>(); if (!amds.Any()) { return; } AMD_ODN.nhm_amd_set_debug_log_level(_amdDebugLogLevel); AMD_ODN.nhm_amd_reg_log_cb(_amdLog); var amdInit = AMD_ODN.nhm_amd_init(); if (0 != amdInit) { Logger.Info("DeviceMonitorManager", $"AMD nhm_amd_init {amdInit}"); return; } foreach (var amd in amds) { var hasRet = AMD_ODN.nhm_amd_has_adapter(amd.PCIeBusID); if (0 == hasRet) { ret.Add(new DeviceMonitorAMD(amd.UUID, amd.PCIeBusID)); } else { Logger.Info("DeviceMonitorManager", $"AMD nhm_amd_has_adapter {hasRet} for BusID {amd.PCIeBusID}"); } } } void addNVIDIAs() { var nvidias = devices.GetDeviceTypes <CUDADevice>(); if (!nvidias.Any()) { return; } NVIDIA_MON.nhm_nvidia_set_debug_log_level(_nvidiaDebugLogLevel); NVIDIA_MON.nhm_nvidia_reg_log_cb(_nvidiaLog); var initialNvmlRestartTimeWait = Math.Min(500 * nvidias.Length, 5000); // 500ms per GPU or initial MAX of 5seconds var nvidiaUUIDAndBusIds = nvidias.ToDictionary(nvidia => nvidia.UUID, nvidia => nvidia.PCIeBusID); var nvidiaInit = NVIDIA_MON.nhm_nvidia_init(); NVIDIA_MON.nhm_nvidia_reg_log_cb(_nvidiaLog); DeviceMonitorNVIDIA.Init(); if (nvidiaInit != 0) { Logger.Info("DeviceMonitorManager", $"AMD nhm_nvidia_init {nvidiaInit}"); return; } foreach (var nvidia in nvidias) { ret.Add(new DeviceMonitorNVIDIA(nvidia.UUID, nvidia.PCIeBusID)); } } addCPUs(); addAMDs(); addNVIDIAs(); return ret; })); }