Пример #1
0
 private static void CheckDriverLife(object objectInfo)
 {
     using var tryLock = new TryLock(_lock);
     if (!NVIDIA_MON.nhm_nvidia_is_nvapi_alive() || !NVIDIA_MON.nhm_nvidia_is_nvml_alive())
     {
         FailCounter++;
         RestartDrivers();
         if (FailCounter == 20)
         {
             DriverAliveCheckTimer.Change(0, 60000);
             CurrentTimeout = 60000;
         }
         else if (FailCounter == 30)
         {
             DriverAliveCheckTimer.Change(0, 3600000);
             CurrentTimeout = 3600000;
         }
     }
     else
     {
         FailCounter = 0;
         if (CurrentTimeout != 10000)
         {
             DriverAliveCheckTimer.Change(0, 10000);
         }
     }
 }
Пример #2
0
 private static void RestartDrivers()
 {
     NVIDIA_MON.nhm_nvidia_deinit();
     NVIDIA_MON.nhm_nvidia_init();
 }
        public static Task <List <DeviceMonitor> > GetDeviceMonitors(IEnumerable <BaseDevice> devices)
        {
            return(Task.Run(() =>
            {
                var ret = new List <DeviceMonitor>();

                void addCPUs()
                {
                    var cpus = devices.GetDeviceTypes <CPUDevice>();
                    foreach (var cpu in cpus)
                    {
                        ret.Add(new DeviceMonitorCPU(cpu.UUID));
                    }
                }
                void addAMDs()
                {
                    var amds = devices.GetDeviceTypes <AMDDevice>();
                    if (!amds.Any())
                    {
                        return;
                    }

                    AMD_ODN.nhm_amd_set_debug_log_level(_amdDebugLogLevel);
                    AMD_ODN.nhm_amd_reg_log_cb(_amdLog);
                    var amdInit = AMD_ODN.nhm_amd_init();
                    if (0 != amdInit)
                    {
                        Logger.Info("DeviceMonitorManager", $"AMD nhm_amd_init {amdInit}");
                        return;
                    }
                    foreach (var amd in amds)
                    {
                        var hasRet = AMD_ODN.nhm_amd_has_adapter(amd.PCIeBusID);
                        if (0 == hasRet)
                        {
                            ret.Add(new DeviceMonitorAMD(amd.UUID, amd.PCIeBusID));
                        }
                        else
                        {
                            Logger.Info("DeviceMonitorManager", $"AMD nhm_amd_has_adapter {hasRet} for BusID {amd.PCIeBusID}");
                        }
                    }
                }
                void addNVIDIAs()
                {
                    var nvidias = devices.GetDeviceTypes <CUDADevice>();
                    if (!nvidias.Any())
                    {
                        return;
                    }

                    NVIDIA_MON.nhm_nvidia_set_debug_log_level(_nvidiaDebugLogLevel);
                    NVIDIA_MON.nhm_nvidia_reg_log_cb(_nvidiaLog);
                    var initialNvmlRestartTimeWait = Math.Min(500 * nvidias.Length, 5000); // 500ms per GPU or initial MAX of 5seconds
                    var nvidiaUUIDAndBusIds = nvidias.ToDictionary(nvidia => nvidia.UUID, nvidia => nvidia.PCIeBusID);
                    var nvidiaInit = NVIDIA_MON.nhm_nvidia_init();
                    NVIDIA_MON.nhm_nvidia_reg_log_cb(_nvidiaLog);
                    DeviceMonitorNVIDIA.Init();

                    if (nvidiaInit != 0)
                    {
                        Logger.Info("DeviceMonitorManager", $"AMD nhm_nvidia_init {nvidiaInit}");
                        return;
                    }

                    foreach (var nvidia in nvidias)
                    {
                        ret.Add(new DeviceMonitorNVIDIA(nvidia.UUID, nvidia.PCIeBusID));
                    }
                }
                addCPUs();
                addAMDs();
                addNVIDIAs();
                return ret;
            }));
        }