Exemplo n.º 1
0
        public static Task <List <DeviceMonitor> > GetDeviceMonitors(IEnumerable <BaseDevice> devices, bool isDCHDriver)
        {
            return(Task.Run(() => {
                var ret = new List <DeviceMonitor>();

                var cpus = devices.Where(dev => dev is CPUDevice).Cast <CPUDevice>().ToList();
                var amds = devices.Where(dev => dev is AMDDevice).Cast <AMDDevice>().ToList();
                var nvidias = devices.Where(dev => dev is CUDADevice).Cast <CUDADevice>().ToList();

                foreach (var cpu in cpus)
                {
                    ret.Add(new DeviceMonitorCPU(cpu.UUID));
                }
                if (amds.Count > 0)
                {
                    var amdBusIdAndUuids = amds.ToDictionary(amd => amd.PCIeBusID, amd => amd.UUID);
                    var(_, amdInfos) = QueryAdl.TryQuery(amdBusIdAndUuids);
                    foreach (var amdInfo in amdInfos)
                    {
                        ret.Add(new DeviceMonitorAMD(amdInfo));
                    }
                }
                if (nvidias.Count > 0)
                {
                    var nvidiaUUIDAndBusIds = nvidias.ToDictionary(nvidia => nvidia.UUID, nvidia => nvidia.PCIeBusID);
                    var nvidiaInfos = NvidiaMonitorManager.Init(nvidiaUUIDAndBusIds, isDCHDriver && UseNvmlFallback.Enabled);
                    foreach (var nvidiaInfo in nvidiaInfos)
                    {
                        ret.Add(new DeviceMonitorNVIDIA(nvidiaInfo));
                    }
                }

                return ret;
            }));
        }
        public static Task <List <DeviceMonitor> > GetDeviceMonitors(IEnumerable <BaseDevice> devices)
        {
            return(Task.Run(() =>
            {
                var ret = new List <DeviceMonitor>();

                var cpus = devices.Where(dev => dev is CPUDevice).Cast <CPUDevice>().ToList();
                var amds = devices.Where(dev => dev is AMDDevice).Cast <AMDDevice>().ToList();
                var nvidias = devices.Where(dev => dev is CUDADevice).Cast <CUDADevice>().ToList();

                foreach (var cpu in cpus)
                {
                    ret.Add(new DeviceMonitorCPU(cpu.UUID));
                }
                if (amds.Count > 0)
                {
                    AMD_ODN.nhm_amd_set_debug_log_level(_amdDebugLogLevel);
                    AMD_ODN.nhm_amd_reg_log_cb(_amdLog);
                    var amdInit = AMD_ODN.nhm_amd_init();
                    if (0 == amdInit)
                    {
                        foreach (var amd in amds)
                        {
                            var hasRet = AMD_ODN.nhm_amd_has_adapter(amd.PCIeBusID);
                            if (0 == hasRet)
                            {
                                ret.Add(new DeviceMonitorAMD(amd.UUID, amd.PCIeBusID));
                            }
                            else
                            {
                                Logger.Info("DeviceMonitorManager", $"AMD nhm_amd_has_adapter {hasRet} for BusID {amd.PCIeBusID}");
                            }
                        }
                    }
                    else
                    {
                        Logger.Info("DeviceMonitorManager", $"AMD nhm_amd_init {amdInit}");
                    }
                }
                if (nvidias.Count > 0)
                {
                    var initialNvmlRestartTimeWait = Math.Min(500 * nvidias.Count, 5000); // 500ms per GPU or initial MAX of 5seconds
                    var firstMaxTimeoutAfterNvmlRestart = TimeSpan.FromMilliseconds(initialNvmlRestartTimeWait);
                    var nvidiaUUIDAndBusIds = nvidias.ToDictionary(nvidia => nvidia.UUID, nvidia => nvidia.PCIeBusID);
                    NvidiaMonitorManager.Init(nvidiaUUIDAndBusIds);
                    foreach (var nvidia in nvidias)
                    {
                        var deviceMonitorNVIDIA = new DeviceMonitorNVIDIA(nvidia.UUID, nvidia.PCIeBusID, firstMaxTimeoutAfterNvmlRestart);
                        ret.Add(deviceMonitorNVIDIA);
                    }
                }

                return ret;
            }));
        }
Exemplo n.º 3
0
 private static void RestartNVIDIAMonitoring()
 {
     lock (DeviceMonitorNVIDIA._lock)
     {
         NvidiaMonitorManager.ShutdownNvml();
         var nvidiaInfos = NvidiaMonitorManager.Init(_nvidiaUUIDAndBusIds, _isDCHDriver && UseNvmlFallback.Enabled);
         foreach (var nvidiaInfo in nvidiaInfos)
         {
             var deviceMonitorNVIDIA = _deviceMonitorNVIDIAs.Where(devMon => devMon.UUID == nvidiaInfo.UUID).FirstOrDefault();
             if (deviceMonitorNVIDIA == null)
             {
                 continue;
             }
             deviceMonitorNVIDIA.ResetHandles(nvidiaInfo);
         }
     }
 }
 // NVML is thread-safe according to the documentation
 private T ExecNvmlProcedure <T>(T failReturn, string tag, Func <T> nvmlExecFun)
 {
     if (!NvidiaMonitorManager.InitalNVMLInitSuccess)
     {
         Logger.ErrorDelayed(LogTag, $"{tag} InitalNVMLInitSuccess==FALSE", TimeSpan.FromMinutes(5));
         return(failReturn);
     }
     if (NvidiaMonitorManager.IsNVMLRestarting)
     {
         Logger.ErrorDelayed(LogTag, $"Skipping {tag} NVML IsRestarting", TimeSpan.FromSeconds(5));
         return(failReturn);
     }
     try
     {
         var execRet = nvmlExecFun();
         _deviceMonitorWatchdog.Reset(); // if nvmlExecFun doesn't throw we mark this as success
         return(execRet);
     }
     catch (Exception e)
     {
         Logger.ErrorDelayed(LogTag, e.ToString(), TimeSpan.FromSeconds(30));
         if (e is NvmlException ne && !SkipNvmlErrorRecovery(ne.ReturnCode))
         {
             if (_deviceMonitorWatchdog.IsAttemptErrorRecoveryPermanentlyDisabled())
             {
                 Logger.ErrorDelayed(LogTag, $"{tag} Will NOT RESTART NVML. Recovery for this device is permanently disabled.", TimeSpan.FromSeconds(30));
                 return(failReturn);
             }
             _deviceMonitorWatchdog.SetErrorTime();
             var shouldAttemptRestartNvml = _deviceMonitorWatchdog.ShouldAttemptErrorRecovery();
             if (shouldAttemptRestartNvml)
             {
                 _deviceMonitorWatchdog.UpdateTickError();
                 Logger.Info(LogTag, $"{tag} Will call NVML restart");
                 NvidiaMonitorManager.AttemptRestartNVML();
             }
         }
     }
     return(failReturn);
 }
Exemplo n.º 5
0
        public static Task <List <DeviceMonitor> > GetDeviceMonitors(IEnumerable <BaseDevice> devices, bool isDCHDriver)
        {
            return(Task.Run(() => {
                var ret = new List <DeviceMonitor>();

                var cpus = devices.Where(dev => dev is CPUDevice).Cast <CPUDevice>().ToList();
                var amds = devices.Where(dev => dev is AMDDevice).Cast <AMDDevice>().ToList();
                var nvidias = devices.Where(dev => dev is CUDADevice).Cast <CUDADevice>().ToList();

                foreach (var cpu in cpus)
                {
                    ret.Add(new DeviceMonitorCPU(cpu.UUID));
                }
                if (amds.Count > 0)
                {
                    var amdBusIdAndUuids = amds.ToDictionary(amd => amd.PCIeBusID, amd => amd.UUID);
                    var(_, amdInfos) = QueryAdl.TryQuery(amdBusIdAndUuids);
                    foreach (var amd in amds)
                    {
                        var currentAmdInfos = amdInfos.Where(info => info.BusID == amd.PCIeBusID);
                        ret.Add(new DeviceMonitorAMD(amd.UUID, amd.PCIeBusID, currentAmdInfos.ToArray()));
                    }
                }
                if (nvidias.Count > 0)
                {
                    var initialNvmlRestartTimeWait = Math.Min(500 * nvidias.Count, 5000); // 500ms per GPU or initial MAX of 5seconds
                    var firstMaxTimeoutAfterNvmlRestart = TimeSpan.FromMilliseconds(initialNvmlRestartTimeWait);
                    var nvidiaUUIDAndBusIds = nvidias.ToDictionary(nvidia => nvidia.UUID, nvidia => nvidia.PCIeBusID);
                    NvidiaMonitorManager.Init(nvidiaUUIDAndBusIds, isDCHDriver && UseNvmlFallback.Enabled);
                    foreach (var nvidia in nvidias)
                    {
                        var deviceMonitorNVIDIA = new DeviceMonitorNVIDIA(nvidia.UUID, nvidia.PCIeBusID, firstMaxTimeoutAfterNvmlRestart);
                        ret.Add(deviceMonitorNVIDIA);
                    }
                }

                return ret;
            }));
        }