/// <summary> /// The loop that watches the watchdog /// </summary> /// <param name="cancellationToken">The <see cref="CancellationToken"/> for the operation</param> /// <returns>A <see cref="Task"/> representing the running operation</returns> async Task MonitorLifetimes(CancellationToken cancellationToken) { logger.LogTrace("Entered MonitorLifetimes"); var iteration = 1; for (var monitorState = new MonitorState(); monitorState.NextAction != MonitorAction.Exit; ++iteration) { monitorState.NextAction = MonitorAction.Continue; logger.LogDebug("Iteration {0} of monitor loop", iteration); try { if (AlphaIsActive) { logger.LogDebug("Alpha is the active server"); } else { logger.LogDebug("Bravo is the active server"); } if (monitorState.InactiveServerHasStagedDmb) { logger.LogDebug("Inactive server has staged .dmb"); } if (monitorState.RebootingInactiveServer) { logger.LogDebug("Inactive server is rebooting"); } monitorState.ActiveServer = AlphaIsActive ? alphaServer : bravoServer; monitorState.InactiveServer = AlphaIsActive ? bravoServer : alphaServer; if (monitorState.ActiveServer.ClosePortOnReboot) { logger.LogDebug("Active server will close port on reboot"); } if (monitorState.InactiveServer.ClosePortOnReboot) { logger.LogDebug("Inactive server will close port on reboot"); } var activeServerLifetime = monitorState.ActiveServer.Lifetime; var inactiveServerLifetime = monitorState.InactiveServer.Lifetime; var activeServerReboot = monitorState.ActiveServer.OnReboot; var inactiveServerReboot = monitorState.InactiveServer.OnReboot; var inactiveServerStartup = monitorState.RebootingInactiveServer ? monitorState.InactiveServer.LaunchResult : null; var activeLaunchParametersChanged = activeParametersUpdated.Task; var newDmbAvailable = dmbFactory.OnNewerDmb; var cancelTcs = new TaskCompletionSource <object>(); using (cancellationToken.Register(() => cancelTcs.SetCanceled())) { var toWaitOn = Task.WhenAny(activeServerLifetime, inactiveServerLifetime, activeServerReboot, inactiveServerReboot, newDmbAvailable, cancelTcs.Task, activeLaunchParametersChanged); if (monitorState.RebootingInactiveServer) { toWaitOn = Task.WhenAny(toWaitOn, inactiveServerStartup); } await toWaitOn.ConfigureAwait(false); cancellationToken.ThrowIfCancellationRequested(); } var chatTask = Task.CompletedTask; using (await SemaphoreSlimContext.Lock(semaphore, cancellationToken).ConfigureAwait(false)) { MonitorActivationReason activationReason = default; //multiple things may have happened, handle them one at a time for (var moreActivationsToProcess = true; moreActivationsToProcess && monitorState.NextAction == MonitorAction.Continue;) { if (activeServerLifetime?.IsCompleted == true) { activationReason = MonitorActivationReason.ActiveServerCrashed; activeServerLifetime = null; } else if (inactiveServerLifetime?.IsCompleted == true) { activationReason = MonitorActivationReason.InactiveServerCrashed; inactiveServerLifetime = null; } else if (activeServerReboot?.IsCompleted == true) { activationReason = MonitorActivationReason.ActiveServerRebooted; activeServerReboot = null; } else if (inactiveServerReboot?.IsCompleted == true) { activationReason = MonitorActivationReason.InactiveServerRebooted; inactiveServerReboot = null; } else if (inactiveServerStartup?.IsCompleted == true) { activationReason = MonitorActivationReason.InactiveServerStartupComplete; inactiveServerStartup = null; } else if (newDmbAvailable?.IsCompleted == true) { activationReason = MonitorActivationReason.NewDmbAvailable; newDmbAvailable = null; } else if (activeLaunchParametersChanged?.IsCompleted == true) { activationReason = MonitorActivationReason.ActiveLaunchParametersUpdated; activeLaunchParametersChanged = null; } else { moreActivationsToProcess = false; } if (moreActivationsToProcess) { await HandlerMonitorWakeup(activationReason, monitorState, cancellationToken).ConfigureAwait(false); } } //writeback alphaServer and bravoServer alphaServer = AlphaIsActive ? monitorState.ActiveServer : monitorState.InactiveServer; bravoServer = !AlphaIsActive ? monitorState.ActiveServer : monitorState.InactiveServer; } //full reboot required if (monitorState.NextAction == MonitorAction.Restart) { logger.LogDebug("Next state action is to restart"); DisposeAndNullControllers(); chatTask = chat.SendWatchdogMessage("Restarting entirely due to complications...", cancellationToken); for (var retryAttempts = 1; monitorState.NextAction == MonitorAction.Restart; ++retryAttempts) { WatchdogLaunchResult result; using (await SemaphoreSlimContext.Lock(semaphore, cancellationToken).ConfigureAwait(false)) { result = await LaunchNoLock(false, false, false, cancellationToken).ConfigureAwait(false); if (Running) { logger.LogDebug("Relaunch successful, resetting monitor state..."); monitorState = new MonitorState(); //clean the slate } } await chatTask.ConfigureAwait(false); if (!Running) { logger.LogWarning("Failed to automatically restart the watchdog! Alpha: {0}; Bravo: {1}", result.Alpha.ToString(), result.Bravo.ToString()); var retryDelay = Math.Min(Math.Pow(2, retryAttempts), 3600); //max of one hour chatTask = chat.SendWatchdogMessage(String.Format(CultureInfo.InvariantCulture, "Failed to restart watchdog (Attempt: {0}), retrying in {1} seconds...", retryAttempts, retryDelay), cancellationToken); await Task.WhenAll(Task.Delay((int)retryDelay, cancellationToken), chatTask).ConfigureAwait(false); } } } } catch (OperationCanceledException) { logger.LogDebug("Monitor cancelled"); break; } catch (Exception e) { logger.LogError("Monitor crashed! Iteration: {0}, State: {1}", iteration, JsonConvert.SerializeObject(monitorState)); await chat.SendWatchdogMessage(String.Format(CultureInfo.InvariantCulture, "Monitor crashed, this should NEVER happen! Please report this, full details in logs! Restarting monitor... Error: {0}", e.Message), cancellationToken).ConfigureAwait(false); } } }
/// <inheritdoc /> protected override async Task <MonitorAction> HandleMonitorWakeup(MonitorActivationReason reason, CancellationToken cancellationToken) { switch (reason) { case MonitorActivationReason.ActiveServerCrashed: string exitWord = Server.TerminationWasRequested ? "exited" : "crashed"; if (Server.RebootState == Session.RebootState.Shutdown) { // the time for graceful shutdown is now await Chat.QueueWatchdogMessage( String.Format( CultureInfo.InvariantCulture, "Server {0}! Shutting down due to graceful termination request...", exitWord), cancellationToken) .ConfigureAwait(false); return(MonitorAction.Exit); } await Chat.QueueWatchdogMessage( String.Format( CultureInfo.InvariantCulture, "Server {0}! Rebooting...", exitWord), cancellationToken) .ConfigureAwait(false); return(MonitorAction.Restart); case MonitorActivationReason.ActiveServerRebooted: var rebootState = Server.RebootState; if (gracefulRebootRequired && rebootState == Session.RebootState.Normal) { Logger.LogError("Watchdog reached normal reboot state with gracefulRebootRequired set!"); rebootState = Session.RebootState.Restart; } gracefulRebootRequired = false; Server.ResetRebootState(); switch (rebootState) { case Session.RebootState.Normal: return(await HandleNormalReboot(cancellationToken).ConfigureAwait(false)); case Session.RebootState.Restart: return(MonitorAction.Restart); case Session.RebootState.Shutdown: // graceful shutdown time await Chat.QueueWatchdogMessage( "Active server rebooted! Shutting down due to graceful termination request...", cancellationToken) .ConfigureAwait(false); return(MonitorAction.Exit); default: throw new InvalidOperationException($"Invalid reboot state: {rebootState}"); } case MonitorActivationReason.ActiveLaunchParametersUpdated: await Server.SetRebootState(Session.RebootState.Restart, cancellationToken).ConfigureAwait(false); gracefulRebootRequired = true; break; case MonitorActivationReason.NewDmbAvailable: await HandleNewDmbAvailable(cancellationToken).ConfigureAwait(false); break; case MonitorActivationReason.Heartbeat: default: throw new InvalidOperationException($"Invalid activation reason: {reason}"); } return(MonitorAction.Continue); }
/// <summary> /// Handles the actions to take when the monitor has to "wake up" /// </summary> /// <param name="activationReason">The <see cref="MonitorActivationReason"/> that caused the invocation</param> /// <param name="monitorState">The current <see cref="MonitorState"/>. Will be modified upon retrn</param> /// <param name="cancellationToken">The <see cref="CancellationToken"/> for the operation</param> /// <returns>A <see cref="Task"/> representing the running operation</returns> async Task HandlerMonitorWakeup(MonitorActivationReason activationReason, MonitorState monitorState, CancellationToken cancellationToken) { logger.LogDebug("Monitor activation. Reason: {0}", activationReason); //returns true if the inactive server can't be used immediately bool FullRestartDeadInactive() { if (monitorState.RebootingInactiveServer || monitorState.InactiveServerCritFail) { logger.LogInformation("Inactive server is {0}! Restarting monitor...", monitorState.InactiveServerCritFail ? "critically failed" : "still rebooting"); monitorState.NextAction = MonitorAction.Restart; //will dispose server return(true); } return(false); }; //trys to set inactive server's port to the public port //doesn't handle closing active server's port async Task <bool> MakeInactiveActive() { logger.LogDebug("Setting inactive server to port {0}...", ActiveLaunchParameters.PrimaryPort.Value); var result = await monitorState.InactiveServer.SetPort(ActiveLaunchParameters.PrimaryPort.Value, cancellationToken).ConfigureAwait(false); if (!result) { logger.LogWarning("Failed to activate inactive server! Restarting monitor..."); monitorState.NextAction = MonitorAction.Restart; //will dispose server return(false); } //inactive server should always be using active launch parameters LastLaunchParameters = ActiveLaunchParameters; var tmp = monitorState.ActiveServer; monitorState.ActiveServer = monitorState.InactiveServer; monitorState.InactiveServer = tmp; AlphaIsActive = !AlphaIsActive; return(true); } // Tries to load inactive server with latest dmb, falling back to current dmb on failure. Requires a lock on <see cref="semaphore"/> async Task <bool> RestartInactiveServer() { logger.LogInformation("Rebooting inactive server..."); var newDmb = dmbFactory.LockNextDmb(1); bool usedMostRecentDmb; try { monitorState.InactiveServer = await sessionControllerFactory.LaunchNew(ActiveLaunchParameters, newDmb, null, false, !monitorState.ActiveServer.IsPrimary, false, cancellationToken).ConfigureAwait(false); usedMostRecentDmb = true; } catch (OperationCanceledException) { throw; } catch (Exception e) { logger.LogError("Error occurred while recreating server! Attempting backup strategy of running DMB of running server! Exception: {0}", e.ToString()); //ahh jeez, what do we do here? //this is our fault, so it should never happen but //idk maybe a database error while handling the newest dmb? //either way try to start it using the active server's dmb as a backup try { var dmbBackup = await dmbFactory.FromCompileJob(monitorState.ActiveServer.Dmb.CompileJob, cancellationToken).ConfigureAwait(false); if (dmbBackup == null) //NANI!? //just give up, if THAT compile job is failing then the ActiveServer is gonna crash soon too or already has { throw new JobException("Creating backup DMB provider failed!"); } monitorState.InactiveServer = await sessionControllerFactory.LaunchNew(ActiveLaunchParameters, dmbBackup, null, false, !monitorState.ActiveServer.IsPrimary, false, cancellationToken).ConfigureAwait(false); usedMostRecentDmb = false; await chat.SendWatchdogMessage("Staging newest DMB on inactive server failed: {0} Falling back to previous dmb...", cancellationToken).ConfigureAwait(false); } catch (OperationCanceledException) { throw; } catch (Exception e2) { //fuuuuucckkk logger.LogError("Backup strategy failed! Monitor will restart when active server reboots! Exception: {0}", e2.ToString()); monitorState.InactiveServerCritFail = true; await chat.SendWatchdogMessage("Attempted reboot of inactive server failed. Watchdog will reset when active server fails or exits", cancellationToken).ConfigureAwait(false); return(true); //we didn't use the old dmb } } logger.LogInformation("Successfully relaunched inactive server!"); monitorState.RebootingInactiveServer = true; return(usedMostRecentDmb); } async Task UpdateAndRestartInactiveServer(bool breakAfter) { //replace the notification tcs here so that the next loop will read a fresh one activeParametersUpdated = new TaskCompletionSource <object>(); monitorState.InactiveServer.Dispose(); //kill or recycle it monitorState.NextAction = breakAfter ? MonitorAction.Break : MonitorAction.Continue; var usedLatestDmb = await RestartInactiveServer().ConfigureAwait(false); if (monitorState.NextAction == (breakAfter ? MonitorAction.Break : MonitorAction.Continue)) { monitorState.ActiveServer.ClosePortOnReboot = false; if (monitorState.InactiveServerHasStagedDmb && !usedLatestDmb) { monitorState.InactiveServerHasStagedDmb = false; //don't try to load it again though } } }; string ExitWord(ISessionController controller) => controller.TerminationWasRequested ? "exited" : "crashed"; //reason handling switch (activationReason) { case MonitorActivationReason.ActiveServerCrashed: if (monitorState.ActiveServer.RebootState == Components.Watchdog.RebootState.Shutdown) { await chat.SendWatchdogMessage(String.Format(CultureInfo.InvariantCulture, "Active server {0}! Exiting due to graceful termination request...", ExitWord(monitorState.ActiveServer)), cancellationToken).ConfigureAwait(false); monitorState.NextAction = MonitorAction.Exit; break; } if (FullRestartDeadInactive()) { await chat.SendWatchdogMessage(String.Format(CultureInfo.InvariantCulture, "Active server {0}! Inactive server unable to online!", ExitWord(monitorState.ActiveServer)), cancellationToken).ConfigureAwait(false); break; } await chat.SendWatchdogMessage(String.Format(CultureInfo.InvariantCulture, "Active server {0}! Onlining inactive server...", ExitWord(monitorState.ActiveServer)), cancellationToken).ConfigureAwait(false); if (!await MakeInactiveActive().ConfigureAwait(false)) { break; } await UpdateAndRestartInactiveServer(true).ConfigureAwait(false); break; case MonitorActivationReason.InactiveServerCrashed: await chat.SendWatchdogMessage(String.Format(CultureInfo.InvariantCulture, "Inactive server {0}! Rebooting...", ExitWord(monitorState.InactiveServer)), cancellationToken).ConfigureAwait(false); await UpdateAndRestartInactiveServer(false).ConfigureAwait(false); break; case MonitorActivationReason.ActiveServerRebooted: if (FullRestartDeadInactive()) { break; } //what matters here is the RebootState bool restartOnceSwapped = false; var rebootState = monitorState.ActiveServer.RebootState; monitorState.ActiveServer.ResetRebootState(); //the DMAPI has already done this internally switch (rebootState) { case Components.Watchdog.RebootState.Normal: break; case Components.Watchdog.RebootState.Restart: restartOnceSwapped = true; break; case Components.Watchdog.RebootState.Shutdown: await chat.SendWatchdogMessage("Active server rebooted! Exiting due to graceful termination request...", cancellationToken).ConfigureAwait(false); DisposeAndNullControllers(); monitorState.NextAction = MonitorAction.Exit; return; } var sameCompileJob = monitorState.InactiveServer.Dmb.CompileJob.Id == monitorState.ActiveServer.Dmb.CompileJob.Id; if (sameCompileJob && monitorState.InactiveServerHasStagedDmb) { //both servers up to date monitorState.InactiveServerHasStagedDmb = false; } if (!sameCompileJob || ActiveLaunchParameters != LastLaunchParameters) { //need a new launch in ActiveServer restartOnceSwapped = true; } if (restartOnceSwapped && !monitorState.ActiveServer.ClosePortOnReboot) { //we need to manually restart active server //it won't listen to us right now so just kill it monitorState.ActiveServer.Dispose(); } if ((!restartOnceSwapped && !monitorState.ActiveServer.ClosePortOnReboot) || !await MakeInactiveActive().ConfigureAwait(false)) { break; } monitorState.ActiveServer.ClosePortOnReboot = true; if (!restartOnceSwapped) { monitorState.InactiveServer.ClosePortOnReboot = false; //try to reopen inactive server on the private port so it's not pinging all the time //failing that, just reboot it restartOnceSwapped = !await monitorState.InactiveServer.SetPort(ActiveLaunchParameters.SecondaryPort.Value, cancellationToken).ConfigureAwait(false); } if (restartOnceSwapped) //for one reason or another { await UpdateAndRestartInactiveServer(true).ConfigureAwait(false); //break because worse case, active server is still booting } else { monitorState.InactiveServer.ClosePortOnReboot = false; monitorState.NextAction = MonitorAction.Break; } break; case MonitorActivationReason.InactiveServerRebooted: monitorState.RebootingInactiveServer = true; monitorState.InactiveServer.ResetRebootState(); monitorState.ActiveServer.ClosePortOnReboot = false; monitorState.NextAction = MonitorAction.Continue; break; case MonitorActivationReason.InactiveServerStartupComplete: //eziest case of my life monitorState.RebootingInactiveServer = false; monitorState.ActiveServer.ClosePortOnReboot = true; monitorState.NextAction = MonitorAction.Continue; break; case MonitorActivationReason.NewDmbAvailable: monitorState.InactiveServerHasStagedDmb = true; await UpdateAndRestartInactiveServer(true).ConfigureAwait(false); //next case does same thing break; case MonitorActivationReason.ActiveLaunchParametersUpdated: await UpdateAndRestartInactiveServer(false).ConfigureAwait(false); break; } }
async Task <MonitorAction> HandleMonitorWakeup(MonitorActivationReason reason, CancellationToken cancellationToken) { switch (reason) { case MonitorActivationReason.ActiveServerCrashed: string exitWord = Server.TerminationWasRequested ? "exited" : "crashed"; if (Server.RebootState == Watchdog.RebootState.Shutdown) { // the time for graceful shutdown is now await Chat.SendWatchdogMessage(String.Format(CultureInfo.InvariantCulture, "Server {0}! Exiting due to graceful termination request...", exitWord), cancellationToken).ConfigureAwait(false); DisposeAndNullControllers(); return(MonitorAction.Exit); } await Chat.SendWatchdogMessage(String.Format(CultureInfo.InvariantCulture, "Server {0}! Rebooting...", exitWord), cancellationToken).ConfigureAwait(false); return(MonitorAction.Restart); case MonitorActivationReason.ActiveServerRebooted: var rebootState = Server.RebootState; Server.ResetRebootState(); switch (rebootState) { case Watchdog.RebootState.Normal: return(HandleNormalReboot()); case Watchdog.RebootState.Restart: return(MonitorAction.Restart); case Watchdog.RebootState.Shutdown: // graceful shutdown time await Chat.SendWatchdogMessage("Active server rebooted! Exiting due to graceful termination request...", cancellationToken).ConfigureAwait(false); DisposeAndNullControllers(); return(MonitorAction.Exit); default: throw new InvalidOperationException($"Invalid reboot state: {rebootState}"); } case MonitorActivationReason.ActiveLaunchParametersUpdated: await Server.SetRebootState(Watchdog.RebootState.Restart, cancellationToken).ConfigureAwait(false); return(MonitorAction.Continue); case MonitorActivationReason.NewDmbAvailable: await HandleNewDmbAvailable(cancellationToken).ConfigureAwait(false); return(MonitorAction.Continue); case MonitorActivationReason.InactiveServerCrashed: case MonitorActivationReason.InactiveServerRebooted: case MonitorActivationReason.InactiveServerStartupComplete: throw new NotSupportedException($"Unsupported activation reason: {reason}"); default: throw new InvalidOperationException($"Invalid activation reason: {reason}"); } }
/// <inheritdoc /> protected sealed override async Task MonitorLifetimes(CancellationToken cancellationToken) { Logger.LogTrace("Entered MonitorLifetimes"); // this function is responsible for calling HandlerMonitorWakeup when necessary and manitaining the MonitorState var iteration = 1; for (MonitorAction nextAction = MonitorAction.Continue; nextAction != MonitorAction.Exit; ++iteration) { // always start out with continue nextAction = MonitorAction.Continue; // dump some info to the logs Logger.LogDebug("Iteration {0} of monitor loop", iteration); try { Logger.LogDebug("Server Compile Job ID: {0}", Server.Dmb.CompileJob.Id); // load the activation tasks into local variables Task activeServerLifetime = Server.Lifetime; var activeServerReboot = Server.OnReboot; Task activeLaunchParametersChanged = ActiveParametersUpdated.Task; var newDmbAvailable = DmbFactory.OnNewerDmb; // cancel waiting if requested var cancelTcs = new TaskCompletionSource <object>(); using (cancellationToken.Register(() => cancelTcs.SetCanceled())) { var toWaitOn = Task.WhenAny(activeServerLifetime, activeServerReboot, newDmbAvailable, cancelTcs.Task, activeLaunchParametersChanged); // wait for something to happen await toWaitOn.ConfigureAwait(false); cancellationToken.ThrowIfCancellationRequested(); } var chatTask = Task.CompletedTask; using (await SemaphoreSlimContext.Lock(Semaphore, cancellationToken).ConfigureAwait(false)) { // always run HandleMonitorWakeup from the context of the semaphore lock // multiple things may have happened, handle them one at a time for (var moreActivationsToProcess = true; moreActivationsToProcess && (nextAction == MonitorAction.Continue || nextAction == MonitorAction.Skip);) { MonitorActivationReason activationReason = default; // this will always be assigned before being used // process the tasks in this order and call HandlerMonitorWakup for each bool CheckActivationReason(ref Task task, MonitorActivationReason testActivationReason) { var taskCompleted = task?.IsCompleted == true; task = null; if (nextAction == MonitorAction.Skip) { nextAction = MonitorAction.Continue; } else if (taskCompleted) { activationReason = testActivationReason; return(true); } return(false); } if (CheckActivationReason(ref activeServerLifetime, MonitorActivationReason.ActiveServerCrashed) || CheckActivationReason(ref activeServerReboot, MonitorActivationReason.ActiveServerRebooted) || CheckActivationReason(ref newDmbAvailable, MonitorActivationReason.NewDmbAvailable) || CheckActivationReason(ref activeLaunchParametersChanged, MonitorActivationReason.ActiveLaunchParametersUpdated)) { Logger.LogTrace("Monitor activation: {0}", activationReason); nextAction = await HandleMonitorWakeup(activationReason, cancellationToken).ConfigureAwait(false); } else { moreActivationsToProcess = false; } } } // full reboot required if (nextAction == MonitorAction.Restart) { Logger.LogDebug("Next state action is to restart"); DisposeAndNullControllers(); for (var retryAttempts = 1; nextAction == MonitorAction.Restart; ++retryAttempts) { Exception launchException = null; using (await SemaphoreSlimContext.Lock(Semaphore, cancellationToken).ConfigureAwait(false)) try { // use LaunchImplNoLock without announcements or restarting the monitor await LaunchImplNoLock(false, false, null, cancellationToken).ConfigureAwait(false); if (Running) { Logger.LogDebug("Relaunch successful, resetting monitor state..."); break; // continue on main loop } } catch (OperationCanceledException) { throw; } catch (Exception e) { launchException = e; } await chatTask.ConfigureAwait(false); if (!Running) { if (launchException == null) { Logger.LogWarning("Failed to automatically restart the watchdog! Attempt: {0}", retryAttempts); } else { Logger.LogWarning("Failed to automatically restart the watchdog! Attempt: {0}, Exception: {1}", retryAttempts, launchException); } var retryDelay = Math.Min(Math.Pow(2, retryAttempts), 3600); // max of one hour, increasing by a power of 2 each time chatTask = Chat.SendWatchdogMessage(String.Format(CultureInfo.InvariantCulture, "Failed to restart watchdog (Attempt: {0}), retrying in {1} seconds...", retryAttempts, retryDelay), cancellationToken); await Task.WhenAll(AsyncDelayer.Delay(TimeSpan.FromSeconds(retryDelay), cancellationToken), chatTask).ConfigureAwait(false); } } } } catch (OperationCanceledException) { Logger.LogDebug("Monitor cancelled"); break; } catch (Exception e) { // really, this should NEVER happen Logger.LogError("Monitor crashed! Iteration: {0}, NextAction: {1}, Exception: {2}", iteration, nextAction, e); await Chat.SendWatchdogMessage(String.Format(CultureInfo.InvariantCulture, "Monitor crashed, this should NEVER happen! Please report this, full details in logs! Restarting monitor... Error: {0}", e.Message), cancellationToken).ConfigureAwait(false); } } Logger.LogTrace("Monitor exiting..."); }
/// <summary> /// Handles the actions to take when the monitor has to "wake up" /// </summary> /// <param name="activationReason">The <see cref="MonitorActivationReason"/> that caused the invocation</param> /// <param name="monitorState">The current <see cref="MonitorState"/>. Will be modified upon retrn</param> /// <param name="cancellationToken">The <see cref="CancellationToken"/> for the operation</param> /// <returns>A <see cref="Task"/> representing the running operation</returns> #pragma warning disable CA1502 // TODO: Decomplexify async Task HandlerMonitorWakeup(MonitorActivationReason activationReason, MonitorState monitorState, CancellationToken cancellationToken) { Logger.LogDebug("Monitor activation. Reason: {0}", activationReason); // this is where the bulk of the watchdog handling code lives and is fraught with lambdas, sorry not sorry // I'll do my best to walk you through it // returns true if the inactive server can't be used immediately // also sets monitor to restart if the above holds bool FullRestartDeadInactive() { if (monitorState.RebootingInactiveServer || monitorState.InactiveServerCritFail) { Logger.LogInformation("Inactive server is {0}! Restarting monitor...", monitorState.InactiveServerCritFail ? "critically failed" : "still rebooting"); monitorState.NextAction = MonitorAction.Restart; // will dispose server return(true); } return(false); } // trys to set inactive server's port to the public game port // doesn't handle closing active server's port // returns true on success and swaps inactiveserver and activeserver also sets LastLaunchParameters to ActiveLaunchParameters // on failure, sets monitor to restart async Task <bool> MakeInactiveActive() { Logger.LogDebug("Setting inactive server to port {0}...", ActiveLaunchParameters.PrimaryPort.Value); var result = await monitorState.InactiveServer.SetPort(ActiveLaunchParameters.PrimaryPort.Value, cancellationToken).ConfigureAwait(false); if (!result) { Logger.LogWarning("Failed to activate inactive server! Restarting monitor..."); monitorState.NextAction = MonitorAction.Restart; // will dispose server return(false); } // inactive server should always be using active launch parameters LastLaunchParameters = ActiveLaunchParameters; var tmp = monitorState.ActiveServer; monitorState.ActiveServer = monitorState.InactiveServer; monitorState.InactiveServer = tmp; alphaIsActive = !AlphaIsActive; monitorState.ActiveServer.EnableCustomChatCommands(); return(true); } // Kills and tries to launch inactive server with the latest dmb // falls back to current dmb on failure // Sets critfail on inactive server failing that // returns false if the backup dmb was used successfully, true otherwise async Task UpdateAndRestartInactiveServer(bool breakAfter) { ActiveParametersUpdated = new TaskCompletionSource <object>(); monitorState.InactiveServer.Dispose(); // kill or recycle it var desiredNextAction = breakAfter ? MonitorAction.Break : MonitorAction.Continue; monitorState.NextAction = desiredNextAction; Logger.LogInformation("Rebooting inactive server..."); var newDmb = DmbFactory.LockNextDmb(1); try { monitorState.InactiveServer = await SessionControllerFactory.LaunchNew(ActiveLaunchParameters, newDmb, null, false, !monitorState.ActiveServer.IsPrimary, false, cancellationToken).ConfigureAwait(false); monitorState.InactiveServer.SetHighPriority(); } catch (OperationCanceledException) { throw; } catch (Exception e) { Logger.LogError("Error occurred while recreating server! Attempting backup strategy of running DMB of running server! Exception: {0}", e.ToString()); // ahh jeez, what do we do here? // this is our fault, so it should never happen but // idk maybe a database error while handling the newest dmb? // either way try to start it using the active server's dmb as a backup try { var dmbBackup = await DmbFactory.FromCompileJob(monitorState.ActiveServer.Dmb.CompileJob, cancellationToken).ConfigureAwait(false); if (dmbBackup == null) // NANI!? { throw new JobException("Creating backup DMB provider failed!"); // just give up, if THAT compile job is failing then the ActiveServer is gonna crash soon too or already has } monitorState.InactiveServer = await SessionControllerFactory.LaunchNew(ActiveLaunchParameters, dmbBackup, null, false, !monitorState.ActiveServer.IsPrimary, false, cancellationToken).ConfigureAwait(false); monitorState.InactiveServer.SetHighPriority(); await Chat.SendWatchdogMessage("Staging newest DMB on inactive server failed: {0} Falling back to previous dmb...", cancellationToken).ConfigureAwait(false); } catch (OperationCanceledException) { throw; } catch (Exception e2) { // fuuuuucckkk Logger.LogError("Backup strategy failed! Monitor will restart when active server reboots! Exception: {0}", e2.ToString()); monitorState.InactiveServerCritFail = true; await Chat.SendWatchdogMessage("Attempted reboot of inactive server failed. Watchdog will reset when active server fails or exits", cancellationToken).ConfigureAwait(false); return; } } Logger.LogInformation("Successfully relaunched inactive server!"); monitorState.RebootingInactiveServer = true; } string ExitWord(ISessionController controller) => controller.TerminationWasRequested ? "exited" : "crashed"; // reason handling switch (activationReason) { case MonitorActivationReason.ActiveServerCrashed: if (monitorState.ActiveServer.RebootState == Watchdog.RebootState.Shutdown) { // the time for graceful shutdown is now await Chat.SendWatchdogMessage(String.Format(CultureInfo.InvariantCulture, "Active server {0}! Exiting due to graceful termination request...", ExitWord(monitorState.ActiveServer)), cancellationToken).ConfigureAwait(false); DisposeAndNullControllers(); monitorState.NextAction = MonitorAction.Exit; break; } if (FullRestartDeadInactive()) { // tell chat about it and go ahead await Chat.SendWatchdogMessage(String.Format(CultureInfo.InvariantCulture, "Active server {0}! Inactive server unable to online!", ExitWord(monitorState.ActiveServer)), cancellationToken).ConfigureAwait(false); // we've already been set to restart break; } // tell chat about it await Chat.SendWatchdogMessage(String.Format(CultureInfo.InvariantCulture, "Active server {0}! Onlining inactive server...", ExitWord(monitorState.ActiveServer)), cancellationToken).ConfigureAwait(false); // try to activate the inactive server if (!await MakeInactiveActive().ConfigureAwait(false)) { break; // failing that, we've already been set to restart } // bring up another inactive server await UpdateAndRestartInactiveServer(true).ConfigureAwait(false); break; case MonitorActivationReason.InactiveServerCrashed: // just announce and try to bring it back await Chat.SendWatchdogMessage(String.Format(CultureInfo.InvariantCulture, "Inactive server {0}! Rebooting...", ExitWord(monitorState.InactiveServer)), cancellationToken).ConfigureAwait(false); await UpdateAndRestartInactiveServer(false).ConfigureAwait(false); break; case MonitorActivationReason.ActiveServerRebooted: // ideal goal: active server just closed its port // tell inactive server to open it's port and that's now the active server var rebootState = monitorState.ActiveServer.RebootState; monitorState.ActiveServer.ResetRebootState(); // the DMAPI has already done this internally if (FullRestartDeadInactive() && rebootState != Watchdog.RebootState.Shutdown) { break; // full restart if the inactive server is being fucky } // what matters here is the RebootState var restartOnceSwapped = false; switch (rebootState) { case Watchdog.RebootState.Normal: // life as normal break; case Watchdog.RebootState.Restart: // reboot the current active server once the inactive one activates restartOnceSwapped = true; break; case Watchdog.RebootState.Shutdown: // graceful shutdown time await Chat.SendWatchdogMessage("Active server rebooted! Exiting due to graceful termination request...", cancellationToken).ConfigureAwait(false); DisposeAndNullControllers(); monitorState.NextAction = MonitorAction.Exit; return; default: throw new InvalidOperationException($"Invalid reboot state: {rebootState}"); } // are both servers now running the same CompileJob? var sameCompileJob = monitorState.InactiveServer.Dmb.CompileJob.Id == monitorState.ActiveServer.Dmb.CompileJob.Id; if (!sameCompileJob || ActiveLaunchParameters != LastLaunchParameters) { restartOnceSwapped = true; // need a new launch to update either settings or compile job } if (restartOnceSwapped) { /* * we need to manually restart active server * just kill it here, easier that way */ monitorState.ActiveServer.Dispose(); } var activeServerStillHasPortOpen = !restartOnceSwapped && !monitorState.ActiveServer.ClosePortOnReboot; if (activeServerStillHasPortOpen) { /* we didn't want active server to swap for some reason and it still has it's port open * just continue as normal */ break; } if (!await MakeInactiveActive().ConfigureAwait(false)) { break; // monitor will restart } // servers now swapped // enable this now if inactive server is not still valid monitorState.ActiveServer.ClosePortOnReboot = restartOnceSwapped; if (!restartOnceSwapped) { /* * now try to reopen it on the private port * failing that, just reboot it */ restartOnceSwapped = !await monitorState.InactiveServer.SetPort(ActiveLaunchParameters.SecondaryPort.Value, cancellationToken).ConfigureAwait(false); } // break either way because any issues past this point would be solved by the reboot if (restartOnceSwapped) // for one reason or another { await UpdateAndRestartInactiveServer(true).ConfigureAwait(false); // update and reboot } else { monitorState.NextAction = MonitorAction.Skip; // only skip checking inactive server rebooted, it's guaranteed InactiveServerStartup complete wouldn't fire this iteration } break; case MonitorActivationReason.InactiveServerRebooted: // just don't let the active server close it's port if the inactive server isn't ready monitorState.RebootingInactiveServer = true; monitorState.InactiveServer.ResetRebootState(); monitorState.ActiveServer.ClosePortOnReboot = false; monitorState.NextAction = MonitorAction.Continue; break; case MonitorActivationReason.InactiveServerStartupComplete: // opposite of above case monitorState.RebootingInactiveServer = false; monitorState.ActiveServer.ClosePortOnReboot = true; monitorState.NextAction = MonitorAction.Continue; break; case MonitorActivationReason.NewDmbAvailable: case MonitorActivationReason.ActiveLaunchParametersUpdated: // just reload the inactive server and wait for a swap to apply the changes await UpdateAndRestartInactiveServer(true).ConfigureAwait(false); break; default: Trace.Assert(false, String.Format(CultureInfo.InvariantCulture, "Invalid monitor activation reason: {0}!", activationReason)); break; } }
/// <inheritdoc /> #pragma warning disable CA1502 // TODO: Decomplexify protected override async Task HandleMonitorWakeup(MonitorActivationReason activationReason, MonitorState monitorState, CancellationToken cancellationToken) { Logger.LogDebug("Monitor activation. Reason: {0}", activationReason); // this is where the bulk of the watchdog handling code lives and is fraught with lambdas, sorry not sorry // I'll do my best to walk you through it // returns true if the inactive server can't be used immediately // also sets monitor to restart if the above holds bool FullRestartDeadInactive() { if (monitorState.RebootingInactiveServer || monitorState.InactiveServerCritFail) { Logger.LogInformation("Inactive server is {0}! Restarting monitor...", monitorState.InactiveServerCritFail ? "critically failed" : "still rebooting"); monitorState.NextAction = MonitorAction.Restart; // will dispose server return(true); } return(false); } // trys to set inactive server's port to the public game port // doesn't handle closing active server's port // returns true on success and swaps inactiveserver and activeserver also sets LastLaunchParameters to ActiveLaunchParameters // on failure, sets monitor to restart async Task <bool> MakeInactiveActive() { Logger.LogDebug("Setting inactive server to port {0}...", ActiveLaunchParameters.PrimaryPort.Value); var result = await monitorState.InactiveServer.SetPort(ActiveLaunchParameters.PrimaryPort.Value, cancellationToken).ConfigureAwait(false); if (!result) { Logger.LogWarning("Failed to activate inactive server! Restarting monitor..."); monitorState.NextAction = MonitorAction.Restart; // will dispose server return(false); } // inactive server should always be using active launch parameters LastLaunchParameters = ActiveLaunchParameters; var tmp = monitorState.ActiveServer; monitorState.ActiveServer = monitorState.InactiveServer; monitorState.InactiveServer = tmp; alphaIsActive = !AlphaIsActive; monitorState.ActiveServer.EnableCustomChatCommands(); return(true); } // Kills and tries to launch inactive server with the latest dmb // falls back to current dmb on failure // Sets critfail on inactive server failing that // returns false if the backup dmb was used successfully, true otherwise async Task UpdateAndRestartInactiveServer(bool breakAfter) { ActiveParametersUpdated = new TaskCompletionSource <object>(); monitorState.InactiveServer.Dispose(); // kill or recycle it var desiredNextAction = breakAfter ? MonitorAction.Break : MonitorAction.Continue; monitorState.NextAction = desiredNextAction; Logger.LogInformation("Rebooting inactive server..."); var newDmb = DmbFactory.LockNextDmb(1); try { monitorState.InactiveServer = await SessionControllerFactory.LaunchNew( newDmb, null, ActiveLaunchParameters, false, !monitorState.ActiveServer.IsPrimary, false, cancellationToken) .ConfigureAwait(false); monitorState.InactiveServer.SetHighPriority(); } catch (OperationCanceledException) { throw; } catch (Exception e) { Logger.LogError("Error occurred while recreating server! Attempting backup strategy of running DMB of running server! Exception: {0}", e.ToString()); // ahh jeez, what do we do here? // this is our fault, so it should never happen but // idk maybe a database error while handling the newest dmb? // either way try to start it using the active server's dmb as a backup try { var dmbBackup = await DmbFactory.FromCompileJob(monitorState.ActiveServer.Dmb.CompileJob, cancellationToken).ConfigureAwait(false); if (dmbBackup == null) // NANI!? { throw new InvalidOperationException("Watchdog double crit-fail!"); // just give up, if THAT compile job is failing then the ActiveServer is gonna crash soon too or already has } monitorState.InactiveServer = await SessionControllerFactory.LaunchNew( dmbBackup, null, ActiveLaunchParameters, false, !monitorState.ActiveServer.IsPrimary, false, cancellationToken) .ConfigureAwait(false); monitorState.InactiveServer.SetHighPriority(); await Chat.SendWatchdogMessage( "Staging newest DMB on inactive server failed: {0} Falling back to previous dmb...", false, cancellationToken).ConfigureAwait(false); } catch (OperationCanceledException) { throw; } catch (Exception e2) { // fuuuuucckkk Logger.LogError("Backup strategy failed! Monitor will restart when active server reboots! Exception: {0}", e2.ToString()); monitorState.InactiveServerCritFail = true; await Chat.SendWatchdogMessage( "Attempted reboot of inactive server failed. Watchdog will reset when active server fails or exits", false, cancellationToken).ConfigureAwait(false); return; } } Logger.LogInformation("Successfully relaunched inactive server!"); monitorState.RebootingInactiveServer = true; }