private async Task ProcessAskModeCoreAsync(CancellationToken cancellationToken) { traceType.WriteInfo("Starting Autopilot coordination"); this.lastSuccessfulRunAt = DateTimeOffset.UtcNow; while (!cancellationToken.IsCancellationRequested) { traceType.WriteInfo( "Autopilot coordinator: service name = {0}, partition ID = {1}, start time = {2:O}", this.environment.ServiceName, this.partitionId, this.coordinatorRunAsyncStartTime); ClearFailureMessages(); Stopwatch watch = Stopwatch.StartNew(); var context = new AskModeContext(this.configReader, this.environment.ServiceName); try { await ProcessAskModeOnce(context, cancellationToken); } catch (Exception ex) { if (!cancellationToken.IsCancellationRequested) { traceType.WriteWarning("Ask mode processing failed: {0}", ex); AddFailureMessage(false, $"Ask mode processing failed: {ex.Message}"); } } TimeSpan timerPeriod = this.configReader.JobPollingInterval; ProcessRunResults(context, watch.Elapsed, timerPeriod); if (!cancellationToken.IsCancellationRequested) { traceType.WriteInfo("Next run will occur in {0} at {1:O}", timerPeriod, DateTimeOffset.UtcNow + timerPeriod); await Task.Delay(timerPeriod, cancellationToken); } } traceType.WriteInfo("Cancellation requested. Stopping Autopilot coordination."); }
private async Task ProcessAskModeOnce(AskModeContext context, CancellationToken cancellationToken) { cancellationToken.ThrowIfCancellationRequested(); var machineInfo = await this.dmClient.GetMachineInfoAsync(); cancellationToken.ThrowIfCancellationRequested(); var pendingRecords = await this.dmClient.GetDevicesPendingMaintenanceAsync(); foreach (var record in machineInfo) { if (record.Id.RepairType != RepairType.None) { traceType.WriteInfo("GetMachineInfo: {0},{1},{2},{3}", record.Id.MachineName, record.Status, record.Id.RepairType, record.RepairActionState); context.SetRepairRecord(MachineMaintenanceRecord.FromActiveRepair(record.Id)); } if (!this.configReader.AllowRepairTaskCompletionInMachineStates.Contains(record.Status)) { traceType.WriteInfo( "Disallowing completion of repair tasks for {0} because its state is {1}", record.Id.MachineName, record.Status); context.DoNotCancelRepairsForMachine(record.Id.MachineName); } } foreach (var record in pendingRecords) { context.SetRepairRecord(record); } // TODO consider retrying the fetch/reconcile/execute block, to avoid // extending repairs on transient RM errors try { cancellationToken.ThrowIfCancellationRequested(); // Fetch active repair tasks owned by this service IList <IRepairTask> repairTasks = await this.repairManager.GetRepairTaskListAsync( Guid.Empty, null, RepairTaskStateFilter.Active, this.environment.ServiceName); context.SetRepairTasks(repairTasks); // Reconcile DM and RM IEnumerable <IAction> actions = context.Reconcile(); // Execute actions foreach (IAction action in actions) { cancellationToken.ThrowIfCancellationRequested(); await ExecuteActionAsync(action); } } catch (Exception e) { traceType.WriteWarning("Reconcile/execute failed: {0}", e); AddFailureMessage(false, $"Reconcile/execute failed: {e.Message}"); } // Modify delay times context.UpdateRepairDelays(); // Push updates back to the DM foreach (var record in context.RepairRecords.Where(r => r.IsDelayModified)) { cancellationToken.ThrowIfCancellationRequested(); await UpdateMaintenanceDelayAsync(record); } // Report health against the nodes if (this.configReader.EnableNodeHealthReports) { foreach (var record in machineInfo) { ReportNodeHealth( "Autopilot:MachineStatus", record.Id.MachineName, (record.Status == "H" || record.Status == "P") ? HealthState.Ok : HealthState.Warning, record.Status); ReportNodeHealth( "Autopilot:Repair", record.Id.MachineName, (record.Id.RepairType == RepairType.None) ? HealthState.Ok : HealthState.Warning, $"{record.Id.RepairType},{record.RepairActionState}"); } } }
private void ProcessRunResults(AskModeContext context, TimeSpan runDuration, TimeSpan runInterval) { TimeSpan warningThreshold = this.configReader.CoordinatorFailureWarningThreshold; TimeSpan maxRetryDuration = this.configReader.CoordinatorFailureRetryDuration; DateTimeOffset now = DateTimeOffset.UtcNow; TimeSpan elapsed = now - lastSuccessfulRunAt; int errorCount = this.failureMessages.Count; int fatalCount = this.failureMessages.Count(m => m.Item1); // isFatal == true traceType.WriteInfo( "ProcessRunResults: error count = {0}, fatal error count = {1}, last successful run = {2:O} ({3} ago), run duration = {4} ms, run interval = {5} ms", errorCount, fatalCount, lastSuccessfulRunAt, elapsed, runDuration.TotalMilliseconds, runInterval.TotalMilliseconds); if (errorCount == 0) { this.lastSuccessfulRunAt = now; int pendingCount = context.RepairRecords.Count(r => r.IsPendingApproval); int activeCount = context.RepairRecords.Count() - pendingCount; int overdueCount = context.OverdueRepairTaskCount; if (overdueCount > 0) { UpdateCoordinatorHealthStatus( HealthState.Warning, $"Autopilot coordinator is operating normally, but processing of some repair tasks is taking longer than expected. " + $"(overdue: {overdueCount}, pending: {pendingCount}, active: {activeCount})"); } else if (runDuration.TotalMilliseconds > (runInterval.TotalMilliseconds * this.configReader.CoordinatorDutyCycleWarningThreshold)) { UpdateCoordinatorHealthStatus( HealthState.Warning, $"Autopilot coordinator processing is taking longer than expected. " + $"(last run duration: {runDuration}, pending: {pendingCount}, active: {activeCount})"); } else { UpdateCoordinatorHealthStatus( HealthState.Ok, $"Autopilot coordinator is operating normally. (pending: {pendingCount}, active: {activeCount})"); } } else if (fatalCount > 0) { string firstFatalError = this.failureMessages.First(m => m.Item1).Item2; string message = $"Autopilot coordinator is restarting due to error: {firstFatalError}"; UpdateCoordinatorHealthStatus(HealthState.Warning, message); throw new ApplicationException(message); } else if (elapsed > maxRetryDuration) { string firstError = this.failureMessages.First().Item2; string message = $"Autopilot coordinator is restarting due to lack of recent success: {firstError}"; UpdateCoordinatorHealthStatus(HealthState.Warning, message); throw new ApplicationException(message); } else if (elapsed > warningThreshold) { string firstError = this.failureMessages.First().Item2; string message = $"Autopilot coordinator encountered errors: {firstError}"; UpdateCoordinatorHealthStatus(HealthState.Warning, message); } }