private IEnumerable <IRepairTask> ReconcileRepairRecord(MachineMaintenanceRecord record, IEnumerable <IRepairTask> candidateRepairTasks) { bool foundMatch = false; List <IRepairTask> unmatchedRepairTasks = new List <IRepairTask>(); string taskIdPrefix = GenerateTaskId(record.RecordId, null); foreach (IRepairTask repairTask in candidateRepairTasks) { if (TryReconcileRepair(record, taskIdPrefix, repairTask)) { foundMatch = true; } else { // Not matched, put it back in the queue unmatchedRepairTasks.Add(repairTask); } } if (!foundMatch) { if (!record.IsPendingApproval) { traceType.WriteWarning( "Repair {0} is executing without approval from a repair task", record.RecordId); } EmitActionCreateRepairTaskInPreparing(record); } return(unmatchedRepairTasks); }
public CreateRepairTaskInPreparingAction( MachineMaintenanceRecord record, string serviceName, NodeImpactLevel impactLevel, bool performHealthCheck) { this.record = record.Validate("record"); this.serviceName = serviceName.Validate("serviceName"); this.impactLevel = impactLevel; this.performHealthCheck = performHealthCheck; }
/// <returns>true if the record was added, false if it was ignored public bool SetRepairRecord(MachineMaintenanceRecord record) { record.Validate("record"); if (record.RepairType == RepairType.NoOp) { traceType.WriteInfo("Ignoring repair {0}", record.RecordId); return(false); } this.maintenanceMap[record.RecordId] = record; return(true); }
private void EmitActionCreateRepairTaskInPreparing(MachineMaintenanceRecord record) { // TODO allow config overrides NodeImpactLevel impactLevel = TranslateRepairToNodeImpact(record.RepairType); bool performHealthCheck = false; // TODO traceType.WriteInfo( "Creating repair task for {0} (impact = {1}, health check = {2})", record.RecordId, impactLevel, performHealthCheck); EmitAction(new CreateRepairTaskInPreparingAction( record, this.serviceName, impactLevel, performHealthCheck)); }
private async Task UpdateMaintenanceDelayAsync(MachineMaintenanceRecord record) { try { traceType.WriteInfo( "Updating maintenance delay for {0}: {1} -> {2}", record.RecordId, record.OriginalDelay, record.NewDelay); await this.dmClient.UpdateMaintenanceDelayAsync(record.MachineName, record.NewDelay); } catch (Exception e) { traceType.WriteWarning("Failed to process {0}: {1}", record.RecordId, e); AddFailureMessage(false, $"Failed to update maintenance delay: {record.RecordId}"); } }
public async Task <IList <MachineMaintenanceRecord> > GetDevicesPendingMaintenanceAsync() { var response = await dmClient.GetDevicesPendingMaintenanceAsync(); VerifyResponse(response); using (var csvReader = new CsvReader(new StringReader(response.Response))) { List <MachineMaintenanceRecord> records = new List <MachineMaintenanceRecord>(); while (csvReader.Read()) { string machineName = csvReader["MACHINENAME"]; string repairType = csvReader["ACTION"]; string delayInSec = csvReader["DELAYINSEC"]; int delay; if (!int.TryParse(delayInSec, out delay)) { traceType.WriteWarning( "Unable to parse current delay: MACHINENAME='{0}' ACTION='{1}' DELAYINSEC='{2}'", machineName, repairType, delayInSec); delay = 59; // set this to some small value, so that we assume it is already approved } TimeSpan delaySpan = TimeSpan.FromSeconds(delay); var recordId = new MaintenanceRecordId(machineName, repairType); var record = MachineMaintenanceRecord.FromPendingRepair(recordId, delaySpan); records.Add(record); } return(records); } }
public async Task <IList <MachineMaintenanceRecord> > GetRepairListAsync() { var response = await dmClient.GetRepairListAsync(); VerifyResponse(response); using (var csvReader = new CsvReader(new StringReader(response.Response))) { List <MachineMaintenanceRecord> records = new List <MachineMaintenanceRecord>(); while (csvReader.Read()) { string machineName = csvReader["MACHINENAME"]; string repairType = csvReader["REQUESTTYPE"]; var recordId = new MaintenanceRecordId(machineName, repairType); var record = MachineMaintenanceRecord.FromActiveRepair(recordId); records.Add(record); } return(records); } }
public ExecuteRepairAction(MachineMaintenanceRecord record, IRepairTask repairTask) { this.record = record; this.repairTask = repairTask; }
private void EmitActionExecuteRepair(MachineMaintenanceRecord record, IRepairTask repairTask) { traceType.WriteInfo("Executing repair {0} with approval from repair task {1}", record.RecordId, repairTask.TaskId); EmitAction(new ExecuteRepairAction(record, repairTask)); }
private bool TryReconcileRepair(MachineMaintenanceRecord record, string taskIdPrefix, IRepairTask repairTask) { bool isMatch = repairTask.TaskId.StartsWith(taskIdPrefix); if (isMatch) { if (repairTask.State != RepairTaskState.Completed) { traceType.WriteInfo( "{3} repair {0} matched repair task {1} in state {2}", record.RecordId, repairTask.TaskId, repairTask.State, record.IsPendingApproval ? "Pending" : "Active"); } switch (repairTask.State) { case RepairTaskState.Preparing: if (IsTimestampOld(repairTask.PreparingTimestamp, this.configReader.OverdueRepairTaskPreparingThreshold)) { traceType.WriteWarning( "Repair task {0} has been in Preparing since {1:O} ({2} ago)", repairTask.TaskId, repairTask.PreparingTimestamp, DateTime.UtcNow - repairTask.PreparingTimestamp); this.OverdueRepairTaskCount++; } if (!record.IsPendingApproval) { traceType.WriteWarning( "Repair {0} is executing without approval from repair task {1}", record.RecordId, repairTask.TaskId); } // Nothing else to do break; case RepairTaskState.Approved: if (!IsTimestampOld(repairTask.ApprovedTimestamp, this.configReader.PostApprovalExecutionDelay)) { traceType.WriteInfo( "Delaying execution of {0} because repair task {1} was approved recently (at {2:O})", record.RecordId, repairTask.TaskId, repairTask.ApprovedTimestamp); } else if (record.IsPendingApproval) { EmitActionExecuteRepair(record, repairTask); } else { traceType.WriteWarning( "Repair {0} is executing without approval from repair task {1}", record.RecordId, repairTask.TaskId); } break; case RepairTaskState.Executing: if (record.IsPendingApproval) { EmitActionExecuteRepair(record, repairTask); } break; case RepairTaskState.Restoring: if (IsTimestampOld(repairTask.RestoringTimestamp, this.configReader.OverdueRepairTaskRestoringThreshold)) { traceType.WriteWarning( "Repair task {0} has been in Restoring since {1:O} ({2} ago)", repairTask.TaskId, repairTask.RestoringTimestamp, DateTime.UtcNow - repairTask.RestoringTimestamp); this.OverdueRepairTaskCount++; } break; default: isMatch = false; break; } } return(isMatch); }
private async Task ProcessAskModeOnce(AskModeContext context, CancellationToken cancellationToken) { cancellationToken.ThrowIfCancellationRequested(); var machineInfo = await this.dmClient.GetMachineInfoAsync(); cancellationToken.ThrowIfCancellationRequested(); var pendingRecords = await this.dmClient.GetDevicesPendingMaintenanceAsync(); foreach (var record in machineInfo) { if (record.Id.RepairType != RepairType.None) { traceType.WriteInfo("GetMachineInfo: {0},{1},{2},{3}", record.Id.MachineName, record.Status, record.Id.RepairType, record.RepairActionState); context.SetRepairRecord(MachineMaintenanceRecord.FromActiveRepair(record.Id)); } if (!this.configReader.AllowRepairTaskCompletionInMachineStates.Contains(record.Status)) { traceType.WriteInfo( "Disallowing completion of repair tasks for {0} because its state is {1}", record.Id.MachineName, record.Status); context.DoNotCancelRepairsForMachine(record.Id.MachineName); } } foreach (var record in pendingRecords) { context.SetRepairRecord(record); } // TODO consider retrying the fetch/reconcile/execute block, to avoid // extending repairs on transient RM errors try { cancellationToken.ThrowIfCancellationRequested(); // Fetch active repair tasks owned by this service IList <IRepairTask> repairTasks = await this.repairManager.GetRepairTaskListAsync( Guid.Empty, null, RepairTaskStateFilter.Active, this.environment.ServiceName); context.SetRepairTasks(repairTasks); // Reconcile DM and RM IEnumerable <IAction> actions = context.Reconcile(); // Execute actions foreach (IAction action in actions) { cancellationToken.ThrowIfCancellationRequested(); await ExecuteActionAsync(action); } } catch (Exception e) { traceType.WriteWarning("Reconcile/execute failed: {0}", e); AddFailureMessage(false, $"Reconcile/execute failed: {e.Message}"); } // Modify delay times context.UpdateRepairDelays(); // Push updates back to the DM foreach (var record in context.RepairRecords.Where(r => r.IsDelayModified)) { cancellationToken.ThrowIfCancellationRequested(); await UpdateMaintenanceDelayAsync(record); } // Report health against the nodes if (this.configReader.EnableNodeHealthReports) { foreach (var record in machineInfo) { ReportNodeHealth( "Autopilot:MachineStatus", record.Id.MachineName, (record.Status == "H" || record.Status == "P") ? HealthState.Ok : HealthState.Warning, record.Status); ReportNodeHealth( "Autopilot:Repair", record.Id.MachineName, (record.Id.RepairType == RepairType.None) ? HealthState.Ok : HealthState.Warning, $"{record.Id.RepairType},{record.RepairActionState}"); } } }