Пример #1
0
        private IEnumerable <IRepairTask> ReconcileRepairRecord(MachineMaintenanceRecord record, IEnumerable <IRepairTask> candidateRepairTasks)
        {
            bool foundMatch = false;
            List <IRepairTask> unmatchedRepairTasks = new List <IRepairTask>();

            string taskIdPrefix = GenerateTaskId(record.RecordId, null);

            foreach (IRepairTask repairTask in candidateRepairTasks)
            {
                if (TryReconcileRepair(record, taskIdPrefix, repairTask))
                {
                    foundMatch = true;
                }
                else
                {
                    // Not matched, put it back in the queue
                    unmatchedRepairTasks.Add(repairTask);
                }
            }

            if (!foundMatch)
            {
                if (!record.IsPendingApproval)
                {
                    traceType.WriteWarning(
                        "Repair {0} is executing without approval from a repair task",
                        record.RecordId);
                }

                EmitActionCreateRepairTaskInPreparing(record);
            }

            return(unmatchedRepairTasks);
        }
Пример #2
0
 public CreateRepairTaskInPreparingAction(
     MachineMaintenanceRecord record,
     string serviceName,
     NodeImpactLevel impactLevel,
     bool performHealthCheck)
 {
     this.record             = record.Validate("record");
     this.serviceName        = serviceName.Validate("serviceName");
     this.impactLevel        = impactLevel;
     this.performHealthCheck = performHealthCheck;
 }
Пример #3
0
        /// <returns>true if the record was added, false if it was ignored
        public bool SetRepairRecord(MachineMaintenanceRecord record)
        {
            record.Validate("record");

            if (record.RepairType == RepairType.NoOp)
            {
                traceType.WriteInfo("Ignoring repair {0}", record.RecordId);
                return(false);
            }

            this.maintenanceMap[record.RecordId] = record;
            return(true);
        }
Пример #4
0
        private void EmitActionCreateRepairTaskInPreparing(MachineMaintenanceRecord record)
        {
            // TODO allow config overrides
            NodeImpactLevel impactLevel        = TranslateRepairToNodeImpact(record.RepairType);
            bool            performHealthCheck = false; // TODO

            traceType.WriteInfo(
                "Creating repair task for {0} (impact = {1}, health check = {2})",
                record.RecordId,
                impactLevel,
                performHealthCheck);

            EmitAction(new CreateRepairTaskInPreparingAction(
                           record,
                           this.serviceName,
                           impactLevel,
                           performHealthCheck));
        }
Пример #5
0
        private async Task UpdateMaintenanceDelayAsync(MachineMaintenanceRecord record)
        {
            try
            {
                traceType.WriteInfo(
                    "Updating maintenance delay for {0}: {1} -> {2}",
                    record.RecordId,
                    record.OriginalDelay,
                    record.NewDelay);

                await this.dmClient.UpdateMaintenanceDelayAsync(record.MachineName, record.NewDelay);
            }
            catch (Exception e)
            {
                traceType.WriteWarning("Failed to process {0}: {1}", record.RecordId, e);
                AddFailureMessage(false, $"Failed to update maintenance delay: {record.RecordId}");
            }
        }
Пример #6
0
        public async Task <IList <MachineMaintenanceRecord> > GetDevicesPendingMaintenanceAsync()
        {
            var response = await dmClient.GetDevicesPendingMaintenanceAsync();

            VerifyResponse(response);

            using (var csvReader = new CsvReader(new StringReader(response.Response)))
            {
                List <MachineMaintenanceRecord> records = new List <MachineMaintenanceRecord>();

                while (csvReader.Read())
                {
                    string machineName = csvReader["MACHINENAME"];
                    string repairType  = csvReader["ACTION"];
                    string delayInSec  = csvReader["DELAYINSEC"];

                    int delay;
                    if (!int.TryParse(delayInSec, out delay))
                    {
                        traceType.WriteWarning(
                            "Unable to parse current delay: MACHINENAME='{0}' ACTION='{1}' DELAYINSEC='{2}'",
                            machineName,
                            repairType,
                            delayInSec);

                        delay = 59; // set this to some small value, so that we assume it is already approved
                    }

                    TimeSpan delaySpan = TimeSpan.FromSeconds(delay);

                    var recordId = new MaintenanceRecordId(machineName, repairType);
                    var record   = MachineMaintenanceRecord.FromPendingRepair(recordId, delaySpan);
                    records.Add(record);
                }

                return(records);
            }
        }
Пример #7
0
        public async Task <IList <MachineMaintenanceRecord> > GetRepairListAsync()
        {
            var response = await dmClient.GetRepairListAsync();

            VerifyResponse(response);

            using (var csvReader = new CsvReader(new StringReader(response.Response)))
            {
                List <MachineMaintenanceRecord> records = new List <MachineMaintenanceRecord>();

                while (csvReader.Read())
                {
                    string machineName = csvReader["MACHINENAME"];
                    string repairType  = csvReader["REQUESTTYPE"];

                    var recordId = new MaintenanceRecordId(machineName, repairType);
                    var record   = MachineMaintenanceRecord.FromActiveRepair(recordId);
                    records.Add(record);
                }

                return(records);
            }
        }
Пример #8
0
 public ExecuteRepairAction(MachineMaintenanceRecord record, IRepairTask repairTask)
 {
     this.record     = record;
     this.repairTask = repairTask;
 }
Пример #9
0
 private void EmitActionExecuteRepair(MachineMaintenanceRecord record, IRepairTask repairTask)
 {
     traceType.WriteInfo("Executing repair {0} with approval from repair task {1}", record.RecordId, repairTask.TaskId);
     EmitAction(new ExecuteRepairAction(record, repairTask));
 }
Пример #10
0
        private bool TryReconcileRepair(MachineMaintenanceRecord record, string taskIdPrefix, IRepairTask repairTask)
        {
            bool isMatch = repairTask.TaskId.StartsWith(taskIdPrefix);

            if (isMatch)
            {
                if (repairTask.State != RepairTaskState.Completed)
                {
                    traceType.WriteInfo(
                        "{3} repair {0} matched repair task {1} in state {2}",
                        record.RecordId,
                        repairTask.TaskId,
                        repairTask.State,
                        record.IsPendingApproval ? "Pending" : "Active");
                }

                switch (repairTask.State)
                {
                case RepairTaskState.Preparing:

                    if (IsTimestampOld(repairTask.PreparingTimestamp, this.configReader.OverdueRepairTaskPreparingThreshold))
                    {
                        traceType.WriteWarning(
                            "Repair task {0} has been in Preparing since {1:O} ({2} ago)",
                            repairTask.TaskId,
                            repairTask.PreparingTimestamp,
                            DateTime.UtcNow - repairTask.PreparingTimestamp);

                        this.OverdueRepairTaskCount++;
                    }

                    if (!record.IsPendingApproval)
                    {
                        traceType.WriteWarning(
                            "Repair {0} is executing without approval from repair task {1}",
                            record.RecordId,
                            repairTask.TaskId);
                    }

                    // Nothing else to do
                    break;

                case RepairTaskState.Approved:

                    if (!IsTimestampOld(repairTask.ApprovedTimestamp, this.configReader.PostApprovalExecutionDelay))
                    {
                        traceType.WriteInfo(
                            "Delaying execution of {0} because repair task {1} was approved recently (at {2:O})",
                            record.RecordId,
                            repairTask.TaskId,
                            repairTask.ApprovedTimestamp);
                    }
                    else if (record.IsPendingApproval)
                    {
                        EmitActionExecuteRepair(record, repairTask);
                    }
                    else
                    {
                        traceType.WriteWarning(
                            "Repair {0} is executing without approval from repair task {1}",
                            record.RecordId,
                            repairTask.TaskId);
                    }
                    break;

                case RepairTaskState.Executing:

                    if (record.IsPendingApproval)
                    {
                        EmitActionExecuteRepair(record, repairTask);
                    }
                    break;

                case RepairTaskState.Restoring:

                    if (IsTimestampOld(repairTask.RestoringTimestamp, this.configReader.OverdueRepairTaskRestoringThreshold))
                    {
                        traceType.WriteWarning(
                            "Repair task {0} has been in Restoring since {1:O} ({2} ago)",
                            repairTask.TaskId,
                            repairTask.RestoringTimestamp,
                            DateTime.UtcNow - repairTask.RestoringTimestamp);

                        this.OverdueRepairTaskCount++;
                    }
                    break;

                default:
                    isMatch = false;
                    break;
                }
            }

            return(isMatch);
        }
Пример #11
0
        private async Task ProcessAskModeOnce(AskModeContext context, CancellationToken cancellationToken)
        {
            cancellationToken.ThrowIfCancellationRequested();
            var machineInfo = await this.dmClient.GetMachineInfoAsync();

            cancellationToken.ThrowIfCancellationRequested();
            var pendingRecords = await this.dmClient.GetDevicesPendingMaintenanceAsync();

            foreach (var record in machineInfo)
            {
                if (record.Id.RepairType != RepairType.None)
                {
                    traceType.WriteInfo("GetMachineInfo: {0},{1},{2},{3}", record.Id.MachineName, record.Status, record.Id.RepairType, record.RepairActionState);
                    context.SetRepairRecord(MachineMaintenanceRecord.FromActiveRepair(record.Id));
                }

                if (!this.configReader.AllowRepairTaskCompletionInMachineStates.Contains(record.Status))
                {
                    traceType.WriteInfo(
                        "Disallowing completion of repair tasks for {0} because its state is {1}",
                        record.Id.MachineName,
                        record.Status);

                    context.DoNotCancelRepairsForMachine(record.Id.MachineName);
                }
            }

            foreach (var record in pendingRecords)
            {
                context.SetRepairRecord(record);
            }

            // TODO consider retrying the fetch/reconcile/execute block, to avoid
            // extending repairs on transient RM errors
            try
            {
                cancellationToken.ThrowIfCancellationRequested();

                // Fetch active repair tasks owned by this service
                IList <IRepairTask> repairTasks = await this.repairManager.GetRepairTaskListAsync(
                    Guid.Empty,
                    null,
                    RepairTaskStateFilter.Active,
                    this.environment.ServiceName);

                context.SetRepairTasks(repairTasks);

                // Reconcile DM and RM
                IEnumerable <IAction> actions = context.Reconcile();

                // Execute actions
                foreach (IAction action in actions)
                {
                    cancellationToken.ThrowIfCancellationRequested();
                    await ExecuteActionAsync(action);
                }
            }
            catch (Exception e)
            {
                traceType.WriteWarning("Reconcile/execute failed: {0}", e);
                AddFailureMessage(false, $"Reconcile/execute failed: {e.Message}");
            }

            // Modify delay times
            context.UpdateRepairDelays();

            // Push updates back to the DM
            foreach (var record in context.RepairRecords.Where(r => r.IsDelayModified))
            {
                cancellationToken.ThrowIfCancellationRequested();
                await UpdateMaintenanceDelayAsync(record);
            }

            // Report health against the nodes
            if (this.configReader.EnableNodeHealthReports)
            {
                foreach (var record in machineInfo)
                {
                    ReportNodeHealth(
                        "Autopilot:MachineStatus",
                        record.Id.MachineName,
                        (record.Status == "H" || record.Status == "P") ? HealthState.Ok : HealthState.Warning,
                        record.Status);

                    ReportNodeHealth(
                        "Autopilot:Repair",
                        record.Id.MachineName,
                        (record.Id.RepairType == RepairType.None) ? HealthState.Ok : HealthState.Warning,
                        $"{record.Id.RepairType},{record.RepairActionState}");
                }
            }
        }