Пример #1
0
        private async Task ProcessAskModeCoreAsync(CancellationToken cancellationToken)
        {
            traceType.WriteInfo("Starting Autopilot coordination");

            this.lastSuccessfulRunAt = DateTimeOffset.UtcNow;

            while (!cancellationToken.IsCancellationRequested)
            {
                traceType.WriteInfo(
                    "Autopilot coordinator: service name = {0}, partition ID = {1}, start time = {2:O}",
                    this.environment.ServiceName,
                    this.partitionId,
                    this.coordinatorRunAsyncStartTime);

                ClearFailureMessages();
                Stopwatch watch = Stopwatch.StartNew();

                var context = new AskModeContext(this.configReader, this.environment.ServiceName);

                try
                {
                    await ProcessAskModeOnce(context, cancellationToken);
                }
                catch (Exception ex)
                {
                    if (!cancellationToken.IsCancellationRequested)
                    {
                        traceType.WriteWarning("Ask mode processing failed: {0}", ex);
                        AddFailureMessage(false, $"Ask mode processing failed: {ex.Message}");
                    }
                }

                TimeSpan timerPeriod = this.configReader.JobPollingInterval;

                ProcessRunResults(context, watch.Elapsed, timerPeriod);

                if (!cancellationToken.IsCancellationRequested)
                {
                    traceType.WriteInfo("Next run will occur in {0} at {1:O}", timerPeriod, DateTimeOffset.UtcNow + timerPeriod);
                    await Task.Delay(timerPeriod, cancellationToken);
                }
            }

            traceType.WriteInfo("Cancellation requested. Stopping Autopilot coordination.");
        }
Пример #2
0
        private async Task ProcessAskModeOnce(AskModeContext context, CancellationToken cancellationToken)
        {
            cancellationToken.ThrowIfCancellationRequested();
            var machineInfo = await this.dmClient.GetMachineInfoAsync();

            cancellationToken.ThrowIfCancellationRequested();
            var pendingRecords = await this.dmClient.GetDevicesPendingMaintenanceAsync();

            foreach (var record in machineInfo)
            {
                if (record.Id.RepairType != RepairType.None)
                {
                    traceType.WriteInfo("GetMachineInfo: {0},{1},{2},{3}", record.Id.MachineName, record.Status, record.Id.RepairType, record.RepairActionState);
                    context.SetRepairRecord(MachineMaintenanceRecord.FromActiveRepair(record.Id));
                }

                if (!this.configReader.AllowRepairTaskCompletionInMachineStates.Contains(record.Status))
                {
                    traceType.WriteInfo(
                        "Disallowing completion of repair tasks for {0} because its state is {1}",
                        record.Id.MachineName,
                        record.Status);

                    context.DoNotCancelRepairsForMachine(record.Id.MachineName);
                }
            }

            foreach (var record in pendingRecords)
            {
                context.SetRepairRecord(record);
            }

            // TODO consider retrying the fetch/reconcile/execute block, to avoid
            // extending repairs on transient RM errors
            try
            {
                cancellationToken.ThrowIfCancellationRequested();

                // Fetch active repair tasks owned by this service
                IList <IRepairTask> repairTasks = await this.repairManager.GetRepairTaskListAsync(
                    Guid.Empty,
                    null,
                    RepairTaskStateFilter.Active,
                    this.environment.ServiceName);

                context.SetRepairTasks(repairTasks);

                // Reconcile DM and RM
                IEnumerable <IAction> actions = context.Reconcile();

                // Execute actions
                foreach (IAction action in actions)
                {
                    cancellationToken.ThrowIfCancellationRequested();
                    await ExecuteActionAsync(action);
                }
            }
            catch (Exception e)
            {
                traceType.WriteWarning("Reconcile/execute failed: {0}", e);
                AddFailureMessage(false, $"Reconcile/execute failed: {e.Message}");
            }

            // Modify delay times
            context.UpdateRepairDelays();

            // Push updates back to the DM
            foreach (var record in context.RepairRecords.Where(r => r.IsDelayModified))
            {
                cancellationToken.ThrowIfCancellationRequested();
                await UpdateMaintenanceDelayAsync(record);
            }

            // Report health against the nodes
            if (this.configReader.EnableNodeHealthReports)
            {
                foreach (var record in machineInfo)
                {
                    ReportNodeHealth(
                        "Autopilot:MachineStatus",
                        record.Id.MachineName,
                        (record.Status == "H" || record.Status == "P") ? HealthState.Ok : HealthState.Warning,
                        record.Status);

                    ReportNodeHealth(
                        "Autopilot:Repair",
                        record.Id.MachineName,
                        (record.Id.RepairType == RepairType.None) ? HealthState.Ok : HealthState.Warning,
                        $"{record.Id.RepairType},{record.RepairActionState}");
                }
            }
        }
Пример #3
0
        private void ProcessRunResults(AskModeContext context, TimeSpan runDuration, TimeSpan runInterval)
        {
            TimeSpan       warningThreshold = this.configReader.CoordinatorFailureWarningThreshold;
            TimeSpan       maxRetryDuration = this.configReader.CoordinatorFailureRetryDuration;
            DateTimeOffset now     = DateTimeOffset.UtcNow;
            TimeSpan       elapsed = now - lastSuccessfulRunAt;

            int errorCount = this.failureMessages.Count;
            int fatalCount = this.failureMessages.Count(m => m.Item1); // isFatal == true

            traceType.WriteInfo(
                "ProcessRunResults: error count = {0}, fatal error count = {1}, last successful run = {2:O} ({3} ago), run duration = {4} ms, run interval = {5} ms",
                errorCount,
                fatalCount,
                lastSuccessfulRunAt,
                elapsed,
                runDuration.TotalMilliseconds,
                runInterval.TotalMilliseconds);

            if (errorCount == 0)
            {
                this.lastSuccessfulRunAt = now;

                int pendingCount = context.RepairRecords.Count(r => r.IsPendingApproval);
                int activeCount  = context.RepairRecords.Count() - pendingCount;
                int overdueCount = context.OverdueRepairTaskCount;

                if (overdueCount > 0)
                {
                    UpdateCoordinatorHealthStatus(
                        HealthState.Warning,
                        $"Autopilot coordinator is operating normally, but processing of some repair tasks is taking longer than expected. " +
                        $"(overdue: {overdueCount}, pending: {pendingCount}, active: {activeCount})");
                }
                else if (runDuration.TotalMilliseconds > (runInterval.TotalMilliseconds * this.configReader.CoordinatorDutyCycleWarningThreshold))
                {
                    UpdateCoordinatorHealthStatus(
                        HealthState.Warning,
                        $"Autopilot coordinator processing is taking longer than expected. " +
                        $"(last run duration: {runDuration}, pending: {pendingCount}, active: {activeCount})");
                }
                else
                {
                    UpdateCoordinatorHealthStatus(
                        HealthState.Ok,
                        $"Autopilot coordinator is operating normally. (pending: {pendingCount}, active: {activeCount})");
                }
            }
            else if (fatalCount > 0)
            {
                string firstFatalError = this.failureMessages.First(m => m.Item1).Item2;
                string message         = $"Autopilot coordinator is restarting due to error: {firstFatalError}";
                UpdateCoordinatorHealthStatus(HealthState.Warning, message);
                throw new ApplicationException(message);
            }
            else if (elapsed > maxRetryDuration)
            {
                string firstError = this.failureMessages.First().Item2;
                string message    = $"Autopilot coordinator is restarting due to lack of recent success: {firstError}";
                UpdateCoordinatorHealthStatus(HealthState.Warning, message);
                throw new ApplicationException(message);
            }
            else if (elapsed > warningThreshold)
            {
                string firstError = this.failureMessages.First().Item2;
                string message    = $"Autopilot coordinator encountered errors: {firstError}";
                UpdateCoordinatorHealthStatus(HealthState.Warning, message);
            }
        }