/// <remarks>
        /// Returning bool instead of void since we want to not 'throw' from this method to preserve exception stack info.
        /// </remarks>
        private bool HandleCoordinatorException(Exception ex, DateTimeOffset lastSuccessfulRunAt)
        {
            if (ex is ManagementChannelTerminallyUnhealthyException)
            {
                return(false);
            }

            var warningThreshold =
                TimeSpan.FromSeconds(configSection.ReadConfigValue(Constants.ConfigKeys.CoordinatorFailureWarningThreshold, 120));
            var maxRetryDuration =
                TimeSpan.FromSeconds(configSection.ReadConfigValue(Constants.ConfigKeys.CoordinatorFailureRetryDuration, 900));

            var now     = DateTimeOffset.UtcNow;
            var elapsed = now - lastSuccessfulRunAt;

            if (elapsed > maxRetryDuration)
            {
                var message =
                    "Not retrying further since 'CoordinatorFailureRetryDuration' of {0} has been exceeded".ToString(maxRetryDuration);
                traceType.WriteError("Error while processing policy agent document. {0}. Exception: {1}", message, ex);
                return(false);
            }

            string warningText = null;

            if (elapsed > warningThreshold)
            {
                UpdateCoordinatorHealthStatus(
                    HealthState.Warning,
                    "Azure coordinator encountered errors. Last successful run was at: {0:O}. Last error: {1}"
                    .ToString(lastSuccessfulRunAt, ex));

                warningText =
                    "Health warning reported since 'CoordinatorFailureWarningThreshold' of {0} has exceeded. ".ToString(
                        warningThreshold);
            }

            traceType.WriteWarning(
                "Error while processing policy agent document. Last successful run was at: {0:O}. {1}Retrying until 'CoordinatorFailureRetryDuration' of {2} is exceeded. Exception: {3}",
                lastSuccessfulRunAt, warningText, maxRetryDuration, ex);

            return(true);
        }
Пример #2
0
        private static bool DoesJobRequireHealthCheck(ITenantJob tenantJob, IConfigSection configSection, string keyFormat, bool defaultValue)
        {
            tenantJob.Validate("tenantJob");
            configSection.Validate("configSection");

            var impactAction = tenantJob.GetImpactAction();
            var key          = keyFormat.ToString(impactAction);

            return(configSection.ReadConfigValue(key, defaultValue));
        }
Пример #3
0
        private TimeSpan GetValidityTimeInterval()
        {
            var validityTimeInterval = configSection.ReadConfigValue(ConfigLearningModeValidityTimeSpanKeyName, DefaultLearningModeValidityTimeSpan);

            if (validityTimeInterval <= TimeSpan.Zero)
            {
                validityTimeInterval = DefaultLearningModeValidityTimeSpan;
            }

            return(validityTimeInterval);
        }
Пример #4
0
        private T ReadFactoryConfig <T>(
            IConfigSection configSection,
            string keyName,
            T defaultValue = default(T))
        {
            if (this.configUpdateHandler != null)
            {
                this.configUpdateHandler.RegisterKey(configSection.Name, keyName);
            }

            return(configSection.ReadConfigValue(keyName, defaultValue));
        }
Пример #5
0
        private void BuildRepairActionMap()
        {
            // if users specify a generic command. E.g. Reboot, then apply the config setting policy and determine
            // whether to apply it on the VM or on the Host
            RepairActionTypeEnum reboot     = configSection.ReadConfigValue(Constants.ConfigKeys.RebootMaintenanceAction, RepairActionTypeEnum.Reboot);
            RepairActionTypeEnum repaveData = configSection.ReadConfigValue(Constants.ConfigKeys.FullReimageMaintenanceAction, RepairActionTypeEnum.RepaveData);

            AddRepairAction(reboot, AbstractRebootMaintenanceAction);
            AddRepairAction(repaveData, AbstractFullReimageMaintenanceAction);

            // There is no Host/VM target for ReimageOS currently. It is just applicable to VMs
            AddRepairAction(RepairActionTypeEnum.ReimageOS);

            // if users specify the command explicitly. E.g. System.Azure.Reboot, then apply the foll. irrespective of the
            // *MaintenanceAction config setting
            // Note: Unlike ReimageOS above, we haven't added a generic 'Heal' action. Instead users have to specify this explicitly
            AddRepairActionWithNamespace(RepairActionTypeEnum.Reboot);
            AddRepairActionWithNamespace(RepairActionTypeEnum.ReimageOS);
            AddRepairActionWithNamespace(RepairActionTypeEnum.RepaveData);
            AddRepairActionWithNamespace(RepairActionTypeEnum.HostReboot);
            AddRepairActionWithNamespace(RepairActionTypeEnum.HostRepaveData);
            AddRepairActionWithNamespace(RepairActionTypeEnum.Heal);
        }
Пример #6
0
        public static string GetTenantId(IConfigSection configSection)
        {
            configSection.Validate("configSection");

            string tenantId = configSection.ReadConfigValue <string>("WindowsAzure.TenantID");

            if (string.IsNullOrWhiteSpace(tenantId))
            {
#if DotNetCoreClrLinux
                tenantId = GetTenantIdOnLinux();
#else
                tenantId = GetTenantIdOnWindows();
#endif
            }

            return(tenantId);
        }
Пример #7
0
        private Uri GetMRZeroSdkUri(IConfigSection configSection)
        {
            try
            {
                string uriString = configSection.ReadConfigValue <string>(Constants.ConfigKeys.MRZeroSdkUri);

                Uri uri = string.IsNullOrWhiteSpace(uriString)
                    ? GetUriFromRegistry()
                    : new Uri(uriString, UriKind.Absolute);

                // using original string since a regular ToString() will not display the port no. which can be confusing while looking up traces
                traceType.WriteInfo("Tenant policy agent endpoint: {0}", uri.OriginalString);
                return(uri);
            }
            catch (Exception ex)
            {
                traceWriteConditionalWarning("Error while getting tenant policy agent endpoint. Exception: {0}", ex);
                throw;
            }
        }
Пример #8
0
        public string GetConfigValue(string key)
        {
            string configValue = configSection.ReadConfigValue <string>(key);

            string value;

            if (string.IsNullOrEmpty(configValue))
            {
                value = defaultConfigKeys[key];

                Trace.WriteNoise(
                    RoleInstanceHealthConstants.TraceType,
                    "Config key '{0}' not defined in config store in watchdog health config policy '{1}'. Using default value of '{2}'",
                    key,
                    Name,
                    value);
            }
            else
            {
                value = configValue;
            }

            return(value);
        }
Пример #9
0
 /// <summary>
 /// Gets the polling interval.
 /// </summary>
 /// <param name="keyName">Name of the key.</param>
 /// <param name="defaultValue">The default value in seconds.</param>
 /// <returns>The polling interval.</returns>
 private TimeSpan GetPollingInterval(string keyName, int defaultValue)
 {
     return(TimeSpan.FromSeconds(configSection.ReadConfigValue(keyName, defaultValue)));
 }
Пример #10
0
 private int GetMaxCountFromConfig(ImpactActionEnum jobType)
 {
     return(configSection.ReadConfigValue(
                Constants.ConfigKeys.MaxParallelJobCountKeyPrefix + jobType,
                DefaultMaxParallelJobCountPerImpactAction));
 }
Пример #11
0
        // TODO, ensure stability
        // i.e. on re-execution, the same things should re-execute (if the state hasn't changed)
        public override Task ApplyAsync(Guid activityId, CoordinatorContext coordinatorContext)
        {
            coordinatorContext.Validate("coordinatorContext");

            if (coordinatorContext.MappedTenantJobs.Count == 0)
            {
                return(Task.FromResult(0));
            }

            var allJobs    = coordinatorContext.MappedTenantJobs.Values;
            var activeJobs = allJobs.Where(j => j.IsActive).ToList();

            // Count all active jobs
            int totalActiveJobCount = activeJobs.Count;

            // Count active jobs by type
            int            activeUpdateJobCount = 0;
            JobTypeCounter jobTypeCounter       = new JobTypeCounter(this.configSection);

            foreach (var job in activeJobs)
            {
                traceType.WriteInfo("Active job {0} ({1})", job.Id, job.TenantJob.GetImpactAction());

                jobTypeCounter.AddActiveJob(job.TenantJob);

                if (job.TenantJob.IsUpdateJobType())
                {
                    ++activeUpdateJobCount;
                }
            }

            int maxParallelJobCount = configSection.ReadConfigValue(
                Constants.ConfigKeys.MaxParallelJobCountTotal,
                DefaultMaxParallelJobCountTotal);

            int maxParallelUpdateJobCount = configSection.ReadConfigValue(
                Constants.ConfigKeys.MaxParallelJobCountUpdate,
                DefaultMaxParallelJobCountUpdate);

            traceType.WriteInfo(
                "Active/max job counts: Total: {0}/{1}, Updates: {2}/{3}, {4}",
                totalActiveJobCount,
                maxParallelJobCount,
                activeUpdateJobCount,
                maxParallelUpdateJobCount,
                jobTypeCounter);

            // Find all jobs that are waiting to prepare
            var pendingJobs = allJobs
                              .Where(j => !j.IsActive && ((j.AllowedActions & ActionType.Prepare) == ActionType.Prepare))
                              .OrderBy(j => j.TenantJob.GetImpactAction()) // Apply default static priority based on job type
                              .ToList();

            // TODO, ensure that we don't ack too many in the 2nd pass just after acking once
            // choose the simplest logic for now. In future, we will pick based on oldest document incarnation number etc.

            foreach (var pendingJob in pendingJobs)
            {
                // Fall through the checks, so that all blocking reasons are logged
                bool allowJob = true;

                if (totalActiveJobCount >= maxParallelJobCount)
                {
                    traceType.WriteInfo(
                        "Not starting job {0} because it would exceed max total parallel job count ({1}/{2})",
                        pendingJob.Id,
                        totalActiveJobCount,
                        maxParallelJobCount);

                    allowJob = false;
                }

                JobCount count;
                if (!jobTypeCounter.CanAddActiveJob(pendingJob.TenantJob, out count))
                {
                    traceType.WriteInfo(
                        "Not starting job {0} because it would exceed max parallel job count for type {1} ({2})",
                        pendingJob.Id,
                        pendingJob.TenantJob.GetImpactAction(),
                        count);

                    allowJob = false;
                }

                if (pendingJob.TenantJob.IsUpdateJobType() && (activeUpdateJobCount >= maxParallelUpdateJobCount))
                {
                    traceType.WriteInfo(
                        "Not starting job {0} because it would exceed max parallel update job count ({1}/{2})",
                        pendingJob.Id,
                        activeUpdateJobCount,
                        maxParallelUpdateJobCount);

                    allowJob = false;
                }

                if (allowJob)
                {
                    ++totalActiveJobCount;
                    jobTypeCounter.AddActiveJob(pendingJob.TenantJob);

                    if (pendingJob.TenantJob.IsUpdateJobType())
                    {
                        ++activeUpdateJobCount;
                    }

                    traceType.WriteInfo("Allowing job {0} to start", pendingJob.Id);
                }
                else
                {
                    pendingJob.DenyActions(traceType, ActionType.Prepare);
                }
            }

            return(Task.FromResult(0));
        }