/// <remarks> /// Returning bool instead of void since we want to not 'throw' from this method to preserve exception stack info. /// </remarks> private bool HandleCoordinatorException(Exception ex, DateTimeOffset lastSuccessfulRunAt) { if (ex is ManagementChannelTerminallyUnhealthyException) { return(false); } var warningThreshold = TimeSpan.FromSeconds(configSection.ReadConfigValue(Constants.ConfigKeys.CoordinatorFailureWarningThreshold, 120)); var maxRetryDuration = TimeSpan.FromSeconds(configSection.ReadConfigValue(Constants.ConfigKeys.CoordinatorFailureRetryDuration, 900)); var now = DateTimeOffset.UtcNow; var elapsed = now - lastSuccessfulRunAt; if (elapsed > maxRetryDuration) { var message = "Not retrying further since 'CoordinatorFailureRetryDuration' of {0} has been exceeded".ToString(maxRetryDuration); traceType.WriteError("Error while processing policy agent document. {0}. Exception: {1}", message, ex); return(false); } string warningText = null; if (elapsed > warningThreshold) { UpdateCoordinatorHealthStatus( HealthState.Warning, "Azure coordinator encountered errors. Last successful run was at: {0:O}. Last error: {1}" .ToString(lastSuccessfulRunAt, ex)); warningText = "Health warning reported since 'CoordinatorFailureWarningThreshold' of {0} has exceeded. ".ToString( warningThreshold); } traceType.WriteWarning( "Error while processing policy agent document. Last successful run was at: {0:O}. {1}Retrying until 'CoordinatorFailureRetryDuration' of {2} is exceeded. Exception: {3}", lastSuccessfulRunAt, warningText, maxRetryDuration, ex); return(true); }
private static bool DoesJobRequireHealthCheck(ITenantJob tenantJob, IConfigSection configSection, string keyFormat, bool defaultValue) { tenantJob.Validate("tenantJob"); configSection.Validate("configSection"); var impactAction = tenantJob.GetImpactAction(); var key = keyFormat.ToString(impactAction); return(configSection.ReadConfigValue(key, defaultValue)); }
private TimeSpan GetValidityTimeInterval() { var validityTimeInterval = configSection.ReadConfigValue(ConfigLearningModeValidityTimeSpanKeyName, DefaultLearningModeValidityTimeSpan); if (validityTimeInterval <= TimeSpan.Zero) { validityTimeInterval = DefaultLearningModeValidityTimeSpan; } return(validityTimeInterval); }
private T ReadFactoryConfig <T>( IConfigSection configSection, string keyName, T defaultValue = default(T)) { if (this.configUpdateHandler != null) { this.configUpdateHandler.RegisterKey(configSection.Name, keyName); } return(configSection.ReadConfigValue(keyName, defaultValue)); }
private void BuildRepairActionMap() { // if users specify a generic command. E.g. Reboot, then apply the config setting policy and determine // whether to apply it on the VM or on the Host RepairActionTypeEnum reboot = configSection.ReadConfigValue(Constants.ConfigKeys.RebootMaintenanceAction, RepairActionTypeEnum.Reboot); RepairActionTypeEnum repaveData = configSection.ReadConfigValue(Constants.ConfigKeys.FullReimageMaintenanceAction, RepairActionTypeEnum.RepaveData); AddRepairAction(reboot, AbstractRebootMaintenanceAction); AddRepairAction(repaveData, AbstractFullReimageMaintenanceAction); // There is no Host/VM target for ReimageOS currently. It is just applicable to VMs AddRepairAction(RepairActionTypeEnum.ReimageOS); // if users specify the command explicitly. E.g. System.Azure.Reboot, then apply the foll. irrespective of the // *MaintenanceAction config setting // Note: Unlike ReimageOS above, we haven't added a generic 'Heal' action. Instead users have to specify this explicitly AddRepairActionWithNamespace(RepairActionTypeEnum.Reboot); AddRepairActionWithNamespace(RepairActionTypeEnum.ReimageOS); AddRepairActionWithNamespace(RepairActionTypeEnum.RepaveData); AddRepairActionWithNamespace(RepairActionTypeEnum.HostReboot); AddRepairActionWithNamespace(RepairActionTypeEnum.HostRepaveData); AddRepairActionWithNamespace(RepairActionTypeEnum.Heal); }
public static string GetTenantId(IConfigSection configSection) { configSection.Validate("configSection"); string tenantId = configSection.ReadConfigValue <string>("WindowsAzure.TenantID"); if (string.IsNullOrWhiteSpace(tenantId)) { #if DotNetCoreClrLinux tenantId = GetTenantIdOnLinux(); #else tenantId = GetTenantIdOnWindows(); #endif } return(tenantId); }
private Uri GetMRZeroSdkUri(IConfigSection configSection) { try { string uriString = configSection.ReadConfigValue <string>(Constants.ConfigKeys.MRZeroSdkUri); Uri uri = string.IsNullOrWhiteSpace(uriString) ? GetUriFromRegistry() : new Uri(uriString, UriKind.Absolute); // using original string since a regular ToString() will not display the port no. which can be confusing while looking up traces traceType.WriteInfo("Tenant policy agent endpoint: {0}", uri.OriginalString); return(uri); } catch (Exception ex) { traceWriteConditionalWarning("Error while getting tenant policy agent endpoint. Exception: {0}", ex); throw; } }
public string GetConfigValue(string key) { string configValue = configSection.ReadConfigValue <string>(key); string value; if (string.IsNullOrEmpty(configValue)) { value = defaultConfigKeys[key]; Trace.WriteNoise( RoleInstanceHealthConstants.TraceType, "Config key '{0}' not defined in config store in watchdog health config policy '{1}'. Using default value of '{2}'", key, Name, value); } else { value = configValue; } return(value); }
/// <summary> /// Gets the polling interval. /// </summary> /// <param name="keyName">Name of the key.</param> /// <param name="defaultValue">The default value in seconds.</param> /// <returns>The polling interval.</returns> private TimeSpan GetPollingInterval(string keyName, int defaultValue) { return(TimeSpan.FromSeconds(configSection.ReadConfigValue(keyName, defaultValue))); }
private int GetMaxCountFromConfig(ImpactActionEnum jobType) { return(configSection.ReadConfigValue( Constants.ConfigKeys.MaxParallelJobCountKeyPrefix + jobType, DefaultMaxParallelJobCountPerImpactAction)); }
// TODO, ensure stability // i.e. on re-execution, the same things should re-execute (if the state hasn't changed) public override Task ApplyAsync(Guid activityId, CoordinatorContext coordinatorContext) { coordinatorContext.Validate("coordinatorContext"); if (coordinatorContext.MappedTenantJobs.Count == 0) { return(Task.FromResult(0)); } var allJobs = coordinatorContext.MappedTenantJobs.Values; var activeJobs = allJobs.Where(j => j.IsActive).ToList(); // Count all active jobs int totalActiveJobCount = activeJobs.Count; // Count active jobs by type int activeUpdateJobCount = 0; JobTypeCounter jobTypeCounter = new JobTypeCounter(this.configSection); foreach (var job in activeJobs) { traceType.WriteInfo("Active job {0} ({1})", job.Id, job.TenantJob.GetImpactAction()); jobTypeCounter.AddActiveJob(job.TenantJob); if (job.TenantJob.IsUpdateJobType()) { ++activeUpdateJobCount; } } int maxParallelJobCount = configSection.ReadConfigValue( Constants.ConfigKeys.MaxParallelJobCountTotal, DefaultMaxParallelJobCountTotal); int maxParallelUpdateJobCount = configSection.ReadConfigValue( Constants.ConfigKeys.MaxParallelJobCountUpdate, DefaultMaxParallelJobCountUpdate); traceType.WriteInfo( "Active/max job counts: Total: {0}/{1}, Updates: {2}/{3}, {4}", totalActiveJobCount, maxParallelJobCount, activeUpdateJobCount, maxParallelUpdateJobCount, jobTypeCounter); // Find all jobs that are waiting to prepare var pendingJobs = allJobs .Where(j => !j.IsActive && ((j.AllowedActions & ActionType.Prepare) == ActionType.Prepare)) .OrderBy(j => j.TenantJob.GetImpactAction()) // Apply default static priority based on job type .ToList(); // TODO, ensure that we don't ack too many in the 2nd pass just after acking once // choose the simplest logic for now. In future, we will pick based on oldest document incarnation number etc. foreach (var pendingJob in pendingJobs) { // Fall through the checks, so that all blocking reasons are logged bool allowJob = true; if (totalActiveJobCount >= maxParallelJobCount) { traceType.WriteInfo( "Not starting job {0} because it would exceed max total parallel job count ({1}/{2})", pendingJob.Id, totalActiveJobCount, maxParallelJobCount); allowJob = false; } JobCount count; if (!jobTypeCounter.CanAddActiveJob(pendingJob.TenantJob, out count)) { traceType.WriteInfo( "Not starting job {0} because it would exceed max parallel job count for type {1} ({2})", pendingJob.Id, pendingJob.TenantJob.GetImpactAction(), count); allowJob = false; } if (pendingJob.TenantJob.IsUpdateJobType() && (activeUpdateJobCount >= maxParallelUpdateJobCount)) { traceType.WriteInfo( "Not starting job {0} because it would exceed max parallel update job count ({1}/{2})", pendingJob.Id, activeUpdateJobCount, maxParallelUpdateJobCount); allowJob = false; } if (allowJob) { ++totalActiveJobCount; jobTypeCounter.AddActiveJob(pendingJob.TenantJob); if (pendingJob.TenantJob.IsUpdateJobType()) { ++activeUpdateJobCount; } traceType.WriteInfo("Allowing job {0} to start", pendingJob.Id); } else { pendingJob.DenyActions(traceType, ActionType.Prepare); } } return(Task.FromResult(0)); }