public RedisAutoscalingRule(Configuration configuration, RedisAutoscalingAgent redisAutoscalingAgent, IRedisInstance primaryRedisInstance, IRedisInstance secondaryRedisInstance) : base(configuration) { Contract.Assert(primaryRedisInstance != secondaryRedisInstance); _configuration = configuration; _redisAutoscalingAgent = redisAutoscalingAgent; _primaryRedisInstance = primaryRedisInstance; _secondaryRedisInstance = secondaryRedisInstance; }
private async Task CreateIcmForFailedStateIfNeededAsync(IRedisInstance instance) { if (instance.State != "Failed") { return; } try { await EmitIcmAsync( severity : 3, title : $"{instance.Name} is in a failed state", description : SingleFailedInstanceDescription, machines : null, correlationIds : null, cacheTimeToLive : _configuration.IcmIncidentCacheTtl); } catch (Exception e) { _configuration.Logger.Error($"Failed to emit IcM for failed instance {instance.Name}: {e}"); } }
private async Task CreateIcmForFailedStateIfNeededAsync(IRedisInstance instance) { if (instance.State != "Failed") { return; } try { await EmitIcmAsync( severity : 3, title : $"{instance.Name} is in a failed state", description : "Instance fell into a failed state. Please monitor it and open a Sev 2 IcM against the Windows Azure Cache team (https://aka.ms/redisicm) for support if needed.", machines : null, correlationIds : null, cacheTimeToLive : _configuration.IcmIncidentCacheTtl); } catch (Exception e) { _configuration.Logger.Error($"Failed to emit IcM for failed instance {instance.Name}: {e}"); } }
private async Task <bool> AttemptToScaleAsync(RuleContext context, IRedisInstance redisInstance, CancellationToken cancellationToken) { Contract.Requires(redisInstance.IsReadyToScale); // Fetch which cluster size we want, and start scaling operation if needed. var currentClusterSize = redisInstance.ClusterSize; var targetClusterSizeResult = await _redisAutoscalingAgent.EstimateBestClusterSizeAsync(context.IntoOperationContext(_configuration.Logger), redisInstance); if (!targetClusterSizeResult.Succeeded) { Emit(context, "Autoscale", Severity.Error, $"Failed to find best plan for instance `{redisInstance.Name}` in plan `{currentClusterSize}`. Result=[{targetClusterSizeResult}]"); return(false); } var modelOutput = targetClusterSizeResult.Value; Contract.AssertNotNull(modelOutput); var targetClusterSize = modelOutput.TargetClusterSize; if (targetClusterSize.Equals(redisInstance.ClusterSize) || modelOutput.ScalePath.Count == 0) { return(false); } Emit(context, "Autoscale", Severity.Warning, $"Autoscaling from `{currentClusterSize}` to `{targetClusterSize}` via scale path `{currentClusterSize} -> {string.Join(" -> ", modelOutput.ScalePath)}` for instance `{redisInstance.Name}`. Solution cost is `{modelOutput.Cost}`"); var scaleResult = await redisInstance.ScaleAsync(modelOutput.ScalePath, cancellationToken); if (!scaleResult) { Emit(context, "Autoscale", Severity.Error, $"Autoscale attempt from `{currentClusterSize}` to `{targetClusterSize}` for instance `{redisInstance.Name}` failed. Result=[{scaleResult}]"); scaleResult.ThrowIfFailure(); } return(true); }
private async Task <bool> AttemptToScaleAsync(RuleContext context, IRedisInstance redisInstance) { Contract.Requires(redisInstance.IsReadyToScale); var operationContext = context.IntoOperationContext(_configuration.Logger); // Fetch which cluster size we want, and start scaling operation if needed. var currentClusterSize = redisInstance.ClusterSize; var targetClusterSizeResult = await _redisAutoscalingAgent.EstimateBestClusterSizeAsync(operationContext, redisInstance); if (!targetClusterSizeResult.Succeeded) { Emit(context, "Autoscale", Severity.Error, $"Failed to find best plan for instance `{redisInstance.Name}` in plan `{currentClusterSize}`. Result=[{targetClusterSizeResult}]"); return(false); } var modelOutput = targetClusterSizeResult.Value; Contract.AssertNotNull(modelOutput); var targetClusterSize = modelOutput.TargetClusterSize; if (targetClusterSize.Equals(currentClusterSize) || modelOutput.ScalePath.Count == 0) { return(false); } if (RedisScalingUtilities.IsDownScale(currentClusterSize, targetClusterSize)) { // Downscales are typically about saving money rather than system health, hence, it's deprioritized. // Force downscales to happen during very comfortable business hours in PST, to ensure we're always // available if things go wrong. We disregard holidays because it's a pain to handle them. if (!TimeConstraints.BusinessHours.SatisfiedPST(_configuration.Clock.UtcNow)) { Emit(context, "Autoscale", Severity.Info, $"Refused autoscale from `{currentClusterSize}` to `{targetClusterSize}` via scale path `{currentClusterSize} -> {string.Join(" -> ", modelOutput.ScalePath)}` for instance `{redisInstance.Name}` due to business hours constraints"); return(false); } // Downscales are performed in phases instead of all at once. If the model proposes an autoscale, we'll // only take the first step of it in the current iteration, and force wait some amount of time until we // allow this instance to be downscaled again. This gives some time to evaluate the effects of the last // downscale (which typically takes time because migration's effects on instance memory and cpu load // take some time to see). // // The intent of this measure is to avoid situations where our downscale causes heightened load in the // remaining shards, forcing us to scale back to our original size after some time. This effect creates // "autoscale loops" over time. modelOutput.ScalePath = modelOutput.ScalePath.Take(1).ToList(); if (_lastAutoscaleTimeUtc.TryGetValue(redisInstance.Id, out var lastAutoscaleTimeUtc)) { var now = _configuration.Clock.UtcNow; if (now - lastAutoscaleTimeUtc < _configuration.MinimumWaitTimeBetweenDownscaleSteps) { return(false); } } } Emit(context, "Autoscale", Severity.Warning, $"Autoscaling from `{currentClusterSize}` ({currentClusterSize.MonthlyCostUsd} USD/mo) to `{targetClusterSize}` ({targetClusterSize.MonthlyCostUsd} USD/mo) via scale path `{currentClusterSize} -> {string.Join(" -> ", modelOutput.ScalePath)}` for instance `{redisInstance.Name}`. CostFunction=[{modelOutput.Cost}]"); var scaleResult = await redisInstance.ScaleAsync(operationContext, modelOutput.ScalePath); _lastAutoscaleTimeUtc[redisInstance.Id] = _configuration.Clock.UtcNow; if (!scaleResult) { Emit(context, "Autoscale", Severity.Error, $"Autoscale attempt from `{currentClusterSize}` to `{targetClusterSize}` for instance `{redisInstance.Name}` failed. Result=[{scaleResult}]"); scaleResult.ThrowIfFailure(); } return(true); }
private async Task <ValidationOutcome> ValidateAndScaleAsync(RuleContext context, IRedisInstance primary, IRedisInstance secondary, bool allowFailedStateReporting) { // Last refresh time may be arbitrarily long, either because the rule hasn't been run for a long time, or // because there was an autoscale that happened before. Hence, we need to refresh what we know. await Task.WhenAll(primary.RefreshAsync(context.CancellationToken), secondary.RefreshAsync(context.CancellationToken)).ThrowIfFailureAsync(); if (allowFailedStateReporting) { if (!primary.IsReadyToScale) { Emit(context, "Autoscale", Severity.Warning, $"Instance `{primary.Name}` is undergoing maintenance or autoscaling operation. State=[{primary.State}]"); await CreateIcmForFailedStateIfNeededAsync(primary); } if (!secondary.IsReadyToScale) { Emit(context, "Autoscale", Severity.Warning, $"Instance `{secondary.Name}` is undergoing maintenance or autoscaling operation. State=[{secondary.State}]"); await CreateIcmForFailedStateIfNeededAsync(secondary); } } // Both instances in a failed state means we need to open a Sev 2 against our own rotation to get them // fixed as quickly as possible. if (primary.IsFailed && secondary.IsFailed) { try { await EmitIcmAsync( severity : _configuration.Environment.IsProduction()? 2 : 3, title : $"Redis instances {_primaryRedisInstance.Name} and {_secondaryRedisInstance.Name} are in a failed state", description : TwoFailedInstancesDescription, machines : null, correlationIds : null, cacheTimeToLive : _configuration.IcmIncidentCacheTtl); } catch (Exception e) { _configuration.Logger.Error($"Failed to emit IcM for failed instances {primary.Name} and {secondary.Name}: {e}"); } } // We are willing to scale iff: // 1. The instance is ready to scale // 2. The other instance is not being scaled, but may be not ready to scale if (!primary.IsReadyToScale) { if (primary.IsFailed) { return(ValidationOutcome.PrimaryFailed); } else { return(ValidationOutcome.PrimaryUndergoingAutoscale); } } if (!secondary.IsReadyToScale && !secondary.IsFailed) { return(ValidationOutcome.SecondaryUndergoingAutoscale); } await AttemptToScaleAsync(context, primary); return(ValidationOutcome.Success); }
private async Task <ValidationOutcome> ValidateAndScaleAsync(RuleContext context, IRedisInstance primary, IRedisInstance secondary) { // Last refresh time may be arbitrarily long, either because the rule hasn't been run for a long time, or // because there was an autoscale that happened before. Hence, we need to refresh what we know. await Task.WhenAll(primary.RefreshAsync(context.CancellationToken), secondary.RefreshAsync(context.CancellationToken)).ThrowIfFailureAsync(); // We are willing to scale iff: // 1. The instance is ready to scale // 2. The other instance is not being scaled, but may be not ready to scale if (!primary.IsReadyToScale) { Emit(context, "Autoscale", Severity.Warning, $"Instance `{primary.Name}` is undergoing maintenance or autoscaling operation. State=[{primary.State}]"); await CreateIcmForFailedStateIfNeededAsync(primary); if (primary.IsFailed) { return(ValidationOutcome.PrimaryFailed); } else { return(ValidationOutcome.PrimaryUndergoingAutoscale); } } if (!secondary.IsReadyToScale && !secondary.IsFailed) { Emit(context, "Autoscale", Severity.Warning, $"Instance `{secondary.Name}` is undergoing maintenance or autoscaling operation. State=[{secondary.State}]"); await CreateIcmForFailedStateIfNeededAsync(secondary); return(ValidationOutcome.SecondaryUndergoingAutoscale); } await AttemptToScaleAsync(context, primary, context.CancellationToken); return(ValidationOutcome.Success); }
public async Task <Result <ModelOutput> > EstimateBestClusterSizeAsync(OperationContext context, IRedisInstance redisInstance) { var now = DateTime.UtcNow; var redisAzureId = redisInstance.Id; var currentClusterSize = redisInstance.ClusterSize; var modelContext = await ComputeFeaturesAsync(context, now, redisAzureId, currentClusterSize); return(Predict(currentClusterSize, modelContext)); }