public static Result <ModelOutput> Predict(RedisClusterSize currentClusterSize, ModelContext modelContext) { var shortestPaths = ComputeAllowedPaths(currentClusterSize, modelContext); var eligibleClusterSizes = shortestPaths .Select(kvp => (Size: kvp.Key, Node: kvp.Value)) // Find all plans that we can reach from the current one via scaling operations, and that we allow scaling to .Where(entry => entry.Node.ShortestDistanceFromSource != double.PositiveInfinity && IsScalingAllowed(currentClusterSize, entry.Size, modelContext)) // Compute the cost of taking the given route .Select(entry => (entry.Size, entry.Node, Cost: CostFunction(currentClusterSize, entry.Size, modelContext, shortestPaths))) .ToList(); // Rank them by cost ascending var costSorted = eligibleClusterSizes .OrderBy(pair => pair.Cost) .ToList(); if (costSorted.Count == 0) { return(new Result <ModelOutput>(errorMessage: "No cluster size available for scaling")); } return(new ModelOutput( targetClusterSize: costSorted[0].Size, modelContext: modelContext, cost: costSorted[0].Cost, scalePath: RedisScalingUtilities.ComputeShortestPath(shortestPaths, currentClusterSize, costSorted[0].Size))); }
private Task <BoolResult> RequestScaleAsync(OperationContext context, RedisClusterSize targetClusterSize) { string extraMessage = $"CurrentClusterSize=[{ClusterSize}] TargetClusterSize=[{targetClusterSize}]"; return(context.PerformOperationAsync(Tracer, async() => { if (ClusterSize.Equals(targetClusterSize)) { return new BoolResult(errorMessage: $"No-op scale request attempted (`{ClusterSize}` -> `{targetClusterSize}`) on instance `{Name}`"); } if (!RedisScalingUtilities.CanScale(ClusterSize, targetClusterSize)) { return new BoolResult(errorMessage: $"Scale request `{ClusterSize}` -> `{targetClusterSize}` on instance `{Name}` is disallowed by Azure Cache for Redis"); } if (!IsReadyToScale) { return new BoolResult(errorMessage: $"Redis instance `{Name}` is not ready to scale, current provisioning state is `{RedisCache.ProvisioningState}`"); } var instance = RedisCache.Update(); if (!ClusterSize.Tier.Equals(targetClusterSize.Tier)) { switch (targetClusterSize.Tier.Plan) { case RedisPlan.Basic: instance = instance.WithBasicSku(targetClusterSize.Tier.Capacity); break; case RedisPlan.Standard: instance = instance.WithStandardSku(targetClusterSize.Tier.Capacity); break; case RedisPlan.Premium: instance = instance.WithPremiumSku(targetClusterSize.Tier.Capacity); break; } } if (ClusterSize.Shards != targetClusterSize.Shards) { instance = instance.WithShardCount(targetClusterSize.Shards); } await instance.ApplyAsync(context.Token); return BoolResult.Success; }, extraStartMessage: extraMessage, extraEndMessage: _ => extraMessage, pendingOperationTracingInterval: TimeSpan.FromMinutes(1))); }
public static IReadOnlyDictionary <RedisClusterSize, RedisScalingUtilities.Node> ComputeAllowedPaths(RedisClusterSize currentClusterSize, ModelContext modelContext) { // We need to reach the target cluster size, but we can't do it in one shot because business rules won't // let us, so we need to compute a path to get to it. This is probably the most complex part of the // algorithm, there are several competing aspects we want to optimize for, in descending importance: // - We want for memory to get to the target level ASAP // - We want to keep the number of shards as stable as possible, given that changing them can cause build // failures // - We'd like to get there in the fewest amount of time possible // - The route needs to be deterministic, so that if we are forced to stop and re-compute it we'll take // the same route. // - We'd like to minimize the cost of the route // Multi-constraint optimization over graphs is NP-complete and algorithms are hard to come up with, so we // do our best. Func <RedisClusterSize, IEnumerable <RedisClusterSize> > neighbors = currentClusterSize => currentClusterSize.ScaleEligibleSizes.Where(targetClusterSize => { // Constrain paths to downscale at most one shard at the time. This only makes paths longer, so it // is safe. The reason behind this is that the service doesn't really tolerate big reductions. if (targetClusterSize.Shards < currentClusterSize.Shards) { return(targetClusterSize.Shards == currentClusterSize.Shards - 1); } return(true); }); Func <RedisClusterSize, RedisClusterSize, double> weight = (from, to) => { // This factor is used to avoid transitioning to any kind of intermediate plan that may cause a // production outage. If we don't have it, we may transition into a state in which we have less // cluster memory available than we need. By adjusting the weight function, we guarantee that // this only happens iff there is no better path; moreover, we will always choose the lesser of // two evils if given no choice. double clusterMemoryPenalization = 0; var delta = to.ClusterMemorySizeMb - modelContext.MinimumAllowedClusterMemoryMb; if (delta < 0) { // The amount of cluster memory is less than we need, so we penalize taking this path by // adding the amount of memory that keeps us away from the target. clusterMemoryPenalization = -delta; } // This needs to be at least one so we don't pick minimum paths that are arbitrarily long return(1 + clusterMemoryPenalization); }; return(RedisScalingUtilities.ComputeOneToAllShortestPath(vertices: RedisClusterSize.Instances, neighbors: neighbors, weight: weight, from: currentClusterSize)); }
/// <summary> /// This function embodies the concept of "how much does it cost to switch from /// <paramref name="current"/> to <paramref name="target"/>". At this point, we can assume that: /// - The two input sizes are valid states to be in /// - We can reach the target from current via some amount of autoscaling operations /// Hence, we're just ranking amonst the many potential states. /// </summary> public static double CostFunction(RedisClusterSize current, RedisClusterSize target, ModelContext modelContext, IReadOnlyDictionary <RedisClusterSize, RedisScalingUtilities.Node> shortestPaths) { // Switching to the same size (i.e. no op) is free if (current.Equals(target)) { return(0); } var shortestPath = RedisScalingUtilities.ComputeShortestPath(shortestPaths, current, target); Contract.Assert(shortestPath.Count > 0); // Positive if we are spending more money, negative if we are saving return((double)(target.MonthlyCostUsd - current.MonthlyCostUsd)); }
private async Task <BoolResult> RequestScaleAsync(RedisClusterSize targetClusterSize, CancellationToken cancellationToken = default) { if (ClusterSize.Equals(targetClusterSize)) { return(new BoolResult(errorMessage: $"No-op scale request attempted (`{ClusterSize}` -> `{targetClusterSize}`) on instance `{Name}`")); } if (!RedisScalingUtilities.CanScale(ClusterSize, targetClusterSize)) { return(new BoolResult(errorMessage: $"Scale request `{ClusterSize}` -> `{targetClusterSize}` on instance `{Name}` is disallowed by Azure Cache for Redis")); } if (!IsReadyToScale) { return(new BoolResult(errorMessage: $"Redis instance `{Name}` is not ready to scale, current provisioning state is `{RedisCache.ProvisioningState}`")); } var instance = RedisCache.Update(); if (!ClusterSize.Tier.Equals(targetClusterSize.Tier)) { switch (targetClusterSize.Tier.Plan) { case RedisPlan.Basic: instance = instance.WithBasicSku(targetClusterSize.Tier.Capacity); break; case RedisPlan.Standard: instance = instance.WithStandardSku(targetClusterSize.Tier.Capacity); break; case RedisPlan.Premium: instance = instance.WithPremiumSku(targetClusterSize.Tier.Capacity); break; } } if (ClusterSize.Shards != targetClusterSize.Shards) { instance = instance.WithShardCount(targetClusterSize.Shards); } await instance.ApplyAsync(cancellationToken); return(BoolResult.Success); }
private async Task <BoolResult> RequestScaleAsync(RedisClusterSize targetClusterSize, CancellationToken cancellationToken = default) { if (ClusterSize.Equals(targetClusterSize)) { return(new BoolResult(errorMessage: $"No-op scale request attempted (`{ClusterSize}` -> `{targetClusterSize}`) on instance `{Name}`")); } if (!RedisScalingUtilities.CanScale(ClusterSize, targetClusterSize)) { return(new BoolResult(errorMessage: $"Scale request `{ClusterSize}` -> `{targetClusterSize}` on instance `{Name}` is disallowed by Azure Cache for Redis")); } if (!IsReadyToScale) { return(new BoolResult(errorMessage: $"Redis instance `{Name}` is not ready to scale, current provisioning state is `{RedisCache.ProvisioningState}`")); } return(await SubmitScaleRequestAsync(targetClusterSize, cancellationToken)); }
private async Task <bool> AttemptToScaleAsync(RuleContext context, IRedisInstance redisInstance) { Contract.Requires(redisInstance.IsReadyToScale); var operationContext = context.IntoOperationContext(_configuration.Logger); // Fetch which cluster size we want, and start scaling operation if needed. var currentClusterSize = redisInstance.ClusterSize; var targetClusterSizeResult = await _redisAutoscalingAgent.EstimateBestClusterSizeAsync(operationContext, redisInstance); if (!targetClusterSizeResult.Succeeded) { Emit(context, "Autoscale", Severity.Error, $"Failed to find best plan for instance `{redisInstance.Name}` in plan `{currentClusterSize}`. Result=[{targetClusterSizeResult}]"); return(false); } var modelOutput = targetClusterSizeResult.Value; Contract.AssertNotNull(modelOutput); var targetClusterSize = modelOutput.TargetClusterSize; if (targetClusterSize.Equals(currentClusterSize) || modelOutput.ScalePath.Count == 0) { return(false); } if (RedisScalingUtilities.IsDownScale(currentClusterSize, targetClusterSize)) { // Downscales are typically about saving money rather than system health, hence, it's deprioritized. // Force downscales to happen during very comfortable business hours in PST, to ensure we're always // available if things go wrong. We disregard holidays because it's a pain to handle them. if (!TimeConstraints.BusinessHours.SatisfiedPST(_configuration.Clock.UtcNow)) { Emit(context, "Autoscale", Severity.Info, $"Refused autoscale from `{currentClusterSize}` to `{targetClusterSize}` via scale path `{currentClusterSize} -> {string.Join(" -> ", modelOutput.ScalePath)}` for instance `{redisInstance.Name}` due to business hours constraints"); return(false); } // Downscales are performed in phases instead of all at once. If the model proposes an autoscale, we'll // only take the first step of it in the current iteration, and force wait some amount of time until we // allow this instance to be downscaled again. This gives some time to evaluate the effects of the last // downscale (which typically takes time because migration's effects on instance memory and cpu load // take some time to see). // // The intent of this measure is to avoid situations where our downscale causes heightened load in the // remaining shards, forcing us to scale back to our original size after some time. This effect creates // "autoscale loops" over time. modelOutput.ScalePath = modelOutput.ScalePath.Take(1).ToList(); if (_lastAutoscaleTimeUtc.TryGetValue(redisInstance.Id, out var lastAutoscaleTimeUtc)) { var now = _configuration.Clock.UtcNow; if (now - lastAutoscaleTimeUtc < _configuration.MinimumWaitTimeBetweenDownscaleSteps) { return(false); } } } Emit(context, "Autoscale", Severity.Warning, $"Autoscaling from `{currentClusterSize}` ({currentClusterSize.MonthlyCostUsd} USD/mo) to `{targetClusterSize}` ({targetClusterSize.MonthlyCostUsd} USD/mo) via scale path `{currentClusterSize} -> {string.Join(" -> ", modelOutput.ScalePath)}` for instance `{redisInstance.Name}`. CostFunction=[{modelOutput.Cost}]"); var scaleResult = await redisInstance.ScaleAsync(operationContext, modelOutput.ScalePath); _lastAutoscaleTimeUtc[redisInstance.Id] = _configuration.Clock.UtcNow; if (!scaleResult) { Emit(context, "Autoscale", Severity.Error, $"Autoscale attempt from `{currentClusterSize}` to `{targetClusterSize}` for instance `{redisInstance.Name}` failed. Result=[{scaleResult}]"); scaleResult.ThrowIfFailure(); } return(true); }
public RedisClusterSize(RedisTier tier, int shards) { Contract.Requires(1 <= shards && shards <= 10, "Number of shards out of bounds, must be between 1 and 10"); Tier = tier; Shards = shards; _scaleEligibleSizes = new Lazy <IReadOnlyList <RedisClusterSize> >(() => Instances.Where(to => RedisScalingUtilities.CanScale(this, to)).ToList(), System.Threading.LazyThreadSafetyMode.PublicationOnly); }