public static Result <ModelOutput> Predict(RedisClusterSize currentClusterSize, ModelContext modelContext)
        {
            var shortestPaths = ComputeAllowedPaths(currentClusterSize, modelContext);

            var eligibleClusterSizes = shortestPaths
                                       .Select(kvp => (Size: kvp.Key, Node: kvp.Value))
                                       // Find all plans that we can reach from the current one via scaling operations, and that we allow scaling to
                                       .Where(entry => entry.Node.ShortestDistanceFromSource != double.PositiveInfinity && IsScalingAllowed(currentClusterSize, entry.Size, modelContext))
                                       // Compute the cost of taking the given route
                                       .Select(entry => (entry.Size, entry.Node, Cost: CostFunction(currentClusterSize, entry.Size, modelContext, shortestPaths)))
                                       .ToList();

            // Rank them by cost ascending
            var costSorted = eligibleClusterSizes
                             .OrderBy(pair => pair.Cost)
                             .ToList();

            if (costSorted.Count == 0)
            {
                return(new Result <ModelOutput>(errorMessage: "No cluster size available for scaling"));
            }

            return(new ModelOutput(
                       targetClusterSize: costSorted[0].Size,
                       modelContext: modelContext,
                       cost: costSorted[0].Cost,
                       scalePath: RedisScalingUtilities.ComputeShortestPath(shortestPaths, currentClusterSize, costSorted[0].Size)));
        }
Esempio n. 2
0
        private Task <BoolResult> RequestScaleAsync(OperationContext context, RedisClusterSize targetClusterSize)
        {
            string extraMessage = $"CurrentClusterSize=[{ClusterSize}] TargetClusterSize=[{targetClusterSize}]";

            return(context.PerformOperationAsync(Tracer, async() =>
            {
                if (ClusterSize.Equals(targetClusterSize))
                {
                    return new BoolResult(errorMessage: $"No-op scale request attempted (`{ClusterSize}` -> `{targetClusterSize}`) on instance `{Name}`");
                }

                if (!RedisScalingUtilities.CanScale(ClusterSize, targetClusterSize))
                {
                    return new BoolResult(errorMessage: $"Scale request `{ClusterSize}` -> `{targetClusterSize}` on instance `{Name}` is disallowed by Azure Cache for Redis");
                }

                if (!IsReadyToScale)
                {
                    return new BoolResult(errorMessage: $"Redis instance `{Name}` is not ready to scale, current provisioning state is `{RedisCache.ProvisioningState}`");
                }

                var instance = RedisCache.Update();

                if (!ClusterSize.Tier.Equals(targetClusterSize.Tier))
                {
                    switch (targetClusterSize.Tier.Plan)
                    {
                    case RedisPlan.Basic:
                        instance = instance.WithBasicSku(targetClusterSize.Tier.Capacity);
                        break;

                    case RedisPlan.Standard:
                        instance = instance.WithStandardSku(targetClusterSize.Tier.Capacity);
                        break;

                    case RedisPlan.Premium:
                        instance = instance.WithPremiumSku(targetClusterSize.Tier.Capacity);
                        break;
                    }
                }

                if (ClusterSize.Shards != targetClusterSize.Shards)
                {
                    instance = instance.WithShardCount(targetClusterSize.Shards);
                }

                await instance.ApplyAsync(context.Token);

                return BoolResult.Success;
            },
                                                 extraStartMessage: extraMessage,
                                                 extraEndMessage: _ => extraMessage,
                                                 pendingOperationTracingInterval: TimeSpan.FromMinutes(1)));
        }
        public static IReadOnlyDictionary <RedisClusterSize, RedisScalingUtilities.Node> ComputeAllowedPaths(RedisClusterSize currentClusterSize, ModelContext modelContext)
        {
            // We need to reach the target cluster size, but we can't do it in one shot because business rules won't
            // let us, so we need to compute a path to get to it. This is probably the most complex part of the
            // algorithm, there are several competing aspects we want to optimize for, in descending importance:
            //  - We want for memory to get to the target level ASAP
            //  - We want to keep the number of shards as stable as possible, given that changing them can cause build
            //    failures
            //  - We'd like to get there in the fewest amount of time possible
            //  - The route needs to be deterministic, so that if we are forced to stop and re-compute it we'll take
            //    the same route.
            //  - We'd like to minimize the cost of the route
            // Multi-constraint optimization over graphs is NP-complete and algorithms are hard to come up with, so we
            // do our best.

            Func <RedisClusterSize, IEnumerable <RedisClusterSize> > neighbors =
                currentClusterSize => currentClusterSize.ScaleEligibleSizes.Where(targetClusterSize =>
            {
                // Constrain paths to downscale at most one shard at the time. This only makes paths longer, so it
                // is safe. The reason behind this is that the service doesn't really tolerate big reductions.
                if (targetClusterSize.Shards < currentClusterSize.Shards)
                {
                    return(targetClusterSize.Shards == currentClusterSize.Shards - 1);
                }

                return(true);
            });

            Func <RedisClusterSize, RedisClusterSize, double> weight =
                (from, to) =>
            {
                // This factor is used to avoid transitioning to any kind of intermediate plan that may cause a
                // production outage. If we don't have it, we may transition into a state in which we have less
                // cluster memory available than we need. By adjusting the weight function, we guarantee that
                // this only happens iff there is no better path; moreover, we will always choose the lesser of
                // two evils if given no choice.
                double clusterMemoryPenalization = 0;

                var delta = to.ClusterMemorySizeMb - modelContext.MinimumAllowedClusterMemoryMb;
                if (delta < 0)
                {
                    // The amount of cluster memory is less than we need, so we penalize taking this path by
                    // adding the amount of memory that keeps us away from the target.
                    clusterMemoryPenalization = -delta;
                }

                // This needs to be at least one so we don't pick minimum paths that are arbitrarily long
                return(1 + clusterMemoryPenalization);
            };


            return(RedisScalingUtilities.ComputeOneToAllShortestPath(vertices: RedisClusterSize.Instances, neighbors: neighbors, weight: weight, from: currentClusterSize));
        }
        /// <summary>
        /// This function embodies the concept of "how much does it cost to switch from
        /// <paramref name="current"/> to <paramref name="target"/>". At this point, we can assume that:
        ///     - The two input sizes are valid states to be in
        ///     - We can reach the target from current via some amount of autoscaling operations
        /// Hence, we're just ranking amonst the many potential states.
        /// </summary>
        public static double CostFunction(RedisClusterSize current, RedisClusterSize target, ModelContext modelContext, IReadOnlyDictionary <RedisClusterSize, RedisScalingUtilities.Node> shortestPaths)
        {
            // Switching to the same size (i.e. no op) is free
            if (current.Equals(target))
            {
                return(0);
            }

            var shortestPath = RedisScalingUtilities.ComputeShortestPath(shortestPaths, current, target);

            Contract.Assert(shortestPath.Count > 0);

            // Positive if we are spending more money, negative if we are saving
            return((double)(target.MonthlyCostUsd - current.MonthlyCostUsd));
        }
Esempio n. 5
0
        private async Task <BoolResult> RequestScaleAsync(RedisClusterSize targetClusterSize, CancellationToken cancellationToken = default)
        {
            if (ClusterSize.Equals(targetClusterSize))
            {
                return(new BoolResult(errorMessage: $"No-op scale request attempted (`{ClusterSize}` -> `{targetClusterSize}`) on instance `{Name}`"));
            }

            if (!RedisScalingUtilities.CanScale(ClusterSize, targetClusterSize))
            {
                return(new BoolResult(errorMessage: $"Scale request `{ClusterSize}` -> `{targetClusterSize}` on instance `{Name}` is disallowed by Azure Cache for Redis"));
            }

            if (!IsReadyToScale)
            {
                return(new BoolResult(errorMessage: $"Redis instance `{Name}` is not ready to scale, current provisioning state is `{RedisCache.ProvisioningState}`"));
            }

            var instance = RedisCache.Update();

            if (!ClusterSize.Tier.Equals(targetClusterSize.Tier))
            {
                switch (targetClusterSize.Tier.Plan)
                {
                case RedisPlan.Basic:
                    instance = instance.WithBasicSku(targetClusterSize.Tier.Capacity);
                    break;

                case RedisPlan.Standard:
                    instance = instance.WithStandardSku(targetClusterSize.Tier.Capacity);
                    break;

                case RedisPlan.Premium:
                    instance = instance.WithPremiumSku(targetClusterSize.Tier.Capacity);
                    break;
                }
            }

            if (ClusterSize.Shards != targetClusterSize.Shards)
            {
                instance = instance.WithShardCount(targetClusterSize.Shards);
            }

            await instance.ApplyAsync(cancellationToken);

            return(BoolResult.Success);
        }
Esempio n. 6
0
        private async Task <BoolResult> RequestScaleAsync(RedisClusterSize targetClusterSize, CancellationToken cancellationToken = default)
        {
            if (ClusterSize.Equals(targetClusterSize))
            {
                return(new BoolResult(errorMessage: $"No-op scale request attempted (`{ClusterSize}` -> `{targetClusterSize}`) on instance `{Name}`"));
            }

            if (!RedisScalingUtilities.CanScale(ClusterSize, targetClusterSize))
            {
                return(new BoolResult(errorMessage: $"Scale request `{ClusterSize}` -> `{targetClusterSize}` on instance `{Name}` is disallowed by Azure Cache for Redis"));
            }

            if (!IsReadyToScale)
            {
                return(new BoolResult(errorMessage: $"Redis instance `{Name}` is not ready to scale, current provisioning state is `{RedisCache.ProvisioningState}`"));
            }

            return(await SubmitScaleRequestAsync(targetClusterSize, cancellationToken));
        }
Esempio n. 7
0
        private async Task <bool> AttemptToScaleAsync(RuleContext context, IRedisInstance redisInstance)
        {
            Contract.Requires(redisInstance.IsReadyToScale);
            var operationContext = context.IntoOperationContext(_configuration.Logger);

            // Fetch which cluster size we want, and start scaling operation if needed.
            var currentClusterSize      = redisInstance.ClusterSize;
            var targetClusterSizeResult = await _redisAutoscalingAgent.EstimateBestClusterSizeAsync(operationContext, redisInstance);

            if (!targetClusterSizeResult.Succeeded)
            {
                Emit(context, "Autoscale", Severity.Error, $"Failed to find best plan for instance `{redisInstance.Name}` in plan `{currentClusterSize}`. Result=[{targetClusterSizeResult}]");
                return(false);
            }

            var modelOutput = targetClusterSizeResult.Value;

            Contract.AssertNotNull(modelOutput);

            var targetClusterSize = modelOutput.TargetClusterSize;

            if (targetClusterSize.Equals(currentClusterSize) || modelOutput.ScalePath.Count == 0)
            {
                return(false);
            }

            if (RedisScalingUtilities.IsDownScale(currentClusterSize, targetClusterSize))
            {
                // Downscales are typically about saving money rather than system health, hence, it's deprioritized.

                // Force downscales to happen during very comfortable business hours in PST, to ensure we're always
                // available if things go wrong. We disregard holidays because it's a pain to handle them.
                if (!TimeConstraints.BusinessHours.SatisfiedPST(_configuration.Clock.UtcNow))
                {
                    Emit(context, "Autoscale", Severity.Info, $"Refused autoscale from `{currentClusterSize}` to `{targetClusterSize}` via scale path `{currentClusterSize} -> {string.Join(" -> ", modelOutput.ScalePath)}` for instance `{redisInstance.Name}` due to business hours constraints");
                    return(false);
                }

                // Downscales are performed in phases instead of all at once. If the model proposes an autoscale, we'll
                // only take the first step of it in the current iteration, and force wait some amount of time until we
                // allow this instance to be downscaled again. This gives some time to evaluate the effects of the last
                // downscale (which typically takes time because migration's effects on instance memory and cpu load
                // take some time to see).
                //
                // The intent of this measure is to avoid situations where our downscale causes heightened load in the
                // remaining shards, forcing us to scale back to our original size after some time. This effect creates
                // "autoscale loops" over time.
                modelOutput.ScalePath = modelOutput.ScalePath.Take(1).ToList();
                if (_lastAutoscaleTimeUtc.TryGetValue(redisInstance.Id, out var lastAutoscaleTimeUtc))
                {
                    var now = _configuration.Clock.UtcNow;
                    if (now - lastAutoscaleTimeUtc < _configuration.MinimumWaitTimeBetweenDownscaleSteps)
                    {
                        return(false);
                    }
                }
            }

            Emit(context, "Autoscale", Severity.Warning, $"Autoscaling from `{currentClusterSize}` ({currentClusterSize.MonthlyCostUsd} USD/mo) to `{targetClusterSize}` ({targetClusterSize.MonthlyCostUsd} USD/mo) via scale path `{currentClusterSize} -> {string.Join(" -> ", modelOutput.ScalePath)}` for instance `{redisInstance.Name}`. CostFunction=[{modelOutput.Cost}]");

            var scaleResult = await redisInstance.ScaleAsync(operationContext, modelOutput.ScalePath);

            _lastAutoscaleTimeUtc[redisInstance.Id] = _configuration.Clock.UtcNow;
            if (!scaleResult)
            {
                Emit(context, "Autoscale", Severity.Error, $"Autoscale attempt from `{currentClusterSize}` to `{targetClusterSize}` for instance `{redisInstance.Name}` failed. Result=[{scaleResult}]");
                scaleResult.ThrowIfFailure();
            }

            return(true);
        }
Esempio n. 8
0
        public RedisClusterSize(RedisTier tier, int shards)
        {
            Contract.Requires(1 <= shards && shards <= 10, "Number of shards out of bounds, must be between 1 and 10");
            Tier   = tier;
            Shards = shards;

            _scaleEligibleSizes = new Lazy <IReadOnlyList <RedisClusterSize> >(() => Instances.Where(to => RedisScalingUtilities.CanScale(this, to)).ToList(), System.Threading.LazyThreadSafetyMode.PublicationOnly);
        }