Exemple #1
0
        /// <summary>
        /// Schedules a rule to be run over different environments.
        /// </summary>
        private void OncePerEnvironment(Func <MultiStampRuleArguments, IEnumerable <Instantiation> > generator, Watchlist watchlist)
        {
            var tableNames =
                watchlist
                .Entries
                .Select(kvp => (kvp.Key.Environment, kvp.Value.CacheTableName))
                .Distinct()
                .ToDictionary(
                    keySelector: pair => pair.Environment,
                    elementSelector: pair => pair.CacheTableName);

            foreach (var kvp in tableNames)
            {
                var resources = _environmentResources[kvp.Key];

                var configuration = new MultiStampRuleConfiguration(
                    _clock,
                    _logger,
                    _alertNotifier,
                    resources.KustoQueryClient,
                    _icmClient,
                    _configuration.Environments[kvp.Key].KustoDatabaseName,
                    kvp.Value,
                    kvp.Key,
                    watchlist);

                var request = new MultiStampRuleArguments
                {
                    BaseConfiguration    = configuration,
                    EnvironmentResources = resources,
                };

                foreach (var rule in generator(request))
                {
                    Contract.AssertNotNull(rule.Rule);
                    _scheduler.Add(rule.Rule, rule.PollingPeriod, rule.ForceRun);
                }
            }
        }
Exemple #2
0
        /// <summary>
        /// Creates the schedule of rules that will be run. Also responsible for configuring them.
        /// </summary>
        private void CreateSchedule(Watchlist watchlist)
        {
            // TODO: per-stamp configuration (some stamps are more important than others, query frequency should reflect that)
            // TODO: query weight (how much does it cost). We should adapt scheduling policy to have lighter queries prioritize earlier than the others.
            // TODO: stamp configuration knowledge. Stamp configuration affects what our thresholds should be. We should reflect that here.
            // TODO: add jitter to rules, so that queries to Kusto are spread out over time instead of all at once
            OncePerEnvironment(
                arguments =>
            {
                var configuration = new LastProducedCheckpointRule.Configuration(arguments.BaseConfiguration);
                return(Analysis.Utilities.Yield(new Instantiation()
                {
                    Rule = new LastProducedCheckpointRule(configuration),
                    PollingPeriod = TimeSpan.FromMinutes(40),
                }));
            }, watchlist);

            OncePerEnvironment(arguments =>
            {
                var configuration = new LastRestoredCheckpointRule.Configuration(arguments.BaseConfiguration);
                return(Analysis.Utilities.Yield(new Instantiation()
                {
                    Rule = new LastRestoredCheckpointRule(configuration),
                    PollingPeriod = TimeSpan.FromMinutes(30),
                }));
            }, watchlist);

            OncePerEnvironment(arguments =>
            {
                var configuration = new CheckpointSizeRule.Configuration(arguments.BaseConfiguration);
                return(Analysis.Utilities.Yield(new Instantiation()
                {
                    Rule = new CheckpointSizeRule(configuration),
                    PollingPeriod = configuration.AnomalyDetectionHorizon - TimeSpan.FromMinutes(5),
                }));
            }, watchlist);

            // TODO: this rule is too noisy and inaccurate, we should make it work again
            //OncePerStamp(arguments =>
            //{
            //    var configuration = new ActiveMachinesRule.Configuration(arguments.BaseConfiguration);
            //    return Analysis.Utilities.Yield(new Instantiation()
            //    {
            //        Rule = new ActiveMachinesRule(configuration),
            //        PollingPeriod = configuration.AnomalyDetectionHorizon - TimeSpan.FromMinutes(5),
            //    });
            //}, watchlist);

            OncePerEnvironment(arguments =>
            {
                var configuration = new EventHubProcessingDelayRule.Configuration(arguments.BaseConfiguration);
                return(Analysis.Utilities.Yield(new Instantiation()
                {
                    Rule = new EventHubProcessingDelayRule(configuration),
                    PollingPeriod = TimeSpan.FromMinutes(30),
                }));
            }, watchlist);

            OncePerEnvironment(arguments =>
            {
                var configuration = new BuildFailuresRule.Configuration(arguments.BaseConfiguration);
                return(Analysis.Utilities.Yield(new Instantiation()
                {
                    Rule = new BuildFailuresRule(configuration),
                    PollingPeriod = TimeSpan.FromMinutes(45),
                }));
            }, watchlist);

            // TODO: fire-and-forget exceptions are now being reported on the dashboards. We should see if this can be recycled.
            //OncePerStamp(arguments =>
            //{
            //    var configuration = new FireAndForgetExceptionsRule.Configuration(arguments.BaseConfiguration);
            //    return Analysis.Utilities.Yield(new Instantiation()
            //    {
            //        Rule = new FireAndForgetExceptionsRule(configuration),
            //        PollingPeriod = configuration.LookbackPeriod - TimeSpan.FromMinutes(5),
            //    });
            //}, watchlist);

            // TODO: this was just too noisy
            //OncePerStamp(arguments =>
            //{
            //    var configuration = new ContractViolationsRule.Configuration(arguments.BaseConfiguration);
            //    return Analysis.Utilities.Yield(new Instantiation() {
            //        Rule = new ContractViolationsRule(configuration),
            //        PollingPeriod = configuration.LookbackPeriod,
            //    });
            //}, watchlist);

            var failureChecks = new List <OperationFailureCheckRule.Check>()
            {
                new OperationFailureCheckRule.Check()
                {
                    Match      = "StartupAsync",
                    Constraint = "Component != 'GrpcCopyClient'",
                },
                new OperationFailureCheckRule.Check()
                {
                    Match = "ShutdownAsync",
                },
                new OperationFailureCheckRule.Check()
                {
                    Match = "RestoreCheckpointAsync",
                },
                new OperationFailureCheckRule.Check()
                {
                    Match = "CreateCheckpointAsync",
                },
                new OperationFailureCheckRule.Check()
                {
                    Match = "ReconcileAsync",
                },
                new OperationFailureCheckRule.Check()
                {
                    Match = "ProcessEventsCoreAsync",
                },
                new OperationFailureCheckRule.Check()
                {
                    // TODO(jubayard): lower severity
                    Match = "SendEventsCoreAsync",
                },
            };

            OncePerEnvironment(arguments =>
            {
                return(failureChecks.Select(check =>
                {
                    var configuration = new OperationFailureCheckRule.Configuration(arguments.BaseConfiguration, check);

                    return new Instantiation()
                    {
                        Rule = new OperationFailureCheckRule(configuration),
                        PollingPeriod = configuration.LookbackPeriod - TimeSpan.FromMinutes(5),
                    };
                }));
            }, watchlist);

            var performanceChecks = new List <OperationPerformanceOutliersRule.DynamicCheck>()
            {
                new OperationPerformanceOutliersRule.DynamicCheck()
                {
                    LookbackPeriod  = TimeSpan.FromMinutes(60),
                    DetectionPeriod = TimeSpan.FromMinutes(30),
                    Match           = "LocalCacheServer.StartupAsync",
                    Constraint      = $"Duration >= {CslTimeSpanLiteral.AsCslString(TimeSpan.FromMinutes(1))}",
                },
                new OperationPerformanceOutliersRule.DynamicCheck()
                {
                    LookbackPeriod  = TimeSpan.FromHours(12),
                    DetectionPeriod = TimeSpan.FromHours(1),
                    Match           = "CheckpointManager.CreateCheckpointAsync",
                    Constraint      = $"Duration >= {CslTimeSpanLiteral.AsCslString(TimeSpan.FromMinutes(1))}",
                },
                new OperationPerformanceOutliersRule.DynamicCheck()
                {
                    LookbackPeriod  = TimeSpan.FromHours(12),
                    DetectionPeriod = TimeSpan.FromHours(1),
                    Match           = "CheckpointManager.RestoreCheckpointAsync",
                    Constraint      = $"Duration >= P95 and P95 >= {CslTimeSpanLiteral.AsCslString(TimeSpan.FromMinutes(30))}",
                },
            };

            OncePerEnvironment(arguments =>
            {
                return(performanceChecks.Select(check =>
                {
                    var configuration = new OperationPerformanceOutliersRule.Configuration(arguments.BaseConfiguration, check);

                    return new Instantiation
                    {
                        Rule = new OperationPerformanceOutliersRule(configuration),
                        PollingPeriod = check.DetectionPeriod - TimeSpan.FromMinutes(5),
                    };
                }));
            }, watchlist);

            OncePerEnvironment(arguments =>
            {
                var configuration = new ServiceRestartsRule.Configuration(arguments.BaseConfiguration);
                return(Analysis.Utilities.Yield(new Instantiation()
                {
                    Rule = new ServiceRestartsRule(configuration),
                    PollingPeriod = TimeSpan.FromMinutes(30),
                }));
            }, watchlist);

            OncePerEnvironment(arguments =>
            {
                var configuration = new LongCopyRule.Configuration(arguments.BaseConfiguration);
                return(Analysis.Utilities.Yield(new Instantiation()
                {
                    Rule = new LongCopyRule(configuration),
                    PollingPeriod = TimeSpan.FromMinutes(30),
                }));
            }, watchlist);

            OncePerEnvironment(arguments =>
            {
                var configuration = new MachineReimagesRule.Configuration(arguments.BaseConfiguration);

                return(Analysis.Utilities.Yield(new Instantiation()
                {
                    Rule = new MachineReimagesRule(configuration),
                    PollingPeriod = TimeSpan.FromMinutes(30),
                }));
            }, watchlist);

            OncePerStamp(GenerateRedisAutoscalingRules, watchlist);
        }
Exemple #3
0
        /// <summary>
        /// Schedules a rule to be run over different stamps and environments.
        /// </summary>
        private void OncePerStamp(Func <SingleStampRuleArguments, IEnumerable <Instantiation> > generator, Watchlist watchlist)
        {
            foreach (var(stampId, properties) in watchlist.Entries)
            {
                var environmentConfiguration = _configuration.Environments[stampId.Environment];
                var resources = _environmentResources[stampId.Environment];

                var configuration = new SingleStampRuleConfiguration(
                    _clock,
                    _logger,
                    _alertNotifier,
                    resources.KustoQueryClient,
                    _icmClient,
                    environmentConfiguration.KustoDatabaseName,
                    properties.CacheTableName,
                    stampId);

                var request = new SingleStampRuleArguments
                {
                    StampId = stampId,
                    DynamicStampProperties = properties,
                    BaseConfiguration      = configuration,
                    EnvironmentResources   = resources,
                };

                foreach (var rule in generator(request))
                {
                    Contract.AssertNotNull(rule.Rule);
                    _scheduler.Add(rule.Rule, rule.PollingPeriod, rule.ForceRun);
                }
            }
        }
Exemple #4
0
        private void OncePerStamp(Func <KustoRuleConfiguration, IEnumerable <Instantiation> > generator, Watchlist watchlist)
        {
            foreach (var entry in watchlist.Entries)
            {
                var tableNameFound = watchlist.TryGetCacheTableName(entry, out var cacheTableName);
                Contract.Assert(tableNameFound);

                var configuration = new KustoRuleConfiguration()
                {
                    Clock             = _clock,
                    Logger            = _logger,
                    Notifier          = _alertNotifier,
                    CslQueryProvider  = _cslQueryProvider,
                    KustoDatabaseName = EnvironmentToKustoDatabaseName[entry.Environment],
                    Environment       = entry.Environment,
                    Stamp             = entry.Stamp,
                    CacheTableName    = cacheTableName,
                };

                foreach (var rule in generator(configuration))
                {
                    Contract.AssertNotNull(rule.Rule);
                    _scheduler.Add(rule.Rule, rule.PollingPeriod, rule.ForceRun);
                }
            }
        }