/// <summary> /// Schedules a rule to be run over different environments. /// </summary> private void OncePerEnvironment(Func <MultiStampRuleArguments, IEnumerable <Instantiation> > generator, Watchlist watchlist) { var tableNames = watchlist .Entries .Select(kvp => (kvp.Key.Environment, kvp.Value.CacheTableName)) .Distinct() .ToDictionary( keySelector: pair => pair.Environment, elementSelector: pair => pair.CacheTableName); foreach (var kvp in tableNames) { var resources = _environmentResources[kvp.Key]; var configuration = new MultiStampRuleConfiguration( _clock, _logger, _alertNotifier, resources.KustoQueryClient, _icmClient, _configuration.Environments[kvp.Key].KustoDatabaseName, kvp.Value, kvp.Key, watchlist); var request = new MultiStampRuleArguments { BaseConfiguration = configuration, EnvironmentResources = resources, }; foreach (var rule in generator(request)) { Contract.AssertNotNull(rule.Rule); _scheduler.Add(rule.Rule, rule.PollingPeriod, rule.ForceRun); } } }
/// <summary> /// Creates the schedule of rules that will be run. Also responsible for configuring them. /// </summary> private void CreateSchedule(Watchlist watchlist) { // TODO: per-stamp configuration (some stamps are more important than others, query frequency should reflect that) // TODO: query weight (how much does it cost). We should adapt scheduling policy to have lighter queries prioritize earlier than the others. // TODO: stamp configuration knowledge. Stamp configuration affects what our thresholds should be. We should reflect that here. // TODO: add jitter to rules, so that queries to Kusto are spread out over time instead of all at once OncePerEnvironment( arguments => { var configuration = new LastProducedCheckpointRule.Configuration(arguments.BaseConfiguration); return(Analysis.Utilities.Yield(new Instantiation() { Rule = new LastProducedCheckpointRule(configuration), PollingPeriod = TimeSpan.FromMinutes(40), })); }, watchlist); OncePerEnvironment(arguments => { var configuration = new LastRestoredCheckpointRule.Configuration(arguments.BaseConfiguration); return(Analysis.Utilities.Yield(new Instantiation() { Rule = new LastRestoredCheckpointRule(configuration), PollingPeriod = TimeSpan.FromMinutes(30), })); }, watchlist); OncePerEnvironment(arguments => { var configuration = new CheckpointSizeRule.Configuration(arguments.BaseConfiguration); return(Analysis.Utilities.Yield(new Instantiation() { Rule = new CheckpointSizeRule(configuration), PollingPeriod = configuration.AnomalyDetectionHorizon - TimeSpan.FromMinutes(5), })); }, watchlist); // TODO: this rule is too noisy and inaccurate, we should make it work again //OncePerStamp(arguments => //{ // var configuration = new ActiveMachinesRule.Configuration(arguments.BaseConfiguration); // return Analysis.Utilities.Yield(new Instantiation() // { // Rule = new ActiveMachinesRule(configuration), // PollingPeriod = configuration.AnomalyDetectionHorizon - TimeSpan.FromMinutes(5), // }); //}, watchlist); OncePerEnvironment(arguments => { var configuration = new EventHubProcessingDelayRule.Configuration(arguments.BaseConfiguration); return(Analysis.Utilities.Yield(new Instantiation() { Rule = new EventHubProcessingDelayRule(configuration), PollingPeriod = TimeSpan.FromMinutes(30), })); }, watchlist); OncePerEnvironment(arguments => { var configuration = new BuildFailuresRule.Configuration(arguments.BaseConfiguration); return(Analysis.Utilities.Yield(new Instantiation() { Rule = new BuildFailuresRule(configuration), PollingPeriod = TimeSpan.FromMinutes(45), })); }, watchlist); // TODO: fire-and-forget exceptions are now being reported on the dashboards. We should see if this can be recycled. //OncePerStamp(arguments => //{ // var configuration = new FireAndForgetExceptionsRule.Configuration(arguments.BaseConfiguration); // return Analysis.Utilities.Yield(new Instantiation() // { // Rule = new FireAndForgetExceptionsRule(configuration), // PollingPeriod = configuration.LookbackPeriod - TimeSpan.FromMinutes(5), // }); //}, watchlist); // TODO: this was just too noisy //OncePerStamp(arguments => //{ // var configuration = new ContractViolationsRule.Configuration(arguments.BaseConfiguration); // return Analysis.Utilities.Yield(new Instantiation() { // Rule = new ContractViolationsRule(configuration), // PollingPeriod = configuration.LookbackPeriod, // }); //}, watchlist); var failureChecks = new List <OperationFailureCheckRule.Check>() { new OperationFailureCheckRule.Check() { Match = "StartupAsync", Constraint = "Component != 'GrpcCopyClient'", }, new OperationFailureCheckRule.Check() { Match = "ShutdownAsync", }, new OperationFailureCheckRule.Check() { Match = "RestoreCheckpointAsync", }, new OperationFailureCheckRule.Check() { Match = "CreateCheckpointAsync", }, new OperationFailureCheckRule.Check() { Match = "ReconcileAsync", }, new OperationFailureCheckRule.Check() { Match = "ProcessEventsCoreAsync", }, new OperationFailureCheckRule.Check() { // TODO(jubayard): lower severity Match = "SendEventsCoreAsync", }, }; OncePerEnvironment(arguments => { return(failureChecks.Select(check => { var configuration = new OperationFailureCheckRule.Configuration(arguments.BaseConfiguration, check); return new Instantiation() { Rule = new OperationFailureCheckRule(configuration), PollingPeriod = configuration.LookbackPeriod - TimeSpan.FromMinutes(5), }; })); }, watchlist); var performanceChecks = new List <OperationPerformanceOutliersRule.DynamicCheck>() { new OperationPerformanceOutliersRule.DynamicCheck() { LookbackPeriod = TimeSpan.FromMinutes(60), DetectionPeriod = TimeSpan.FromMinutes(30), Match = "LocalCacheServer.StartupAsync", Constraint = $"Duration >= {CslTimeSpanLiteral.AsCslString(TimeSpan.FromMinutes(1))}", }, new OperationPerformanceOutliersRule.DynamicCheck() { LookbackPeriod = TimeSpan.FromHours(12), DetectionPeriod = TimeSpan.FromHours(1), Match = "CheckpointManager.CreateCheckpointAsync", Constraint = $"Duration >= {CslTimeSpanLiteral.AsCslString(TimeSpan.FromMinutes(1))}", }, new OperationPerformanceOutliersRule.DynamicCheck() { LookbackPeriod = TimeSpan.FromHours(12), DetectionPeriod = TimeSpan.FromHours(1), Match = "CheckpointManager.RestoreCheckpointAsync", Constraint = $"Duration >= P95 and P95 >= {CslTimeSpanLiteral.AsCslString(TimeSpan.FromMinutes(30))}", }, }; OncePerEnvironment(arguments => { return(performanceChecks.Select(check => { var configuration = new OperationPerformanceOutliersRule.Configuration(arguments.BaseConfiguration, check); return new Instantiation { Rule = new OperationPerformanceOutliersRule(configuration), PollingPeriod = check.DetectionPeriod - TimeSpan.FromMinutes(5), }; })); }, watchlist); OncePerEnvironment(arguments => { var configuration = new ServiceRestartsRule.Configuration(arguments.BaseConfiguration); return(Analysis.Utilities.Yield(new Instantiation() { Rule = new ServiceRestartsRule(configuration), PollingPeriod = TimeSpan.FromMinutes(30), })); }, watchlist); OncePerEnvironment(arguments => { var configuration = new LongCopyRule.Configuration(arguments.BaseConfiguration); return(Analysis.Utilities.Yield(new Instantiation() { Rule = new LongCopyRule(configuration), PollingPeriod = TimeSpan.FromMinutes(30), })); }, watchlist); OncePerEnvironment(arguments => { var configuration = new MachineReimagesRule.Configuration(arguments.BaseConfiguration); return(Analysis.Utilities.Yield(new Instantiation() { Rule = new MachineReimagesRule(configuration), PollingPeriod = TimeSpan.FromMinutes(30), })); }, watchlist); OncePerStamp(GenerateRedisAutoscalingRules, watchlist); }
/// <summary> /// Schedules a rule to be run over different stamps and environments. /// </summary> private void OncePerStamp(Func <SingleStampRuleArguments, IEnumerable <Instantiation> > generator, Watchlist watchlist) { foreach (var(stampId, properties) in watchlist.Entries) { var environmentConfiguration = _configuration.Environments[stampId.Environment]; var resources = _environmentResources[stampId.Environment]; var configuration = new SingleStampRuleConfiguration( _clock, _logger, _alertNotifier, resources.KustoQueryClient, _icmClient, environmentConfiguration.KustoDatabaseName, properties.CacheTableName, stampId); var request = new SingleStampRuleArguments { StampId = stampId, DynamicStampProperties = properties, BaseConfiguration = configuration, EnvironmentResources = resources, }; foreach (var rule in generator(request)) { Contract.AssertNotNull(rule.Rule); _scheduler.Add(rule.Rule, rule.PollingPeriod, rule.ForceRun); } } }
private void OncePerStamp(Func <KustoRuleConfiguration, IEnumerable <Instantiation> > generator, Watchlist watchlist) { foreach (var entry in watchlist.Entries) { var tableNameFound = watchlist.TryGetCacheTableName(entry, out var cacheTableName); Contract.Assert(tableNameFound); var configuration = new KustoRuleConfiguration() { Clock = _clock, Logger = _logger, Notifier = _alertNotifier, CslQueryProvider = _cslQueryProvider, KustoDatabaseName = EnvironmentToKustoDatabaseName[entry.Environment], Environment = entry.Environment, Stamp = entry.Stamp, CacheTableName = cacheTableName, }; foreach (var rule in generator(configuration)) { Contract.AssertNotNull(rule.Rule); _scheduler.Add(rule.Rule, rule.PollingPeriod, rule.ForceRun); } } }