/// <summary> /// Creates the schedule of rules that will be run. Also responsible for configuring them. /// </summary> private void CreateSchedule(Watchlist watchlist) { // TODO: per-stamp configuration (some stamps are more important than others, query frequency should reflect that) // TODO: query weight (how much does it cost). We should adapt scheduling policy to have lighter queries prioritize earlier than the others. // TODO: stamp configuration knowledge. Stamp configuration affects what our thresholds should be. We should reflect that here. // TODO: add jitter to rules, so that queries to Kusto are spread out over time instead of all at once OncePerEnvironment( arguments => { var configuration = new LastProducedCheckpointRule.Configuration(arguments.BaseConfiguration); return(Analysis.Utilities.Yield(new Instantiation() { Rule = new LastProducedCheckpointRule(configuration), PollingPeriod = TimeSpan.FromMinutes(40), })); }, watchlist); OncePerEnvironment(arguments => { var configuration = new LastRestoredCheckpointRule.Configuration(arguments.BaseConfiguration); return(Analysis.Utilities.Yield(new Instantiation() { Rule = new LastRestoredCheckpointRule(configuration), PollingPeriod = TimeSpan.FromMinutes(30), })); }, watchlist); OncePerEnvironment(arguments => { var configuration = new CheckpointSizeRule.Configuration(arguments.BaseConfiguration); return(Analysis.Utilities.Yield(new Instantiation() { Rule = new CheckpointSizeRule(configuration), PollingPeriod = configuration.AnomalyDetectionHorizon - TimeSpan.FromMinutes(5), })); }, watchlist); // TODO: this rule is too noisy and inaccurate, we should make it work again //OncePerStamp(arguments => //{ // var configuration = new ActiveMachinesRule.Configuration(arguments.BaseConfiguration); // return Analysis.Utilities.Yield(new Instantiation() // { // Rule = new ActiveMachinesRule(configuration), // PollingPeriod = configuration.AnomalyDetectionHorizon - TimeSpan.FromMinutes(5), // }); //}, watchlist); OncePerEnvironment(arguments => { var configuration = new EventHubProcessingDelayRule.Configuration(arguments.BaseConfiguration); return(Analysis.Utilities.Yield(new Instantiation() { Rule = new EventHubProcessingDelayRule(configuration), PollingPeriod = TimeSpan.FromMinutes(30), })); }, watchlist); OncePerEnvironment(arguments => { var configuration = new BuildFailuresRule.Configuration(arguments.BaseConfiguration); return(Analysis.Utilities.Yield(new Instantiation() { Rule = new BuildFailuresRule(configuration), PollingPeriod = TimeSpan.FromMinutes(45), })); }, watchlist); // TODO: fire-and-forget exceptions are now being reported on the dashboards. We should see if this can be recycled. //OncePerStamp(arguments => //{ // var configuration = new FireAndForgetExceptionsRule.Configuration(arguments.BaseConfiguration); // return Analysis.Utilities.Yield(new Instantiation() // { // Rule = new FireAndForgetExceptionsRule(configuration), // PollingPeriod = configuration.LookbackPeriod - TimeSpan.FromMinutes(5), // }); //}, watchlist); // TODO: this was just too noisy //OncePerStamp(arguments => //{ // var configuration = new ContractViolationsRule.Configuration(arguments.BaseConfiguration); // return Analysis.Utilities.Yield(new Instantiation() { // Rule = new ContractViolationsRule(configuration), // PollingPeriod = configuration.LookbackPeriod, // }); //}, watchlist); var failureChecks = new List <OperationFailureCheckRule.Check>() { new OperationFailureCheckRule.Check() { Match = "StartupAsync", Constraint = "Component != 'GrpcCopyClient'", }, new OperationFailureCheckRule.Check() { Match = "ShutdownAsync", }, new OperationFailureCheckRule.Check() { Match = "RestoreCheckpointAsync", }, new OperationFailureCheckRule.Check() { Match = "CreateCheckpointAsync", }, new OperationFailureCheckRule.Check() { Match = "ReconcileAsync", }, new OperationFailureCheckRule.Check() { Match = "ProcessEventsCoreAsync", }, new OperationFailureCheckRule.Check() { // TODO(jubayard): lower severity Match = "SendEventsCoreAsync", }, }; OncePerEnvironment(arguments => { return(failureChecks.Select(check => { var configuration = new OperationFailureCheckRule.Configuration(arguments.BaseConfiguration, check); return new Instantiation() { Rule = new OperationFailureCheckRule(configuration), PollingPeriod = configuration.LookbackPeriod - TimeSpan.FromMinutes(5), }; })); }, watchlist); var performanceChecks = new List <OperationPerformanceOutliersRule.DynamicCheck>() { new OperationPerformanceOutliersRule.DynamicCheck() { LookbackPeriod = TimeSpan.FromMinutes(60), DetectionPeriod = TimeSpan.FromMinutes(30), Match = "LocalCacheServer.StartupAsync", Constraint = $"Duration >= {CslTimeSpanLiteral.AsCslString(TimeSpan.FromMinutes(1))}", }, new OperationPerformanceOutliersRule.DynamicCheck() { LookbackPeriod = TimeSpan.FromHours(12), DetectionPeriod = TimeSpan.FromHours(1), Match = "CheckpointManager.CreateCheckpointAsync", Constraint = $"Duration >= {CslTimeSpanLiteral.AsCslString(TimeSpan.FromMinutes(1))}", }, new OperationPerformanceOutliersRule.DynamicCheck() { LookbackPeriod = TimeSpan.FromHours(12), DetectionPeriod = TimeSpan.FromHours(1), Match = "CheckpointManager.RestoreCheckpointAsync", Constraint = $"Duration >= P95 and P95 >= {CslTimeSpanLiteral.AsCslString(TimeSpan.FromMinutes(30))}", }, }; OncePerEnvironment(arguments => { return(performanceChecks.Select(check => { var configuration = new OperationPerformanceOutliersRule.Configuration(arguments.BaseConfiguration, check); return new Instantiation { Rule = new OperationPerformanceOutliersRule(configuration), PollingPeriod = check.DetectionPeriod - TimeSpan.FromMinutes(5), }; })); }, watchlist); OncePerEnvironment(arguments => { var configuration = new ServiceRestartsRule.Configuration(arguments.BaseConfiguration); return(Analysis.Utilities.Yield(new Instantiation() { Rule = new ServiceRestartsRule(configuration), PollingPeriod = TimeSpan.FromMinutes(30), })); }, watchlist); OncePerEnvironment(arguments => { var configuration = new LongCopyRule.Configuration(arguments.BaseConfiguration); return(Analysis.Utilities.Yield(new Instantiation() { Rule = new LongCopyRule(configuration), PollingPeriod = TimeSpan.FromMinutes(30), })); }, watchlist); OncePerEnvironment(arguments => { var configuration = new MachineReimagesRule.Configuration(arguments.BaseConfiguration); return(Analysis.Utilities.Yield(new Instantiation() { Rule = new MachineReimagesRule(configuration), PollingPeriod = TimeSpan.FromMinutes(30), })); }, watchlist); OncePerEnvironment(arguments => { var configuration = new DiskCorruptionRule.Configuration(arguments.BaseConfiguration); return(Analysis.Utilities.Yield(new Instantiation() { Rule = new DiskCorruptionRule(configuration), PollingPeriod = TimeSpan.FromMinutes(30), })); }, watchlist); OncePerStamp(GenerateRedisAutoscalingRules, watchlist); }
/// <summary> /// Creates the schedule of rules that will be run. Also responsible for configuring them. /// </summary> private void CreateSchedule() { OncePerStamp(baseConfiguration => { var configuration = new LastProducedCheckpointRule.Configuration(baseConfiguration); return(Utilities.Yield(new Instantiation() { Rule = new LastProducedCheckpointRule(configuration), PollingPeriod = TimeSpan.FromMinutes(30), })); }); OncePerStamp(baseConfiguration => { var configuration = new LastRestoredCheckpointRule.Configuration(baseConfiguration); return(Utilities.Yield(new Instantiation() { Rule = new LastRestoredCheckpointRule(configuration), PollingPeriod = TimeSpan.FromMinutes(30), })); }); OncePerStamp(baseConfiguration => { var configuration = new CheckpointSizeRule.Configuration(baseConfiguration); return(Utilities.Yield(new Instantiation() { Rule = new CheckpointSizeRule(configuration), PollingPeriod = configuration.AnomalyDetectionHorizon - TimeSpan.FromMinutes(5), })); }); OncePerStamp(baseConfiguration => { var configuration = new ActiveMachinesRule.Configuration(baseConfiguration); return(Utilities.Yield(new Instantiation() { Rule = new ActiveMachinesRule(configuration), PollingPeriod = configuration.AnomalyDetectionHorizon - TimeSpan.FromMinutes(5), })); }); OncePerStamp(baseConfiguration => { var configuration = new EventHubProcessingDelayRule.Configuration(baseConfiguration); return(Utilities.Yield(new Instantiation() { Rule = new EventHubProcessingDelayRule(configuration), PollingPeriod = TimeSpan.FromMinutes(20), })); }); OncePerStamp(baseConfiguration => { var configuration = new BuildFailuresRule.Configuration(baseConfiguration); return(Utilities.Yield(new Instantiation() { Rule = new BuildFailuresRule(configuration), PollingPeriod = TimeSpan.FromMinutes(15), })); }); OncePerStamp(baseConfiguration => { var configuration = new FireAndForgetExceptionsRule.Configuration(baseConfiguration); return(Utilities.Yield(new Instantiation() { Rule = new FireAndForgetExceptionsRule(configuration), PollingPeriod = configuration.LookbackPeriod - TimeSpan.FromMinutes(5), })); }); //OncePerStamp(baseConfiguration => //{ // var configuration = new ContractViolationsRule.Configuration(baseConfiguration); // return Utilities.Yield(new Instantiation() { // Rule = new ContractViolationsRule(configuration), // PollingPeriod = configuration.LookbackPeriod, // }); //}); var failureChecks = new List <OperationFailureCheckRule.Check>() { new OperationFailureCheckRule.Check() { Match = "StartupAsync", }, new OperationFailureCheckRule.Check() { Match = "ShutdownAsync", }, new OperationFailureCheckRule.Check() { Match = "RestoreCheckpointAsync", }, new OperationFailureCheckRule.Check() { Match = "CreateCheckpointAsync", }, new OperationFailureCheckRule.Check() { Match = "ReconcileAsync", }, new OperationFailureCheckRule.Check() { Match = "ProcessEventsCoreAsync", }, new OperationFailureCheckRule.Check() { // TODO(jubayard): lower severity Match = "SendEventsCoreAsync", }, }; OncePerStamp(baseConfiguration => { return(failureChecks.Select(check => { var configuration = new OperationFailureCheckRule.Configuration(baseConfiguration) { Check = check, }; return new Instantiation() { Rule = new OperationFailureCheckRule(configuration), PollingPeriod = configuration.LookbackPeriod - TimeSpan.FromMinutes(5), }; })); }); var performanceChecks = new List <OperationPerformanceOutliersRule.DynamicCheck>() { new OperationPerformanceOutliersRule.DynamicCheck() { LookbackPeriod = TimeSpan.FromMinutes(60), DetectionPeriod = TimeSpan.FromMinutes(30), Match = "LocalContentServer.StartupAsync", Constraint = $"TimeMs >= {TimeSpan.FromMinutes(1).TotalMilliseconds}", }, new OperationPerformanceOutliersRule.DynamicCheck() { LookbackPeriod = TimeSpan.FromMinutes(60), DetectionPeriod = TimeSpan.FromMinutes(30), Match = "LocalCacheServer.StartupAsync", Constraint = $"TimeMs >= {TimeSpan.FromMinutes(1).TotalMilliseconds}", }, new OperationPerformanceOutliersRule.DynamicCheck() { LookbackPeriod = TimeSpan.FromMinutes(60), DetectionPeriod = TimeSpan.FromMinutes(30), Match = "RedisGlobalStore.RegisterLocalLocationAsync", Constraint = $"TimeMs >= {TimeSpan.FromSeconds(30).TotalMilliseconds}", }, new OperationPerformanceOutliersRule.DynamicCheck() { LookbackPeriod = TimeSpan.FromDays(1), DetectionPeriod = TimeSpan.FromHours(1), Match = "CheckpointManager.CreateCheckpointAsync", Constraint = $"TimeMs >= {TimeSpan.FromMinutes(1).TotalMilliseconds}", }, new OperationPerformanceOutliersRule.DynamicCheck() { LookbackPeriod = TimeSpan.FromDays(1), DetectionPeriod = TimeSpan.FromHours(1), Match = "CheckpointManager.RestoreCheckpointAsync", Constraint = $"TimeMs >= P95 and P95 >= {TimeSpan.FromMinutes(30).TotalMilliseconds}", }, }; OncePerStamp(baseConfiguration => { return(performanceChecks.Select(check => { var configuration = new OperationPerformanceOutliersRule.Configuration(baseConfiguration) { Check = check, }; return new Instantiation() { Rule = new OperationPerformanceOutliersRule(configuration), PollingPeriod = check.DetectionPeriod - TimeSpan.FromMinutes(5), }; })); }); OncePerStamp(baseConfiguration => { var configuration = new ServiceRestartsRule.Configuration(baseConfiguration); return(Utilities.Yield(new Instantiation() { Rule = new ServiceRestartsRule(configuration), PollingPeriod = TimeSpan.FromMinutes(30), })); }); }