Пример #1
0
        /// <summary>
        /// Creates the schedule of rules that will be run. Also responsible for configuring them.
        /// </summary>
        private void CreateSchedule(Watchlist watchlist)
        {
            // TODO: per-stamp configuration (some stamps are more important than others, query frequency should reflect that)
            // TODO: query weight (how much does it cost). We should adapt scheduling policy to have lighter queries prioritize earlier than the others.
            // TODO: stamp configuration knowledge. Stamp configuration affects what our thresholds should be. We should reflect that here.
            // TODO: add jitter to rules, so that queries to Kusto are spread out over time instead of all at once
            OncePerEnvironment(
                arguments =>
            {
                var configuration = new LastProducedCheckpointRule.Configuration(arguments.BaseConfiguration);
                return(Analysis.Utilities.Yield(new Instantiation()
                {
                    Rule = new LastProducedCheckpointRule(configuration),
                    PollingPeriod = TimeSpan.FromMinutes(40),
                }));
            }, watchlist);

            OncePerEnvironment(arguments =>
            {
                var configuration = new LastRestoredCheckpointRule.Configuration(arguments.BaseConfiguration);
                return(Analysis.Utilities.Yield(new Instantiation()
                {
                    Rule = new LastRestoredCheckpointRule(configuration),
                    PollingPeriod = TimeSpan.FromMinutes(30),
                }));
            }, watchlist);

            OncePerEnvironment(arguments =>
            {
                var configuration = new CheckpointSizeRule.Configuration(arguments.BaseConfiguration);
                return(Analysis.Utilities.Yield(new Instantiation()
                {
                    Rule = new CheckpointSizeRule(configuration),
                    PollingPeriod = configuration.AnomalyDetectionHorizon - TimeSpan.FromMinutes(5),
                }));
            }, watchlist);

            // TODO: this rule is too noisy and inaccurate, we should make it work again
            //OncePerStamp(arguments =>
            //{
            //    var configuration = new ActiveMachinesRule.Configuration(arguments.BaseConfiguration);
            //    return Analysis.Utilities.Yield(new Instantiation()
            //    {
            //        Rule = new ActiveMachinesRule(configuration),
            //        PollingPeriod = configuration.AnomalyDetectionHorizon - TimeSpan.FromMinutes(5),
            //    });
            //}, watchlist);

            OncePerEnvironment(arguments =>
            {
                var configuration = new EventHubProcessingDelayRule.Configuration(arguments.BaseConfiguration);
                return(Analysis.Utilities.Yield(new Instantiation()
                {
                    Rule = new EventHubProcessingDelayRule(configuration),
                    PollingPeriod = TimeSpan.FromMinutes(30),
                }));
            }, watchlist);

            OncePerEnvironment(arguments =>
            {
                var configuration = new BuildFailuresRule.Configuration(arguments.BaseConfiguration);
                return(Analysis.Utilities.Yield(new Instantiation()
                {
                    Rule = new BuildFailuresRule(configuration),
                    PollingPeriod = TimeSpan.FromMinutes(45),
                }));
            }, watchlist);

            // TODO: fire-and-forget exceptions are now being reported on the dashboards. We should see if this can be recycled.
            //OncePerStamp(arguments =>
            //{
            //    var configuration = new FireAndForgetExceptionsRule.Configuration(arguments.BaseConfiguration);
            //    return Analysis.Utilities.Yield(new Instantiation()
            //    {
            //        Rule = new FireAndForgetExceptionsRule(configuration),
            //        PollingPeriod = configuration.LookbackPeriod - TimeSpan.FromMinutes(5),
            //    });
            //}, watchlist);

            // TODO: this was just too noisy
            //OncePerStamp(arguments =>
            //{
            //    var configuration = new ContractViolationsRule.Configuration(arguments.BaseConfiguration);
            //    return Analysis.Utilities.Yield(new Instantiation() {
            //        Rule = new ContractViolationsRule(configuration),
            //        PollingPeriod = configuration.LookbackPeriod,
            //    });
            //}, watchlist);

            var failureChecks = new List <OperationFailureCheckRule.Check>()
            {
                new OperationFailureCheckRule.Check()
                {
                    Match      = "StartupAsync",
                    Constraint = "Component != 'GrpcCopyClient'",
                },
                new OperationFailureCheckRule.Check()
                {
                    Match = "ShutdownAsync",
                },
                new OperationFailureCheckRule.Check()
                {
                    Match = "RestoreCheckpointAsync",
                },
                new OperationFailureCheckRule.Check()
                {
                    Match = "CreateCheckpointAsync",
                },
                new OperationFailureCheckRule.Check()
                {
                    Match = "ReconcileAsync",
                },
                new OperationFailureCheckRule.Check()
                {
                    Match = "ProcessEventsCoreAsync",
                },
                new OperationFailureCheckRule.Check()
                {
                    // TODO(jubayard): lower severity
                    Match = "SendEventsCoreAsync",
                },
            };

            OncePerEnvironment(arguments =>
            {
                return(failureChecks.Select(check =>
                {
                    var configuration = new OperationFailureCheckRule.Configuration(arguments.BaseConfiguration, check);

                    return new Instantiation()
                    {
                        Rule = new OperationFailureCheckRule(configuration),
                        PollingPeriod = configuration.LookbackPeriod - TimeSpan.FromMinutes(5),
                    };
                }));
            }, watchlist);

            var performanceChecks = new List <OperationPerformanceOutliersRule.DynamicCheck>()
            {
                new OperationPerformanceOutliersRule.DynamicCheck()
                {
                    LookbackPeriod  = TimeSpan.FromMinutes(60),
                    DetectionPeriod = TimeSpan.FromMinutes(30),
                    Match           = "LocalCacheServer.StartupAsync",
                    Constraint      = $"Duration >= {CslTimeSpanLiteral.AsCslString(TimeSpan.FromMinutes(1))}",
                },
                new OperationPerformanceOutliersRule.DynamicCheck()
                {
                    LookbackPeriod  = TimeSpan.FromHours(12),
                    DetectionPeriod = TimeSpan.FromHours(1),
                    Match           = "CheckpointManager.CreateCheckpointAsync",
                    Constraint      = $"Duration >= {CslTimeSpanLiteral.AsCslString(TimeSpan.FromMinutes(1))}",
                },
                new OperationPerformanceOutliersRule.DynamicCheck()
                {
                    LookbackPeriod  = TimeSpan.FromHours(12),
                    DetectionPeriod = TimeSpan.FromHours(1),
                    Match           = "CheckpointManager.RestoreCheckpointAsync",
                    Constraint      = $"Duration >= P95 and P95 >= {CslTimeSpanLiteral.AsCslString(TimeSpan.FromMinutes(30))}",
                },
            };

            OncePerEnvironment(arguments =>
            {
                return(performanceChecks.Select(check =>
                {
                    var configuration = new OperationPerformanceOutliersRule.Configuration(arguments.BaseConfiguration, check);

                    return new Instantiation
                    {
                        Rule = new OperationPerformanceOutliersRule(configuration),
                        PollingPeriod = check.DetectionPeriod - TimeSpan.FromMinutes(5),
                    };
                }));
            }, watchlist);

            OncePerEnvironment(arguments =>
            {
                var configuration = new ServiceRestartsRule.Configuration(arguments.BaseConfiguration);
                return(Analysis.Utilities.Yield(new Instantiation()
                {
                    Rule = new ServiceRestartsRule(configuration),
                    PollingPeriod = TimeSpan.FromMinutes(30),
                }));
            }, watchlist);

            OncePerEnvironment(arguments =>
            {
                var configuration = new LongCopyRule.Configuration(arguments.BaseConfiguration);
                return(Analysis.Utilities.Yield(new Instantiation()
                {
                    Rule = new LongCopyRule(configuration),
                    PollingPeriod = TimeSpan.FromMinutes(30),
                }));
            }, watchlist);

            OncePerEnvironment(arguments =>
            {
                var configuration = new MachineReimagesRule.Configuration(arguments.BaseConfiguration);

                return(Analysis.Utilities.Yield(new Instantiation()
                {
                    Rule = new MachineReimagesRule(configuration),
                    PollingPeriod = TimeSpan.FromMinutes(30),
                }));
            }, watchlist);

            OncePerEnvironment(arguments =>
            {
                var configuration = new DiskCorruptionRule.Configuration(arguments.BaseConfiguration);

                return(Analysis.Utilities.Yield(new Instantiation()
                {
                    Rule = new DiskCorruptionRule(configuration),
                    PollingPeriod = TimeSpan.FromMinutes(30),
                }));
            }, watchlist);

            OncePerStamp(GenerateRedisAutoscalingRules, watchlist);
        }
Пример #2
0
        /// <summary>
        /// Creates the schedule of rules that will be run. Also responsible for configuring them.
        /// </summary>
        private void CreateSchedule()
        {
            OncePerStamp(baseConfiguration =>
            {
                var configuration = new LastProducedCheckpointRule.Configuration(baseConfiguration);
                return(Utilities.Yield(new Instantiation()
                {
                    Rule = new LastProducedCheckpointRule(configuration),
                    PollingPeriod = TimeSpan.FromMinutes(30),
                }));
            });

            OncePerStamp(baseConfiguration =>
            {
                var configuration = new LastRestoredCheckpointRule.Configuration(baseConfiguration);
                return(Utilities.Yield(new Instantiation()
                {
                    Rule = new LastRestoredCheckpointRule(configuration),
                    PollingPeriod = TimeSpan.FromMinutes(30),
                }));
            });

            OncePerStamp(baseConfiguration =>
            {
                var configuration = new CheckpointSizeRule.Configuration(baseConfiguration);
                return(Utilities.Yield(new Instantiation()
                {
                    Rule = new CheckpointSizeRule(configuration),
                    PollingPeriod = configuration.AnomalyDetectionHorizon - TimeSpan.FromMinutes(5),
                }));
            });

            OncePerStamp(baseConfiguration =>
            {
                var configuration = new ActiveMachinesRule.Configuration(baseConfiguration);
                return(Utilities.Yield(new Instantiation()
                {
                    Rule = new ActiveMachinesRule(configuration),
                    PollingPeriod = configuration.AnomalyDetectionHorizon - TimeSpan.FromMinutes(5),
                }));
            });

            OncePerStamp(baseConfiguration =>
            {
                var configuration = new EventHubProcessingDelayRule.Configuration(baseConfiguration);
                return(Utilities.Yield(new Instantiation()
                {
                    Rule = new EventHubProcessingDelayRule(configuration),
                    PollingPeriod = TimeSpan.FromMinutes(20),
                }));
            });

            OncePerStamp(baseConfiguration =>
            {
                var configuration = new BuildFailuresRule.Configuration(baseConfiguration);
                return(Utilities.Yield(new Instantiation()
                {
                    Rule = new BuildFailuresRule(configuration),
                    PollingPeriod = TimeSpan.FromMinutes(15),
                }));
            });

            OncePerStamp(baseConfiguration =>
            {
                var configuration = new FireAndForgetExceptionsRule.Configuration(baseConfiguration);
                return(Utilities.Yield(new Instantiation()
                {
                    Rule = new FireAndForgetExceptionsRule(configuration),
                    PollingPeriod = configuration.LookbackPeriod - TimeSpan.FromMinutes(5),
                }));
            });

            //OncePerStamp(baseConfiguration =>
            //{
            //    var configuration = new ContractViolationsRule.Configuration(baseConfiguration);
            //    return Utilities.Yield(new Instantiation() {
            //        Rule = new ContractViolationsRule(configuration),
            //        PollingPeriod = configuration.LookbackPeriod,
            //    });
            //});

            var failureChecks = new List <OperationFailureCheckRule.Check>()
            {
                new OperationFailureCheckRule.Check()
                {
                    Match = "StartupAsync",
                },
                new OperationFailureCheckRule.Check()
                {
                    Match = "ShutdownAsync",
                },
                new OperationFailureCheckRule.Check()
                {
                    Match = "RestoreCheckpointAsync",
                },
                new OperationFailureCheckRule.Check()
                {
                    Match = "CreateCheckpointAsync",
                },
                new OperationFailureCheckRule.Check()
                {
                    Match = "ReconcileAsync",
                },
                new OperationFailureCheckRule.Check()
                {
                    Match = "ProcessEventsCoreAsync",
                },
                new OperationFailureCheckRule.Check()
                {
                    // TODO(jubayard): lower severity
                    Match = "SendEventsCoreAsync",
                },
            };

            OncePerStamp(baseConfiguration =>
            {
                return(failureChecks.Select(check => {
                    var configuration = new OperationFailureCheckRule.Configuration(baseConfiguration)
                    {
                        Check = check,
                    };

                    return new Instantiation()
                    {
                        Rule = new OperationFailureCheckRule(configuration),
                        PollingPeriod = configuration.LookbackPeriod - TimeSpan.FromMinutes(5),
                    };
                }));
            });

            var performanceChecks = new List <OperationPerformanceOutliersRule.DynamicCheck>()
            {
                new OperationPerformanceOutliersRule.DynamicCheck()
                {
                    LookbackPeriod  = TimeSpan.FromMinutes(60),
                    DetectionPeriod = TimeSpan.FromMinutes(30),
                    Match           = "LocalContentServer.StartupAsync",
                    Constraint      = $"TimeMs >= {TimeSpan.FromMinutes(1).TotalMilliseconds}",
                },
                new OperationPerformanceOutliersRule.DynamicCheck()
                {
                    LookbackPeriod  = TimeSpan.FromMinutes(60),
                    DetectionPeriod = TimeSpan.FromMinutes(30),
                    Match           = "LocalCacheServer.StartupAsync",
                    Constraint      = $"TimeMs >= {TimeSpan.FromMinutes(1).TotalMilliseconds}",
                },
                new OperationPerformanceOutliersRule.DynamicCheck()
                {
                    LookbackPeriod  = TimeSpan.FromMinutes(60),
                    DetectionPeriod = TimeSpan.FromMinutes(30),
                    Match           = "RedisGlobalStore.RegisterLocalLocationAsync",
                    Constraint      = $"TimeMs >= {TimeSpan.FromSeconds(30).TotalMilliseconds}",
                },
                new OperationPerformanceOutliersRule.DynamicCheck()
                {
                    LookbackPeriod  = TimeSpan.FromDays(1),
                    DetectionPeriod = TimeSpan.FromHours(1),
                    Match           = "CheckpointManager.CreateCheckpointAsync",
                    Constraint      = $"TimeMs >= {TimeSpan.FromMinutes(1).TotalMilliseconds}",
                },
                new OperationPerformanceOutliersRule.DynamicCheck()
                {
                    LookbackPeriod  = TimeSpan.FromDays(1),
                    DetectionPeriod = TimeSpan.FromHours(1),
                    Match           = "CheckpointManager.RestoreCheckpointAsync",
                    Constraint      = $"TimeMs >= P95 and P95 >= {TimeSpan.FromMinutes(30).TotalMilliseconds}",
                },
            };

            OncePerStamp(baseConfiguration =>
            {
                return(performanceChecks.Select(check =>
                {
                    var configuration = new OperationPerformanceOutliersRule.Configuration(baseConfiguration)
                    {
                        Check = check,
                    };

                    return new Instantiation()
                    {
                        Rule = new OperationPerformanceOutliersRule(configuration),
                        PollingPeriod = check.DetectionPeriod - TimeSpan.FromMinutes(5),
                    };
                }));
            });

            OncePerStamp(baseConfiguration =>
            {
                var configuration = new ServiceRestartsRule.Configuration(baseConfiguration);
                return(Utilities.Yield(new Instantiation()
                {
                    Rule = new ServiceRestartsRule(configuration),
                    PollingPeriod = TimeSpan.FromMinutes(30),
                }));
            });
        }