#pragma warning restore CS0649

        public override async Task Run(RuleContext context)
        {
            var now   = _configuration.Clock.UtcNow;
            var query =
                $@"
                let end = now();
                let start = end - {CslTimeSpanLiteral.AsCslString(_configuration.LookbackPeriod)};
                table(""{_configuration.CacheTableName}"")
                | where PreciseTimeStamp between (start .. end)
                | where Stamp == ""{_configuration.Stamp}""
                | where Service == ""{Constants.ServiceName}"" or Service == ""{Constants.MasterServiceName}""
                | where Message has ""Critical error occurred:""
                | project PreciseTimeStamp, Machine, Message
                | parse Message with * ""occurred: "" ErrorMessage:string "". Diagnostics: "" Exception:string
                | parse ErrorMessage with Operation:string "" stop "" *
                | parse Exception with ExceptionType:string "": "" ExceptionMessage ""\n"" * // Fetch first line of the exception
                | extend Operation=extract(""^((\\w+).(\\w+)).*"", 1, Operation) // Prune unnecessary parameters
                | project-away Message, ErrorMessage
                | where ExceptionType != ""System.UnauthorizedAccessException""
                | summarize Machines=dcount(Machine, 2), Count=count() by ExceptionType, ExceptionMessage, Operation
                | where not(isnull(Machines))";
            var results = await QueryKustoAsync <Result>(context, query);

            foreach (var result in results)
            {
                Emit(context, $"ContractViolations_Operation_{result.Operation}", Severity.Error,
                     $"`{result.Machines}` machine(s) had `{result.Count}` contract violations (`{result.ExceptionType}`) in operation `{result.Operation}`. Example message: {result.ExceptionMessage}",
                     $"`{result.Machines}` machine(s) had `{result.Count}` contract violations (`{result.ExceptionType}`) in operation `{result.Operation}`",
                     eventTimeUtc: now);
            }
        }
#pragma warning restore CS0649

        public override async Task Run(RuleContext context)
        {
            var now   = _configuration.Clock.UtcNow;
            var query =
                $@"
                let end = now();
                let start = end - {CslTimeSpanLiteral.AsCslString(_configuration.LookbackPeriod)};
                table('{_configuration.CacheTableName}')
                | where PreciseTimeStamp between (start .. end)
                | where Result == '{Constants.ResultCode.CriticalFailure}'
                | project PreciseTimeStamp, Machine, Message, Stamp, Component, Operation
                | parse Message with * 'occurred: ' ErrorMessage:string 'Diagnostics=[' Exception:string
                | extend Operation = strcat(Component, '.', Operation)
                | parse Exception with ExceptionType:string ': ' ExceptionMessage '\n' * // Fetch first line of the exception
                | project-away Message, ErrorMessage
                | where ExceptionType != 'System.UnauthorizedAccessException'
                | summarize Machines=dcount(Machine, 2), Count=count() by ExceptionType, ExceptionMessage, Operation, Stamp
                | where not(isnull(Machines))";
            var results = await QueryKustoAsync <Result>(context, query);

            foreach (var result in results)
            {
                Emit(context, $"ContractViolations_Operation_{result.Operation}", Severity.Error,
                     $"`{result.Machines}` machine(s) had `{result.Count}` contract violations (`{result.ExceptionType}`) in operation `{result.Operation}`. Example message: {result.ExceptionMessage}",
                     stamp: result.Stamp,
                     $"`{result.Machines}` machine(s) had `{result.Count}` contract violations (`{result.ExceptionType}`) in operation `{result.Operation}`",
                     eventTimeUtc: now);
            }
        }
Пример #3
0
#pragma warning restore CS0649

        public override async Task Run(RuleContext context)
        {
            var now   = _configuration.Clock.UtcNow;
            var query =
                $@"
                let end = now();
                let start = end - {CslTimeSpanLiteral.AsCslString(_configuration.LookbackPeriod)};
                CacheInvocationsWithErrors(""{_configuration.Stamp}"", start, end)
                | summarize Total=count(), Failed=countif(CacheImplicated)
                | extend FailureRate=(toreal(Failed)/toreal(Total))
                | where not(isnull(Failed))";
            var results = (await QueryKustoAsync <Result>(context, query)).ToList();

            if (results.Count == 0)
            {
                Emit(context, "NoBuilds", Severity.Info,
                     $"Stamp hasn't completed any builds in at least `{_configuration.LookbackPeriod}`",
                     eventTimeUtc: now);
                return;
            }

            var failureRate = results[0].FailureRate;

            _configuration.FailureRateThresholds.Check(failureRate, (severity, threshold) =>
            {
                Emit(context, "FailureRate", severity,
                     $"Build failure rate `{Math.Round(failureRate * 100.0, 4, MidpointRounding.AwayFromZero)}%` over last `{_configuration.LookbackPeriod}``",
                     eventTimeUtc: now);
            });
        }
Пример #4
0
#pragma warning restore CS0649

        public override async Task Run(RuleContext context)
        {
            var now   = _configuration.Clock.UtcNow;
            var query =
                $@"
                let end = now();
                let start = end - {CslTimeSpanLiteral.AsCslString(_configuration.LookbackPeriod)};
                table(""{_configuration.CacheTableName}"")
                | where PreciseTimeStamp between (start .. end)
                | where Stamp == ""{_configuration.Stamp}""
                | where Service == ""{Constants.MasterServiceName}""
                | where Message contains ""CreateCheckpointAsync stop""
                | summarize (PreciseTimeStamp, Machine)=arg_max(PreciseTimeStamp, Machine)
                | extend Age = end - PreciseTimeStamp
                | where not(isnull(PreciseTimeStamp))";
            var results = (await QueryKustoAsync <Result>(context, query)).ToList();

            if (results.Count == 0)
            {
                Emit(context, "NoLogs", Severity.Fatal,
                     $"No checkpoints produced for at least {_configuration.LookbackPeriod}",
                     eventTimeUtc: now);
                return;
            }

            var age = results[0].Age;

            _configuration.AgeThresholds.Check(age, (severity, threshold) =>
            {
                Emit(context, "CreationThreshold", severity,
                     $"Newest checkpoint age `{age}` above threshold `{threshold}`. Master is {results[0].Machine}",
                     eventTimeUtc: results[0].PreciseTimeStamp);
            });
        }
Пример #5
0
#pragma warning restore CS0649

        public override async Task Run(RuleContext context)
        {
            // NOTE(jubayard): The Kusto ingestion delay needs to be taken into consideration to avoid the number of
            // machines being reported as much less than they actually are.
            var now   = _configuration.Clock.UtcNow - Constants.KustoIngestionDelay;
            var query =
                $@"
                let end = now() - {CslTimeSpanLiteral.AsCslString(Constants.KustoIngestionDelay)};
                let start = end - {CslTimeSpanLiteral.AsCslString(_configuration.LookbackPeriod)};
                table(""{_configuration.CacheTableName}"")
                | where PreciseTimeStamp between (start .. end)
                | where Service == ""{Constants.ServiceName}"" or Service == ""{Constants.MasterServiceName}""
                | where Stamp == ""{_configuration.Stamp}""
                | summarize ActiveMachines=dcount(Machine, {_configuration.DistinctCountPrecision}) by bin(PreciseTimeStamp, {CslTimeSpanLiteral.AsCslString(_configuration.BinningPeriod)})
                | sort by PreciseTimeStamp asc
                | where not(isnull(PreciseTimeStamp))";
            var results = (await QueryKustoAsync <Result>(context, query)).ToList();

            if (results.Count == 0)
            {
                Emit(context, "NoLogs", Severity.Fatal,
                     $"Machines haven't produced any logs for at least `{_configuration.LookbackPeriod}`",
                     eventTimeUtc: now);
                return;
            }

            var detectionHorizon = now - _configuration.AnomalyDetectionHorizon;

            var training   = new List <Result>();
            var prediction = new List <Result>();

            results.SplitBy(r => r.PreciseTimeStamp <= detectionHorizon, training, prediction);

            if (prediction.Count == 0)
            {
                Emit(context, "NoLogs", Severity.Fatal,
                     $"Machines haven't produced any logs for at least `{_configuration.AnomalyDetectionHorizon}`",
                     eventTimeUtc: now);
                return;
            }

            if (training.Count < _configuration.MinimumTrainingPercentOfData * results.Count)
            {
                return;
            }

            var lookbackSizes = training.Select(r => r.ActiveMachines);
            var lookbackMin   = (long)Math.Ceiling((1 - _configuration.MaximumGrowthWrtToLookback) * lookbackSizes.Min());
            var lookbackMax   = (long)Math.Floor((1 + _configuration.MaximumGrowthWrtToLookback) * lookbackSizes.Max());

            prediction.Select(p => p.ActiveMachines).NotInRange(lookbackMin, lookbackMax).Perform(index =>
            {
                Emit(context, "ExpectedRange", Severity.Warning,
                     $"`{prediction[index].ActiveMachines}` active machines, outside of expected range [`{lookbackMin}`, `{lookbackMax}`]",
                     eventTimeUtc: prediction[index].PreciseTimeStamp);
            });
        }
Пример #6
0
#pragma warning restore CS0649

        public override async Task Run(RuleContext context)
        {
            var now   = _configuration.Clock.UtcNow;
            var query =
                $@"
                let end = now();
                let start = end - {CslTimeSpanLiteral.AsCslString(_configuration.LookbackPeriod)};
                let Exceptions = CloudBuildLogEvent
                | where PreciseTimeStamp between (start .. end)
                | where Service == ""{Constants.ServiceName}"" or Service == ""{Constants.MasterServiceName}""
                | where Stamp == ""{_configuration.Stamp}""
                | where Message has ""Unhandled Exception in service.""
                | project PreciseTimeStamp, Stamp, Machine, Service, Exception
                | extend KnownFormat=(Exception has ""Error=["" and Exception has ""Diagnostics=["");
                let FormattedExceptions = Exceptions
                | where KnownFormat
                | parse Exception with ExceptionType:string "": Error=["" Error:string ""] Diagnostics=["" Diagnostics:string
                | parse Diagnostics with ExceptionTypeFromDiagnostics:string "": "" *
                | extend ExceptionType = iif(isnull(ExceptionTypeFromDiagnostics), ExceptionType, ExceptionTypeFromDiagnostics)
                | project-away ExceptionTypeFromDiagnostics
                | project-away Exception;
                let UnknownExceptions = Exceptions
                | where not(KnownFormat)
                | parse Exception with ExceptionType:string "": "" *
                | project-rename Diagnostics=Exception;
                FormattedExceptions
                | union UnknownExceptions
                | extend ExceptionType=iif(isempty(ExceptionType) or isnull(ExceptionType), ""Unknown"", ExceptionType)
                | extend ExceptionType=iif(ExceptionType has "" "", extract(""([A-Za-z0-9.]+) .*"", 1, ExceptionType), ExceptionType)
                | summarize Total=count(), LatestTimeUtc=max(PreciseTimeStamp), EarliestTimeUtc=min(PreciseTimeStamp) by Machine, ExceptionType
                | where not(isnull(Total));";
            var results = (await QuerySingleResultSetAsync <Result>(context, query)).ToList();

            if (results.Count == 0)
            {
                return;
            }

            foreach (var group in results.GroupBy(result => result.ExceptionType))
            {
                var numberOfMachines   = group.Select(result => result.Machine).Distinct().Count();
                var numberOfExceptions = group.Sum(result => result.Total);

                var minTimeUtc   = group.Min(result => result.EarliestTimeUtc);
                var eventTimeUtc = group.Max(result => result.LatestTimeUtc);

                BiThreshold(numberOfMachines, numberOfExceptions, _configuration.MachinesThresholds, _configuration.ExceptionsThresholds, (severity, machinesThreshold, exceptionsThreshold) =>
                {
                    var examples = string.Join(".\r\n", group
                                               .Select(result => $"\t`{result.Machine}` had `{result.Total}` restart(s) between `{result.EarliestTimeUtc}` and `{result.LatestTimeUtc}` UTC"));
                    Emit(context, $"ServiceRestarts_{group.Key}", severity,
                         $"`{numberOfMachines}` machine(s) had `{numberOfExceptions}` unexpected service restart(s) due to exception `{group.Key}` since `{now - minTimeUtc}` ago. Examples:\r\n{examples}",
                         $"`{numberOfMachines}` machine(s) had `{numberOfExceptions}` unexpected service restart(s) due to exception `{group.Key}` since `{now - minTimeUtc}` ago",
                         eventTimeUtc: eventTimeUtc);
                });
            }
        }
#pragma warning restore CS0649

        public override async Task Run(RuleContext context)
        {
            var now   = _configuration.Clock.UtcNow;
            var query =
                $@"
                let lookbackBuilds = {_configuration.AmountOfBuilds};
                let lookbackTime = {CslTimeSpanLiteral.AsCslString(_configuration.LookbackPeriod)};
                let Builds = materialize(BuildInfo
                | where PreciseTimeStamp between (ago(lookbackTime) .. now())
                | where InProbation != 1
                | partition by OwnerStampId (top lookbackBuilds by EndTime));
                let times = materialize(
                    Builds
                    | summarize TotalBuilds = count(), Start = min(StartTime), End = max(StartTime) by OwnerStampId
                    | where TotalBuilds == lookbackBuilds);
                let start = toscalar(times| summarize min(Start));
                let end = toscalar(times| summarize max(End));
                let failures = CacheInvocationsWithErrors('', start, end) | where CacheImplicated;
                Builds | lookup kind=inner failures on $left.BuildId == $right.BuildId
                | summarize Failed = count() by Stamp = OwnerStampId";
            var results = (await QueryKustoAsync <Result>(context, query)).ToList();

            await GroupByStampAndCallHelperAsync <Result>(results, result => result.Stamp, buildFailuresHelper);

            async Task buildFailuresHelper(string stamp, List <Result> results)
            {
                if (results.Count == 0)
                {
                    return;
                }

                var failed      = results[0].Failed;
                var failureRate = failed * 1.0 / _configuration.AmountOfBuilds;

                _configuration.FailureRateThresholds.Check(failureRate, (severity, threshold) =>
                {
                    Emit(context, "FailureRate", severity,
                         $"Build failure rate {Math.Round(failureRate * 100.0, 4, MidpointRounding.AwayFromZero)}%` over last `{_configuration.AmountOfBuilds} builds.``",
                         stamp,
                         eventTimeUtc: now);
                });

                await _configuration.FailureRateIcmThresholds.CheckAsync(failureRate, (severity, threshold) =>
                {
                    return(EmitIcmAsync(
                               severity,
                               title: $"{stamp}: build failure rate over last {_configuration.AmountOfBuilds} builds is higher than {threshold*100}%",
                               stamp,
                               machines: null,
                               correlationIds: null,
                               description: $"Build failure rate `{failed}/{_configuration.AmountOfBuilds}={Math.Round(failureRate * 100.0, 4, MidpointRounding.AwayFromZero)}%` over last `{_configuration.AmountOfBuilds} builds``",
                               eventTimeUtc: now,
                               cacheTimeToLive: _configuration.IcmIncidentCacheTtl));
                });
            }
        }
Пример #8
0
#pragma warning restore CS0649

        public override async Task Run(RuleContext context)
        {
            var query =
                $@"
                let end = now();
                let start = end - {CslTimeSpanLiteral.AsCslString(_configuration.LookbackPeriod)};
                let Machines = table('{_configuration.CacheTableName}')
                | where PreciseTimeStamp between (start .. end)
                | summarize Total=dcount(Machine) by Stamp;
                table('{_configuration.CacheTableName}')
                | where PreciseTimeStamp between (start .. end)
                | where Message has 'MemoryContentDirectory' and Message has 'starting with 0 entries from no file'
                | parse Message with 'MemoryContentDirectory(' Path ') starting ' *
                | project PreciseTimeStamp, Machine, Stamp, Path
                | where Path !has 'launcher' and Path !has 'proxy' and Path !has 'dcs'
                | summarize Reimaged = dcount(Machine), Machines = make_set(Machine) by Stamp
                | extend Machines = tostring(Machines)
                | lookup Machines on Stamp
                | where not(isnull(Total))
                | where isnotempty(Stamp)";
            var results = (await QueryKustoAsync <Result>(context, query)).ToList();

            GroupByStampAndCallHelper(results, result => result.Stamp, rule);

            void rule(string stamp, List <Result> results)
            {
                if (results.Count == 0)
                {
                    return;
                }

                var result = results[0];

                var reimageRate = (double)result.Reimaged / (double)result.Total;

                _configuration.ReimageRateThresholds.Check(
                    reimageRate,
                    (severity, cut) =>
                {
                    var examples = string.Empty;
                    if (!string.IsNullOrEmpty(result.Machines))
                    {
                        var machines = JsonSerializer.Deserialize <List <string> >(result.Machines) !;
                        examples     = ". Examples: " + string.Join(", ", machines.Take(_configuration.MaximumExamplesOnAlert).Select(m => $"`{m}`"));
                    }

                    Emit(
                        context,
                        "ReimageRate",
                        severity,
                        $"Detected `{Math.Round(reimageRate * 100.0, 4, MidpointRounding.AwayFromZero)}%` ({result.Reimaged} / {result.Total}) of the stamp has been reimaged in over the last `{_configuration.LookbackPeriod}` period{examples}",
                        stamp);
                });
            }
        }
Пример #9
0
#pragma warning restore CS0649

        public override async Task Run(RuleContext context)
        {
            var now   = _configuration.Clock.UtcNow;
            var query =
                $@"
                let end = now();
                let start = end - {CslTimeSpanLiteral.AsCslString(_configuration.LookbackPeriod)};
                CacheInvocationsWithErrors('', start, end)
                | summarize Total=count(), Failed=countif(CacheImplicated) by Stamp
                | extend FailureRate=(toreal(Failed)/toreal(Total))
                | where not(isnull(Failed))";
            var results = (await QueryKustoAsync <Result>(context, query)).ToList();

            await GroupByStampAndCallHelperAsync <Result>(results, result => result.Stamp, buildFailuresHelper);

            async Task buildFailuresHelper(string stamp, List <Result> results)
            {
                if (results.Count == 0)
                {
                    Emit(context, "NoBuilds", Severity.Info,
                         $"Stamp hasn't completed any builds in at least `{_configuration.LookbackPeriod}`",
                         stamp,
                         eventTimeUtc: now);
                    return;
                }

                var failureRate = results[0].FailureRate;
                var failed      = results[0].Failed;
                var total       = results[0].Total;

                _configuration.FailureRateThresholds.Check(failureRate, (severity, threshold) =>
                {
                    Emit(context, "FailureRate", severity,
                         $"Build failure rate `{failed}/{total}={Math.Round(failureRate * 100.0, 4, MidpointRounding.AwayFromZero)}%` over last `{_configuration.LookbackPeriod}``",
                         stamp,
                         eventTimeUtc: now);
                });

                if (total >= _configuration.MinimumAmountOfBuildsForIcm)
                {
                    await _configuration.FailureRateIcmThresholds.CheckAsync(failureRate, (severity, threshold) =>
                    {
                        return(EmitIcmAsync(
                                   severity,
                                   title: $"{stamp}: build failure rate is higher than {threshold*100}%",
                                   stamp,
                                   machines: null,
                                   correlationIds: null,
                                   description: $"Build failure rate `{failed}/{total}={Math.Round(failureRate * 100.0, 4, MidpointRounding.AwayFromZero)}%` over last `{_configuration.LookbackPeriod}``",
                                   eventTimeUtc: now,
                                   cacheTimeToLive: _configuration.IcmIncidentCacheTtl));
                    });
                }
            }
        }
#pragma warning restore CS0649

        public override async Task Run(RuleContext context)
        {
            var now   = _configuration.Clock.UtcNow;
            var query =
                $@"
                let end = now();
                let start = end - {CslTimeSpanLiteral.AsCslString(_configuration.LookbackPeriod)};
                table('{_configuration.CacheTableName}')
                | where PreciseTimeStamp between (start .. end)
                | where Role == 'Master'
                | where Operation == 'CreateCheckpointAsync' and isnotempty(Duration)
                | where Result == '{Constants.ResultCode.Success}'
                | summarize (PreciseTimeStamp, Machine)=arg_max(PreciseTimeStamp, Machine) by Stamp
                | extend Age = end - PreciseTimeStamp
                | where not(isnull(PreciseTimeStamp))";
            var results = (await QueryKustoAsync <Result>(context, query)).ToList();

            GroupByStampAndCallHelper <Result>(results, result => result.Stamp, lastProducedCheckpointHelper);

            void lastProducedCheckpointHelper(string stamp, List <Result> results)
            {
                if (results.Count == 0)
                {
                    Emit(context, "NoLogs", Severity.Fatal,
                         $"No checkpoints produced for at least {_configuration.LookbackPeriod}",
                         stamp,
                         eventTimeUtc: now);
                    return;
                }

                var age = results[0].Age;

                _configuration.AgeThresholds.Check(age, (severity, threshold) =>
                {
                    Emit(context, "CreationThreshold", severity,
                         $"Newest checkpoint age `{age}` above threshold `{threshold}`. Master is {results[0].Machine}",
                         stamp,
                         eventTimeUtc: results[0].PreciseTimeStamp);
                });
            }
        }
Пример #11
0
#pragma warning restore CS0649

        public override async Task Run(RuleContext context)
        {
            // NOTE(jubayard): When a summarize is run over an empty result set, Kusto produces a single (null) row,
            // which is why we need to filter it out.
            var now   = _configuration.Clock.UtcNow;
            var query =
                $@"
                let end = now();
                let start = end - {CslTimeSpanLiteral.AsCslString(_configuration.LookbackPeriod)};
                table(""{_configuration.CacheTableName}"")
                | where PreciseTimeStamp between (start .. end)
                | where Stamp == ""{_configuration.Stamp}""
                | where Service == ""{Constants.ServiceName}"" or Service == ""{Constants.MasterServiceName}""
                | where Message has ""Unhandled exception in fire and forget task""
                | where Message !has ""RedisConnectionException"" // This is a transient error (i.e. server closed the socket)
                | where Message !has ""TaskCanceledException"" // This is irrelevant
                | parse Message with * ""operation '"" Operation:string ""'"" * ""FullException="" Exception:string
                | project PreciseTimeStamp, Machine, Operation, Exception
                | summarize Machines=dcount(Machine), Count=count() by Operation
                | where not(isnull(Machines))";
            var results = await QueryKustoAsync <Result>(context, query);

            foreach (var result in results)
            {
                _configuration.MachinesThresholds.Check(result.Machines, (severity, threshold) =>
                {
                    if (result.Count < _configuration.MinimumErrorsThreshold)
                    {
                        return;
                    }

                    Emit(context, $"FireAndForgetExceptions_Operation_{result.Operation}", severity,
                         $"`{result.Machines}` machines had `{result.Count}` errors in fire and forget tasks for operation `{result.Operation}`",
                         eventTimeUtc: now);
                });
            }
        }
Пример #12
0
#pragma warning restore CS0649

        public override async Task Run(RuleContext context)
        {
            var now   = _configuration.Clock.UtcNow;
            var query =
                $@"
                let end = now();
                let start = end - {CslTimeSpanLiteral.AsCslString(_configuration.LookbackPeriod)};
                let activity = end - {CslTimeSpanLiteral.AsCslString(_configuration.ActivityPeriod)};
                let masterActivity = end - {CslTimeSpanLiteral.AsCslString(_configuration.MasterActivityPeriod)};
                let Events = table('{_configuration.CacheTableName}')
                | where PreciseTimeStamp between (start .. end);
                let Machines = Events
                | where PreciseTimeStamp >= activity
                | summarize LastActivityTime=max(PreciseTimeStamp) by Machine, Stamp
                | where not(isnull(Machine));
                let CurrentMaster = Events
                | where PreciseTimeStamp >= masterActivity
                | where Role == 'Master'
                | where Operation == 'CreateCheckpointAsync' and isnotempty(Duration)
                | where Result == '{Constants.ResultCode.Success}'
                | summarize (PreciseTimeStamp, Master)=arg_max(PreciseTimeStamp, Machine)
                | project-away PreciseTimeStamp
                | where not(isnull(Master));
                let MachinesWithRole = CurrentMaster
                | join hint.strategy=broadcast kind=rightouter Machines on $left.Master == $right.Machine
                | extend IsWorkerMachine=iif(isnull(Master) or isempty(Master), true, false)
                | project-away Master;
                let Restores = Events
                | where Operation == 'RestoreCheckpointAsync' and isnotempty(Duration)
                | where Result == '{Constants.ResultCode.Success}'
                | summarize LastRestoreTime=max(PreciseTimeStamp) by Machine;
                MachinesWithRole
                | where IsWorkerMachine
                | project-away IsWorkerMachine
                | join hint.strategy=broadcast kind=leftouter Restores on Machine
                | project-away Machine1
                | extend Age=LastActivityTime - LastRestoreTime";
            var results = (await QueryKustoAsync <Result>(context, query)).ToList();

            GroupByStampAndCallHelper <Result>(results, result => result.Stamp, lastRestoredCheckpointHelper);

            void lastRestoredCheckpointHelper(string stamp, List <Result> results)
            {
                if (results.Count == 0)
                {
                    Emit(context, "NoLogs", Severity.Fatal,
                         $"No machines logged anything in the last `{_configuration.ActivityPeriod}`",
                         stamp,
                         eventTimeUtc: now);
                    return;
                }

                var missing  = new List <Result>();
                var failures = new List <Result>();

                foreach (var result in results)
                {
                    if (!result.LastRestoreTime.HasValue)
                    {
                        missing.Add(result);
                        continue;
                    }

                    Contract.AssertNotNull(result.Age);
                    if (result.Age.Value >= _configuration.CheckpointAgeErrorThreshold)
                    {
                        failures.Add(result);
                    }
                }

                _configuration.MissingRestoreMachinesThresholds.Check(missing.Count, (severity, threshold) =>
                {
                    var formattedMissing = missing.Select(m => $"`{m.Machine}`");
                    var machinesCsv      = string.Join(", ", formattedMissing);
                    var shortMachinesCsv = string.Join(", ", formattedMissing.Take(5));
                    Emit(context, "NoRestoresThreshold", severity,
                         $"Found `{missing.Count}` machine(s) active in the last `{_configuration.ActivityPeriod}`, but without checkpoints restored in at least `{_configuration.LookbackPeriod}`: {machinesCsv}",
                         stamp,
                         $"`{missing.Count}` machine(s) haven't restored checkpoints in at least `{_configuration.LookbackPeriod}`. Examples: {shortMachinesCsv}",
                         eventTimeUtc: now);
                });

                _configuration.OldRestoreMachinesThresholds.Check(failures.Count, (severity, threshold) =>
                {
                    var formattedFailures = failures.Select(f => $"`{f.Machine}` ({f.Age ?? TimeSpan.Zero})");
                    var machinesCsv       = string.Join(", ", formattedFailures);
                    var shortMachinesCsv  = string.Join(", ", formattedFailures.Take(5));
                    Emit(context, "OldRestores", severity,
                         $"Found `{failures.Count}` machine(s) active in the last `{_configuration.ActivityPeriod}`, but with old checkpoints (at least `{_configuration.CheckpointAgeErrorThreshold}`): {machinesCsv}",
                         stamp,
                         $"`{failures.Count}` machine(s) have checkpoints older than `{_configuration.CheckpointAgeErrorThreshold}`. Examples: {shortMachinesCsv}",
                         eventTimeUtc: now);
                });
            }
        }
Пример #13
0
#pragma warning restore CS0649

        public override async Task Run(RuleContext context)
        {
            var now   = _configuration.Clock.UtcNow;
            var query =
                $@"
                let end = now();
                let start = end - {CslTimeSpanLiteral.AsCslString(_configuration.LookbackPeriod)};
                let binWidth = {CslTimeSpanLiteral.AsCslString(_configuration.BinWidth)};
                let MasterEvents = table('{_configuration.CacheTableName}')
                | where PreciseTimeStamp between (start .. end)
                | where Role == 'Master'
                | where Component has 'EventHubContentLocationEventStore' or Component has 'ContentLocationEventStore';
                let MaximumDelayFromReceivedEvents = MasterEvents
                | where Message has 'ReceivedEvent'
                | parse Message with * 'ProcessingDelay=' Lag:timespan ',' *
                | project PreciseTimeStamp, Stamp, Machine, ServiceVersion, Message, Duration=Lag
                | summarize MaxDelay=arg_max(Duration, Machine) by Stamp, PreciseTimeStamp=bin(PreciseTimeStamp, binWidth);
                MaximumDelayFromReceivedEvents
                | project Stamp, Machine, PreciseTimeStamp, MaxDelay
                | order by PreciseTimeStamp desc";
            var results = (await QueryKustoAsync <Result>(context, query)).ToList();

            await GroupByStampAndCallHelperAsync <Result>(results, result => result.Stamp, eventHubProcessingDelayHelper);

            async Task eventHubProcessingDelayHelper(string stamp, List <Result> results)
            {
                if (results.Count == 0)
                {
                    // If we reached this point, it means we don't have messages from master saying that it processed
                    // anything. We now need to check if clients actually sent anything at all!
                    var outstandingQuery = $@"
                    let end = now();
                    let start = end - {CslTimeSpanLiteral.AsCslString(_configuration.LookbackPeriod)};
                    let Events = table('{_configuration.CacheTableName}')
                    | where PreciseTimeStamp between (start .. end)
                    | where Stamp == '{stamp}'
                    | where Component has 'EventHubContentLocationEventStore' or Component has 'ContentLocationEventStore'
                    | project PreciseTimeStamp, CorrelationId, Role, Machine, Stamp, Message;
                    let Enqueued = Events
                    | where Role == 'Worker'
                    | where Message has 'sending' and Message has 'OpId'
                    | parse Message with * '/' NumEvents:long 'event.' *
                    | project-away Message;
                    let Processed = Events
                    | where Role == 'Master'
                    | where Message has 'OpId'
                    | project-away Message;
                    let Outstanding = Enqueued
                    | join kind=leftanti Processed on CorrelationId
                    | summarize OutstandingBatches=dcount(CorrelationId), OutstandingEvents=sum(NumEvents);
                    let Done = Processed
                    | summarize ProcessedBatches=dcount(CorrelationId);             
                    Outstanding | extend dummy=1 | join kind=inner (Done | extend dummy=1) on dummy | project-away dummy, dummy1";

                    var outstandingResults = (await QueryKustoAsync <Result2>(context, outstandingQuery)).ToList();

                    if (outstandingResults.Count != 1)
                    {
                        Emit(context, "NoLogs", Severity.Fatal,
                             $"No events processed for at least `{_configuration.LookbackPeriod}`",
                             stamp,
                             eventTimeUtc: now);

                        await EmitIcmAsync(
                            severity : _configuration.Environment.IsProduction()? 2 : 3,
                            title : $"{stamp}: master event processing issues",
                            stamp,
                            machines : null,
                            correlationIds : null,
                            description : $"No events processed for at least `{_configuration.LookbackPeriod}`",
                            eventTimeUtc : now,
                            cacheTimeToLive : _configuration.LookbackPeriod);

                        return;
                    }

                    var result = outstandingResults[0];
                    if (result.OutstandingBatches.HasValue && result.OutstandingBatches.Value > 0)
                    {
                        if (result.ProcessedBatches.HasValue && result.ProcessedBatches.Value == 0)
                        {
                            Emit(context, "MasterStuck", Severity.Fatal,
                                 $"Master hasn't processed any events in the last `{_configuration.LookbackPeriod}`, but has `{result.OutstandingBatches.Value}` batches pending.",
                                 stamp,
                                 eventTimeUtc: now);

                            await EmitIcmAsync(
                                severity : _configuration.Environment.IsProduction()? 2 : 3,
                                title : $"{stamp}: master event processing issues",
                                stamp,
                                machines : null,
                                correlationIds : null,
                                description : $"Master hasn't processed any events in the last `{_configuration.LookbackPeriod}`, but has `{result.OutstandingBatches.Value}` batches pending.",
                                eventTimeUtc : now,
                                cacheTimeToLive : _configuration.LookbackPeriod);
                        }
                    }

                    return;
                }

                var delay = results[0].MaxDelay;

                _configuration.Thresholds.Check(delay, (severity, threshold) =>
                {
                    Emit(context, "DelayThreshold", severity,
                         $"EventHub processing delay `{delay}` above threshold `{threshold}`. Master is `{results[0].Machine}`",
                         stamp,
                         eventTimeUtc: results[0].PreciseTimeStamp);
                });

                await _configuration.EventProcessingDelayIcmThresholdsInHours.CheckAsync(delay.TotalHours, (severity, threshold) =>
                {
                    if (severity >= 2)
                    {
                        if (!_configuration.Environment.IsProduction())
                        {
                            severity = 3;
                        }
                        else if (!TimeConstraints.BusinessHours.SatisfiedPST(utcNow: _configuration.Clock.UtcNow))
                        {
                            severity = 3;
                        }
                    }

                    return(EmitIcmAsync(
                               severity,
                               title: $"{stamp}: master event processing issues",
                               stamp,
                               machines: new[] { results[0].Machine },
                               correlationIds: null,
                               description: $"EventHub processing delay `{delay}` above threshold `{threshold}`. Master is `{results[0].Machine}`",
                               eventTimeUtc: now,
                               cacheTimeToLive: _configuration.LookbackPeriod));
                });
            }
        }
Пример #14
0
        /// <summary>
        /// Creates the schedule of rules that will be run. Also responsible for configuring them.
        /// </summary>
        private void CreateSchedule(Watchlist watchlist)
        {
            // TODO: per-stamp configuration (some stamps are more important than others, query frequency should reflect that)
            // TODO: query weight (how much does it cost). We should adapt scheduling policy to have lighter queries prioritize earlier than the others.
            // TODO: stamp configuration knowledge. Stamp configuration affects what our thresholds should be. We should reflect that here.
            // TODO: add jitter to rules, so that queries to Kusto are spread out over time instead of all at once
            OncePerEnvironment(
                arguments =>
            {
                var configuration = new LastProducedCheckpointRule.Configuration(arguments.BaseConfiguration);
                return(Analysis.Utilities.Yield(new Instantiation()
                {
                    Rule = new LastProducedCheckpointRule(configuration),
                    PollingPeriod = TimeSpan.FromMinutes(40),
                }));
            }, watchlist);

            OncePerEnvironment(arguments =>
            {
                var configuration = new LastRestoredCheckpointRule.Configuration(arguments.BaseConfiguration);
                return(Analysis.Utilities.Yield(new Instantiation()
                {
                    Rule = new LastRestoredCheckpointRule(configuration),
                    PollingPeriod = TimeSpan.FromMinutes(30),
                }));
            }, watchlist);

            OncePerEnvironment(arguments =>
            {
                var configuration = new CheckpointSizeRule.Configuration(arguments.BaseConfiguration);
                return(Analysis.Utilities.Yield(new Instantiation()
                {
                    Rule = new CheckpointSizeRule(configuration),
                    PollingPeriod = configuration.AnomalyDetectionHorizon - TimeSpan.FromMinutes(5),
                }));
            }, watchlist);

            // TODO: this rule is too noisy and inaccurate, we should make it work again
            //OncePerStamp(arguments =>
            //{
            //    var configuration = new ActiveMachinesRule.Configuration(arguments.BaseConfiguration);
            //    return Analysis.Utilities.Yield(new Instantiation()
            //    {
            //        Rule = new ActiveMachinesRule(configuration),
            //        PollingPeriod = configuration.AnomalyDetectionHorizon - TimeSpan.FromMinutes(5),
            //    });
            //}, watchlist);

            OncePerEnvironment(arguments =>
            {
                var configuration = new EventHubProcessingDelayRule.Configuration(arguments.BaseConfiguration);
                return(Analysis.Utilities.Yield(new Instantiation()
                {
                    Rule = new EventHubProcessingDelayRule(configuration),
                    PollingPeriod = TimeSpan.FromMinutes(30),
                }));
            }, watchlist);

            OncePerEnvironment(arguments =>
            {
                var configuration = new BuildFailuresRule.Configuration(arguments.BaseConfiguration);
                return(Analysis.Utilities.Yield(new Instantiation()
                {
                    Rule = new BuildFailuresRule(configuration),
                    PollingPeriod = TimeSpan.FromMinutes(45),
                }));
            }, watchlist);

            // TODO: fire-and-forget exceptions are now being reported on the dashboards. We should see if this can be recycled.
            //OncePerStamp(arguments =>
            //{
            //    var configuration = new FireAndForgetExceptionsRule.Configuration(arguments.BaseConfiguration);
            //    return Analysis.Utilities.Yield(new Instantiation()
            //    {
            //        Rule = new FireAndForgetExceptionsRule(configuration),
            //        PollingPeriod = configuration.LookbackPeriod - TimeSpan.FromMinutes(5),
            //    });
            //}, watchlist);

            // TODO: this was just too noisy
            //OncePerStamp(arguments =>
            //{
            //    var configuration = new ContractViolationsRule.Configuration(arguments.BaseConfiguration);
            //    return Analysis.Utilities.Yield(new Instantiation() {
            //        Rule = new ContractViolationsRule(configuration),
            //        PollingPeriod = configuration.LookbackPeriod,
            //    });
            //}, watchlist);

            var failureChecks = new List <OperationFailureCheckRule.Check>()
            {
                new OperationFailureCheckRule.Check()
                {
                    Match      = "StartupAsync",
                    Constraint = "Component != 'GrpcCopyClient'",
                },
                new OperationFailureCheckRule.Check()
                {
                    Match = "ShutdownAsync",
                },
                new OperationFailureCheckRule.Check()
                {
                    Match = "RestoreCheckpointAsync",
                },
                new OperationFailureCheckRule.Check()
                {
                    Match = "CreateCheckpointAsync",
                },
                new OperationFailureCheckRule.Check()
                {
                    Match = "ReconcileAsync",
                },
                new OperationFailureCheckRule.Check()
                {
                    Match = "ProcessEventsCoreAsync",
                },
                new OperationFailureCheckRule.Check()
                {
                    // TODO(jubayard): lower severity
                    Match = "SendEventsCoreAsync",
                },
            };

            OncePerEnvironment(arguments =>
            {
                return(failureChecks.Select(check =>
                {
                    var configuration = new OperationFailureCheckRule.Configuration(arguments.BaseConfiguration, check);

                    return new Instantiation()
                    {
                        Rule = new OperationFailureCheckRule(configuration),
                        PollingPeriod = configuration.LookbackPeriod - TimeSpan.FromMinutes(5),
                    };
                }));
            }, watchlist);

            var performanceChecks = new List <OperationPerformanceOutliersRule.DynamicCheck>()
            {
                new OperationPerformanceOutliersRule.DynamicCheck()
                {
                    LookbackPeriod  = TimeSpan.FromMinutes(60),
                    DetectionPeriod = TimeSpan.FromMinutes(30),
                    Match           = "LocalCacheServer.StartupAsync",
                    Constraint      = $"Duration >= {CslTimeSpanLiteral.AsCslString(TimeSpan.FromMinutes(1))}",
                },
                new OperationPerformanceOutliersRule.DynamicCheck()
                {
                    LookbackPeriod  = TimeSpan.FromHours(12),
                    DetectionPeriod = TimeSpan.FromHours(1),
                    Match           = "CheckpointManager.CreateCheckpointAsync",
                    Constraint      = $"Duration >= {CslTimeSpanLiteral.AsCslString(TimeSpan.FromMinutes(1))}",
                },
                new OperationPerformanceOutliersRule.DynamicCheck()
                {
                    LookbackPeriod  = TimeSpan.FromHours(12),
                    DetectionPeriod = TimeSpan.FromHours(1),
                    Match           = "CheckpointManager.RestoreCheckpointAsync",
                    Constraint      = $"Duration >= P95 and P95 >= {CslTimeSpanLiteral.AsCslString(TimeSpan.FromMinutes(30))}",
                },
            };

            OncePerEnvironment(arguments =>
            {
                return(performanceChecks.Select(check =>
                {
                    var configuration = new OperationPerformanceOutliersRule.Configuration(arguments.BaseConfiguration, check);

                    return new Instantiation
                    {
                        Rule = new OperationPerformanceOutliersRule(configuration),
                        PollingPeriod = check.DetectionPeriod - TimeSpan.FromMinutes(5),
                    };
                }));
            }, watchlist);

            OncePerEnvironment(arguments =>
            {
                var configuration = new ServiceRestartsRule.Configuration(arguments.BaseConfiguration);
                return(Analysis.Utilities.Yield(new Instantiation()
                {
                    Rule = new ServiceRestartsRule(configuration),
                    PollingPeriod = TimeSpan.FromMinutes(30),
                }));
            }, watchlist);

            OncePerEnvironment(arguments =>
            {
                var configuration = new LongCopyRule.Configuration(arguments.BaseConfiguration);
                return(Analysis.Utilities.Yield(new Instantiation()
                {
                    Rule = new LongCopyRule(configuration),
                    PollingPeriod = TimeSpan.FromMinutes(30),
                }));
            }, watchlist);

            OncePerEnvironment(arguments =>
            {
                var configuration = new MachineReimagesRule.Configuration(arguments.BaseConfiguration);

                return(Analysis.Utilities.Yield(new Instantiation()
                {
                    Rule = new MachineReimagesRule(configuration),
                    PollingPeriod = TimeSpan.FromMinutes(30),
                }));
            }, watchlist);

            OncePerEnvironment(arguments =>
            {
                var configuration = new DiskCorruptionRule.Configuration(arguments.BaseConfiguration);

                return(Analysis.Utilities.Yield(new Instantiation()
                {
                    Rule = new DiskCorruptionRule(configuration),
                    PollingPeriod = TimeSpan.FromMinutes(30),
                }));
            }, watchlist);

            OncePerStamp(GenerateRedisAutoscalingRules, watchlist);
        }
Пример #15
0
#pragma warning restore CS0649

        public override async Task Run(RuleContext context)
        {
            var now   = _configuration.Clock.UtcNow;
            var query =
                $@"
                let end = now();
                let start = end - {CslTimeSpanLiteral.AsCslString(_configuration.LookbackPeriod)};
                let activity = end - {CslTimeSpanLiteral.AsCslString(_configuration.ActivityPeriod)};
                let masterActivity = end - {CslTimeSpanLiteral.AsCslString(_configuration.MasterActivityPeriod)};
                let Events = CloudBuildLogEvent
                | where PreciseTimeStamp between (start .. end)
                | where Stamp == ""{_configuration.Stamp}""
                | where Service == ""{Constants.ServiceName}"" or Service == ""{Constants.MasterServiceName}"";
                let Machines = Events
                | where PreciseTimeStamp >= activity
                | summarize LastActivityTime=max(PreciseTimeStamp) by Machine
                | where not(isnull(Machine));
                let CurrentMaster = Events
                | where PreciseTimeStamp >= masterActivity
                | where Service == ""{Constants.MasterServiceName}""
                | where Message has ""CreateCheckpointAsync stop""
                | summarize (PreciseTimeStamp, Master)=arg_max(PreciseTimeStamp, Machine)
                | project-away PreciseTimeStamp
                | where not(isnull(Master));
                let MachinesWithRole = CurrentMaster
                | join hint.strategy=broadcast kind=rightouter Machines on $left.Master == $right.Machine
                | extend IsWorkerMachine=iif(isnull(Master) or isempty(Master), true, false)
                | project-away Master;
                let Restores = Events
                | where Message has ""RestoreCheckpointAsync stop""
                | summarize LastRestoreTime=max(PreciseTimeStamp) by Machine;
                MachinesWithRole
                | where IsWorkerMachine
                | project-away IsWorkerMachine
                | join hint.strategy=broadcast kind=leftouter Restores on Machine
                | project-away Machine1
                | extend Age=LastActivityTime - LastRestoreTime";
            var results = (await QuerySingleResultSetAsync <Result>(context, query)).ToList();

            if (results.Count == 0)
            {
                Emit(context, "NoLogs", Severity.Fatal,
                     $"No machines logged anything in the last `{_configuration.ActivityPeriod}`",
                     eventTimeUtc: now);
                return;
            }

            var missing  = new List <Result>();
            var failures = new List <Result>();

            foreach (var result in results)
            {
                if (!result.LastRestoreTime.HasValue)
                {
                    missing.Add(result);
                    continue;
                }

                if (result.Age.Value >= _configuration.CheckpointAgeErrorThreshold)
                {
                    failures.Add(result);
                }
            }

            _configuration.MissingRestoreMachinesThresholds.Check(missing.Count, (severity, threshold) =>
            {
                var formattedMissing = missing.Select(m => $"`{m.Machine}`");
                var machinesCsv      = string.Join(", ", formattedMissing);
                var shortMachinesCsv = string.Join(", ", formattedMissing.Take(5));
                Emit(context, "NoRestoresThreshold", severity,
                     $"Found `{missing.Count}` machine(s) active in the last `{_configuration.ActivityPeriod}`, but without checkpoints restored in at least `{_configuration.LookbackPeriod}`: {machinesCsv}",
                     $"`{missing.Count}` machine(s) haven't restored checkpoints in at least `{_configuration.LookbackPeriod}`. Examples: {shortMachinesCsv}",
                     eventTimeUtc: now);
            });

            _configuration.OldRestoreMachinesThresholds.Check(failures.Count, (severity, threshold) =>
            {
                var formattedFailures = failures.Select(f => $"`{f.Machine}` ({f.Age.Value})");
                var machinesCsv       = string.Join(", ", formattedFailures);
                var shortMachinesCsv  = string.Join(", ", formattedFailures.Take(5));
                Emit(context, "OldRestores", severity,
                     $"Found `{failures.Count}` machine(s) active in the last `{_configuration.ActivityPeriod}`, but with old checkpoints (at least `{_configuration.CheckpointAgeErrorThreshold}`): {machinesCsv}",
                     $"`{failures.Count}` machine(s) have checkpoints older than `{_configuration.CheckpointAgeErrorThreshold}`. Examples: {shortMachinesCsv}",
                     eventTimeUtc: now);
            });
        }
Пример #16
0
#pragma warning restore CS0649

        public override async Task Run(RuleContext context)
        {
            var now   = _configuration.Clock.UtcNow;
            var query =
                $@"
                let end = now();
                let start = end - {CslTimeSpanLiteral.AsCslString(_configuration.LookbackPeriod)};
                let Exceptions = table('{_configuration.CacheTableName}')
                | where PreciseTimeStamp between (start .. end)
                | where Message has 'Unhandled Exception in service.'
                | project PreciseTimeStamp, Stamp, Machine, Service, Exception
                | extend KnownFormat=(Exception has 'Error=[' and Exception has 'Diagnostics=[');
                let FormattedExceptions = Exceptions
                | where KnownFormat
                | parse Exception with ExceptionType:string ': Error=[' Error:string '] Diagnostics=[' Diagnostics:string
                | parse Diagnostics with ExceptionTypeFromDiagnostics:string ': ' *
                | extend ExceptionType = iif(isnull(ExceptionTypeFromDiagnostics), ExceptionType, ExceptionTypeFromDiagnostics)
                | project-away ExceptionTypeFromDiagnostics
                | project-away Exception;
                let UnknownExceptions = Exceptions
                | where not(KnownFormat)
                | parse Exception with ExceptionType:string ': ' *
                | project-rename Diagnostics=Exception;
                FormattedExceptions
                | union UnknownExceptions
                | extend ExceptionType=iif(isempty(ExceptionType) or isnull(ExceptionType), 'Unknown', ExceptionType)
                | extend ExceptionType=iif(ExceptionType has ' ', extract('([A-Za-z0-9.]+) .*', 1, ExceptionType), ExceptionType)
                | summarize Total=count(), LatestTimeUtc=max(PreciseTimeStamp), EarliestTimeUtc=min(PreciseTimeStamp) by Machine, ExceptionType, Stamp
                | where not(isnull(Total));";
            var results = (await QueryKustoAsync <Result>(context, query)).ToList();

            GroupByStampAndCallHelper <Result>(results, result => result.Stamp, serviceRestartsHelper);

            void serviceRestartsHelper(string stamp, List <Result> results)
            {
                if (results.Count == 0)
                {
                    return;
                }

                foreach (var group in results.GroupBy(result => result.ExceptionType))
                {
                    var numberOfMachines   = group.Select(result => result.Machine).Distinct().Count();
                    var numberOfExceptions = group.Sum(result => result.Total);

                    var minTimeUtc   = group.Min(result => result.EarliestTimeUtc);
                    var eventTimeUtc = group.Max(result => result.LatestTimeUtc);

                    BiThreshold(numberOfMachines, numberOfExceptions, _configuration.MachinesThresholds, _configuration.ExceptionsThresholds, (severity, machinesThreshold, exceptionsThreshold) =>
                    {
                        var examples = string.Join(".\r\n", group
                                                   .Select(result => $"\t`{result.Machine}` had `{result.Total}` restart(s) between `{result.EarliestTimeUtc}` and `{result.LatestTimeUtc}` UTC"));
                        Emit(context, $"ServiceRestarts_{group.Key}", severity,
                             $"`{numberOfMachines}` machine(s) had `{numberOfExceptions}` unexpected service restart(s) due to exception `{group.Key}` since `{now - minTimeUtc}` ago. Examples:\r\n{examples}",
                             stamp,
                             $"`{numberOfMachines}` machine(s) had `{numberOfExceptions}` unexpected service restart(s) due to exception `{group.Key}` since `{now - minTimeUtc}` ago",
                             eventTimeUtc: eventTimeUtc);
                    });
                }
            }
        }
Пример #17
0
#pragma warning restore CS0649

        public override async Task Run(RuleContext context)
        {
            var now   = _configuration.Clock.UtcNow;
            var query =
                $@"
                let end = now();
                let start = end - {CslTimeSpanLiteral.AsCslString(_configuration.LookbackPeriod)};
                let MasterEvents = table('{_configuration.CacheTableName}')
                | where PreciseTimeStamp between (start .. end)
                | where Role == 'Master'
                | where Component has 'EventHubContentLocationEventStore';
                let MaximumDelayFromReceivedEvents = MasterEvents
                | where Message has 'ReceivedEvent'
                | parse Message with * 'ProcessingDelay=' Lag:timespan ',' *
                | project PreciseTimeStamp, Stamp, Machine, ServiceVersion, Message, Duration=Lag
                | summarize MaxDelay=max(Duration) by Stamp, PreciseTimeStamp=bin(PreciseTimeStamp, {CslTimeSpanLiteral.AsCslString(_configuration.BinWidth)});
                let MaximumDelayFromEH = MasterEvents
                | where Message has 'processed' 
                | parse Message with * 'processed ' MessageCount:long 'message(s) by ' Duration:long 'ms' * 'TotalMessagesSize=' TotalSize:long *
                | project PreciseTimeStamp, Stamp, MessageCount, Duration, TotalSize, Message, Machine
                | summarize MessageCount=count(), ProcessingDurationMs=percentile(Duration, 50), MaxProcessingDurationMs=max(Duration), SumMessageCount=sum(MessageCount), SumMessageSize=sum(TotalSize) by Stamp, Machine, PreciseTimeStamp=bin(PreciseTimeStamp, {CslTimeSpanLiteral.AsCslString(_configuration.BinWidth)});
                MaximumDelayFromReceivedEvents
                | join MaximumDelayFromEH on Stamp, PreciseTimeStamp
                | project Stamp, Machine, PreciseTimeStamp, MaxDelay, AvgProcessingDuration=totimespan(ProcessingDurationMs*10000), MaxProcessingDuration=totimespan(MaxProcessingDurationMs*10000), MessageCount, SumMessageCount, SumMessageSizeMb=SumMessageSize / (1024*1024)
                | order by PreciseTimeStamp desc";
            var results = (await QueryKustoAsync <Result>(context, query)).ToList();

            GroupByStampAndCallHelper <Result>(results, result => result.Stamp, eventHubProcessingDelayHelper);

            async void eventHubProcessingDelayHelper(string stamp, List <Result> results)
            {
                if (results.Count == 0)
                {
                    // If we reached this point, it means we don't have messages from master saying that it processed
                    // anything. We now need to check if clients actually sent anything at all!
                    var outstandingQuery = $@"
                    let end = now();
                    let start = end - {CslTimeSpanLiteral.AsCslString(_configuration.LookbackPeriod)};
                    let Events = table('{_configuration.CacheTableName}')
                    | where PreciseTimeStamp between (start .. end)
                    | where Stamp == '{stamp}'
                    | where Component has 'EventHubContentLocationEventStore'
                    | project PreciseTimeStamp, Role, Machine, Stamp, Message;
                    let Enqueued = Events
                    | where Role == 'Worker'
                    | where Message has 'sending' and Message has 'OpId'
                    | parse Message with * '/' NumEvents:long 'event.' * 'OpId=' OperationId ',' *
                    | project-away Message;
                    let Processed = Events
                    | where Role == 'Master'
                    | where Message has 'OpId'
                    | parse Message with * 'OpId=' OperationId ',' *
                    | project-away Message;
                    let Outstanding = Enqueued
                    | join kind=leftanti Processed on OperationId
                    | summarize OutstandingBatches=dcount(OperationId), OutstandingEvents=sum(NumEvents);
                    let Done = Processed
                    | summarize ProcessedBatches=dcount(OperationId);
                    Outstanding | extend dummy=1 | join kind=inner (Done | extend dummy=1) on dummy | project-away dummy, dummy1";

                    var outstandingResults = (await QueryKustoAsync <Result2>(context, outstandingQuery)).ToList();

                    if (outstandingResults.Count != 1)
                    {
                        Emit(context, "NoLogs", Severity.Fatal,
                             $"No events processed for at least `{_configuration.LookbackPeriod}`",
                             stamp,
                             eventTimeUtc: now);
                        return;
                    }

                    var result = outstandingResults[0];
                    if (result.OutstandingBatches.HasValue && result.OutstandingBatches.Value > 0)
                    {
                        if (result.ProcessedBatches.HasValue && result.ProcessedBatches.Value == 0)
                        {
                            Emit(context, "MasterStuck", Severity.Fatal,
                                 $"Master hasn't processed any events in the last `{_configuration.LookbackPeriod}`, but has `{result.OutstandingBatches.Value}` batches pending.",
                                 stamp,
                                 eventTimeUtc: now);
                        }
                    }

                    return;
                }

                var delay = results[0].MaxDelay;

                _configuration.Thresholds.Check(delay, (severity, threshold) =>
                {
                    Emit(context, "DelayThreshold", severity,
                         $"EventHub processing delay `{delay}` above threshold `{threshold}`. Master is `{results[0].Machine}`",
                         stamp,
                         eventTimeUtc: results[0].PreciseTimeStamp);
                });
            }
        }
Пример #18
0
#pragma warning restore CS0649

        public override async Task Run(RuleContext context)
        {
            var now   = _configuration.Clock.UtcNow;
            var query =
                $@"
                let end = now();
                let start = end - {CslTimeSpanLiteral.AsCslString(_configuration.LookbackPeriod)};
                table(""{_configuration.CacheTableName}"")
                | where PreciseTimeStamp between (start .. end)
                | where Role == 'Master'
                | where Operation == 'CreateCheckpointAsync' and Component == 'CheckpointManager' and isnotempty(Duration)
                | where Result == '{Constants.ResultCode.Success}'
                | project PreciseTimeStamp, Message, Stamp
                | parse Message with * 'SizeMb=[' SizeMb:double ']' *
                | parse Message with * ' SizeOnDiskMb = ' SizeOnDiskMb:double ' ' *
                | extend SizeMb = coalesce(SizeMb, SizeOnDiskMb)
                | project PreciseTimeStamp, TotalSize=tolong(SizeMb * 1000000), Stamp 
                | sort by PreciseTimeStamp asc";

            var results = (await QueryKustoAsync <Result>(context, query)).ToList();

            GroupByStampAndCallHelper <Result>(results, result => result.Stamp, checkpointSizeHelper);

            void checkpointSizeHelper(string stamp, List <Result> results)
            {
                if (results.Count == 0)
                {
                    _configuration.Logger.Error($"No checkpoints have been produced for at least {_configuration.LookbackPeriod}");
                    return;
                }

                var detectionHorizon = now - _configuration.AnomalyDetectionHorizon;

                results
                .Select(r => (double)r.TotalSize)
                .OverPercentualDifference(_configuration.MaximumPercentualDifference)
                .Where(evaluation => results[evaluation.Index].PreciseTimeStamp >= detectionHorizon)
                .PerformOnLast(index =>
                {
                    var result         = results[index];
                    var previousResult = results[index - 1];
                    Emit(context, "SizeDerivative", Severity.Warning,
                         $"Checkpoint size went from `{previousResult.TotalSize.ToSizeExpression()}` to `{result.TotalSize.ToSizeExpression()}`, which is higher than the threshold of `{_configuration.MaximumPercentualDifference * 100.0}%`",
                         stamp,
                         $"Checkpoint size went from `{previousResult.TotalSize.ToSizeExpression()}` to `{result.TotalSize.ToSizeExpression()}`",
                         eventTimeUtc: result.PreciseTimeStamp);
                });

                var training   = new List <Result>();
                var prediction = new List <Result>();

                results.SplitBy(r => r.PreciseTimeStamp <= detectionHorizon, training, prediction);

                if (prediction.Count == 0)
                {
                    _configuration.Logger.Error($"No checkpoints have been produced for at least {_configuration.AnomalyDetectionHorizon}");
                    return;
                }

                prediction
                .Select(p => p.TotalSize)
                .NotInRange(_configuration.MinimumValidSizeBytes, _configuration.MaximumValidSizeBytes)
                .PerformOnLast(index =>
                {
                    var result = prediction[index];
                    Emit(context, "SizeValidRange", Severity.Warning,
                         $"Checkpoint size `{result.TotalSize.ToSizeExpression()}` out of valid range [`{_configuration.MinimumValidSize}`, `{_configuration.MaximumValidSize}`]",
                         stamp,
                         eventTimeUtc: result.PreciseTimeStamp);
                });

                if (training.Count < _configuration.MinimumTrainingPercentOfData * results.Count)
                {
                    return;
                }

                var lookbackSizes = training.Select(r => r.TotalSize);
                var expectedMin   = (long)Math.Floor((1 - _configuration.MaximumGrowthWrtToLookback) * lookbackSizes.Min());
                var expectedMax   = (long)Math.Ceiling((1 + _configuration.MaximumGrowthWrtToLookback) * lookbackSizes.Max());

                prediction
                .Select(p => p.TotalSize)
                .NotInRange(expectedMin, expectedMax)
                .PerformOnLast(index =>
                {
                    var result = prediction[index];
                    Emit(context, "SizeExpectedRange", Severity.Warning,
                         $"Checkpoint size `{result.TotalSize.ToSizeExpression()}` out of expected range [`{expectedMin.ToSizeExpression()}`, `{expectedMax.ToSizeExpression()}`]",
                         stamp,
                         eventTimeUtc: result.PreciseTimeStamp);
                });
            }
        }
Пример #19
0
#pragma warning restore CS0649

        public override async Task Run(RuleContext context)
        {
            var now   = _configuration.Clock.UtcNow;
            var query =
                $@"
                let end = now();
                let start = end - {CslTimeSpanLiteral.AsCslString(_configuration.LookbackPeriod)};
                CloudBuildLogEvent
                | where PreciseTimeStamp between (start .. end)
                | where Service == ""{Constants.MasterServiceName}""
                | where Stamp == ""{_configuration.Stamp}""
                | where Message has ""Touching blob"" or Message has ""Uploading blob""
                | project PreciseTimeStamp, Machine, Message
                | parse Message with Id "" "" * ""of size "" Size: long "" "" *
                | summarize PreciseTimeStamp = min(PreciseTimeStamp), TotalSize = sum(Size) by Id
                | project PreciseTimeStamp, TotalSize
                | sort by PreciseTimeStamp asc
                | where not(isnull(PreciseTimeStamp))";
            var results = (await QuerySingleResultSetAsync <Result>(context, query)).ToList();

            if (results.Count == 0)
            {
                _configuration.Logger.Error($"No checkpoints have been produced for at least {_configuration.LookbackPeriod}");
                return;
            }

            var detectionHorizon = now - _configuration.AnomalyDetectionHorizon;

            results
            .Select(r => (double)r.TotalSize)
            .OverPercentualDifference(_configuration.MaximumPercentualDifference)
            .Where(evaluation => results[evaluation.Index].PreciseTimeStamp >= detectionHorizon)
            .PerformOnLast(index =>
            {
                var result         = results[index];
                var previousResult = results[index - 1];
                Emit(context, "SizeDerivative", Severity.Warning,
                     $"Checkpoint size went from `{previousResult.TotalSize.ToSizeExpression()}` to `{result.TotalSize.ToSizeExpression()}`, which is higher than the threshold of `{_configuration.MaximumPercentualDifference * 100.0}%`",
                     $"Checkpoint size went from `{previousResult.TotalSize.ToSizeExpression()}` to `{result.TotalSize.ToSizeExpression()}`",
                     eventTimeUtc: result.PreciseTimeStamp);
            });

            var training   = new List <Result>();
            var prediction = new List <Result>();

            results.SplitBy(r => r.PreciseTimeStamp <= detectionHorizon, training, prediction);

            if (prediction.Count == 0)
            {
                _configuration.Logger.Error($"No checkpoints have been produced for at least {_configuration.AnomalyDetectionHorizon}");
                return;
            }

            prediction
            .Select(p => p.TotalSize)
            .NotInRange(_configuration.MinimumValidSizeBytes, _configuration.MaximumValidSizeBytes)
            .PerformOnLast(index => {
                var result = prediction[index];
                Emit(context, "SizeValidRange", Severity.Warning,
                     $"Checkpoint size `{result.TotalSize.ToSizeExpression()}` out of valid range [`{_configuration.MinimumValidSize}`, `{_configuration.MaximumValidSize}`]",
                     eventTimeUtc: result.PreciseTimeStamp);
            });

            if (training.Count < _configuration.MinimumTrainingPercentOfData * results.Count)
            {
                return;
            }

            var lookbackSizes = training.Select(r => r.TotalSize);
            var expectedMin   = (long)Math.Floor((1 - _configuration.MaximumGrowthWrtToLookback) * lookbackSizes.Min());
            var expectedMax   = (long)Math.Ceiling((1 + _configuration.MaximumGrowthWrtToLookback) * lookbackSizes.Max());

            prediction
            .Select(p => p.TotalSize)
            .NotInRange(expectedMin, expectedMax)
            .PerformOnLast(index => {
                var result = prediction[index];
                Emit(context, "SizeExpectedRange", Severity.Warning,
                     $"Checkpoint size `{result.TotalSize.ToSizeExpression()}` out of expected range [`{expectedMin.ToSizeExpression()}`, `{expectedMax.ToSizeExpression()}`]",
                     eventTimeUtc: result.PreciseTimeStamp);
            });
        }
Пример #20
0
#pragma warning restore CS0649

        public override async Task Run(RuleContext context)
        {
            var now   = _configuration.Clock.UtcNow;
            var query =
                $@"
                let end = now();
                let start = end - {CslTimeSpanLiteral.AsCslString(_configuration.LookbackPeriod)};
                let MasterEvents = CloudBuildLogEvent
                | where PreciseTimeStamp between (start .. end)
                | where Stamp == ""{_configuration.Stamp}""
                | where Service == ""{Constants.MasterServiceName}"";
                let MaximumDelayFromReceivedEvents = MasterEvents
                | where Message has ""ReceivedEvent""
                | parse Message with * ""ProcessingDelay="" Lag:timespan "","" *
                | project PreciseTimeStamp, Stamp, Machine, ServiceVersion, Message, Duration=Lag
                | summarize MaxDelay=max(Duration) by Stamp, PreciseTimeStamp=bin(PreciseTimeStamp, {CslTimeSpanLiteral.AsCslString(_configuration.BinWidth)});
                let MaximumDelayFromEH = MasterEvents
                | where (Message has ""EventHubContentLocationEventStore"" and Message has ""processed"") 
                | parse Message with * ""processed "" MessageCount:long ""message(s) by "" Duration:long ""ms"" * ""TotalMessagesSize="" TotalSize:long *
                | project PreciseTimeStamp, Stamp, MessageCount, Duration, TotalSize, Message, Machine
                | summarize MessageCount=count(), ProcessingDurationMs=percentile(Duration, 50), MaxProcessingDurationMs=max(Duration), SumMessageCount=sum(MessageCount), SumMessageSize=sum(TotalSize) by Stamp, Machine, PreciseTimeStamp=bin(PreciseTimeStamp, {CslTimeSpanLiteral.AsCslString(_configuration.BinWidth)});
                MaximumDelayFromReceivedEvents
                | join MaximumDelayFromEH on Stamp, PreciseTimeStamp
                | project Machine, PreciseTimeStamp, MaxDelay, AvgProcessingDuration=totimespan(ProcessingDurationMs*10000), MaxProcessingDuration=totimespan(MaxProcessingDurationMs*10000), MessageCount, SumMessageCount, SumMessageSizeMb=SumMessageSize / (1024*1024)
                | order by PreciseTimeStamp desc";
            var results = (await QuerySingleResultSetAsync <Result>(context, query)).ToList();

            if (results.Count == 0)
            {
                // If we reached this point, it means we don't have messages from master saying that it processed
                // anything. We now need to check if clients actually sent anything at all!
                var outstandingQuery = $@"
                    let end = now();
                    let start = end - {CslTimeSpanLiteral.AsCslString(_configuration.LookbackPeriod)};
                    let Events = CloudBuildLogEvent
                    | where PreciseTimeStamp between (start .. end)
                    | where Stamp == ""{_configuration.Stamp}""
                    | project PreciseTimeStamp, Service, Machine, Stamp, LogLevelFriendly, Message;
                    let Enqueued = Events
                    | where Service == ""{Constants.ServiceName}""
                    | where Message has ""EventHubContentLocationEventStore"" and Message has ""sending"" and Message has ""OpId""
                    | parse Message with * ""/"" NumEvents:long ""event."" * ""OpId="" OperationId "","" *
                    | project-away Message;
                    let Processed = Events
                    | where Service == ""{Constants.MasterServiceName}""
                    | where Message has ""EventHubContentLocationEventStore"" and Message has ""OpId""
                    | parse Message with * ""OpId="" OperationId "","" *
                    | project-away Message;
                    let Outstanding = Enqueued
                    | join kind=leftanti Processed on OperationId
                    | summarize OutstandingBatches=dcount(OperationId), OutstandingEvents=sum(NumEvents);
                    let Done = Processed
                    | summarize ProcessedBatches=dcount(OperationId);
                    Outstanding | extend dummy=1 | join kind=inner (Done | extend dummy=1) on dummy | project-away dummy, dummy1";

                var outstandingResults = (await QuerySingleResultSetAsync <Result2>(context, outstandingQuery)).ToList();

                if (outstandingResults.Count != 1)
                {
                    Emit(context, "NoLogs", Severity.Fatal,
                         $"No events processed for at least `{_configuration.LookbackPeriod}`",
                         eventTimeUtc: now);
                    return;
                }

                var result = outstandingResults[0];
                if (result.OutstandingBatches.HasValue && result.OutstandingBatches.Value > 0)
                {
                    if (result.ProcessedBatches.HasValue && result.ProcessedBatches.Value == 0)
                    {
                        Emit(context, "MasterStuck", Severity.Fatal,
                             $"Master hasn't processed any events in the last `{_configuration.LookbackPeriod}`, but has `{result.OutstandingBatches.Value}` batches pending.",
                             eventTimeUtc: now);
                    }
                }

                return;
            }

            var delay = results[0].MaxDelay;

            _configuration.Thresholds.Check(delay, (severity, threshold) =>
            {
                Emit(context, "DelayThreshold", severity,
                     $"EventHub processing delay `{delay}` above threshold `{threshold}`. Master is `{results[0].Machine}`",
                     eventTimeUtc: results[0].PreciseTimeStamp);
            });
        }
Пример #21
0
#pragma warning restore CS0649

        public override async Task Run(RuleContext context)
        {
            // There is a lot of drama here around whether the Launcher is running or not. This is because the launcher
            // launcher creates its own MemoryContentDirectory, so the only message that we can use to do this is used
            // for two different reasons.
            var query =
                $@"
                let end = now();
                let start = end - {CslTimeSpanLiteral.AsCslString(_configuration.LookbackPeriod)};
                table('{_configuration.CacheTableName}')
                | where PreciseTimeStamp between (start .. end)
                | summarize Total=dcount(Machine), Reimaged=dcountif(Machine, Message has 'MemoryContentDirectory starting with 0 entries from no file'), Machines=make_set_if(Machine, Message has 'MemoryContentDirectory starting with 0 entries from no file') by Stamp, Service
                | extend Machines=tostring(Machines)
                | where not(isnull(Total))";
            var results = (await QueryKustoAsync <Result>(context, query)).ToList();

            GroupByStampAndCallHelper(results, result => result.Stamp, rule);

            void rule(string stamp, List <Result> results)
            {
                if (results.Count == 0)
                {
                    return;
                }

                // We skip stamps with the Launcher because things get really weird. This should be removed in the near future.
                var isLauncherRunning = results.Any(r => r.Service.Equals(Constants.CacheService, StringComparison.InvariantCultureIgnoreCase));

                if (isLauncherRunning)
                {
                    return;
                }

                var result = results.Where(r => r.Service.Equals(Constants.ContentAddressableStoreService, StringComparison.InvariantCultureIgnoreCase)).First();

                if (result.Reimaged == 0)
                {
                    return;
                }

                var reimageRate = result.Reimaged / (double)result.Total;

                _configuration.ReimageRateThresholds.Check(
                    reimageRate,
                    (severity, cut) =>
                {
                    var examples = string.Empty;
                    if (!string.IsNullOrEmpty(result.Machines))
                    {
                        var machines = JsonSerializer.Deserialize <List <string> >(result.Machines) !;
                        examples     = ". Examples: " + string.Join(", ", machines.Take(_configuration.MaximumExamplesOnAlert).Select(m => $"`{m}`"));
                    }

                    Emit(
                        context,
                        "ReimageRate",
                        severity,
                        $"Detected `{Math.Round(reimageRate * 100.0, 4, MidpointRounding.AwayFromZero)}%` ({result.Reimaged} / {result.Total}) of the stamp has been reimaged in over the last `{_configuration.LookbackPeriod}` period{examples}",
                        stamp);
                });
            }
        }