#pragma warning restore CS0649 public override async Task Run(RuleContext context) { var now = _configuration.Clock.UtcNow; var query = $@" let end = now(); let start = end - {CslTimeSpanLiteral.AsCslString(_configuration.LookbackPeriod)}; table(""{_configuration.CacheTableName}"") | where PreciseTimeStamp between (start .. end) | where Stamp == ""{_configuration.Stamp}"" | where Service == ""{Constants.ServiceName}"" or Service == ""{Constants.MasterServiceName}"" | where Message has ""Critical error occurred:"" | project PreciseTimeStamp, Machine, Message | parse Message with * ""occurred: "" ErrorMessage:string "". Diagnostics: "" Exception:string | parse ErrorMessage with Operation:string "" stop "" * | parse Exception with ExceptionType:string "": "" ExceptionMessage ""\n"" * // Fetch first line of the exception | extend Operation=extract(""^((\\w+).(\\w+)).*"", 1, Operation) // Prune unnecessary parameters | project-away Message, ErrorMessage | where ExceptionType != ""System.UnauthorizedAccessException"" | summarize Machines=dcount(Machine, 2), Count=count() by ExceptionType, ExceptionMessage, Operation | where not(isnull(Machines))"; var results = await QueryKustoAsync <Result>(context, query); foreach (var result in results) { Emit(context, $"ContractViolations_Operation_{result.Operation}", Severity.Error, $"`{result.Machines}` machine(s) had `{result.Count}` contract violations (`{result.ExceptionType}`) in operation `{result.Operation}`. Example message: {result.ExceptionMessage}", $"`{result.Machines}` machine(s) had `{result.Count}` contract violations (`{result.ExceptionType}`) in operation `{result.Operation}`", eventTimeUtc: now); } }
#pragma warning restore CS0649 public override async Task Run(RuleContext context) { var now = _configuration.Clock.UtcNow; var query = $@" let end = now(); let start = end - {CslTimeSpanLiteral.AsCslString(_configuration.LookbackPeriod)}; table('{_configuration.CacheTableName}') | where PreciseTimeStamp between (start .. end) | where Result == '{Constants.ResultCode.CriticalFailure}' | project PreciseTimeStamp, Machine, Message, Stamp, Component, Operation | parse Message with * 'occurred: ' ErrorMessage:string 'Diagnostics=[' Exception:string | extend Operation = strcat(Component, '.', Operation) | parse Exception with ExceptionType:string ': ' ExceptionMessage '\n' * // Fetch first line of the exception | project-away Message, ErrorMessage | where ExceptionType != 'System.UnauthorizedAccessException' | summarize Machines=dcount(Machine, 2), Count=count() by ExceptionType, ExceptionMessage, Operation, Stamp | where not(isnull(Machines))"; var results = await QueryKustoAsync <Result>(context, query); foreach (var result in results) { Emit(context, $"ContractViolations_Operation_{result.Operation}", Severity.Error, $"`{result.Machines}` machine(s) had `{result.Count}` contract violations (`{result.ExceptionType}`) in operation `{result.Operation}`. Example message: {result.ExceptionMessage}", stamp: result.Stamp, $"`{result.Machines}` machine(s) had `{result.Count}` contract violations (`{result.ExceptionType}`) in operation `{result.Operation}`", eventTimeUtc: now); } }
#pragma warning restore CS0649 public override async Task Run(RuleContext context) { var now = _configuration.Clock.UtcNow; var query = $@" let end = now(); let start = end - {CslTimeSpanLiteral.AsCslString(_configuration.LookbackPeriod)}; CacheInvocationsWithErrors(""{_configuration.Stamp}"", start, end) | summarize Total=count(), Failed=countif(CacheImplicated) | extend FailureRate=(toreal(Failed)/toreal(Total)) | where not(isnull(Failed))"; var results = (await QueryKustoAsync <Result>(context, query)).ToList(); if (results.Count == 0) { Emit(context, "NoBuilds", Severity.Info, $"Stamp hasn't completed any builds in at least `{_configuration.LookbackPeriod}`", eventTimeUtc: now); return; } var failureRate = results[0].FailureRate; _configuration.FailureRateThresholds.Check(failureRate, (severity, threshold) => { Emit(context, "FailureRate", severity, $"Build failure rate `{Math.Round(failureRate * 100.0, 4, MidpointRounding.AwayFromZero)}%` over last `{_configuration.LookbackPeriod}``", eventTimeUtc: now); }); }
#pragma warning restore CS0649 public override async Task Run(RuleContext context) { var now = _configuration.Clock.UtcNow; var query = $@" let end = now(); let start = end - {CslTimeSpanLiteral.AsCslString(_configuration.LookbackPeriod)}; table(""{_configuration.CacheTableName}"") | where PreciseTimeStamp between (start .. end) | where Stamp == ""{_configuration.Stamp}"" | where Service == ""{Constants.MasterServiceName}"" | where Message contains ""CreateCheckpointAsync stop"" | summarize (PreciseTimeStamp, Machine)=arg_max(PreciseTimeStamp, Machine) | extend Age = end - PreciseTimeStamp | where not(isnull(PreciseTimeStamp))"; var results = (await QueryKustoAsync <Result>(context, query)).ToList(); if (results.Count == 0) { Emit(context, "NoLogs", Severity.Fatal, $"No checkpoints produced for at least {_configuration.LookbackPeriod}", eventTimeUtc: now); return; } var age = results[0].Age; _configuration.AgeThresholds.Check(age, (severity, threshold) => { Emit(context, "CreationThreshold", severity, $"Newest checkpoint age `{age}` above threshold `{threshold}`. Master is {results[0].Machine}", eventTimeUtc: results[0].PreciseTimeStamp); }); }
#pragma warning restore CS0649 public override async Task Run(RuleContext context) { // NOTE(jubayard): The Kusto ingestion delay needs to be taken into consideration to avoid the number of // machines being reported as much less than they actually are. var now = _configuration.Clock.UtcNow - Constants.KustoIngestionDelay; var query = $@" let end = now() - {CslTimeSpanLiteral.AsCslString(Constants.KustoIngestionDelay)}; let start = end - {CslTimeSpanLiteral.AsCslString(_configuration.LookbackPeriod)}; table(""{_configuration.CacheTableName}"") | where PreciseTimeStamp between (start .. end) | where Service == ""{Constants.ServiceName}"" or Service == ""{Constants.MasterServiceName}"" | where Stamp == ""{_configuration.Stamp}"" | summarize ActiveMachines=dcount(Machine, {_configuration.DistinctCountPrecision}) by bin(PreciseTimeStamp, {CslTimeSpanLiteral.AsCslString(_configuration.BinningPeriod)}) | sort by PreciseTimeStamp asc | where not(isnull(PreciseTimeStamp))"; var results = (await QueryKustoAsync <Result>(context, query)).ToList(); if (results.Count == 0) { Emit(context, "NoLogs", Severity.Fatal, $"Machines haven't produced any logs for at least `{_configuration.LookbackPeriod}`", eventTimeUtc: now); return; } var detectionHorizon = now - _configuration.AnomalyDetectionHorizon; var training = new List <Result>(); var prediction = new List <Result>(); results.SplitBy(r => r.PreciseTimeStamp <= detectionHorizon, training, prediction); if (prediction.Count == 0) { Emit(context, "NoLogs", Severity.Fatal, $"Machines haven't produced any logs for at least `{_configuration.AnomalyDetectionHorizon}`", eventTimeUtc: now); return; } if (training.Count < _configuration.MinimumTrainingPercentOfData * results.Count) { return; } var lookbackSizes = training.Select(r => r.ActiveMachines); var lookbackMin = (long)Math.Ceiling((1 - _configuration.MaximumGrowthWrtToLookback) * lookbackSizes.Min()); var lookbackMax = (long)Math.Floor((1 + _configuration.MaximumGrowthWrtToLookback) * lookbackSizes.Max()); prediction.Select(p => p.ActiveMachines).NotInRange(lookbackMin, lookbackMax).Perform(index => { Emit(context, "ExpectedRange", Severity.Warning, $"`{prediction[index].ActiveMachines}` active machines, outside of expected range [`{lookbackMin}`, `{lookbackMax}`]", eventTimeUtc: prediction[index].PreciseTimeStamp); }); }
#pragma warning restore CS0649 public override async Task Run(RuleContext context) { var now = _configuration.Clock.UtcNow; var query = $@" let end = now(); let start = end - {CslTimeSpanLiteral.AsCslString(_configuration.LookbackPeriod)}; let Exceptions = CloudBuildLogEvent | where PreciseTimeStamp between (start .. end) | where Service == ""{Constants.ServiceName}"" or Service == ""{Constants.MasterServiceName}"" | where Stamp == ""{_configuration.Stamp}"" | where Message has ""Unhandled Exception in service."" | project PreciseTimeStamp, Stamp, Machine, Service, Exception | extend KnownFormat=(Exception has ""Error=["" and Exception has ""Diagnostics=[""); let FormattedExceptions = Exceptions | where KnownFormat | parse Exception with ExceptionType:string "": Error=["" Error:string ""] Diagnostics=["" Diagnostics:string | parse Diagnostics with ExceptionTypeFromDiagnostics:string "": "" * | extend ExceptionType = iif(isnull(ExceptionTypeFromDiagnostics), ExceptionType, ExceptionTypeFromDiagnostics) | project-away ExceptionTypeFromDiagnostics | project-away Exception; let UnknownExceptions = Exceptions | where not(KnownFormat) | parse Exception with ExceptionType:string "": "" * | project-rename Diagnostics=Exception; FormattedExceptions | union UnknownExceptions | extend ExceptionType=iif(isempty(ExceptionType) or isnull(ExceptionType), ""Unknown"", ExceptionType) | extend ExceptionType=iif(ExceptionType has "" "", extract(""([A-Za-z0-9.]+) .*"", 1, ExceptionType), ExceptionType) | summarize Total=count(), LatestTimeUtc=max(PreciseTimeStamp), EarliestTimeUtc=min(PreciseTimeStamp) by Machine, ExceptionType | where not(isnull(Total));"; var results = (await QuerySingleResultSetAsync <Result>(context, query)).ToList(); if (results.Count == 0) { return; } foreach (var group in results.GroupBy(result => result.ExceptionType)) { var numberOfMachines = group.Select(result => result.Machine).Distinct().Count(); var numberOfExceptions = group.Sum(result => result.Total); var minTimeUtc = group.Min(result => result.EarliestTimeUtc); var eventTimeUtc = group.Max(result => result.LatestTimeUtc); BiThreshold(numberOfMachines, numberOfExceptions, _configuration.MachinesThresholds, _configuration.ExceptionsThresholds, (severity, machinesThreshold, exceptionsThreshold) => { var examples = string.Join(".\r\n", group .Select(result => $"\t`{result.Machine}` had `{result.Total}` restart(s) between `{result.EarliestTimeUtc}` and `{result.LatestTimeUtc}` UTC")); Emit(context, $"ServiceRestarts_{group.Key}", severity, $"`{numberOfMachines}` machine(s) had `{numberOfExceptions}` unexpected service restart(s) due to exception `{group.Key}` since `{now - minTimeUtc}` ago. Examples:\r\n{examples}", $"`{numberOfMachines}` machine(s) had `{numberOfExceptions}` unexpected service restart(s) due to exception `{group.Key}` since `{now - minTimeUtc}` ago", eventTimeUtc: eventTimeUtc); }); } }
#pragma warning restore CS0649 public override async Task Run(RuleContext context) { var now = _configuration.Clock.UtcNow; var query = $@" let lookbackBuilds = {_configuration.AmountOfBuilds}; let lookbackTime = {CslTimeSpanLiteral.AsCslString(_configuration.LookbackPeriod)}; let Builds = materialize(BuildInfo | where PreciseTimeStamp between (ago(lookbackTime) .. now()) | where InProbation != 1 | partition by OwnerStampId (top lookbackBuilds by EndTime)); let times = materialize( Builds | summarize TotalBuilds = count(), Start = min(StartTime), End = max(StartTime) by OwnerStampId | where TotalBuilds == lookbackBuilds); let start = toscalar(times| summarize min(Start)); let end = toscalar(times| summarize max(End)); let failures = CacheInvocationsWithErrors('', start, end) | where CacheImplicated; Builds | lookup kind=inner failures on $left.BuildId == $right.BuildId | summarize Failed = count() by Stamp = OwnerStampId"; var results = (await QueryKustoAsync <Result>(context, query)).ToList(); await GroupByStampAndCallHelperAsync <Result>(results, result => result.Stamp, buildFailuresHelper); async Task buildFailuresHelper(string stamp, List <Result> results) { if (results.Count == 0) { return; } var failed = results[0].Failed; var failureRate = failed * 1.0 / _configuration.AmountOfBuilds; _configuration.FailureRateThresholds.Check(failureRate, (severity, threshold) => { Emit(context, "FailureRate", severity, $"Build failure rate {Math.Round(failureRate * 100.0, 4, MidpointRounding.AwayFromZero)}%` over last `{_configuration.AmountOfBuilds} builds.``", stamp, eventTimeUtc: now); }); await _configuration.FailureRateIcmThresholds.CheckAsync(failureRate, (severity, threshold) => { return(EmitIcmAsync( severity, title: $"{stamp}: build failure rate over last {_configuration.AmountOfBuilds} builds is higher than {threshold*100}%", stamp, machines: null, correlationIds: null, description: $"Build failure rate `{failed}/{_configuration.AmountOfBuilds}={Math.Round(failureRate * 100.0, 4, MidpointRounding.AwayFromZero)}%` over last `{_configuration.AmountOfBuilds} builds``", eventTimeUtc: now, cacheTimeToLive: _configuration.IcmIncidentCacheTtl)); }); } }
#pragma warning restore CS0649 public override async Task Run(RuleContext context) { var query = $@" let end = now(); let start = end - {CslTimeSpanLiteral.AsCslString(_configuration.LookbackPeriod)}; let Machines = table('{_configuration.CacheTableName}') | where PreciseTimeStamp between (start .. end) | summarize Total=dcount(Machine) by Stamp; table('{_configuration.CacheTableName}') | where PreciseTimeStamp between (start .. end) | where Message has 'MemoryContentDirectory' and Message has 'starting with 0 entries from no file' | parse Message with 'MemoryContentDirectory(' Path ') starting ' * | project PreciseTimeStamp, Machine, Stamp, Path | where Path !has 'launcher' and Path !has 'proxy' and Path !has 'dcs' | summarize Reimaged = dcount(Machine), Machines = make_set(Machine) by Stamp | extend Machines = tostring(Machines) | lookup Machines on Stamp | where not(isnull(Total)) | where isnotempty(Stamp)"; var results = (await QueryKustoAsync <Result>(context, query)).ToList(); GroupByStampAndCallHelper(results, result => result.Stamp, rule); void rule(string stamp, List <Result> results) { if (results.Count == 0) { return; } var result = results[0]; var reimageRate = (double)result.Reimaged / (double)result.Total; _configuration.ReimageRateThresholds.Check( reimageRate, (severity, cut) => { var examples = string.Empty; if (!string.IsNullOrEmpty(result.Machines)) { var machines = JsonSerializer.Deserialize <List <string> >(result.Machines) !; examples = ". Examples: " + string.Join(", ", machines.Take(_configuration.MaximumExamplesOnAlert).Select(m => $"`{m}`")); } Emit( context, "ReimageRate", severity, $"Detected `{Math.Round(reimageRate * 100.0, 4, MidpointRounding.AwayFromZero)}%` ({result.Reimaged} / {result.Total}) of the stamp has been reimaged in over the last `{_configuration.LookbackPeriod}` period{examples}", stamp); }); } }
#pragma warning restore CS0649 public override async Task Run(RuleContext context) { var now = _configuration.Clock.UtcNow; var query = $@" let end = now(); let start = end - {CslTimeSpanLiteral.AsCslString(_configuration.LookbackPeriod)}; CacheInvocationsWithErrors('', start, end) | summarize Total=count(), Failed=countif(CacheImplicated) by Stamp | extend FailureRate=(toreal(Failed)/toreal(Total)) | where not(isnull(Failed))"; var results = (await QueryKustoAsync <Result>(context, query)).ToList(); await GroupByStampAndCallHelperAsync <Result>(results, result => result.Stamp, buildFailuresHelper); async Task buildFailuresHelper(string stamp, List <Result> results) { if (results.Count == 0) { Emit(context, "NoBuilds", Severity.Info, $"Stamp hasn't completed any builds in at least `{_configuration.LookbackPeriod}`", stamp, eventTimeUtc: now); return; } var failureRate = results[0].FailureRate; var failed = results[0].Failed; var total = results[0].Total; _configuration.FailureRateThresholds.Check(failureRate, (severity, threshold) => { Emit(context, "FailureRate", severity, $"Build failure rate `{failed}/{total}={Math.Round(failureRate * 100.0, 4, MidpointRounding.AwayFromZero)}%` over last `{_configuration.LookbackPeriod}``", stamp, eventTimeUtc: now); }); if (total >= _configuration.MinimumAmountOfBuildsForIcm) { await _configuration.FailureRateIcmThresholds.CheckAsync(failureRate, (severity, threshold) => { return(EmitIcmAsync( severity, title: $"{stamp}: build failure rate is higher than {threshold*100}%", stamp, machines: null, correlationIds: null, description: $"Build failure rate `{failed}/{total}={Math.Round(failureRate * 100.0, 4, MidpointRounding.AwayFromZero)}%` over last `{_configuration.LookbackPeriod}``", eventTimeUtc: now, cacheTimeToLive: _configuration.IcmIncidentCacheTtl)); }); } } }
#pragma warning restore CS0649 public override async Task Run(RuleContext context) { var now = _configuration.Clock.UtcNow; var query = $@" let end = now(); let start = end - {CslTimeSpanLiteral.AsCslString(_configuration.LookbackPeriod)}; table('{_configuration.CacheTableName}') | where PreciseTimeStamp between (start .. end) | where Role == 'Master' | where Operation == 'CreateCheckpointAsync' and isnotempty(Duration) | where Result == '{Constants.ResultCode.Success}' | summarize (PreciseTimeStamp, Machine)=arg_max(PreciseTimeStamp, Machine) by Stamp | extend Age = end - PreciseTimeStamp | where not(isnull(PreciseTimeStamp))"; var results = (await QueryKustoAsync <Result>(context, query)).ToList(); GroupByStampAndCallHelper <Result>(results, result => result.Stamp, lastProducedCheckpointHelper); void lastProducedCheckpointHelper(string stamp, List <Result> results) { if (results.Count == 0) { Emit(context, "NoLogs", Severity.Fatal, $"No checkpoints produced for at least {_configuration.LookbackPeriod}", stamp, eventTimeUtc: now); return; } var age = results[0].Age; _configuration.AgeThresholds.Check(age, (severity, threshold) => { Emit(context, "CreationThreshold", severity, $"Newest checkpoint age `{age}` above threshold `{threshold}`. Master is {results[0].Machine}", stamp, eventTimeUtc: results[0].PreciseTimeStamp); }); } }
#pragma warning restore CS0649 public override async Task Run(RuleContext context) { // NOTE(jubayard): When a summarize is run over an empty result set, Kusto produces a single (null) row, // which is why we need to filter it out. var now = _configuration.Clock.UtcNow; var query = $@" let end = now(); let start = end - {CslTimeSpanLiteral.AsCslString(_configuration.LookbackPeriod)}; table(""{_configuration.CacheTableName}"") | where PreciseTimeStamp between (start .. end) | where Stamp == ""{_configuration.Stamp}"" | where Service == ""{Constants.ServiceName}"" or Service == ""{Constants.MasterServiceName}"" | where Message has ""Unhandled exception in fire and forget task"" | where Message !has ""RedisConnectionException"" // This is a transient error (i.e. server closed the socket) | where Message !has ""TaskCanceledException"" // This is irrelevant | parse Message with * ""operation '"" Operation:string ""'"" * ""FullException="" Exception:string | project PreciseTimeStamp, Machine, Operation, Exception | summarize Machines=dcount(Machine), Count=count() by Operation | where not(isnull(Machines))"; var results = await QueryKustoAsync <Result>(context, query); foreach (var result in results) { _configuration.MachinesThresholds.Check(result.Machines, (severity, threshold) => { if (result.Count < _configuration.MinimumErrorsThreshold) { return; } Emit(context, $"FireAndForgetExceptions_Operation_{result.Operation}", severity, $"`{result.Machines}` machines had `{result.Count}` errors in fire and forget tasks for operation `{result.Operation}`", eventTimeUtc: now); }); } }
#pragma warning restore CS0649 public override async Task Run(RuleContext context) { var now = _configuration.Clock.UtcNow; var query = $@" let end = now(); let start = end - {CslTimeSpanLiteral.AsCslString(_configuration.LookbackPeriod)}; let activity = end - {CslTimeSpanLiteral.AsCslString(_configuration.ActivityPeriod)}; let masterActivity = end - {CslTimeSpanLiteral.AsCslString(_configuration.MasterActivityPeriod)}; let Events = table('{_configuration.CacheTableName}') | where PreciseTimeStamp between (start .. end); let Machines = Events | where PreciseTimeStamp >= activity | summarize LastActivityTime=max(PreciseTimeStamp) by Machine, Stamp | where not(isnull(Machine)); let CurrentMaster = Events | where PreciseTimeStamp >= masterActivity | where Role == 'Master' | where Operation == 'CreateCheckpointAsync' and isnotempty(Duration) | where Result == '{Constants.ResultCode.Success}' | summarize (PreciseTimeStamp, Master)=arg_max(PreciseTimeStamp, Machine) | project-away PreciseTimeStamp | where not(isnull(Master)); let MachinesWithRole = CurrentMaster | join hint.strategy=broadcast kind=rightouter Machines on $left.Master == $right.Machine | extend IsWorkerMachine=iif(isnull(Master) or isempty(Master), true, false) | project-away Master; let Restores = Events | where Operation == 'RestoreCheckpointAsync' and isnotempty(Duration) | where Result == '{Constants.ResultCode.Success}' | summarize LastRestoreTime=max(PreciseTimeStamp) by Machine; MachinesWithRole | where IsWorkerMachine | project-away IsWorkerMachine | join hint.strategy=broadcast kind=leftouter Restores on Machine | project-away Machine1 | extend Age=LastActivityTime - LastRestoreTime"; var results = (await QueryKustoAsync <Result>(context, query)).ToList(); GroupByStampAndCallHelper <Result>(results, result => result.Stamp, lastRestoredCheckpointHelper); void lastRestoredCheckpointHelper(string stamp, List <Result> results) { if (results.Count == 0) { Emit(context, "NoLogs", Severity.Fatal, $"No machines logged anything in the last `{_configuration.ActivityPeriod}`", stamp, eventTimeUtc: now); return; } var missing = new List <Result>(); var failures = new List <Result>(); foreach (var result in results) { if (!result.LastRestoreTime.HasValue) { missing.Add(result); continue; } Contract.AssertNotNull(result.Age); if (result.Age.Value >= _configuration.CheckpointAgeErrorThreshold) { failures.Add(result); } } _configuration.MissingRestoreMachinesThresholds.Check(missing.Count, (severity, threshold) => { var formattedMissing = missing.Select(m => $"`{m.Machine}`"); var machinesCsv = string.Join(", ", formattedMissing); var shortMachinesCsv = string.Join(", ", formattedMissing.Take(5)); Emit(context, "NoRestoresThreshold", severity, $"Found `{missing.Count}` machine(s) active in the last `{_configuration.ActivityPeriod}`, but without checkpoints restored in at least `{_configuration.LookbackPeriod}`: {machinesCsv}", stamp, $"`{missing.Count}` machine(s) haven't restored checkpoints in at least `{_configuration.LookbackPeriod}`. Examples: {shortMachinesCsv}", eventTimeUtc: now); }); _configuration.OldRestoreMachinesThresholds.Check(failures.Count, (severity, threshold) => { var formattedFailures = failures.Select(f => $"`{f.Machine}` ({f.Age ?? TimeSpan.Zero})"); var machinesCsv = string.Join(", ", formattedFailures); var shortMachinesCsv = string.Join(", ", formattedFailures.Take(5)); Emit(context, "OldRestores", severity, $"Found `{failures.Count}` machine(s) active in the last `{_configuration.ActivityPeriod}`, but with old checkpoints (at least `{_configuration.CheckpointAgeErrorThreshold}`): {machinesCsv}", stamp, $"`{failures.Count}` machine(s) have checkpoints older than `{_configuration.CheckpointAgeErrorThreshold}`. Examples: {shortMachinesCsv}", eventTimeUtc: now); }); } }
#pragma warning restore CS0649 public override async Task Run(RuleContext context) { var now = _configuration.Clock.UtcNow; var query = $@" let end = now(); let start = end - {CslTimeSpanLiteral.AsCslString(_configuration.LookbackPeriod)}; let binWidth = {CslTimeSpanLiteral.AsCslString(_configuration.BinWidth)}; let MasterEvents = table('{_configuration.CacheTableName}') | where PreciseTimeStamp between (start .. end) | where Role == 'Master' | where Component has 'EventHubContentLocationEventStore' or Component has 'ContentLocationEventStore'; let MaximumDelayFromReceivedEvents = MasterEvents | where Message has 'ReceivedEvent' | parse Message with * 'ProcessingDelay=' Lag:timespan ',' * | project PreciseTimeStamp, Stamp, Machine, ServiceVersion, Message, Duration=Lag | summarize MaxDelay=arg_max(Duration, Machine) by Stamp, PreciseTimeStamp=bin(PreciseTimeStamp, binWidth); MaximumDelayFromReceivedEvents | project Stamp, Machine, PreciseTimeStamp, MaxDelay | order by PreciseTimeStamp desc"; var results = (await QueryKustoAsync <Result>(context, query)).ToList(); await GroupByStampAndCallHelperAsync <Result>(results, result => result.Stamp, eventHubProcessingDelayHelper); async Task eventHubProcessingDelayHelper(string stamp, List <Result> results) { if (results.Count == 0) { // If we reached this point, it means we don't have messages from master saying that it processed // anything. We now need to check if clients actually sent anything at all! var outstandingQuery = $@" let end = now(); let start = end - {CslTimeSpanLiteral.AsCslString(_configuration.LookbackPeriod)}; let Events = table('{_configuration.CacheTableName}') | where PreciseTimeStamp between (start .. end) | where Stamp == '{stamp}' | where Component has 'EventHubContentLocationEventStore' or Component has 'ContentLocationEventStore' | project PreciseTimeStamp, CorrelationId, Role, Machine, Stamp, Message; let Enqueued = Events | where Role == 'Worker' | where Message has 'sending' and Message has 'OpId' | parse Message with * '/' NumEvents:long 'event.' * | project-away Message; let Processed = Events | where Role == 'Master' | where Message has 'OpId' | project-away Message; let Outstanding = Enqueued | join kind=leftanti Processed on CorrelationId | summarize OutstandingBatches=dcount(CorrelationId), OutstandingEvents=sum(NumEvents); let Done = Processed | summarize ProcessedBatches=dcount(CorrelationId); Outstanding | extend dummy=1 | join kind=inner (Done | extend dummy=1) on dummy | project-away dummy, dummy1"; var outstandingResults = (await QueryKustoAsync <Result2>(context, outstandingQuery)).ToList(); if (outstandingResults.Count != 1) { Emit(context, "NoLogs", Severity.Fatal, $"No events processed for at least `{_configuration.LookbackPeriod}`", stamp, eventTimeUtc: now); await EmitIcmAsync( severity : _configuration.Environment.IsProduction()? 2 : 3, title : $"{stamp}: master event processing issues", stamp, machines : null, correlationIds : null, description : $"No events processed for at least `{_configuration.LookbackPeriod}`", eventTimeUtc : now, cacheTimeToLive : _configuration.LookbackPeriod); return; } var result = outstandingResults[0]; if (result.OutstandingBatches.HasValue && result.OutstandingBatches.Value > 0) { if (result.ProcessedBatches.HasValue && result.ProcessedBatches.Value == 0) { Emit(context, "MasterStuck", Severity.Fatal, $"Master hasn't processed any events in the last `{_configuration.LookbackPeriod}`, but has `{result.OutstandingBatches.Value}` batches pending.", stamp, eventTimeUtc: now); await EmitIcmAsync( severity : _configuration.Environment.IsProduction()? 2 : 3, title : $"{stamp}: master event processing issues", stamp, machines : null, correlationIds : null, description : $"Master hasn't processed any events in the last `{_configuration.LookbackPeriod}`, but has `{result.OutstandingBatches.Value}` batches pending.", eventTimeUtc : now, cacheTimeToLive : _configuration.LookbackPeriod); } } return; } var delay = results[0].MaxDelay; _configuration.Thresholds.Check(delay, (severity, threshold) => { Emit(context, "DelayThreshold", severity, $"EventHub processing delay `{delay}` above threshold `{threshold}`. Master is `{results[0].Machine}`", stamp, eventTimeUtc: results[0].PreciseTimeStamp); }); await _configuration.EventProcessingDelayIcmThresholdsInHours.CheckAsync(delay.TotalHours, (severity, threshold) => { if (severity >= 2) { if (!_configuration.Environment.IsProduction()) { severity = 3; } else if (!TimeConstraints.BusinessHours.SatisfiedPST(utcNow: _configuration.Clock.UtcNow)) { severity = 3; } } return(EmitIcmAsync( severity, title: $"{stamp}: master event processing issues", stamp, machines: new[] { results[0].Machine }, correlationIds: null, description: $"EventHub processing delay `{delay}` above threshold `{threshold}`. Master is `{results[0].Machine}`", eventTimeUtc: now, cacheTimeToLive: _configuration.LookbackPeriod)); }); } }
/// <summary> /// Creates the schedule of rules that will be run. Also responsible for configuring them. /// </summary> private void CreateSchedule(Watchlist watchlist) { // TODO: per-stamp configuration (some stamps are more important than others, query frequency should reflect that) // TODO: query weight (how much does it cost). We should adapt scheduling policy to have lighter queries prioritize earlier than the others. // TODO: stamp configuration knowledge. Stamp configuration affects what our thresholds should be. We should reflect that here. // TODO: add jitter to rules, so that queries to Kusto are spread out over time instead of all at once OncePerEnvironment( arguments => { var configuration = new LastProducedCheckpointRule.Configuration(arguments.BaseConfiguration); return(Analysis.Utilities.Yield(new Instantiation() { Rule = new LastProducedCheckpointRule(configuration), PollingPeriod = TimeSpan.FromMinutes(40), })); }, watchlist); OncePerEnvironment(arguments => { var configuration = new LastRestoredCheckpointRule.Configuration(arguments.BaseConfiguration); return(Analysis.Utilities.Yield(new Instantiation() { Rule = new LastRestoredCheckpointRule(configuration), PollingPeriod = TimeSpan.FromMinutes(30), })); }, watchlist); OncePerEnvironment(arguments => { var configuration = new CheckpointSizeRule.Configuration(arguments.BaseConfiguration); return(Analysis.Utilities.Yield(new Instantiation() { Rule = new CheckpointSizeRule(configuration), PollingPeriod = configuration.AnomalyDetectionHorizon - TimeSpan.FromMinutes(5), })); }, watchlist); // TODO: this rule is too noisy and inaccurate, we should make it work again //OncePerStamp(arguments => //{ // var configuration = new ActiveMachinesRule.Configuration(arguments.BaseConfiguration); // return Analysis.Utilities.Yield(new Instantiation() // { // Rule = new ActiveMachinesRule(configuration), // PollingPeriod = configuration.AnomalyDetectionHorizon - TimeSpan.FromMinutes(5), // }); //}, watchlist); OncePerEnvironment(arguments => { var configuration = new EventHubProcessingDelayRule.Configuration(arguments.BaseConfiguration); return(Analysis.Utilities.Yield(new Instantiation() { Rule = new EventHubProcessingDelayRule(configuration), PollingPeriod = TimeSpan.FromMinutes(30), })); }, watchlist); OncePerEnvironment(arguments => { var configuration = new BuildFailuresRule.Configuration(arguments.BaseConfiguration); return(Analysis.Utilities.Yield(new Instantiation() { Rule = new BuildFailuresRule(configuration), PollingPeriod = TimeSpan.FromMinutes(45), })); }, watchlist); // TODO: fire-and-forget exceptions are now being reported on the dashboards. We should see if this can be recycled. //OncePerStamp(arguments => //{ // var configuration = new FireAndForgetExceptionsRule.Configuration(arguments.BaseConfiguration); // return Analysis.Utilities.Yield(new Instantiation() // { // Rule = new FireAndForgetExceptionsRule(configuration), // PollingPeriod = configuration.LookbackPeriod - TimeSpan.FromMinutes(5), // }); //}, watchlist); // TODO: this was just too noisy //OncePerStamp(arguments => //{ // var configuration = new ContractViolationsRule.Configuration(arguments.BaseConfiguration); // return Analysis.Utilities.Yield(new Instantiation() { // Rule = new ContractViolationsRule(configuration), // PollingPeriod = configuration.LookbackPeriod, // }); //}, watchlist); var failureChecks = new List <OperationFailureCheckRule.Check>() { new OperationFailureCheckRule.Check() { Match = "StartupAsync", Constraint = "Component != 'GrpcCopyClient'", }, new OperationFailureCheckRule.Check() { Match = "ShutdownAsync", }, new OperationFailureCheckRule.Check() { Match = "RestoreCheckpointAsync", }, new OperationFailureCheckRule.Check() { Match = "CreateCheckpointAsync", }, new OperationFailureCheckRule.Check() { Match = "ReconcileAsync", }, new OperationFailureCheckRule.Check() { Match = "ProcessEventsCoreAsync", }, new OperationFailureCheckRule.Check() { // TODO(jubayard): lower severity Match = "SendEventsCoreAsync", }, }; OncePerEnvironment(arguments => { return(failureChecks.Select(check => { var configuration = new OperationFailureCheckRule.Configuration(arguments.BaseConfiguration, check); return new Instantiation() { Rule = new OperationFailureCheckRule(configuration), PollingPeriod = configuration.LookbackPeriod - TimeSpan.FromMinutes(5), }; })); }, watchlist); var performanceChecks = new List <OperationPerformanceOutliersRule.DynamicCheck>() { new OperationPerformanceOutliersRule.DynamicCheck() { LookbackPeriod = TimeSpan.FromMinutes(60), DetectionPeriod = TimeSpan.FromMinutes(30), Match = "LocalCacheServer.StartupAsync", Constraint = $"Duration >= {CslTimeSpanLiteral.AsCslString(TimeSpan.FromMinutes(1))}", }, new OperationPerformanceOutliersRule.DynamicCheck() { LookbackPeriod = TimeSpan.FromHours(12), DetectionPeriod = TimeSpan.FromHours(1), Match = "CheckpointManager.CreateCheckpointAsync", Constraint = $"Duration >= {CslTimeSpanLiteral.AsCslString(TimeSpan.FromMinutes(1))}", }, new OperationPerformanceOutliersRule.DynamicCheck() { LookbackPeriod = TimeSpan.FromHours(12), DetectionPeriod = TimeSpan.FromHours(1), Match = "CheckpointManager.RestoreCheckpointAsync", Constraint = $"Duration >= P95 and P95 >= {CslTimeSpanLiteral.AsCslString(TimeSpan.FromMinutes(30))}", }, }; OncePerEnvironment(arguments => { return(performanceChecks.Select(check => { var configuration = new OperationPerformanceOutliersRule.Configuration(arguments.BaseConfiguration, check); return new Instantiation { Rule = new OperationPerformanceOutliersRule(configuration), PollingPeriod = check.DetectionPeriod - TimeSpan.FromMinutes(5), }; })); }, watchlist); OncePerEnvironment(arguments => { var configuration = new ServiceRestartsRule.Configuration(arguments.BaseConfiguration); return(Analysis.Utilities.Yield(new Instantiation() { Rule = new ServiceRestartsRule(configuration), PollingPeriod = TimeSpan.FromMinutes(30), })); }, watchlist); OncePerEnvironment(arguments => { var configuration = new LongCopyRule.Configuration(arguments.BaseConfiguration); return(Analysis.Utilities.Yield(new Instantiation() { Rule = new LongCopyRule(configuration), PollingPeriod = TimeSpan.FromMinutes(30), })); }, watchlist); OncePerEnvironment(arguments => { var configuration = new MachineReimagesRule.Configuration(arguments.BaseConfiguration); return(Analysis.Utilities.Yield(new Instantiation() { Rule = new MachineReimagesRule(configuration), PollingPeriod = TimeSpan.FromMinutes(30), })); }, watchlist); OncePerEnvironment(arguments => { var configuration = new DiskCorruptionRule.Configuration(arguments.BaseConfiguration); return(Analysis.Utilities.Yield(new Instantiation() { Rule = new DiskCorruptionRule(configuration), PollingPeriod = TimeSpan.FromMinutes(30), })); }, watchlist); OncePerStamp(GenerateRedisAutoscalingRules, watchlist); }
#pragma warning restore CS0649 public override async Task Run(RuleContext context) { var now = _configuration.Clock.UtcNow; var query = $@" let end = now(); let start = end - {CslTimeSpanLiteral.AsCslString(_configuration.LookbackPeriod)}; let activity = end - {CslTimeSpanLiteral.AsCslString(_configuration.ActivityPeriod)}; let masterActivity = end - {CslTimeSpanLiteral.AsCslString(_configuration.MasterActivityPeriod)}; let Events = CloudBuildLogEvent | where PreciseTimeStamp between (start .. end) | where Stamp == ""{_configuration.Stamp}"" | where Service == ""{Constants.ServiceName}"" or Service == ""{Constants.MasterServiceName}""; let Machines = Events | where PreciseTimeStamp >= activity | summarize LastActivityTime=max(PreciseTimeStamp) by Machine | where not(isnull(Machine)); let CurrentMaster = Events | where PreciseTimeStamp >= masterActivity | where Service == ""{Constants.MasterServiceName}"" | where Message has ""CreateCheckpointAsync stop"" | summarize (PreciseTimeStamp, Master)=arg_max(PreciseTimeStamp, Machine) | project-away PreciseTimeStamp | where not(isnull(Master)); let MachinesWithRole = CurrentMaster | join hint.strategy=broadcast kind=rightouter Machines on $left.Master == $right.Machine | extend IsWorkerMachine=iif(isnull(Master) or isempty(Master), true, false) | project-away Master; let Restores = Events | where Message has ""RestoreCheckpointAsync stop"" | summarize LastRestoreTime=max(PreciseTimeStamp) by Machine; MachinesWithRole | where IsWorkerMachine | project-away IsWorkerMachine | join hint.strategy=broadcast kind=leftouter Restores on Machine | project-away Machine1 | extend Age=LastActivityTime - LastRestoreTime"; var results = (await QuerySingleResultSetAsync <Result>(context, query)).ToList(); if (results.Count == 0) { Emit(context, "NoLogs", Severity.Fatal, $"No machines logged anything in the last `{_configuration.ActivityPeriod}`", eventTimeUtc: now); return; } var missing = new List <Result>(); var failures = new List <Result>(); foreach (var result in results) { if (!result.LastRestoreTime.HasValue) { missing.Add(result); continue; } if (result.Age.Value >= _configuration.CheckpointAgeErrorThreshold) { failures.Add(result); } } _configuration.MissingRestoreMachinesThresholds.Check(missing.Count, (severity, threshold) => { var formattedMissing = missing.Select(m => $"`{m.Machine}`"); var machinesCsv = string.Join(", ", formattedMissing); var shortMachinesCsv = string.Join(", ", formattedMissing.Take(5)); Emit(context, "NoRestoresThreshold", severity, $"Found `{missing.Count}` machine(s) active in the last `{_configuration.ActivityPeriod}`, but without checkpoints restored in at least `{_configuration.LookbackPeriod}`: {machinesCsv}", $"`{missing.Count}` machine(s) haven't restored checkpoints in at least `{_configuration.LookbackPeriod}`. Examples: {shortMachinesCsv}", eventTimeUtc: now); }); _configuration.OldRestoreMachinesThresholds.Check(failures.Count, (severity, threshold) => { var formattedFailures = failures.Select(f => $"`{f.Machine}` ({f.Age.Value})"); var machinesCsv = string.Join(", ", formattedFailures); var shortMachinesCsv = string.Join(", ", formattedFailures.Take(5)); Emit(context, "OldRestores", severity, $"Found `{failures.Count}` machine(s) active in the last `{_configuration.ActivityPeriod}`, but with old checkpoints (at least `{_configuration.CheckpointAgeErrorThreshold}`): {machinesCsv}", $"`{failures.Count}` machine(s) have checkpoints older than `{_configuration.CheckpointAgeErrorThreshold}`. Examples: {shortMachinesCsv}", eventTimeUtc: now); }); }
#pragma warning restore CS0649 public override async Task Run(RuleContext context) { var now = _configuration.Clock.UtcNow; var query = $@" let end = now(); let start = end - {CslTimeSpanLiteral.AsCslString(_configuration.LookbackPeriod)}; let Exceptions = table('{_configuration.CacheTableName}') | where PreciseTimeStamp between (start .. end) | where Message has 'Unhandled Exception in service.' | project PreciseTimeStamp, Stamp, Machine, Service, Exception | extend KnownFormat=(Exception has 'Error=[' and Exception has 'Diagnostics=['); let FormattedExceptions = Exceptions | where KnownFormat | parse Exception with ExceptionType:string ': Error=[' Error:string '] Diagnostics=[' Diagnostics:string | parse Diagnostics with ExceptionTypeFromDiagnostics:string ': ' * | extend ExceptionType = iif(isnull(ExceptionTypeFromDiagnostics), ExceptionType, ExceptionTypeFromDiagnostics) | project-away ExceptionTypeFromDiagnostics | project-away Exception; let UnknownExceptions = Exceptions | where not(KnownFormat) | parse Exception with ExceptionType:string ': ' * | project-rename Diagnostics=Exception; FormattedExceptions | union UnknownExceptions | extend ExceptionType=iif(isempty(ExceptionType) or isnull(ExceptionType), 'Unknown', ExceptionType) | extend ExceptionType=iif(ExceptionType has ' ', extract('([A-Za-z0-9.]+) .*', 1, ExceptionType), ExceptionType) | summarize Total=count(), LatestTimeUtc=max(PreciseTimeStamp), EarliestTimeUtc=min(PreciseTimeStamp) by Machine, ExceptionType, Stamp | where not(isnull(Total));"; var results = (await QueryKustoAsync <Result>(context, query)).ToList(); GroupByStampAndCallHelper <Result>(results, result => result.Stamp, serviceRestartsHelper); void serviceRestartsHelper(string stamp, List <Result> results) { if (results.Count == 0) { return; } foreach (var group in results.GroupBy(result => result.ExceptionType)) { var numberOfMachines = group.Select(result => result.Machine).Distinct().Count(); var numberOfExceptions = group.Sum(result => result.Total); var minTimeUtc = group.Min(result => result.EarliestTimeUtc); var eventTimeUtc = group.Max(result => result.LatestTimeUtc); BiThreshold(numberOfMachines, numberOfExceptions, _configuration.MachinesThresholds, _configuration.ExceptionsThresholds, (severity, machinesThreshold, exceptionsThreshold) => { var examples = string.Join(".\r\n", group .Select(result => $"\t`{result.Machine}` had `{result.Total}` restart(s) between `{result.EarliestTimeUtc}` and `{result.LatestTimeUtc}` UTC")); Emit(context, $"ServiceRestarts_{group.Key}", severity, $"`{numberOfMachines}` machine(s) had `{numberOfExceptions}` unexpected service restart(s) due to exception `{group.Key}` since `{now - minTimeUtc}` ago. Examples:\r\n{examples}", stamp, $"`{numberOfMachines}` machine(s) had `{numberOfExceptions}` unexpected service restart(s) due to exception `{group.Key}` since `{now - minTimeUtc}` ago", eventTimeUtc: eventTimeUtc); }); } } }
#pragma warning restore CS0649 public override async Task Run(RuleContext context) { var now = _configuration.Clock.UtcNow; var query = $@" let end = now(); let start = end - {CslTimeSpanLiteral.AsCslString(_configuration.LookbackPeriod)}; let MasterEvents = table('{_configuration.CacheTableName}') | where PreciseTimeStamp between (start .. end) | where Role == 'Master' | where Component has 'EventHubContentLocationEventStore'; let MaximumDelayFromReceivedEvents = MasterEvents | where Message has 'ReceivedEvent' | parse Message with * 'ProcessingDelay=' Lag:timespan ',' * | project PreciseTimeStamp, Stamp, Machine, ServiceVersion, Message, Duration=Lag | summarize MaxDelay=max(Duration) by Stamp, PreciseTimeStamp=bin(PreciseTimeStamp, {CslTimeSpanLiteral.AsCslString(_configuration.BinWidth)}); let MaximumDelayFromEH = MasterEvents | where Message has 'processed' | parse Message with * 'processed ' MessageCount:long 'message(s) by ' Duration:long 'ms' * 'TotalMessagesSize=' TotalSize:long * | project PreciseTimeStamp, Stamp, MessageCount, Duration, TotalSize, Message, Machine | summarize MessageCount=count(), ProcessingDurationMs=percentile(Duration, 50), MaxProcessingDurationMs=max(Duration), SumMessageCount=sum(MessageCount), SumMessageSize=sum(TotalSize) by Stamp, Machine, PreciseTimeStamp=bin(PreciseTimeStamp, {CslTimeSpanLiteral.AsCslString(_configuration.BinWidth)}); MaximumDelayFromReceivedEvents | join MaximumDelayFromEH on Stamp, PreciseTimeStamp | project Stamp, Machine, PreciseTimeStamp, MaxDelay, AvgProcessingDuration=totimespan(ProcessingDurationMs*10000), MaxProcessingDuration=totimespan(MaxProcessingDurationMs*10000), MessageCount, SumMessageCount, SumMessageSizeMb=SumMessageSize / (1024*1024) | order by PreciseTimeStamp desc"; var results = (await QueryKustoAsync <Result>(context, query)).ToList(); GroupByStampAndCallHelper <Result>(results, result => result.Stamp, eventHubProcessingDelayHelper); async void eventHubProcessingDelayHelper(string stamp, List <Result> results) { if (results.Count == 0) { // If we reached this point, it means we don't have messages from master saying that it processed // anything. We now need to check if clients actually sent anything at all! var outstandingQuery = $@" let end = now(); let start = end - {CslTimeSpanLiteral.AsCslString(_configuration.LookbackPeriod)}; let Events = table('{_configuration.CacheTableName}') | where PreciseTimeStamp between (start .. end) | where Stamp == '{stamp}' | where Component has 'EventHubContentLocationEventStore' | project PreciseTimeStamp, Role, Machine, Stamp, Message; let Enqueued = Events | where Role == 'Worker' | where Message has 'sending' and Message has 'OpId' | parse Message with * '/' NumEvents:long 'event.' * 'OpId=' OperationId ',' * | project-away Message; let Processed = Events | where Role == 'Master' | where Message has 'OpId' | parse Message with * 'OpId=' OperationId ',' * | project-away Message; let Outstanding = Enqueued | join kind=leftanti Processed on OperationId | summarize OutstandingBatches=dcount(OperationId), OutstandingEvents=sum(NumEvents); let Done = Processed | summarize ProcessedBatches=dcount(OperationId); Outstanding | extend dummy=1 | join kind=inner (Done | extend dummy=1) on dummy | project-away dummy, dummy1"; var outstandingResults = (await QueryKustoAsync <Result2>(context, outstandingQuery)).ToList(); if (outstandingResults.Count != 1) { Emit(context, "NoLogs", Severity.Fatal, $"No events processed for at least `{_configuration.LookbackPeriod}`", stamp, eventTimeUtc: now); return; } var result = outstandingResults[0]; if (result.OutstandingBatches.HasValue && result.OutstandingBatches.Value > 0) { if (result.ProcessedBatches.HasValue && result.ProcessedBatches.Value == 0) { Emit(context, "MasterStuck", Severity.Fatal, $"Master hasn't processed any events in the last `{_configuration.LookbackPeriod}`, but has `{result.OutstandingBatches.Value}` batches pending.", stamp, eventTimeUtc: now); } } return; } var delay = results[0].MaxDelay; _configuration.Thresholds.Check(delay, (severity, threshold) => { Emit(context, "DelayThreshold", severity, $"EventHub processing delay `{delay}` above threshold `{threshold}`. Master is `{results[0].Machine}`", stamp, eventTimeUtc: results[0].PreciseTimeStamp); }); } }
#pragma warning restore CS0649 public override async Task Run(RuleContext context) { var now = _configuration.Clock.UtcNow; var query = $@" let end = now(); let start = end - {CslTimeSpanLiteral.AsCslString(_configuration.LookbackPeriod)}; table(""{_configuration.CacheTableName}"") | where PreciseTimeStamp between (start .. end) | where Role == 'Master' | where Operation == 'CreateCheckpointAsync' and Component == 'CheckpointManager' and isnotempty(Duration) | where Result == '{Constants.ResultCode.Success}' | project PreciseTimeStamp, Message, Stamp | parse Message with * 'SizeMb=[' SizeMb:double ']' * | parse Message with * ' SizeOnDiskMb = ' SizeOnDiskMb:double ' ' * | extend SizeMb = coalesce(SizeMb, SizeOnDiskMb) | project PreciseTimeStamp, TotalSize=tolong(SizeMb * 1000000), Stamp | sort by PreciseTimeStamp asc"; var results = (await QueryKustoAsync <Result>(context, query)).ToList(); GroupByStampAndCallHelper <Result>(results, result => result.Stamp, checkpointSizeHelper); void checkpointSizeHelper(string stamp, List <Result> results) { if (results.Count == 0) { _configuration.Logger.Error($"No checkpoints have been produced for at least {_configuration.LookbackPeriod}"); return; } var detectionHorizon = now - _configuration.AnomalyDetectionHorizon; results .Select(r => (double)r.TotalSize) .OverPercentualDifference(_configuration.MaximumPercentualDifference) .Where(evaluation => results[evaluation.Index].PreciseTimeStamp >= detectionHorizon) .PerformOnLast(index => { var result = results[index]; var previousResult = results[index - 1]; Emit(context, "SizeDerivative", Severity.Warning, $"Checkpoint size went from `{previousResult.TotalSize.ToSizeExpression()}` to `{result.TotalSize.ToSizeExpression()}`, which is higher than the threshold of `{_configuration.MaximumPercentualDifference * 100.0}%`", stamp, $"Checkpoint size went from `{previousResult.TotalSize.ToSizeExpression()}` to `{result.TotalSize.ToSizeExpression()}`", eventTimeUtc: result.PreciseTimeStamp); }); var training = new List <Result>(); var prediction = new List <Result>(); results.SplitBy(r => r.PreciseTimeStamp <= detectionHorizon, training, prediction); if (prediction.Count == 0) { _configuration.Logger.Error($"No checkpoints have been produced for at least {_configuration.AnomalyDetectionHorizon}"); return; } prediction .Select(p => p.TotalSize) .NotInRange(_configuration.MinimumValidSizeBytes, _configuration.MaximumValidSizeBytes) .PerformOnLast(index => { var result = prediction[index]; Emit(context, "SizeValidRange", Severity.Warning, $"Checkpoint size `{result.TotalSize.ToSizeExpression()}` out of valid range [`{_configuration.MinimumValidSize}`, `{_configuration.MaximumValidSize}`]", stamp, eventTimeUtc: result.PreciseTimeStamp); }); if (training.Count < _configuration.MinimumTrainingPercentOfData * results.Count) { return; } var lookbackSizes = training.Select(r => r.TotalSize); var expectedMin = (long)Math.Floor((1 - _configuration.MaximumGrowthWrtToLookback) * lookbackSizes.Min()); var expectedMax = (long)Math.Ceiling((1 + _configuration.MaximumGrowthWrtToLookback) * lookbackSizes.Max()); prediction .Select(p => p.TotalSize) .NotInRange(expectedMin, expectedMax) .PerformOnLast(index => { var result = prediction[index]; Emit(context, "SizeExpectedRange", Severity.Warning, $"Checkpoint size `{result.TotalSize.ToSizeExpression()}` out of expected range [`{expectedMin.ToSizeExpression()}`, `{expectedMax.ToSizeExpression()}`]", stamp, eventTimeUtc: result.PreciseTimeStamp); }); } }
#pragma warning restore CS0649 public override async Task Run(RuleContext context) { var now = _configuration.Clock.UtcNow; var query = $@" let end = now(); let start = end - {CslTimeSpanLiteral.AsCslString(_configuration.LookbackPeriod)}; CloudBuildLogEvent | where PreciseTimeStamp between (start .. end) | where Service == ""{Constants.MasterServiceName}"" | where Stamp == ""{_configuration.Stamp}"" | where Message has ""Touching blob"" or Message has ""Uploading blob"" | project PreciseTimeStamp, Machine, Message | parse Message with Id "" "" * ""of size "" Size: long "" "" * | summarize PreciseTimeStamp = min(PreciseTimeStamp), TotalSize = sum(Size) by Id | project PreciseTimeStamp, TotalSize | sort by PreciseTimeStamp asc | where not(isnull(PreciseTimeStamp))"; var results = (await QuerySingleResultSetAsync <Result>(context, query)).ToList(); if (results.Count == 0) { _configuration.Logger.Error($"No checkpoints have been produced for at least {_configuration.LookbackPeriod}"); return; } var detectionHorizon = now - _configuration.AnomalyDetectionHorizon; results .Select(r => (double)r.TotalSize) .OverPercentualDifference(_configuration.MaximumPercentualDifference) .Where(evaluation => results[evaluation.Index].PreciseTimeStamp >= detectionHorizon) .PerformOnLast(index => { var result = results[index]; var previousResult = results[index - 1]; Emit(context, "SizeDerivative", Severity.Warning, $"Checkpoint size went from `{previousResult.TotalSize.ToSizeExpression()}` to `{result.TotalSize.ToSizeExpression()}`, which is higher than the threshold of `{_configuration.MaximumPercentualDifference * 100.0}%`", $"Checkpoint size went from `{previousResult.TotalSize.ToSizeExpression()}` to `{result.TotalSize.ToSizeExpression()}`", eventTimeUtc: result.PreciseTimeStamp); }); var training = new List <Result>(); var prediction = new List <Result>(); results.SplitBy(r => r.PreciseTimeStamp <= detectionHorizon, training, prediction); if (prediction.Count == 0) { _configuration.Logger.Error($"No checkpoints have been produced for at least {_configuration.AnomalyDetectionHorizon}"); return; } prediction .Select(p => p.TotalSize) .NotInRange(_configuration.MinimumValidSizeBytes, _configuration.MaximumValidSizeBytes) .PerformOnLast(index => { var result = prediction[index]; Emit(context, "SizeValidRange", Severity.Warning, $"Checkpoint size `{result.TotalSize.ToSizeExpression()}` out of valid range [`{_configuration.MinimumValidSize}`, `{_configuration.MaximumValidSize}`]", eventTimeUtc: result.PreciseTimeStamp); }); if (training.Count < _configuration.MinimumTrainingPercentOfData * results.Count) { return; } var lookbackSizes = training.Select(r => r.TotalSize); var expectedMin = (long)Math.Floor((1 - _configuration.MaximumGrowthWrtToLookback) * lookbackSizes.Min()); var expectedMax = (long)Math.Ceiling((1 + _configuration.MaximumGrowthWrtToLookback) * lookbackSizes.Max()); prediction .Select(p => p.TotalSize) .NotInRange(expectedMin, expectedMax) .PerformOnLast(index => { var result = prediction[index]; Emit(context, "SizeExpectedRange", Severity.Warning, $"Checkpoint size `{result.TotalSize.ToSizeExpression()}` out of expected range [`{expectedMin.ToSizeExpression()}`, `{expectedMax.ToSizeExpression()}`]", eventTimeUtc: result.PreciseTimeStamp); }); }
#pragma warning restore CS0649 public override async Task Run(RuleContext context) { var now = _configuration.Clock.UtcNow; var query = $@" let end = now(); let start = end - {CslTimeSpanLiteral.AsCslString(_configuration.LookbackPeriod)}; let MasterEvents = CloudBuildLogEvent | where PreciseTimeStamp between (start .. end) | where Stamp == ""{_configuration.Stamp}"" | where Service == ""{Constants.MasterServiceName}""; let MaximumDelayFromReceivedEvents = MasterEvents | where Message has ""ReceivedEvent"" | parse Message with * ""ProcessingDelay="" Lag:timespan "","" * | project PreciseTimeStamp, Stamp, Machine, ServiceVersion, Message, Duration=Lag | summarize MaxDelay=max(Duration) by Stamp, PreciseTimeStamp=bin(PreciseTimeStamp, {CslTimeSpanLiteral.AsCslString(_configuration.BinWidth)}); let MaximumDelayFromEH = MasterEvents | where (Message has ""EventHubContentLocationEventStore"" and Message has ""processed"") | parse Message with * ""processed "" MessageCount:long ""message(s) by "" Duration:long ""ms"" * ""TotalMessagesSize="" TotalSize:long * | project PreciseTimeStamp, Stamp, MessageCount, Duration, TotalSize, Message, Machine | summarize MessageCount=count(), ProcessingDurationMs=percentile(Duration, 50), MaxProcessingDurationMs=max(Duration), SumMessageCount=sum(MessageCount), SumMessageSize=sum(TotalSize) by Stamp, Machine, PreciseTimeStamp=bin(PreciseTimeStamp, {CslTimeSpanLiteral.AsCslString(_configuration.BinWidth)}); MaximumDelayFromReceivedEvents | join MaximumDelayFromEH on Stamp, PreciseTimeStamp | project Machine, PreciseTimeStamp, MaxDelay, AvgProcessingDuration=totimespan(ProcessingDurationMs*10000), MaxProcessingDuration=totimespan(MaxProcessingDurationMs*10000), MessageCount, SumMessageCount, SumMessageSizeMb=SumMessageSize / (1024*1024) | order by PreciseTimeStamp desc"; var results = (await QuerySingleResultSetAsync <Result>(context, query)).ToList(); if (results.Count == 0) { // If we reached this point, it means we don't have messages from master saying that it processed // anything. We now need to check if clients actually sent anything at all! var outstandingQuery = $@" let end = now(); let start = end - {CslTimeSpanLiteral.AsCslString(_configuration.LookbackPeriod)}; let Events = CloudBuildLogEvent | where PreciseTimeStamp between (start .. end) | where Stamp == ""{_configuration.Stamp}"" | project PreciseTimeStamp, Service, Machine, Stamp, LogLevelFriendly, Message; let Enqueued = Events | where Service == ""{Constants.ServiceName}"" | where Message has ""EventHubContentLocationEventStore"" and Message has ""sending"" and Message has ""OpId"" | parse Message with * ""/"" NumEvents:long ""event."" * ""OpId="" OperationId "","" * | project-away Message; let Processed = Events | where Service == ""{Constants.MasterServiceName}"" | where Message has ""EventHubContentLocationEventStore"" and Message has ""OpId"" | parse Message with * ""OpId="" OperationId "","" * | project-away Message; let Outstanding = Enqueued | join kind=leftanti Processed on OperationId | summarize OutstandingBatches=dcount(OperationId), OutstandingEvents=sum(NumEvents); let Done = Processed | summarize ProcessedBatches=dcount(OperationId); Outstanding | extend dummy=1 | join kind=inner (Done | extend dummy=1) on dummy | project-away dummy, dummy1"; var outstandingResults = (await QuerySingleResultSetAsync <Result2>(context, outstandingQuery)).ToList(); if (outstandingResults.Count != 1) { Emit(context, "NoLogs", Severity.Fatal, $"No events processed for at least `{_configuration.LookbackPeriod}`", eventTimeUtc: now); return; } var result = outstandingResults[0]; if (result.OutstandingBatches.HasValue && result.OutstandingBatches.Value > 0) { if (result.ProcessedBatches.HasValue && result.ProcessedBatches.Value == 0) { Emit(context, "MasterStuck", Severity.Fatal, $"Master hasn't processed any events in the last `{_configuration.LookbackPeriod}`, but has `{result.OutstandingBatches.Value}` batches pending.", eventTimeUtc: now); } } return; } var delay = results[0].MaxDelay; _configuration.Thresholds.Check(delay, (severity, threshold) => { Emit(context, "DelayThreshold", severity, $"EventHub processing delay `{delay}` above threshold `{threshold}`. Master is `{results[0].Machine}`", eventTimeUtc: results[0].PreciseTimeStamp); }); }
#pragma warning restore CS0649 public override async Task Run(RuleContext context) { // There is a lot of drama here around whether the Launcher is running or not. This is because the launcher // launcher creates its own MemoryContentDirectory, so the only message that we can use to do this is used // for two different reasons. var query = $@" let end = now(); let start = end - {CslTimeSpanLiteral.AsCslString(_configuration.LookbackPeriod)}; table('{_configuration.CacheTableName}') | where PreciseTimeStamp between (start .. end) | summarize Total=dcount(Machine), Reimaged=dcountif(Machine, Message has 'MemoryContentDirectory starting with 0 entries from no file'), Machines=make_set_if(Machine, Message has 'MemoryContentDirectory starting with 0 entries from no file') by Stamp, Service | extend Machines=tostring(Machines) | where not(isnull(Total))"; var results = (await QueryKustoAsync <Result>(context, query)).ToList(); GroupByStampAndCallHelper(results, result => result.Stamp, rule); void rule(string stamp, List <Result> results) { if (results.Count == 0) { return; } // We skip stamps with the Launcher because things get really weird. This should be removed in the near future. var isLauncherRunning = results.Any(r => r.Service.Equals(Constants.CacheService, StringComparison.InvariantCultureIgnoreCase)); if (isLauncherRunning) { return; } var result = results.Where(r => r.Service.Equals(Constants.ContentAddressableStoreService, StringComparison.InvariantCultureIgnoreCase)).First(); if (result.Reimaged == 0) { return; } var reimageRate = result.Reimaged / (double)result.Total; _configuration.ReimageRateThresholds.Check( reimageRate, (severity, cut) => { var examples = string.Empty; if (!string.IsNullOrEmpty(result.Machines)) { var machines = JsonSerializer.Deserialize <List <string> >(result.Machines) !; examples = ". Examples: " + string.Join(", ", machines.Take(_configuration.MaximumExamplesOnAlert).Select(m => $"`{m}`")); } Emit( context, "ReimageRate", severity, $"Detected `{Math.Round(reimageRate * 100.0, 4, MidpointRounding.AwayFromZero)}%` ({result.Reimaged} / {result.Total}) of the stamp has been reimaged in over the last `{_configuration.LookbackPeriod}` period{examples}", stamp); }); } }