#pragma warning restore CS0649 public override async Task Run(RuleContext context) { var now = _configuration.Clock.UtcNow; var query = $@" let end = now(); let start = end - {CslTimeSpanLiteral.AsCslString(_configuration.LookbackPeriod)}; table(""{_configuration.CacheTableName}"") | where PreciseTimeStamp between (start .. end) | where Stamp == ""{_configuration.Stamp}"" | where Service == ""{Constants.MasterServiceName}"" | where Message has ""CreateCheckpointAsync stop"" | project PreciseTimeStamp, Message | parse Message with * ""SizeMb=["" SizeMb:double ""]"" * | project PreciseTimeStamp, TotalSize=tolong(SizeMb * 1000000) | sort by PreciseTimeStamp asc"; var results = (await QueryKustoAsync <Result>(context, query)).ToList(); if (results.Count == 0) { _configuration.Logger.Error($"No checkpoints have been produced for at least {_configuration.LookbackPeriod}"); return; } var detectionHorizon = now - _configuration.AnomalyDetectionHorizon; results .Select(r => (double)r.TotalSize) .OverPercentualDifference(_configuration.MaximumPercentualDifference) .Where(evaluation => results[evaluation.Index].PreciseTimeStamp >= detectionHorizon) .PerformOnLast(index => { var result = results[index]; var previousResult = results[index - 1]; Emit(context, "SizeDerivative", Severity.Warning, $"Checkpoint size went from `{previousResult.TotalSize.ToSizeExpression()}` to `{result.TotalSize.ToSizeExpression()}`, which is higher than the threshold of `{_configuration.MaximumPercentualDifference * 100.0}%`", $"Checkpoint size went from `{previousResult.TotalSize.ToSizeExpression()}` to `{result.TotalSize.ToSizeExpression()}`", eventTimeUtc: result.PreciseTimeStamp); }); var training = new List <Result>(); var prediction = new List <Result>(); results.SplitBy(r => r.PreciseTimeStamp <= detectionHorizon, training, prediction); if (prediction.Count == 0) { _configuration.Logger.Error($"No checkpoints have been produced for at least {_configuration.AnomalyDetectionHorizon}"); return; } prediction .Select(p => p.TotalSize) .NotInRange(_configuration.MinimumValidSizeBytes, _configuration.MaximumValidSizeBytes) .PerformOnLast(index => { var result = prediction[index]; Emit(context, "SizeValidRange", Severity.Warning, $"Checkpoint size `{result.TotalSize.ToSizeExpression()}` out of valid range [`{_configuration.MinimumValidSize}`, `{_configuration.MaximumValidSize}`]", eventTimeUtc: result.PreciseTimeStamp); }); if (training.Count < _configuration.MinimumTrainingPercentOfData * results.Count) { return; } var lookbackSizes = training.Select(r => r.TotalSize); var expectedMin = (long)Math.Floor((1 - _configuration.MaximumGrowthWrtToLookback) * lookbackSizes.Min()); var expectedMax = (long)Math.Ceiling((1 + _configuration.MaximumGrowthWrtToLookback) * lookbackSizes.Max()); prediction .Select(p => p.TotalSize) .NotInRange(expectedMin, expectedMax) .PerformOnLast(index => { var result = prediction[index]; Emit(context, "SizeExpectedRange", Severity.Warning, $"Checkpoint size `{result.TotalSize.ToSizeExpression()}` out of expected range [`{expectedMin.ToSizeExpression()}`, `{expectedMax.ToSizeExpression()}`]", eventTimeUtc: result.PreciseTimeStamp); }); }
public abstract Task Run(RuleContext context);
#pragma warning restore CS0649 public override async Task Run(RuleContext context) { var now = _configuration.Clock.UtcNow; var query = $@" let end = now(); let start = end - {CslTimeSpanLiteral.AsCslString(_configuration.LookbackPeriod)}; let MasterEvents = table(""{_configuration.CacheTableName}"") | where PreciseTimeStamp between (start .. end) | where Stamp == ""{_configuration.Stamp}"" | where Service == ""{Constants.MasterServiceName}""; let MaximumDelayFromReceivedEvents = MasterEvents | where Message has ""ReceivedEvent"" | parse Message with * ""ProcessingDelay="" Lag:timespan "","" * | project PreciseTimeStamp, Stamp, Machine, ServiceVersion, Message, Duration=Lag | summarize MaxDelay=max(Duration) by Stamp, PreciseTimeStamp=bin(PreciseTimeStamp, {CslTimeSpanLiteral.AsCslString(_configuration.BinWidth)}); let MaximumDelayFromEH = MasterEvents | where (Message has ""EventHubContentLocationEventStore"" and Message has ""processed"") | parse Message with * ""processed "" MessageCount:long ""message(s) by "" Duration:long ""ms"" * ""TotalMessagesSize="" TotalSize:long * | project PreciseTimeStamp, Stamp, MessageCount, Duration, TotalSize, Message, Machine | summarize MessageCount=count(), ProcessingDurationMs=percentile(Duration, 50), MaxProcessingDurationMs=max(Duration), SumMessageCount=sum(MessageCount), SumMessageSize=sum(TotalSize) by Stamp, Machine, PreciseTimeStamp=bin(PreciseTimeStamp, {CslTimeSpanLiteral.AsCslString(_configuration.BinWidth)}); MaximumDelayFromReceivedEvents | join MaximumDelayFromEH on Stamp, PreciseTimeStamp | project Machine, PreciseTimeStamp, MaxDelay, AvgProcessingDuration=totimespan(ProcessingDurationMs*10000), MaxProcessingDuration=totimespan(MaxProcessingDurationMs*10000), MessageCount, SumMessageCount, SumMessageSizeMb=SumMessageSize / (1024*1024) | order by PreciseTimeStamp desc"; var results = (await QueryKustoAsync <Result>(context, query)).ToList(); if (results.Count == 0) { // If we reached this point, it means we don't have messages from master saying that it processed // anything. We now need to check if clients actually sent anything at all! var outstandingQuery = $@" let end = now(); let start = end - {CslTimeSpanLiteral.AsCslString(_configuration.LookbackPeriod)}; let Events = table(""{_configuration.CacheTableName}"") | where PreciseTimeStamp between (start .. end) | where Stamp == ""{_configuration.Stamp}"" | project PreciseTimeStamp, Service, Machine, Stamp, LogLevelFriendly, Message; let Enqueued = Events | where Service == ""{Constants.ServiceName}"" | where Message has ""EventHubContentLocationEventStore"" and Message has ""sending"" and Message has ""OpId"" | parse Message with * ""/"" NumEvents:long ""event."" * ""OpId="" OperationId "","" * | project-away Message; let Processed = Events | where Service == ""{Constants.MasterServiceName}"" | where Message has ""EventHubContentLocationEventStore"" and Message has ""OpId"" | parse Message with * ""OpId="" OperationId "","" * | project-away Message; let Outstanding = Enqueued | join kind=leftanti Processed on OperationId | summarize OutstandingBatches=dcount(OperationId), OutstandingEvents=sum(NumEvents); let Done = Processed | summarize ProcessedBatches=dcount(OperationId); Outstanding | extend dummy=1 | join kind=inner (Done | extend dummy=1) on dummy | project-away dummy, dummy1"; var outstandingResults = (await QueryKustoAsync <Result2>(context, outstandingQuery)).ToList(); if (outstandingResults.Count != 1) { Emit(context, "NoLogs", Severity.Fatal, $"No events processed for at least `{_configuration.LookbackPeriod}`", eventTimeUtc: now); return; } var result = outstandingResults[0]; if (result.OutstandingBatches.HasValue && result.OutstandingBatches.Value > 0) { if (result.ProcessedBatches.HasValue && result.ProcessedBatches.Value == 0) { Emit(context, "MasterStuck", Severity.Fatal, $"Master hasn't processed any events in the last `{_configuration.LookbackPeriod}`, but has `{result.OutstandingBatches.Value}` batches pending.", eventTimeUtc: now); } } return; } var delay = results[0].MaxDelay; _configuration.Thresholds.Check(delay, (severity, threshold) => { Emit(context, "DelayThreshold", severity, $"EventHub processing delay `{delay}` above threshold `{threshold}`. Master is `{results[0].Machine}`", eventTimeUtc: results[0].PreciseTimeStamp); }); }