예제 #1
0
#pragma warning restore CS0649

        public override async Task Run(RuleContext context)
        {
            var now   = _configuration.Clock.UtcNow;
            var query =
                $@"
                let end = now();
                let start = end - {CslTimeSpanLiteral.AsCslString(_configuration.LookbackPeriod)};
                table(""{_configuration.CacheTableName}"")
                | where PreciseTimeStamp between (start .. end)
                | where Stamp == ""{_configuration.Stamp}""
                | where Service == ""{Constants.MasterServiceName}""
                | where Message has ""CreateCheckpointAsync stop""
                | project PreciseTimeStamp, Message
                | parse Message with * ""SizeMb=["" SizeMb:double ""]"" *
                | project PreciseTimeStamp, TotalSize=tolong(SizeMb * 1000000)
                | sort by PreciseTimeStamp asc";
            var results = (await QueryKustoAsync <Result>(context, query)).ToList();

            if (results.Count == 0)
            {
                _configuration.Logger.Error($"No checkpoints have been produced for at least {_configuration.LookbackPeriod}");
                return;
            }

            var detectionHorizon = now - _configuration.AnomalyDetectionHorizon;

            results
            .Select(r => (double)r.TotalSize)
            .OverPercentualDifference(_configuration.MaximumPercentualDifference)
            .Where(evaluation => results[evaluation.Index].PreciseTimeStamp >= detectionHorizon)
            .PerformOnLast(index =>
            {
                var result         = results[index];
                var previousResult = results[index - 1];
                Emit(context, "SizeDerivative", Severity.Warning,
                     $"Checkpoint size went from `{previousResult.TotalSize.ToSizeExpression()}` to `{result.TotalSize.ToSizeExpression()}`, which is higher than the threshold of `{_configuration.MaximumPercentualDifference * 100.0}%`",
                     $"Checkpoint size went from `{previousResult.TotalSize.ToSizeExpression()}` to `{result.TotalSize.ToSizeExpression()}`",
                     eventTimeUtc: result.PreciseTimeStamp);
            });

            var training   = new List <Result>();
            var prediction = new List <Result>();

            results.SplitBy(r => r.PreciseTimeStamp <= detectionHorizon, training, prediction);

            if (prediction.Count == 0)
            {
                _configuration.Logger.Error($"No checkpoints have been produced for at least {_configuration.AnomalyDetectionHorizon}");
                return;
            }

            prediction
            .Select(p => p.TotalSize)
            .NotInRange(_configuration.MinimumValidSizeBytes, _configuration.MaximumValidSizeBytes)
            .PerformOnLast(index => {
                var result = prediction[index];
                Emit(context, "SizeValidRange", Severity.Warning,
                     $"Checkpoint size `{result.TotalSize.ToSizeExpression()}` out of valid range [`{_configuration.MinimumValidSize}`, `{_configuration.MaximumValidSize}`]",
                     eventTimeUtc: result.PreciseTimeStamp);
            });

            if (training.Count < _configuration.MinimumTrainingPercentOfData * results.Count)
            {
                return;
            }

            var lookbackSizes = training.Select(r => r.TotalSize);
            var expectedMin   = (long)Math.Floor((1 - _configuration.MaximumGrowthWrtToLookback) * lookbackSizes.Min());
            var expectedMax   = (long)Math.Ceiling((1 + _configuration.MaximumGrowthWrtToLookback) * lookbackSizes.Max());

            prediction
            .Select(p => p.TotalSize)
            .NotInRange(expectedMin, expectedMax)
            .PerformOnLast(index => {
                var result = prediction[index];
                Emit(context, "SizeExpectedRange", Severity.Warning,
                     $"Checkpoint size `{result.TotalSize.ToSizeExpression()}` out of expected range [`{expectedMin.ToSizeExpression()}`, `{expectedMax.ToSizeExpression()}`]",
                     eventTimeUtc: result.PreciseTimeStamp);
            });
        }
예제 #2
0
 public abstract Task Run(RuleContext context);
예제 #3
0
#pragma warning restore CS0649

        public override async Task Run(RuleContext context)
        {
            var now   = _configuration.Clock.UtcNow;
            var query =
                $@"
                let end = now();
                let start = end - {CslTimeSpanLiteral.AsCslString(_configuration.LookbackPeriod)};
                let MasterEvents = table(""{_configuration.CacheTableName}"")
                | where PreciseTimeStamp between (start .. end)
                | where Stamp == ""{_configuration.Stamp}""
                | where Service == ""{Constants.MasterServiceName}"";
                let MaximumDelayFromReceivedEvents = MasterEvents
                | where Message has ""ReceivedEvent""
                | parse Message with * ""ProcessingDelay="" Lag:timespan "","" *
                | project PreciseTimeStamp, Stamp, Machine, ServiceVersion, Message, Duration=Lag
                | summarize MaxDelay=max(Duration) by Stamp, PreciseTimeStamp=bin(PreciseTimeStamp, {CslTimeSpanLiteral.AsCslString(_configuration.BinWidth)});
                let MaximumDelayFromEH = MasterEvents
                | where (Message has ""EventHubContentLocationEventStore"" and Message has ""processed"") 
                | parse Message with * ""processed "" MessageCount:long ""message(s) by "" Duration:long ""ms"" * ""TotalMessagesSize="" TotalSize:long *
                | project PreciseTimeStamp, Stamp, MessageCount, Duration, TotalSize, Message, Machine
                | summarize MessageCount=count(), ProcessingDurationMs=percentile(Duration, 50), MaxProcessingDurationMs=max(Duration), SumMessageCount=sum(MessageCount), SumMessageSize=sum(TotalSize) by Stamp, Machine, PreciseTimeStamp=bin(PreciseTimeStamp, {CslTimeSpanLiteral.AsCslString(_configuration.BinWidth)});
                MaximumDelayFromReceivedEvents
                | join MaximumDelayFromEH on Stamp, PreciseTimeStamp
                | project Machine, PreciseTimeStamp, MaxDelay, AvgProcessingDuration=totimespan(ProcessingDurationMs*10000), MaxProcessingDuration=totimespan(MaxProcessingDurationMs*10000), MessageCount, SumMessageCount, SumMessageSizeMb=SumMessageSize / (1024*1024)
                | order by PreciseTimeStamp desc";
            var results = (await QueryKustoAsync <Result>(context, query)).ToList();

            if (results.Count == 0)
            {
                // If we reached this point, it means we don't have messages from master saying that it processed
                // anything. We now need to check if clients actually sent anything at all!
                var outstandingQuery = $@"
                    let end = now();
                    let start = end - {CslTimeSpanLiteral.AsCslString(_configuration.LookbackPeriod)};
                    let Events = table(""{_configuration.CacheTableName}"")
                    | where PreciseTimeStamp between (start .. end)
                    | where Stamp == ""{_configuration.Stamp}""
                    | project PreciseTimeStamp, Service, Machine, Stamp, LogLevelFriendly, Message;
                    let Enqueued = Events
                    | where Service == ""{Constants.ServiceName}""
                    | where Message has ""EventHubContentLocationEventStore"" and Message has ""sending"" and Message has ""OpId""
                    | parse Message with * ""/"" NumEvents:long ""event."" * ""OpId="" OperationId "","" *
                    | project-away Message;
                    let Processed = Events
                    | where Service == ""{Constants.MasterServiceName}""
                    | where Message has ""EventHubContentLocationEventStore"" and Message has ""OpId""
                    | parse Message with * ""OpId="" OperationId "","" *
                    | project-away Message;
                    let Outstanding = Enqueued
                    | join kind=leftanti Processed on OperationId
                    | summarize OutstandingBatches=dcount(OperationId), OutstandingEvents=sum(NumEvents);
                    let Done = Processed
                    | summarize ProcessedBatches=dcount(OperationId);
                    Outstanding | extend dummy=1 | join kind=inner (Done | extend dummy=1) on dummy | project-away dummy, dummy1";

                var outstandingResults = (await QueryKustoAsync <Result2>(context, outstandingQuery)).ToList();

                if (outstandingResults.Count != 1)
                {
                    Emit(context, "NoLogs", Severity.Fatal,
                         $"No events processed for at least `{_configuration.LookbackPeriod}`",
                         eventTimeUtc: now);
                    return;
                }

                var result = outstandingResults[0];
                if (result.OutstandingBatches.HasValue && result.OutstandingBatches.Value > 0)
                {
                    if (result.ProcessedBatches.HasValue && result.ProcessedBatches.Value == 0)
                    {
                        Emit(context, "MasterStuck", Severity.Fatal,
                             $"Master hasn't processed any events in the last `{_configuration.LookbackPeriod}`, but has `{result.OutstandingBatches.Value}` batches pending.",
                             eventTimeUtc: now);
                    }
                }

                return;
            }

            var delay = results[0].MaxDelay;

            _configuration.Thresholds.Check(delay, (severity, threshold) =>
            {
                Emit(context, "DelayThreshold", severity,
                     $"EventHub processing delay `{delay}` above threshold `{threshold}`. Master is `{results[0].Machine}`",
                     eventTimeUtc: results[0].PreciseTimeStamp);
            });
        }