private void InitializeDataContainers() { if (this.allCpuDataPrivTime == null) { this.allCpuDataPrivTime = new FabricResourceUsageData <float>("Total CPU Time", "SysCpuTimePct"); } if (this.allMemDataCommittedBytes == null) { this.allMemDataCommittedBytes = new FabricResourceUsageData <float>("Memory Consumption MB", "SysMemoryCommittedMb"); } if (this.firewallData == null) { this.firewallData = new FabricResourceUsageData <int>("Active Firewall Rules", "ActiveFirewallRules"); } if (this.activePortsData == null) { this.activePortsData = new FabricResourceUsageData <int>("All Active Ports", "AllPortsInUse"); } if (this.ephemeralPortsData == null) { this.ephemeralPortsData = new FabricResourceUsageData <int>("Ephemeral Active Ports", "EphemeralPortsInUse"); } if (this.allMemDataPercentUsed == null) { this.allMemDataPercentUsed = new FabricResourceUsageData <int>("Memory Consumption %", "SysMemoryPercentUsed"); } }
private void InitializeDataContainers() { if (this.AllCpuTimeData == null) { this.AllCpuTimeData = new FabricResourceUsageData <float>(ErrorWarningProperty.TotalCpuTime, "TotalCpuTime", this.DataCapacity, this.UseCircularBuffer); } if (this.allMemDataCommittedBytes == null) { this.allMemDataCommittedBytes = new FabricResourceUsageData <float>(ErrorWarningProperty.TotalMemoryConsumptionMb, "MemoryConsumedMb", this.DataCapacity, this.UseCircularBuffer); } if (this.allMemDataPercentUsed == null) { this.allMemDataPercentUsed = new FabricResourceUsageData <int>(ErrorWarningProperty.TotalMemoryConsumptionPct, "MemoryConsumedPercentage", this.DataCapacity, this.UseCircularBuffer); } if (this.firewallData == null) { this.firewallData = new FabricResourceUsageData <int>(ErrorWarningProperty.TotalActiveFirewallRules, "ActiveFirewallRules", 1); } if (this.activePortsData == null) { this.activePortsData = new FabricResourceUsageData <int>(ErrorWarningProperty.TotalActivePorts, "AllPortsInUse", 1); } if (this.ephemeralPortsData == null) { this.ephemeralPortsData = new FabricResourceUsageData <int>(ErrorWarningProperty.TotalEphemeralPorts, "EphemeralPortsInUse", 1); } }
internal void ProcessResourceDataReportHealth <T>( FabricResourceUsageData <T> data, T thresholdError, T thresholdWarning, TimeSpan healthReportTtl, HealthReportType healthReportType = HealthReportType.Node, ReplicaOrInstanceMonitoringInfo replicaOrInstance = null, bool dumpOnError = false) where T : struct { if (data == null) { throw new ArgumentException("Supply all required parameters with non-null value."); } var thresholdName = "Minimum"; bool warningOrError = false; string repPartitionId = null, repOrInstanceId = null, name = null, id = null, procName = null; T threshold = thresholdWarning; var healthState = HealthState.Ok; Uri appName = null; Uri serviceName = null; TelemetryData telemetryData = null; if (healthReportType == HealthReportType.Application) { if (replicaOrInstance != null) { repPartitionId = $"Partition: {replicaOrInstance.PartitionId}"; repOrInstanceId = $"Replica: {replicaOrInstance.ReplicaOrInstanceId}"; // Create a unique id which will be used for health Warnings and OKs (clears). appName = replicaOrInstance.ApplicationName; serviceName = replicaOrInstance.ServiceName; name = appName.OriginalString.Replace("fabric:/", string.Empty); } else { appName = new Uri("fabric:/System"); name = data.Id; } id = name + "_" + data.Property.Replace(" ", string.Empty); // The health event description will be a serialized instance of telemetryData, // so it should be completely constructed (filled with data) regardless // of user telemetry settings. telemetryData = new TelemetryData(this.FabricClientInstance, this.Token) { ApplicationName = appName?.OriginalString ?? string.Empty, Code = FoErrorWarningCodes.Ok, HealthState = Enum.GetName(typeof(HealthState), HealthState.Ok), NodeName = this.NodeName, ObserverName = this.ObserverName, Metric = data.Property, Value = Math.Round(Convert.ToDouble(data.AverageDataValue), 1), PartitionId = replicaOrInstance?.PartitionId.ToString(), ReplicaId = replicaOrInstance?.ReplicaOrInstanceId.ToString(), ServiceName = serviceName?.OriginalString ?? string.Empty, Source = ObserverConstants.FabricObserverName, }; try { if (replicaOrInstance != null) { procName = Process.GetProcessById((int)replicaOrInstance.HostProcessId).ProcessName; } else { // The name of the target service process is always the id for data containers coming from FSO. procName = data.Id; } telemetryData.ServiceName = procName; if (this.IsTelemetryProviderEnabled && this.IsObserverTelemetryEnabled) { _ = this.TelemetryClient?.ReportMetricAsync( telemetryData, this.Token).ConfigureAwait(false); } if (this.IsEtwEnabled) { Logger.EtwLogger?.Write( ObserverConstants.FabricObserverETWEventName, new { ApplicationName = appName?.OriginalString ?? string.Empty, Code = FoErrorWarningCodes.Ok, HealthState = Enum.GetName(typeof(HealthState), HealthState.Ok), NodeName = this.NodeName, ObserverName = this.ObserverName, Metric = data.Property, Value = Math.Round(Convert.ToDouble(data.AverageDataValue), 1), PartitionId = replicaOrInstance?.PartitionId.ToString(), ReplicaId = replicaOrInstance?.ReplicaOrInstanceId.ToString(), ServiceName = procName, Source = ObserverConstants.FabricObserverName, }); } } catch (ArgumentException) { return; } catch (InvalidOperationException) { return; } } else { string drive = string.Empty; if (this.ObserverName == ObserverConstants.DiskObserverName) { drive = $"{data.Id}: "; } // The health event description will be a serialized instance of telemetryData, // so it should be completely constructed (filled with data) regardless // of user telemetry settings. telemetryData = new TelemetryData(this.FabricClientInstance, this.Token) { Code = FoErrorWarningCodes.Ok, HealthState = Enum.GetName(typeof(HealthState), HealthState.Ok), NodeName = this.NodeName, ObserverName = this.ObserverName, Metric = $"{drive}{data.Property}", Source = ObserverConstants.FabricObserverName, Value = Math.Round(Convert.ToDouble(data.AverageDataValue), 1), }; if (this.IsTelemetryProviderEnabled && this.IsObserverTelemetryEnabled) { _ = this.TelemetryClient?.ReportMetricAsync( telemetryData, this.Token); } if (this.IsEtwEnabled) { Logger.EtwLogger?.Write( ObserverConstants.FabricObserverETWEventName, new { Code = FoErrorWarningCodes.Ok, HealthState = Enum.GetName(typeof(HealthState), HealthState.Ok), NodeName = this.NodeName, ObserverName = this.ObserverName, Metric = $"{drive}{data.Property}", Source = ObserverConstants.FabricObserverName, Value = Math.Round(Convert.ToDouble(data.AverageDataValue), 1), }); } } // Health Error if (data.IsUnhealthy(thresholdError)) { thresholdName = "Maximum"; threshold = thresholdError; warningOrError = true; healthState = HealthState.Error; // This is primarily useful for AppObserver, but makes sense to be // part of the base class for future use, like for FSO. if (replicaOrInstance != null && dumpOnError) { try { int procId = (int)replicaOrInstance.HostProcessId; if (!this.serviceDumpCountDictionary.ContainsKey(procName)) { this.serviceDumpCountDictionary.Add(procName, 0); } if (this.serviceDumpCountDictionary[procName] < this.maxDumps) { // DumpServiceProcess defaults to a Full dump with // process memory, handles and thread data. bool success = this.DumpServiceProcess(procId); if (success) { this.serviceDumpCountDictionary[procName]++; } } } // Ignore these, it just means no dmp will be created.This is not // critical to FO. Log as info, not warning. catch (Exception e) when(e is ArgumentException || e is InvalidOperationException) { this.ObserverLogger.LogInfo($"Unable to generate dmp file:{Environment.NewLine}{e}"); } } } // Health Warning if (!warningOrError && data.IsUnhealthy(thresholdWarning)) { warningOrError = true; healthState = HealthState.Warning; } if (warningOrError) { string errorWarningCode = null; switch (data.Property) { case ErrorWarningProperty.TotalCpuTime when healthReportType == HealthReportType.Application: errorWarningCode = (healthState == HealthState.Error) ? FoErrorWarningCodes.AppErrorCpuTime : FoErrorWarningCodes.AppWarningCpuTime; break; case ErrorWarningProperty.TotalCpuTime: errorWarningCode = (healthState == HealthState.Error) ? FoErrorWarningCodes.NodeErrorCpuTime : FoErrorWarningCodes.NodeWarningCpuTime; break; case ErrorWarningProperty.DiskSpaceUsagePercentage: errorWarningCode = (healthState == HealthState.Error) ? FoErrorWarningCodes.NodeErrorDiskSpacePercentUsed : FoErrorWarningCodes.NodeWarningDiskSpacePercentUsed; break; case ErrorWarningProperty.DiskSpaceUsageMb: errorWarningCode = (healthState == HealthState.Error) ? FoErrorWarningCodes.NodeErrorDiskSpaceMb : FoErrorWarningCodes.NodeWarningDiskSpaceMb; break; case ErrorWarningProperty.TotalMemoryConsumptionMb when healthReportType == HealthReportType.Application: errorWarningCode = (healthState == HealthState.Error) ? FoErrorWarningCodes.AppErrorMemoryCommittedMb : FoErrorWarningCodes.AppWarningMemoryCommittedMb; break; case ErrorWarningProperty.TotalMemoryConsumptionMb: errorWarningCode = (healthState == HealthState.Error) ? FoErrorWarningCodes.NodeErrorMemoryCommittedMb : FoErrorWarningCodes.NodeWarningMemoryCommittedMb; break; case ErrorWarningProperty.TotalMemoryConsumptionPct when replicaOrInstance != null: errorWarningCode = (healthState == HealthState.Error) ? FoErrorWarningCodes.AppErrorMemoryPercentUsed : FoErrorWarningCodes.AppWarningMemoryPercentUsed; break; case ErrorWarningProperty.TotalMemoryConsumptionPct: errorWarningCode = (healthState == HealthState.Error) ? FoErrorWarningCodes.NodeErrorMemoryPercentUsed : FoErrorWarningCodes.NodeWarningMemoryPercentUsed; break; case ErrorWarningProperty.DiskAverageQueueLength: errorWarningCode = (healthState == HealthState.Error) ? FoErrorWarningCodes.NodeErrorDiskAverageQueueLength : FoErrorWarningCodes.NodeWarningDiskAverageQueueLength; break; case ErrorWarningProperty.TotalActiveFirewallRules: errorWarningCode = (healthState == HealthState.Error) ? FoErrorWarningCodes.ErrorTooManyFirewallRules : FoErrorWarningCodes.WarningTooManyFirewallRules; break; case ErrorWarningProperty.TotalActivePorts when healthReportType == HealthReportType.Application: errorWarningCode = (healthState == HealthState.Error) ? FoErrorWarningCodes.AppErrorTooManyActiveTcpPorts : FoErrorWarningCodes.AppWarningTooManyActiveTcpPorts; break; case ErrorWarningProperty.TotalActivePorts: errorWarningCode = (healthState == HealthState.Error) ? FoErrorWarningCodes.NodeErrorTooManyActiveTcpPorts : FoErrorWarningCodes.NodeWarningTooManyActiveTcpPorts; break; case ErrorWarningProperty.TotalEphemeralPorts when healthReportType == HealthReportType.Application: errorWarningCode = (healthState == HealthState.Error) ? FoErrorWarningCodes.AppErrorTooManyActiveEphemeralPorts : FoErrorWarningCodes.AppWarningTooManyActiveEphemeralPorts; break; case ErrorWarningProperty.TotalEphemeralPorts: errorWarningCode = (healthState == HealthState.Error) ? FoErrorWarningCodes.NodeErrorTooManyActiveEphemeralPorts : FoErrorWarningCodes.NodeWarningTooManyActiveEphemeralPorts; break; } var healthMessage = new StringBuilder(); /*if (name != null) * { * string partitionAndReplicaInfo = string.Empty; * * if (replicaOrInstance != null) * { * partitionAndReplicaInfo = $", {repPartitionId}, {repOrInstanceId}"; * } * * _ = healthMessage.Append($"{name} (Node: {this.NodeName}, Service Process: {procName}.exe{partitionAndReplicaInfo}): "); * }*/ string drive = string.Empty; if (this.ObserverName == ObserverConstants.DiskObserverName) { drive = $"{data.Id}: "; } _ = healthMessage.Append($"{drive}{data.Property} is at or above the specified {thresholdName} limit ({threshold}{data.Units})"); _ = healthMessage.AppendLine($" - {data.Property}: {Math.Round(Convert.ToDouble(data.AverageDataValue))}{data.Units}"); // The health event description will be a serialized instance of telemetryData, // so it should be completely constructed (filled with data) regardless // of user telemetry settings. telemetryData.ApplicationName = appName?.OriginalString ?? string.Empty; telemetryData.Code = errorWarningCode; telemetryData.HealthState = Enum.GetName(typeof(HealthState), healthState); telemetryData.HealthEventDescription = healthMessage.ToString(); telemetryData.Metric = $"{drive}{data.Property}"; telemetryData.ServiceName = serviceName?.OriginalString ?? string.Empty; telemetryData.Source = ObserverConstants.FabricObserverName; telemetryData.Value = Math.Round(Convert.ToDouble(data.AverageDataValue), 1); // Send Health Report as Telemetry event (perhaps it signals an Alert from App Insights, for example.). if (this.IsTelemetryProviderEnabled && this.IsObserverTelemetryEnabled) { _ = this.TelemetryClient?.ReportMetricAsync( telemetryData, this.Token); } // ETW. if (this.IsEtwEnabled) { Logger.EtwLogger?.Write( ObserverConstants.FabricObserverETWEventName, new { ApplicationName = appName?.OriginalString ?? string.Empty, Code = errorWarningCode, HealthState = Enum.GetName(typeof(HealthState), healthState), HealthEventDescription = healthMessage.ToString(), Metric = $"{drive}{data.Property}", Node = this.NodeName, ServiceName = serviceName?.OriginalString ?? string.Empty, Source = ObserverConstants.FabricObserverName, Value = Math.Round(Convert.ToDouble(data.AverageDataValue), 1), }); } var healthReport = new HealthReport { AppName = appName, Code = errorWarningCode, EmitLogEvent = true, HealthData = telemetryData, HealthMessage = healthMessage.ToString(), HealthReportTimeToLive = healthReportTtl, ReportType = healthReportType, State = healthState, NodeName = this.NodeName, Observer = this.ObserverName, ResourceUsageDataProperty = data.Property, }; // From FSO. if (replicaOrInstance == null && healthReportType == HealthReportType.Application) { healthReport.Property = id; } // Emit a Fabric Health Report and optionally a local log write. this.HealthReporter.ReportHealthToServiceFabric(healthReport); // Set internal health state info on data instance. data.ActiveErrorOrWarning = true; data.ActiveErrorOrWarningCode = errorWarningCode; // This means this observer created a Warning or Error SF Health Report this.HasActiveFabricErrorOrWarning = true; // Clean up sb. _ = healthMessage.Clear(); } else { if (data.ActiveErrorOrWarning) { // The health event description will be a serialized instance of telemetryData, // so it should be completely constructed (filled with data) regardless // of user telemetry settings. telemetryData.ApplicationName = appName?.OriginalString ?? string.Empty; telemetryData.Code = data.ActiveErrorOrWarningCode; telemetryData.HealthState = Enum.GetName(typeof(HealthState), HealthState.Ok); telemetryData.HealthEventDescription = $"{data.Property} is now within normal/expected range."; telemetryData.Metric = data.Property; telemetryData.Source = ObserverConstants.FabricObserverName; telemetryData.Value = Math.Round(Convert.ToDouble(data.AverageDataValue), 1); // Telemetry if (this.IsTelemetryProviderEnabled && this.IsObserverTelemetryEnabled) { _ = this.TelemetryClient?.ReportMetricAsync( telemetryData, this.Token); } // ETW. if (this.IsEtwEnabled) { Logger.EtwLogger?.Write( ObserverConstants.FabricObserverETWEventName, new { ApplicationName = appName != null ? appName.OriginalString : string.Empty, Code = data.ActiveErrorOrWarningCode, HealthState = Enum.GetName(typeof(HealthState), HealthState.Ok), HealthEventDescription = $"{data.Property} is now within normal/expected range.", Metric = data.Property, Node = this.NodeName, ServiceName = name ?? string.Empty, Source = ObserverConstants.FabricObserverName, Value = Math.Round(Convert.ToDouble(data.AverageDataValue), 1), }); } var healthReport = new HealthReport { AppName = appName, Code = data.ActiveErrorOrWarningCode, EmitLogEvent = true, HealthData = telemetryData, HealthMessage = $"{data.Property} is now within normal/expected range.", HealthReportTimeToLive = default(TimeSpan), ReportType = healthReportType, State = HealthState.Ok, NodeName = this.NodeName, Observer = this.ObserverName, ResourceUsageDataProperty = data.Property, }; // From FSO. if (replicaOrInstance == null && healthReportType == HealthReportType.Application) { healthReport.Property = id; } // Emit an Ok Health Report to clear Fabric Health warning. this.HealthReporter.ReportHealthToServiceFabric(healthReport); // Reset health states. data.ActiveErrorOrWarning = false; data.ActiveErrorOrWarningCode = FoErrorWarningCodes.Ok; this.HasActiveFabricErrorOrWarning = false; } } // No need to keep data in memory. if (data.Data is List <T> list) { // List<T> impl. list.Clear(); list.TrimExcess(); } else { // CircularBufferCollection<T> impl. data.Data.Clear(); } }
public void ProcessResourceDataReportHealth <T>( FabricResourceUsageData <T> data, T thresholdError, T thresholdWarning, TimeSpan healthReportTtl, HealthReportType healthReportType = HealthReportType.Node, string app = null, ReplicaMonitoringInfo replicaOrInstance = null, bool dumpOnError = false) { if (data == null) { throw new ArgumentException("Supply all required parameters with non-null value..."); } var thresholdName = "Minimum"; bool warningOrError = false; string repPartitionId = null, repOrInstanceId = null, name = null, id = null, procName = null; T threshold = thresholdWarning; var healthState = HealthState.Ok; Uri appName = null; if (replicaOrInstance != null) { repPartitionId = $"Partition: {replicaOrInstance.Partitionid}"; repOrInstanceId = $"Replica: {replicaOrInstance.ReplicaOrInstanceId}"; procName = Process.GetProcessById((int)replicaOrInstance.ReplicaHostProcessId)?.ProcessName; } // Create a unique node id which may be used in the case of warnings or OK clears... if (app != null) { appName = new Uri(app); name = app.Replace("fabric:/", string.Empty); id = name + "_" + data.Property.Replace(" ", string.Empty); } // Telemetry... if (this.IsTelemetryEnabled) { _ = this.ObserverTelemetryClient?.ReportMetricAsync($"{this.NodeName}-{app}-{data.Id}-{data.Property}", data.AverageDataValue, this.Token); } // ETW... if (this.IsEtwEnabled) { Logger.EtwLogger?.Write( $"FabricObserverDataEvent", new { Level = 0, // Info Node = this.NodeName, Observer = this.ObserverName, Property = data.Property, Id = data.Id, Value = $"{Math.Round(data.AverageDataValue)}", Unit = data.Units, }); } // Health Error if (data.IsUnhealthy(thresholdError)) { thresholdName = "Maximum"; threshold = thresholdError; warningOrError = true; healthState = HealthState.Error; // This is primarily useful for AppObserver, but makes sense to be // part of the base class for future use, like for FSO... if (replicaOrInstance != null && procName != null && dumpOnError) { try { int procId = (int)replicaOrInstance.ReplicaHostProcessId; if (!this.serviceDumpCountDictionary.ContainsKey(procName)) { this.serviceDumpCountDictionary.Add(procName, 0); } if (this.serviceDumpCountDictionary[procName] < this.maxDumps) { // DumpServiceProcess defaults to a Full dump with // process memory, handles and thread data... bool success = this.DumpServiceProcess(procId); if (success) { this.serviceDumpCountDictionary[procName]++; } } } // Ignore these, it just means no dmp will be created.This is not // critical to FO... Log as info, not warning... catch (ArgumentException ae) { this.ObserverLogger.LogInfo($"Unable to generate dmp file:\n{ae.ToString()}"); } catch (InvalidOperationException ioe) { this.ObserverLogger.LogInfo($"Unable to generate dmp file:\n{ioe.ToString()}"); } } } // Health Warning if (!warningOrError && data.IsUnhealthy(thresholdWarning)) { warningOrError = true; healthState = HealthState.Warning; } if (warningOrError) { string errorWarningKind = null; if (data.Property.ToLower().Contains("cpu")) { errorWarningKind = (healthState == HealthState.Error) ? ErrorWarningCode.ErrorCpuTime : ErrorWarningCode.WarningCpuTime; } else if (data.Property.ToLower().Contains("disk space")) { errorWarningKind = (healthState == HealthState.Error) ? ErrorWarningCode.ErrorDiskSpace : ErrorWarningCode.WarningDiskSpace; } else if (data.Property == "Memory Consumption MB") { errorWarningKind = (healthState == HealthState.Error) ? ErrorWarningCode.ErrorMemoryCommitted : ErrorWarningCode.WarningMemoryCommitted; } else if (data.Property == "Memory Consumption %") { errorWarningKind = (healthState == HealthState.Error) ? ErrorWarningCode.ErrorMemoryPercentUsed : ErrorWarningCode.WarningMemoryPercentUsed; } else if (data.Property.Contains("Read")) { errorWarningKind = (healthState == HealthState.Error) ? ErrorWarningCode.ErrorDiskIoReads : ErrorWarningCode.WarningDiskIoReads; } else if (data.Property.Contains("Write")) { errorWarningKind = (healthState == HealthState.Error) ? ErrorWarningCode.ErrorDiskIoWrites : ErrorWarningCode.WarningDiskIoWrites; } else if (data.Property.Contains("Queue")) { errorWarningKind = (healthState == HealthState.Error) ? ErrorWarningCode.ErrorDiskAverageQueueLength : ErrorWarningCode.WarningDiskAverageQueueLength; } else if (data.Property.Contains("Firewall")) { errorWarningKind = (healthState == HealthState.Error) ? ErrorWarningCode.ErrorTooManyFirewallRules : ErrorWarningCode.WarningTooManyFirewallRules; } else if (data.Property.Contains("Ports")) { errorWarningKind = (healthState == HealthState.Error) ? ErrorWarningCode.ErrorTooManyActivePorts : ErrorWarningCode.WarningTooManyActivePorts; } var healthMessage = new StringBuilder(); if (name != null) { healthMessage.Append($"{name} (Service Process: {procName}, {repPartitionId}, {repOrInstanceId}): "); } healthMessage.Append($"{data.Property} is at or above the specified {thresholdName} limit ({threshold}{data.Units})"); healthMessage.AppendLine($" - Average {data.Property}: {Math.Round(data.AverageDataValue)}{data.Units}"); // Set internal fabric health states... data.ActiveErrorOrWarning = true; // This means this observer created a Warning or Error SF Health Report this.HasActiveFabricErrorOrWarning = true; var healthReport = new Utilities.HealthReport { AppName = appName, Code = errorWarningKind, EmitLogEvent = true, HealthMessage = healthMessage.ToString(), HealthReportTimeToLive = healthReportTtl, ReportType = healthReportType, State = healthState, NodeName = this.NodeName, Observer = this.ObserverName, }; // Emit a Fabric Health Report and optionally a local log write... this.HealthReporter.ReportHealthToServiceFabric(healthReport); // Send Health Report as Telemetry event (perhaps it signals an Alert from App Insights, for example...)... if (this.IsTelemetryEnabled) { _ = this.ObserverTelemetryClient?.ReportHealthAsync( id, this.FabricServiceContext.ServiceName.OriginalString, "FabricObserver", this.ObserverName, $"{this.NodeName}/{errorWarningKind}/{data.Property}/{Math.Round(data.AverageDataValue)}", healthState, this.Token); } // ETW... if (this.IsEtwEnabled) { Logger.EtwLogger?.Write( $"FabricObserverDataEvent", new { Level = (healthState == HealthState.Warning) ? 1 : 2, Node = this.NodeName, Observer = this.ObserverName, HealthEventErrorCode = errorWarningKind, HealthEventDescription = healthMessage.ToString(), Property = data.Property, Id = data.Id, Value = $"{Math.Round(data.AverageDataValue)}", Unit = data.Units, }); } // Clean up sb... healthMessage.Clear(); } else { if (data.ActiveErrorOrWarning) { Utilities.HealthReport report = new Utilities.HealthReport { AppName = appName, EmitLogEvent = true, HealthMessage = $"{data.Id}: {data.Property} is now within normal/expected range.", HealthReportTimeToLive = default(TimeSpan), ReportType = healthReportType, State = HealthState.Ok, NodeName = this.NodeName, Observer = this.ObserverName, }; // Emit an Ok Health Report to clear Fabric Health warning... this.HealthReporter.ReportHealthToServiceFabric(report); // Reset health states... data.ActiveErrorOrWarning = false; this.HasActiveFabricErrorOrWarning = false; } } // No need to keep data in memory... data.Data.Clear(); data.Data.TrimExcess(); }
/// <summary> /// This function processes numeric data held in FRUD instances and generates Application or Node level Health Reports depending on supplied thresholds. /// </summary> /// <typeparam name="T">This represents the numeric type of data this function will operate on.</typeparam> /// <param name="data">FabricResourceUsageData instance.</param> /// <param name="thresholdError">Error threshold (numeric)</param> /// <param name="thresholdWarning">Warning threshold (numeric)</param> /// <param name="healthReportTtl">Health report Time to Live (TimeSpan)</param> /// <param name="healthReportType">HealthReport type. Note, only Application and Node health report types are supported.</param> /// <param name="replicaOrInstance">Replica or Instance information contained in a type.</param> /// <param name="dumpOnError">Wheter or not to dump process if Error threshold has been reached.</param> public void ProcessResourceDataReportHealth <T>( FabricResourceUsageData <T> data, T thresholdError, T thresholdWarning, TimeSpan healthReportTtl, HealthReportType healthReportType = HealthReportType.Node, ReplicaOrInstanceMonitoringInfo replicaOrInstance = null, bool dumpOnError = false) where T : struct { if (data == null) { throw new ArgumentException("Supply all required parameters with non-null value."); } if (healthReportType != HealthReportType.Application && healthReportType != HealthReportType.Node) { this.ObserverLogger.LogWarning($"ProcessResourceDataReportHealth: Unsupported HealthReport type -> {Enum.GetName(typeof(HealthReportType), healthReportType)}"); return; } var thresholdName = "Minimum"; bool warningOrError = false; string repPartitionId = null, repOrInstanceId = null, name = null, id = null, procName = null; T threshold = thresholdWarning; var healthState = HealthState.Ok; Uri appName = null; Uri serviceName = null; TelemetryData telemetryData = null; if (healthReportType == HealthReportType.Application) { if (replicaOrInstance != null) { repPartitionId = $"Partition: {replicaOrInstance.PartitionId}"; repOrInstanceId = $"Replica: {replicaOrInstance.ReplicaOrInstanceId}"; // Create a unique id which will be used for health Warnings and OKs (clears). appName = replicaOrInstance.ApplicationName; serviceName = replicaOrInstance.ServiceName; name = appName.OriginalString.Replace("fabric:/", string.Empty); } else { appName = new Uri("fabric:/System"); name = data.Id; } id = name + "_" + data.Property.Replace(" ", string.Empty); // The health event description will be a serialized instance of telemetryData, // so it should be completely constructed (filled with data) regardless // of user telemetry settings. telemetryData = new TelemetryData(FabricClientInstance, Token) { ApplicationName = appName?.OriginalString ?? string.Empty, Code = FOErrorWarningCodes.Ok, HealthState = Enum.GetName(typeof(HealthState), HealthState.Ok), NodeName = NodeName, ObserverName = ObserverName, Metric = data.Property, Value = Math.Round(data.AverageDataValue, 1), PartitionId = replicaOrInstance?.PartitionId.ToString(), ReplicaId = replicaOrInstance?.ReplicaOrInstanceId.ToString(), ServiceName = serviceName?.OriginalString ?? string.Empty, Source = ObserverConstants.FabricObserverName, }; try { if (replicaOrInstance != null && replicaOrInstance.HostProcessId > 0) { procName = Process.GetProcessById((int)replicaOrInstance.HostProcessId).ProcessName; } else { // The name of the target service process is always the id for data containers coming from FSO. procName = data.Id; } telemetryData.ServiceName = procName; if (IsTelemetryProviderEnabled && IsObserverTelemetryEnabled) { _ = TelemetryClient?.ReportMetricAsync( telemetryData, Token).ConfigureAwait(false); } if (IsEtwEnabled) { Logger.EtwLogger?.Write( ObserverConstants.FabricObserverETWEventName, new { ApplicationName = appName?.OriginalString ?? string.Empty, Code = FOErrorWarningCodes.Ok, HealthState = Enum.GetName(typeof(HealthState), HealthState.Ok), NodeName, ObserverName, Metric = data.Property, Value = Math.Round(data.AverageDataValue, 1), PartitionId = replicaOrInstance?.PartitionId.ToString(), ReplicaId = replicaOrInstance?.ReplicaOrInstanceId.ToString(), ServiceName = procName, Source = ObserverConstants.FabricObserverName, }); } } catch (ArgumentException) { return; } catch (InvalidOperationException) { return; } } else { string drive = string.Empty; if (ObserverName == ObserverConstants.DiskObserverName) { drive = $"{data.Id}: "; if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows)) { drive = $"{data.Id.Remove(1, 2)}: "; } } // The health event description will be a serialized instance of telemetryData, // so it should be completely constructed (filled with data) regardless // of user telemetry settings. telemetryData = new TelemetryData(FabricClientInstance, Token) { Code = FOErrorWarningCodes.Ok, HealthState = Enum.GetName(typeof(HealthState), HealthState.Ok), NodeName = NodeName, ObserverName = ObserverName, Metric = $"{drive}{data.Property}", Source = ObserverConstants.FabricObserverName, Value = Math.Round(data.AverageDataValue, 1), }; if (IsTelemetryProviderEnabled && IsObserverTelemetryEnabled) { _ = TelemetryClient?.ReportMetricAsync( telemetryData, Token); } if (IsEtwEnabled) { Logger.EtwLogger?.Write( ObserverConstants.FabricObserverETWEventName, new { Code = FOErrorWarningCodes.Ok, HealthState = Enum.GetName(typeof(HealthState), HealthState.Ok), NodeName, ObserverName, Metric = $"{drive}{data.Property}", Source = ObserverConstants.FabricObserverName, Value = Math.Round(data.AverageDataValue, 1), }); } } // Health Error if (data.IsUnhealthy(thresholdError)) { thresholdName = "Maximum"; threshold = thresholdError; warningOrError = true; healthState = HealthState.Error; // This is primarily useful for AppObserver, but makes sense to be // part of the base class for future use, like for FSO. if (replicaOrInstance != null && dumpOnError) { try { int procId = (int)replicaOrInstance.HostProcessId; if (!this.serviceDumpCountDictionary.ContainsKey(procName)) { this.serviceDumpCountDictionary.Add(procName, 0); } if (this.serviceDumpCountDictionary[procName] < this.maxDumps) { // DumpServiceProcess defaults to a Full dump with // process memory, handles and thread data. bool success = DumpServiceProcess(procId); if (success) { this.serviceDumpCountDictionary[procName]++; } } } // Ignore these, it just means no dmp will be created.This is not // critical to FO. Log as info, not warning. catch (Exception e) when(e is ArgumentException || e is InvalidOperationException) { ObserverLogger.LogInfo($"Unable to generate dmp file:{Environment.NewLine}{e}"); } } } // Health Warning if (!warningOrError && data.IsUnhealthy(thresholdWarning)) { warningOrError = true; healthState = HealthState.Warning; } if (warningOrError) { string errorWarningCode = null; switch (data.Property) { case ErrorWarningProperty.TotalCpuTime when healthReportType == HealthReportType.Application: errorWarningCode = (healthState == HealthState.Error) ? FOErrorWarningCodes.AppErrorCpuPercent : FOErrorWarningCodes.AppWarningCpuPercent; break; case ErrorWarningProperty.TotalCpuTime: errorWarningCode = (healthState == HealthState.Error) ? FOErrorWarningCodes.NodeErrorCpuPercent : FOErrorWarningCodes.NodeWarningCpuPercent; break; case ErrorWarningProperty.DiskSpaceUsagePercentage: errorWarningCode = (healthState == HealthState.Error) ? FOErrorWarningCodes.NodeErrorDiskSpacePercent : FOErrorWarningCodes.NodeWarningDiskSpacePercent; break; case ErrorWarningProperty.DiskSpaceUsageMb: errorWarningCode = (healthState == HealthState.Error) ? FOErrorWarningCodes.NodeErrorDiskSpaceMB : FOErrorWarningCodes.NodeWarningDiskSpaceMB; break; case ErrorWarningProperty.TotalMemoryConsumptionMb when healthReportType == HealthReportType.Application: errorWarningCode = (healthState == HealthState.Error) ? FOErrorWarningCodes.AppErrorMemoryMB : FOErrorWarningCodes.AppWarningMemoryMB; break; case ErrorWarningProperty.TotalMemoryConsumptionMb: errorWarningCode = (healthState == HealthState.Error) ? FOErrorWarningCodes.NodeErrorMemoryMB : FOErrorWarningCodes.NodeWarningMemoryMB; break; case ErrorWarningProperty.TotalMemoryConsumptionPct when replicaOrInstance != null: errorWarningCode = (healthState == HealthState.Error) ? FOErrorWarningCodes.AppErrorMemoryPercent : FOErrorWarningCodes.AppWarningMemoryPercent; break; case ErrorWarningProperty.TotalMemoryConsumptionPct: errorWarningCode = (healthState == HealthState.Error) ? FOErrorWarningCodes.NodeErrorMemoryPercent : FOErrorWarningCodes.NodeWarningMemoryPercent; break; case ErrorWarningProperty.DiskAverageQueueLength: errorWarningCode = (healthState == HealthState.Error) ? FOErrorWarningCodes.NodeErrorDiskAverageQueueLength : FOErrorWarningCodes.NodeWarningDiskAverageQueueLength; break; case ErrorWarningProperty.TotalActiveFirewallRules: errorWarningCode = (healthState == HealthState.Error) ? FOErrorWarningCodes.ErrorTooManyFirewallRules : FOErrorWarningCodes.WarningTooManyFirewallRules; break; case ErrorWarningProperty.TotalActivePorts when healthReportType == HealthReportType.Application: errorWarningCode = (healthState == HealthState.Error) ? FOErrorWarningCodes.AppErrorTooManyActiveTcpPorts : FOErrorWarningCodes.AppWarningTooManyActiveTcpPorts; break; case ErrorWarningProperty.TotalActivePorts: errorWarningCode = (healthState == HealthState.Error) ? FOErrorWarningCodes.NodeErrorTooManyActiveTcpPorts : FOErrorWarningCodes.NodeWarningTooManyActiveTcpPorts; break; case ErrorWarningProperty.TotalEphemeralPorts when healthReportType == HealthReportType.Application: errorWarningCode = (healthState == HealthState.Error) ? FOErrorWarningCodes.AppErrorTooManyActiveEphemeralPorts : FOErrorWarningCodes.AppWarningTooManyActiveEphemeralPorts; break; case ErrorWarningProperty.TotalEphemeralPorts: errorWarningCode = (healthState == HealthState.Error) ? FOErrorWarningCodes.NodeErrorTooManyActiveEphemeralPorts : FOErrorWarningCodes.NodeWarningTooManyActiveEphemeralPorts; break; } var healthMessage = new StringBuilder(); string drive = string.Empty; if (ObserverName == ObserverConstants.DiskObserverName) { drive = $"{data.Id}: "; if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows)) { drive = $"{data.Id.Remove(1, 2)}: "; } } _ = healthMessage.Append($"{drive}{data.Property} is at or above the specified {thresholdName} limit ({threshold}{data.Units})"); _ = healthMessage.AppendLine($" - {data.Property}: {Math.Round(data.AverageDataValue)}{data.Units}"); // The health event description will be a serialized instance of telemetryData, // so it should be completely constructed (filled with data) regardless // of user telemetry settings. telemetryData.ApplicationName = appName?.OriginalString ?? string.Empty; telemetryData.Code = errorWarningCode; if (replicaOrInstance != null && !string.IsNullOrEmpty(replicaOrInstance.ContainerId)) { telemetryData.ContainerId = replicaOrInstance.ContainerId; } telemetryData.HealthState = Enum.GetName(typeof(HealthState), healthState); telemetryData.HealthEventDescription = healthMessage.ToString(); telemetryData.Metric = $"{drive}{data.Property}"; telemetryData.ServiceName = serviceName?.OriginalString ?? string.Empty; telemetryData.Source = ObserverConstants.FabricObserverName; telemetryData.Value = Math.Round(data.AverageDataValue, 1); // Send Health Report as Telemetry event (perhaps it signals an Alert from App Insights, for example.). if (IsTelemetryProviderEnabled && IsObserverTelemetryEnabled) { _ = TelemetryClient?.ReportHealthAsync( telemetryData, Token); } // ETW. if (IsEtwEnabled) { Logger.EtwLogger?.Write( ObserverConstants.FabricObserverETWEventName, new { ApplicationName = appName?.OriginalString ?? string.Empty, Code = errorWarningCode, ContainerId = replicaOrInstance != null ? replicaOrInstance.ContainerId ?? string.Empty : string.Empty, HealthState = Enum.GetName(typeof(HealthState), healthState), HealthEventDescription = healthMessage.ToString(), Metric = $"{drive}{data.Property}", Node = NodeName, ServiceName = serviceName?.OriginalString ?? string.Empty, Source = ObserverConstants.FabricObserverName, Value = Math.Round(data.AverageDataValue, 1), }); } var healthReport = new HealthReport { AppName = appName, Code = errorWarningCode, EmitLogEvent = true, HealthData = telemetryData, HealthMessage = healthMessage.ToString(), HealthReportTimeToLive = healthReportTtl, ReportType = healthReportType, State = healthState, NodeName = NodeName, Observer = ObserverName, ResourceUsageDataProperty = data.Property, }; if (!AppNames.Any(a => a == appName?.OriginalString)) { AppNames.Add(appName?.OriginalString); } // From FSO. if (replicaOrInstance == null && healthReportType == HealthReportType.Application) { HealthReportProperties.Add(id); } else { if (HealthReportProperties.Count == 0) { HealthReportProperties.Add(ObserverName switch { ObserverConstants.AppObserverName => "ApplicationHealth", ObserverConstants.CertificateObserverName => "SecurityHealth", ObserverConstants.DiskObserverName => "DiskHealth", ObserverConstants.FabricSystemObserverName => "FabricSystemServiceHealth", ObserverConstants.NetworkObserverName => "NetworkHealth", ObserverConstants.OSObserverName => "MachineInformation", ObserverConstants.NodeObserverName => "MachineResourceHealth", _ => $"{data.Property}", });
internal void ProcessResourceDataReportHealth <T>( FabricResourceUsageData <T> data, T thresholdError, T thresholdWarning, TimeSpan healthReportTtl, HealthReportType healthReportType = HealthReportType.Node, ReplicaOrInstanceMonitoringInfo replicaOrInstance = null, bool dumpOnError = false) { if (data == null) { throw new ArgumentException("Supply all required parameters with non-null value."); } var thresholdName = "Minimum"; bool warningOrError = false; string repPartitionId = null, repOrInstanceId = null, name = null, id = null, procName = null; T threshold = thresholdWarning; var healthState = HealthState.Ok; Uri appName = null; if (replicaOrInstance != null) { repPartitionId = $"Partition: {replicaOrInstance.PartitionId}"; repOrInstanceId = $"Replica: {replicaOrInstance.ReplicaOrInstanceId}"; // Create a unique id which may be used in the case of warnings or OK clears. appName = replicaOrInstance.ApplicationName; name = appName.OriginalString.Replace("fabric:/", string.Empty); id = name + "_" + data.Property.Replace(" ", string.Empty); // Telemetry. if (this.IsTelemetryEnabled) { this.ObserverTelemetryClient?.ReportMetricAsync( $"{this.NodeName}-{name}-{data.Id}-{data.Property}", data.AverageDataValue, this.Token); } try { procName = Process.GetProcessById((int)replicaOrInstance.HostProcessId).ProcessName; } catch (ArgumentException) { return; } catch (InvalidOperationException) { return; } } else { // Telemetry. if (this.IsTelemetryEnabled) { this.ObserverTelemetryClient?.ReportMetricAsync( $"{this.NodeName}-{data.Id}-{data.Property}", data.AverageDataValue, this.Token); } } // ETW. if (this.IsEtwEnabled) { Logger.EtwLogger?.Write( $"FabricObserverDataEvent", new { Level = 0, // Info Node = this.NodeName, Observer = this.ObserverName, data.Property, data.Id, Value = $"{Math.Round(data.AverageDataValue)}", Unit = data.Units, }); } // Health Error if (data.IsUnhealthy(thresholdError)) { thresholdName = "Maximum"; threshold = thresholdError; warningOrError = true; healthState = HealthState.Error; // This is primarily useful for AppObserver, but makes sense to be // part of the base class for future use, like for FSO. if (replicaOrInstance != null && dumpOnError) { try { int procId = (int)replicaOrInstance.HostProcessId; if (!this.serviceDumpCountDictionary.ContainsKey(procName)) { this.serviceDumpCountDictionary.Add(procName, 0); } if (this.serviceDumpCountDictionary[procName] < this.maxDumps) { // DumpServiceProcess defaults to a Full dump with // process memory, handles and thread data. bool success = this.DumpServiceProcess(procId); if (success) { this.serviceDumpCountDictionary[procName]++; } } } // Ignore these, it just means no dmp will be created.This is not // critical to FO. Log as info, not warning. catch (Exception e) when(e is ArgumentException || e is InvalidOperationException) { this.ObserverLogger.LogInfo($"Unable to generate dmp file:{Environment.NewLine}{e}"); } } } // Health Warning if (!warningOrError && data.IsUnhealthy(thresholdWarning)) { warningOrError = true; healthState = HealthState.Warning; } if (warningOrError) { string errorWarningKind = null; switch (data.Property) { case ErrorWarningProperty.TotalCpuTime when replicaOrInstance != null: errorWarningKind = (healthState == HealthState.Error) ? FoErrorWarningCodes.AppErrorCpuTime : FoErrorWarningCodes.AppWarningCpuTime; break; case ErrorWarningProperty.TotalCpuTime: errorWarningKind = (healthState == HealthState.Error) ? FoErrorWarningCodes.NodeErrorCpuTime : FoErrorWarningCodes.NodeWarningCpuTime; break; case ErrorWarningProperty.DiskSpaceUsagePercentage: errorWarningKind = (healthState == HealthState.Error) ? FoErrorWarningCodes.NodeErrorDiskSpacePercentUsed : FoErrorWarningCodes.NodeWarningDiskSpacePercentUsed; break; case ErrorWarningProperty.DiskSpaceUsageMb: errorWarningKind = (healthState == HealthState.Error) ? FoErrorWarningCodes.NodeErrorDiskSpaceMb : FoErrorWarningCodes.NodeWarningDiskSpaceMb; break; case ErrorWarningProperty.TotalMemoryConsumptionMb when replicaOrInstance != null: errorWarningKind = (healthState == HealthState.Error) ? FoErrorWarningCodes.AppErrorMemoryCommittedMb : FoErrorWarningCodes.AppWarningMemoryCommittedMb; break; case ErrorWarningProperty.TotalMemoryConsumptionMb: errorWarningKind = (healthState == HealthState.Error) ? FoErrorWarningCodes.NodeErrorMemoryCommittedMb : FoErrorWarningCodes.NodeWarningMemoryCommittedMb; break; case ErrorWarningProperty.TotalMemoryConsumptionPct when replicaOrInstance != null: errorWarningKind = (healthState == HealthState.Error) ? FoErrorWarningCodes.AppErrorMemoryPercentUsed : FoErrorWarningCodes.AppWarningMemoryPercentUsed; break; case ErrorWarningProperty.TotalMemoryConsumptionPct: errorWarningKind = (healthState == HealthState.Error) ? FoErrorWarningCodes.NodeErrorMemoryPercentUsed : FoErrorWarningCodes.NodeWarningMemoryPercentUsed; break; case ErrorWarningProperty.DiskAverageQueueLength: errorWarningKind = (healthState == HealthState.Error) ? FoErrorWarningCodes.NodeErrorDiskAverageQueueLength : FoErrorWarningCodes.NodeWarningDiskAverageQueueLength; break; case ErrorWarningProperty.TotalActiveFirewallRules: errorWarningKind = (healthState == HealthState.Error) ? FoErrorWarningCodes.ErrorTooManyFirewallRules : FoErrorWarningCodes.WarningTooManyFirewallRules; break; case ErrorWarningProperty.TotalActivePorts when replicaOrInstance != null: errorWarningKind = (healthState == HealthState.Error) ? FoErrorWarningCodes.AppErrorTooManyActiveTcpPorts : FoErrorWarningCodes.AppWarningTooManyActiveTcpPorts; break; case ErrorWarningProperty.TotalActivePorts: errorWarningKind = (healthState == HealthState.Error) ? FoErrorWarningCodes.NodeErrorTooManyActiveTcpPorts : FoErrorWarningCodes.NodeWarningTooManyActiveTcpPorts; break; case ErrorWarningProperty.TotalEphemeralPorts when replicaOrInstance != null: errorWarningKind = (healthState == HealthState.Error) ? FoErrorWarningCodes.AppErrorTooManyActiveEphemeralPorts : FoErrorWarningCodes.AppWarningTooManyActiveEphemeralPorts; break; case ErrorWarningProperty.TotalEphemeralPorts: errorWarningKind = (healthState == HealthState.Error) ? FoErrorWarningCodes.NodeErrorTooManyActiveEphemeralPorts : FoErrorWarningCodes.NodeWarningTooManyActiveEphemeralPorts; break; } var healthMessage = new StringBuilder(); if (name != null) { healthMessage.Append($"{name} (Service Process: {procName}, {repPartitionId}, {repOrInstanceId}): "); } string drive = string.Empty; if (data.Property.Contains("Disk")) { drive = $"{data.Id}: "; } healthMessage.Append($"{drive}{data.Property} is at or above the specified {thresholdName} limit ({threshold}{data.Units})"); healthMessage.AppendLine($" - Average {data.Property}: {Math.Round(data.AverageDataValue)}{data.Units}"); var healthReport = new HealthReport { AppName = appName, Code = errorWarningKind, EmitLogEvent = true, HealthMessage = healthMessage.ToString(), HealthReportTimeToLive = healthReportTtl, ReportType = healthReportType, State = healthState, NodeName = this.NodeName, Observer = this.ObserverName, ResourceUsageDataProperty = data.Property, }; // Emit a Fabric Health Report and optionally a local log write. this.HealthReporter.ReportHealthToServiceFabric(healthReport); // Set internal fabric health states. data.ActiveErrorOrWarning = true; // This means this observer created a Warning or Error SF Health Report this.HasActiveFabricErrorOrWarning = true; // Send Health Report as Telemetry event (perhaps it signals an Alert from App Insights, for example.). if (this.IsTelemetryEnabled) { this.ObserverTelemetryClient?.ReportHealthAsync( !string.IsNullOrEmpty(id) ? HealthScope.Application : HealthScope.Node, $"{(appName != null ? appName.OriginalString : this.NodeName)}", healthState, $"{this.NodeName}/{errorWarningKind}/{drive}{data.Property}/{Math.Round(data.AverageDataValue)}", this.ObserverName, this.Token); } // ETW. if (this.IsEtwEnabled) { Logger.EtwLogger?.Write( $"FabricObserverDataEvent", new { Level = (healthState == HealthState.Warning) ? 1 : 2, Node = this.NodeName, Observer = this.ObserverName, HealthEventErrorCode = errorWarningKind, HealthEventDescription = healthMessage.ToString(), data.Property, data.Id, Value = $"{Math.Round(data.AverageDataValue)}", Unit = data.Units, }); } // Clean up sb. healthMessage.Clear(); } else { if (data.ActiveErrorOrWarning) { var report = new HealthReport { AppName = appName, EmitLogEvent = true, HealthMessage = $"{data.Property} is now within normal/expected range.", HealthReportTimeToLive = default(TimeSpan), ReportType = healthReportType, State = HealthState.Ok, NodeName = this.NodeName, Observer = $"{this.ObserverName}({data.Id})", ResourceUsageDataProperty = data.Property, }; // Emit an Ok Health Report to clear Fabric Health warning. this.HealthReporter.ReportHealthToServiceFabric(report); // Reset health states. data.ActiveErrorOrWarning = false; this.HasActiveFabricErrorOrWarning = false; } } // No need to keep data in memory. data.Data.Clear(); data.Data.TrimExcess(); }