protected override void Dispose(bool disposing) { if (!disposing) { return; } var errWarnHealthStates = this.connectionStatus.Where( conn => conn.Health == HealthState.Error || conn.Health == HealthState.Warning); foreach (var state in errWarnHealthStates) { // Clear existing Health Warning. var report = new HealthReport { AppName = new Uri(state.TargetApp), Code = FOErrorWarningCodes.AppWarningNetworkEndpointUnreachable, EmitLogEvent = true, HealthMessage = $"Clearing NetworkObserver's Health Error/Warning for {state.TargetApp}/{state.HostName} connection state since FO is stopping.", HealthReportTimeToLive = default(TimeSpan), State = HealthState.Ok, NodeName = NodeName, Observer = ObserverName, Property = $"EndpointUnreachable({state.HostName})", ReportType = HealthReportType.Application, }; HealthReporter.ReportHealthToServiceFabric(report); } }
public override Task ReportAsync(CancellationToken token) { // Local log. this.ObserverLogger.LogInfo(message.ToString()); // Report to Fabric. var healthReporter = new ObserverHealthReporter(this.ObserverLogger); var healthReport = new Utilities.HealthReport { Code = FoErrorWarningCodes.Ok, HealthMessage = this.message.ToString(), NodeName = this.NodeName, Observer = this.ObserverName, ReportType = HealthReportType.Node, State = HealthState.Ok, }; healthReporter.ReportHealthToServiceFabric(healthReport); // Emit Telemetry - This will use whatever telemetry provider you have configured in FabricObserver Settings.xml. var telemetryData = new TelemetryData(this.FabricClientInstance, this.Token) { Code = FoErrorWarningCodes.Ok, HealthEventDescription = this.message.ToString(), HealthState = "Ok", NodeName = this.NodeName, ObserverName = this.ObserverName, Source = ObserverConstants.FabricObserverName, }; if (this.IsTelemetryProviderEnabled && this.IsObserverTelemetryEnabled) { _ = this.TelemetryClient?.ReportHealthAsync( telemetryData, this.Token); } // ETW. if (this.IsEtwEnabled) { Logger.EtwLogger?.Write( ObserverConstants.FabricObserverETWEventName, new { Code = FoErrorWarningCodes.Ok, HealthEventDescription = this.message.ToString(), HealthState = "Ok", this.NodeName, this.ObserverName, Source = ObserverConstants.FabricObserverName, }); } this.message.Clear(); return(Task.CompletedTask); }
/// <inheritdoc/> public override Task ReportAsync(CancellationToken token) { if (token.IsCancellationRequested) { return(Task.CompletedTask); } // Someone calling without observing first, must be run after a new run of ObserveAsync if (this.ExpiringWarnings == null || this.ExpiredWarnings == null || this.NotFoundWarnings == null) { return(Task.CompletedTask); } HealthReport healthReport; if (this.ExpiringWarnings.Count == 0 && this.ExpiredWarnings.Count == 0 && this.NotFoundWarnings.Count == 0) { healthReport = new HealthReport { Observer = this.ObserverName, ReportType = HealthReportType.Node, EmitLogEvent = true, NodeName = this.NodeName, HealthMessage = $"All cluster and monitored app certificates are healthy.", State = HealthState.Ok, HealthReportTimeToLive = this.RunInterval > TimeSpan.MinValue ? this.RunInterval : this.HealthReportTimeToLive, }; this.HasActiveFabricErrorOrWarning = false; } else { string healthMessage = (this.ExpiredWarnings.Count == 0 ? string.Empty : (this.ExpiredWarnings.Aggregate(string.Empty, (i, j) => i + "\n" + j) + "\n")) + (this.NotFoundWarnings.Count == 0 ? string.Empty : (this.NotFoundWarnings.Aggregate(string.Empty, (i, j) => i + "\n" + j) + "\n")) + (this.ExpiringWarnings.Count == 0 ? string.Empty : this.ExpiringWarnings.Aggregate(string.Empty, (i, j) => i + "\n" + j)); healthReport = new HealthReport { Code = FoErrorWarningCodes.WarningCertificateExpiration, Observer = this.ObserverName, ReportType = HealthReportType.Node, EmitLogEvent = true, NodeName = this.NodeName, HealthMessage = healthMessage, State = HealthState.Warning, HealthReportTimeToLive = this.RunInterval > TimeSpan.MinValue ? this.RunInterval : this.HealthReportTimeToLive, }; this.HasActiveFabricErrorOrWarning = true; } this.HealthReporter.ReportHealthToServiceFabric(healthReport); this.ExpiredWarnings = null; this.ExpiringWarnings = null; this.NotFoundWarnings = null; this.LastRunDateTime = DateTime.Now; return(Task.CompletedTask); }
public override Task ReportAsync(CancellationToken token) { var timeToLiveWarning = SetHealthReportTimeToLive(); // Report on connection state. foreach (var config in this.userConfig) { token.ThrowIfCancellationRequested(); foreach (var conn in this.connectionStatus.Where(cs => cs.TargetApp == config.TargetApp)) { token.ThrowIfCancellationRequested(); var connState = conn; if (!connState.Connected) { this.healthState = HealthState.Warning; var healthMessage = $"Outbound Internet connection failure detected for endpoint {connState.HostName}{Environment.NewLine}"; // Send Health Telemetry (perhaps it signals an Alert in AppInsights or LogAnalytics). // This will also be serialied into the health event (Desf. var telemetryData = new TelemetryData(FabricClientInstance, token) { ApplicationName = conn.TargetApp, Code = FOErrorWarningCodes.AppWarningNetworkEndpointUnreachable, HealthState = "Warning", HealthEventDescription = healthMessage, ObserverName = ObserverName, Metric = ErrorWarningProperty.InternetConnectionFailure, NodeName = NodeName, }; if (IsTelemetryProviderEnabled && IsObserverTelemetryEnabled) { _ = TelemetryClient?.ReportMetricAsync( telemetryData, Token); } var report = new HealthReport { AppName = new Uri(conn.TargetApp), EmitLogEvent = true, HealthData = telemetryData, HealthMessage = healthMessage, HealthReportTimeToLive = timeToLiveWarning, State = this.healthState, NodeName = NodeName, Observer = ObserverName, Property = $"EndpointUnreachable({conn.HostName})", ReportType = HealthReportType.Application, ResourceUsageDataProperty = $"{ErrorWarningProperty.InternetConnectionFailure}: {connState.HostName}", }; // Send health report Warning and log event locally. HealthReporter.ReportHealthToServiceFabric(report); // This means this observer created a Warning or Error SF Health Report HasActiveFabricErrorOrWarning = true; // ETW. if (IsEtwEnabled) { Logger.EtwLogger?.Write( ObserverConstants.FabricObserverETWEventName, new { ApplicationName = conn.TargetApp, Code = FOErrorWarningCodes.AppWarningNetworkEndpointUnreachable, HealthState = "Warning", HealthEventDescription = healthMessage, ObserverName, Metric = ErrorWarningProperty.InternetConnectionFailure, NodeName, }); } } else { if (connState.Health != HealthState.Warning || connState.Health != HealthState.Error) { continue; } this.healthState = HealthState.Ok; var healthMessage = $"Outbound Internet connection successful for {connState?.HostName} from node {NodeName}."; // Clear existing Health Warning. var report = new HealthReport { AppName = new Uri(conn.TargetApp), Code = FOErrorWarningCodes.AppWarningNetworkEndpointUnreachable, EmitLogEvent = true, HealthMessage = healthMessage, HealthReportTimeToLive = default(TimeSpan), State = HealthState.Ok, NodeName = NodeName, Observer = ObserverName, Property = $"EndpointUnreachable({conn.HostName})", ReportType = HealthReportType.Application, }; HealthReporter.ReportHealthToServiceFabric(report); // Telemetry. if (IsTelemetryProviderEnabled && IsObserverTelemetryEnabled) { var telemetryData = new TelemetryData(FabricClientInstance, token) { ApplicationName = conn.TargetApp, Code = FOErrorWarningCodes.Ok, HealthState = "Ok", HealthEventDescription = healthMessage, ObserverName = ObserverName, Metric = "Internet Connection State", NodeName = NodeName, }; _ = TelemetryClient?.ReportMetricAsync( telemetryData, Token); } // ETW. if (IsEtwEnabled) { Logger.EtwLogger?.Write( ObserverConstants.FabricObserverETWEventName, new { ApplicationName = conn.TargetApp, Code = FOErrorWarningCodes.Ok, HealthState = "Ok", HealthEventDescription = healthMessage, ObserverName, Metric = "Internet Connection State", NodeName, }); } // Reset health state. HasActiveFabricErrorOrWarning = false; } } } // Clear _ = this.connectionStatus.RemoveAll(conn => conn.Connected); this.connectionStatus.TrimExcess(); this.connEndpointTestResults.Clear(); return(Task.CompletedTask); }
/// <inheritdoc/> public override Task ReportAsync(CancellationToken token) { if (token.IsCancellationRequested) { return(Task.CompletedTask); } // Someone calling without observing first, must be run after a new run of ObserveAsync if (this.ExpiringWarnings == null || this.ExpiredWarnings == null || this.NotFoundWarnings == null) { return(Task.CompletedTask); } HealthReport healthReport; if (this.ExpiringWarnings.Count == 0 && this.ExpiredWarnings.Count == 0 && this.NotFoundWarnings.Count == 0) { healthReport = new HealthReport { Observer = this.ObserverName, ReportType = HealthReportType.Node, EmitLogEvent = true, NodeName = this.NodeName, HealthMessage = $"All cluster and monitored app certificates are healthy.", State = HealthState.Ok, HealthReportTimeToLive = this.RunInterval > TimeSpan.MinValue ? this.RunInterval : this.HealthReportTimeToLive, }; this.HasActiveFabricErrorOrWarning = false; } else { string healthMessage = (this.ExpiredWarnings.Count == 0 ? string.Empty : (this.ExpiredWarnings.Aggregate(string.Empty, (i, j) => i + "\n" + j) + "\n")) + (this.NotFoundWarnings.Count == 0 ? string.Empty : (this.NotFoundWarnings.Aggregate(string.Empty, (i, j) => i + "\n" + j) + "\n")) + (this.ExpiringWarnings.Count == 0 ? string.Empty : this.ExpiringWarnings.Aggregate(string.Empty, (i, j) => i + "\n" + j)); healthReport = new HealthReport { Code = FoErrorWarningCodes.WarningCertificateExpiration, Observer = this.ObserverName, ReportType = HealthReportType.Node, EmitLogEvent = true, NodeName = this.NodeName, HealthMessage = healthMessage, State = HealthState.Warning, HealthReportTimeToLive = this.RunInterval > TimeSpan.MinValue ? this.RunInterval : this.HealthReportTimeToLive, }; this.HasActiveFabricErrorOrWarning = true; if (this.IsTelemetryProviderEnabled && this.IsObserverTelemetryEnabled) { TelemetryData telemetryData = new TelemetryData(this.FabricClientInstance, token) { Code = FoErrorWarningCodes.WarningCertificateExpiration, HealthState = "Warning", NodeName = this.NodeName, Metric = ErrorWarningProperty.CertificateExpiration, HealthEventDescription = healthMessage, ObserverName = this.ObserverName, Source = ObserverConstants.FabricObserverName, Value = FoErrorWarningCodes.GetErrorWarningNameFromFOCode( FoErrorWarningCodes.WarningCertificateExpiration, HealthScope.Node), }; _ = this.TelemetryClient?.ReportMetricAsync( telemetryData, this.Token); } if (this.IsEtwEnabled) { Logger.EtwLogger?.Write( ObserverConstants.FabricObserverETWEventName, new { Code = FoErrorWarningCodes.WarningCertificateExpiration, HealthState = "Warning", NodeName = this.NodeName, Metric = ErrorWarningProperty.CertificateExpiration, HealthEventDescription = healthMessage, ObserverName = this.ObserverName, Source = ObserverConstants.FabricObserverName, Value = FoErrorWarningCodes.GetErrorWarningNameFromFOCode( FoErrorWarningCodes.WarningCertificateExpiration, HealthScope.Node), }); } } this.HealthReporter.ReportHealthToServiceFabric(healthReport); this.ExpiredWarnings = null; this.ExpiringWarnings = null; this.NotFoundWarnings = null; this.LastRunDateTime = DateTime.Now; return(Task.CompletedTask); }
public void ReportHealthToServiceFabric(HealthReport healthReport) { if (healthReport == null) { return; } // There is no real need to change Immediate to true here for errors/warnings. This only adds unecessary stress to the // Health subsystem. var sendOptions = new HealthReportSendOptions { Immediate = false }; // Quickly send OK (clears warning/errors states). if (healthReport.State == HealthState.Ok) { sendOptions.Immediate = true; } var timeToLive = TimeSpan.FromMinutes(5); if (healthReport.HealthReportTimeToLive != default) { timeToLive = healthReport.HealthReportTimeToLive; } // In order for multiple Error/Warning/Ok events to show up in SFX Details view from observer instances, // Event Source Ids must be unique, thus the seemingly strange conditionals inside the cases below: // The apparent duplicity in OR checks is for the case when the incoming report is an OK report, where there is // no error code, but the specific ErrorWarningProperty is known. string property; switch (healthReport.Observer) { case ObserverConstants.AppObserverName: property = "AppHealth"; break; case ObserverConstants.CertificateObserverName: property = "SecurityHealth"; break; case ObserverConstants.DiskObserverName: property = "DiskHealth"; break; case ObserverConstants.FabricSystemObserverName: property = "FabricSystemServiceHealth"; break; case ObserverConstants.NetworkObserverName: property = "NetworkingHealth"; break; case ObserverConstants.OsObserverName: property = "MachineInformation"; break; case ObserverConstants.NodeObserverName: property = "MachineResourceHealth"; break; default: property = "FOGenericHealth"; break; } string sourceId = healthReport.Observer; if (!string.IsNullOrEmpty(healthReport.Code)) { // Only use FOErrorWarningCode for source sourceId = $"{healthReport.Code}"; } var healthInformation = new HealthInformation(sourceId, property, healthReport.State) { Description = healthReport.HealthMessage, TimeToLive = timeToLive, RemoveWhenExpired = true, }; // Log event only if ObserverWebApi (REST Log reader.) app is deployed. if (ObserverManager.ObserverWebAppDeployed && healthReport.EmitLogEvent) { if (healthReport.State == HealthState.Error) { this.logger.LogError(healthReport.NodeName + ": {0}", healthInformation.Description); } else if (healthReport.State == HealthState.Warning) { this.logger.LogWarning(healthReport.NodeName + ": {0}", healthInformation.Description); } else { this.logger.LogInfo(healthReport.NodeName + ": {0}", healthInformation.Description); } } // To SFX and Telemetry provider. if (healthReport.ReportType == HealthReportType.Application && healthReport.AppName != null) { var appHealthReport = new ApplicationHealthReport(healthReport.AppName, healthInformation); this.fabricClient.HealthManager.ReportHealth(appHealthReport, sendOptions); } else { var nodeHealthReport = new NodeHealthReport(healthReport.NodeName, healthInformation); this.fabricClient.HealthManager.ReportHealth(nodeHealthReport, sendOptions); } }
/// <inheritdoc/> public override Task ReportAsync(CancellationToken token) { try { token.ThrowIfCancellationRequested(); // OS Health. if (this.osStatus != null && this.osStatus.ToUpper() != "OK") { string healthMessage = $"OS reporting unhealthy: {this.osStatus}"; var healthReport = new HealthReport { Observer = this.ObserverName, NodeName = this.NodeName, HealthMessage = healthMessage, State = HealthState.Error, HealthReportTimeToLive = this.SetTimeToLiveWarning(), }; this.HealthReporter.ReportHealthToServiceFabric(healthReport); // This means this observer created a Warning or Error SF Health Report this.HasActiveFabricErrorOrWarning = true; // Send Health Report as Telemetry (perhaps it signals an Alert from App Insights, for example.). if (this.IsTelemetryEnabled) { _ = this.ObserverTelemetryClient?.ReportHealthAsync( HealthScope.Application, FabricRuntime.GetActivationContext().ApplicationName, HealthState.Error, $"{this.NodeName} - OS reporting unhealthy: {this.osStatus}", this.ObserverName, this.Token); } } else if (this.HasActiveFabricErrorOrWarning && this.osStatus != null && this.osStatus.ToUpper() == "OK") { // Clear Error or Warning with an OK Health Report. string healthMessage = $"OS reporting healthy: {this.osStatus}"; var healthReport = new HealthReport { Observer = this.ObserverName, NodeName = this.NodeName, HealthMessage = healthMessage, State = HealthState.Ok, HealthReportTimeToLive = default(TimeSpan), }; this.HealthReporter.ReportHealthToServiceFabric(healthReport); // Reset internal health state. this.HasActiveFabricErrorOrWarning = false; } if (ObserverManager.ObserverWebAppDeployed) { var logPath = Path.Combine(this.ObserverLogger.LogFolderBasePath, "SysInfo.txt"); // This file is used by the web application (log reader.). if (!this.ObserverLogger.TryWriteLogFile(logPath, $"Last updated on {DateTime.UtcNow.ToString("M/d/yyyy HH:mm:ss")} UTC<br/>{this.osReport}")) { this.HealthReporter.ReportFabricObserverServiceHealth( this.FabricServiceContext.ServiceName.OriginalString, this.ObserverName, HealthState.Warning, "Unable to create SysInfo.txt file."); } } var report = new HealthReport { Observer = this.ObserverName, HealthMessage = this.osReport, State = HealthState.Ok, NodeName = this.NodeName, HealthReportTimeToLive = this.SetTimeToLiveWarning(), }; this.HealthReporter.ReportHealthToServiceFabric(report); return(Task.CompletedTask); } catch (Exception e) { this.HealthReporter.ReportFabricObserverServiceHealth( this.FabricServiceContext.ServiceName.OriginalString, this.ObserverName, HealthState.Error, $"Unhandled exception processing OS information: {e.Message}: \n {e.StackTrace}"); throw; } }
private async Task MonitorDeployedAppsAsync(CancellationToken token) { Process currentProcess = null; foreach (var repOrInst in ReplicaOrInstanceList) { token.ThrowIfCancellationRequested(); var timer = new Stopwatch(); int processId = (int)repOrInst.HostProcessId; var cpuUsage = new CpuUsage(); try { // App level. currentProcess = Process.GetProcessById(processId); token.ThrowIfCancellationRequested(); var procName = currentProcess.ProcessName; string appNameOrType = GetAppNameOrType(repOrInst); var id = $"{appNameOrType}:{procName}"; // Add new resource data structures for each app service process. if (this.allAppCpuData.All(list => list.Id != id)) { this.allAppCpuData.Add(new FabricResourceUsageData <double>(ErrorWarningProperty.TotalCpuTime, id, DataCapacity, UseCircularBuffer)); this.allAppMemDataMb.Add(new FabricResourceUsageData <float>(ErrorWarningProperty.TotalMemoryConsumptionMb, id, DataCapacity, UseCircularBuffer)); this.allAppMemDataPercent.Add(new FabricResourceUsageData <double>(ErrorWarningProperty.TotalMemoryConsumptionPct, id, DataCapacity, UseCircularBuffer)); this.allAppTotalActivePortsData.Add(new FabricResourceUsageData <int>(ErrorWarningProperty.TotalActivePorts, id, 1)); this.allAppEphemeralPortsData.Add(new FabricResourceUsageData <int>(ErrorWarningProperty.TotalEphemeralPorts, id, 1)); } TimeSpan duration = TimeSpan.FromSeconds(15); if (MonitorDuration > TimeSpan.MinValue) { duration = MonitorDuration; } // Warm up the counters. _ = cpuUsage.GetCpuUsagePercentageProcess(currentProcess); _ = ProcessInfoProvider.Instance.GetProcessPrivateWorkingSetInMB(currentProcess.Id); timer.Start(); while (!currentProcess.HasExited && timer.Elapsed.Seconds <= duration.Seconds) { token.ThrowIfCancellationRequested(); // CPU (all cores). double cpu = cpuUsage.GetCpuUsagePercentageProcess(currentProcess); if (cpu >= 0) { if (cpu > 100) { cpu = 100; } this.allAppCpuData.FirstOrDefault(x => x.Id == id).Data.Add(cpu); } // Memory (private working set (process)). var processMem = ProcessInfoProvider.Instance.GetProcessPrivateWorkingSetInMB(currentProcess.Id); this.allAppMemDataMb.FirstOrDefault(x => x.Id == id).Data.Add(processMem); // Memory (percent in use (total)). var(TotalMemory, PercentInUse) = OperatingSystemInfoProvider.Instance.TupleGetTotalPhysicalMemorySizeAndPercentInUse(); long totalMem = TotalMemory; if (totalMem > -1) { double usedPct = Math.Round(((double)(processMem * 100)) / (totalMem * 1024), 2); this.allAppMemDataPercent.FirstOrDefault(x => x.Id == id).Data.Add(Math.Round(usedPct, 1)); } await Task.Delay(250, Token); } timer.Stop(); timer.Reset(); // Total and Ephemeral ports.. this.allAppTotalActivePortsData.FirstOrDefault(x => x.Id == id) .Data.Add(OperatingSystemInfoProvider.Instance.GetActivePortCount(currentProcess.Id, FabricServiceContext)); this.allAppEphemeralPortsData.FirstOrDefault(x => x.Id == id) .Data.Add(OperatingSystemInfoProvider.Instance.GetActiveEphemeralPortCount(currentProcess.Id, FabricServiceContext)); } catch (Exception e) { #if DEBUG // DEBUG INFO var healthReport = new Utilities.HealthReport { AppName = repOrInst.ApplicationName, HealthMessage = $"Error: {e}\n\n", State = HealthState.Ok, Code = FOErrorWarningCodes.Ok, NodeName = NodeName, Observer = ObserverName, Property = $"{e.Source}", ReportType = HealthReportType.Application, }; HealthReporter.ReportHealthToServiceFabric(healthReport); #endif if (e is Win32Exception || e is ArgumentException || e is InvalidOperationException) { WriteToLogWithLevel( ObserverName, $"MonitorAsync failed to find current service process for {repOrInst.ApplicationName?.OriginalString ?? repOrInst.ApplicationTypeName}/n{e}", LogLevel.Information); } else { if (!(e is OperationCanceledException || e is TaskCanceledException)) { WriteToLogWithLevel( ObserverName, $"Unhandled exception in MonitorAsync: \n {e}", LogLevel.Warning); } throw; } } finally { currentProcess?.Dispose(); currentProcess = null; } } }
/// <summary> /// This function processes numeric data held in FRUD instances and generates Application or Node level Health Reports depending on supplied thresholds. /// </summary> /// <typeparam name="T">This represents the numeric type of data this function will operate on.</typeparam> /// <param name="data">FabricResourceUsageData instance.</param> /// <param name="thresholdError">Error threshold (numeric)</param> /// <param name="thresholdWarning">Warning threshold (numeric)</param> /// <param name="healthReportTtl">Health report Time to Live (TimeSpan)</param> /// <param name="healthReportType">HealthReport type. Note, only Application and Node health report types are supported.</param> /// <param name="replicaOrInstance">Replica or Instance information contained in a type.</param> /// <param name="dumpOnError">Wheter or not to dump process if Error threshold has been reached.</param> public void ProcessResourceDataReportHealth <T>( FabricResourceUsageData <T> data, T thresholdError, T thresholdWarning, TimeSpan healthReportTtl, HealthReportType healthReportType = HealthReportType.Node, ReplicaOrInstanceMonitoringInfo replicaOrInstance = null, bool dumpOnError = false) where T : struct { if (data == null) { throw new ArgumentException("Supply all required parameters with non-null value."); } if (healthReportType != HealthReportType.Application && healthReportType != HealthReportType.Node) { this.ObserverLogger.LogWarning($"ProcessResourceDataReportHealth: Unsupported HealthReport type -> {Enum.GetName(typeof(HealthReportType), healthReportType)}"); return; } var thresholdName = "Minimum"; bool warningOrError = false; string repPartitionId = null, repOrInstanceId = null, name = null, id = null, procName = null; T threshold = thresholdWarning; var healthState = HealthState.Ok; Uri appName = null; Uri serviceName = null; TelemetryData telemetryData = null; if (healthReportType == HealthReportType.Application) { if (replicaOrInstance != null) { repPartitionId = $"Partition: {replicaOrInstance.PartitionId}"; repOrInstanceId = $"Replica: {replicaOrInstance.ReplicaOrInstanceId}"; // Create a unique id which will be used for health Warnings and OKs (clears). appName = replicaOrInstance.ApplicationName; serviceName = replicaOrInstance.ServiceName; name = appName.OriginalString.Replace("fabric:/", string.Empty); } else { appName = new Uri("fabric:/System"); name = data.Id; } id = name + "_" + data.Property.Replace(" ", string.Empty); // The health event description will be a serialized instance of telemetryData, // so it should be completely constructed (filled with data) regardless // of user telemetry settings. telemetryData = new TelemetryData(FabricClientInstance, Token) { ApplicationName = appName?.OriginalString ?? string.Empty, Code = FOErrorWarningCodes.Ok, HealthState = Enum.GetName(typeof(HealthState), HealthState.Ok), NodeName = NodeName, ObserverName = ObserverName, Metric = data.Property, Value = Math.Round(data.AverageDataValue, 1), PartitionId = replicaOrInstance?.PartitionId.ToString(), ReplicaId = replicaOrInstance?.ReplicaOrInstanceId.ToString(), ServiceName = serviceName?.OriginalString ?? string.Empty, Source = ObserverConstants.FabricObserverName, }; try { if (replicaOrInstance != null && replicaOrInstance.HostProcessId > 0) { procName = Process.GetProcessById((int)replicaOrInstance.HostProcessId).ProcessName; } else { // The name of the target service process is always the id for data containers coming from FSO. procName = data.Id; } telemetryData.ServiceName = procName; if (IsTelemetryProviderEnabled && IsObserverTelemetryEnabled) { _ = TelemetryClient?.ReportMetricAsync( telemetryData, Token).ConfigureAwait(false); } if (IsEtwEnabled) { Logger.EtwLogger?.Write( ObserverConstants.FabricObserverETWEventName, new { ApplicationName = appName?.OriginalString ?? string.Empty, Code = FOErrorWarningCodes.Ok, HealthState = Enum.GetName(typeof(HealthState), HealthState.Ok), NodeName, ObserverName, Metric = data.Property, Value = Math.Round(data.AverageDataValue, 1), PartitionId = replicaOrInstance?.PartitionId.ToString(), ReplicaId = replicaOrInstance?.ReplicaOrInstanceId.ToString(), ServiceName = procName, Source = ObserverConstants.FabricObserverName, }); } } catch (ArgumentException) { return; } catch (InvalidOperationException) { return; } } else { string drive = string.Empty; if (ObserverName == ObserverConstants.DiskObserverName) { drive = $"{data.Id}: "; if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows)) { drive = $"{data.Id.Remove(1, 2)}: "; } } // The health event description will be a serialized instance of telemetryData, // so it should be completely constructed (filled with data) regardless // of user telemetry settings. telemetryData = new TelemetryData(FabricClientInstance, Token) { Code = FOErrorWarningCodes.Ok, HealthState = Enum.GetName(typeof(HealthState), HealthState.Ok), NodeName = NodeName, ObserverName = ObserverName, Metric = $"{drive}{data.Property}", Source = ObserverConstants.FabricObserverName, Value = Math.Round(data.AverageDataValue, 1), }; if (IsTelemetryProviderEnabled && IsObserverTelemetryEnabled) { _ = TelemetryClient?.ReportMetricAsync( telemetryData, Token); } if (IsEtwEnabled) { Logger.EtwLogger?.Write( ObserverConstants.FabricObserverETWEventName, new { Code = FOErrorWarningCodes.Ok, HealthState = Enum.GetName(typeof(HealthState), HealthState.Ok), NodeName, ObserverName, Metric = $"{drive}{data.Property}", Source = ObserverConstants.FabricObserverName, Value = Math.Round(data.AverageDataValue, 1), }); } } // Health Error if (data.IsUnhealthy(thresholdError)) { thresholdName = "Maximum"; threshold = thresholdError; warningOrError = true; healthState = HealthState.Error; // This is primarily useful for AppObserver, but makes sense to be // part of the base class for future use, like for FSO. if (replicaOrInstance != null && dumpOnError) { try { int procId = (int)replicaOrInstance.HostProcessId; if (!this.serviceDumpCountDictionary.ContainsKey(procName)) { this.serviceDumpCountDictionary.Add(procName, 0); } if (this.serviceDumpCountDictionary[procName] < this.maxDumps) { // DumpServiceProcess defaults to a Full dump with // process memory, handles and thread data. bool success = DumpServiceProcess(procId); if (success) { this.serviceDumpCountDictionary[procName]++; } } } // Ignore these, it just means no dmp will be created.This is not // critical to FO. Log as info, not warning. catch (Exception e) when(e is ArgumentException || e is InvalidOperationException) { ObserverLogger.LogInfo($"Unable to generate dmp file:{Environment.NewLine}{e}"); } } } // Health Warning if (!warningOrError && data.IsUnhealthy(thresholdWarning)) { warningOrError = true; healthState = HealthState.Warning; } if (warningOrError) { string errorWarningCode = null; switch (data.Property) { case ErrorWarningProperty.TotalCpuTime when healthReportType == HealthReportType.Application: errorWarningCode = (healthState == HealthState.Error) ? FOErrorWarningCodes.AppErrorCpuPercent : FOErrorWarningCodes.AppWarningCpuPercent; break; case ErrorWarningProperty.TotalCpuTime: errorWarningCode = (healthState == HealthState.Error) ? FOErrorWarningCodes.NodeErrorCpuPercent : FOErrorWarningCodes.NodeWarningCpuPercent; break; case ErrorWarningProperty.DiskSpaceUsagePercentage: errorWarningCode = (healthState == HealthState.Error) ? FOErrorWarningCodes.NodeErrorDiskSpacePercent : FOErrorWarningCodes.NodeWarningDiskSpacePercent; break; case ErrorWarningProperty.DiskSpaceUsageMb: errorWarningCode = (healthState == HealthState.Error) ? FOErrorWarningCodes.NodeErrorDiskSpaceMB : FOErrorWarningCodes.NodeWarningDiskSpaceMB; break; case ErrorWarningProperty.TotalMemoryConsumptionMb when healthReportType == HealthReportType.Application: errorWarningCode = (healthState == HealthState.Error) ? FOErrorWarningCodes.AppErrorMemoryMB : FOErrorWarningCodes.AppWarningMemoryMB; break; case ErrorWarningProperty.TotalMemoryConsumptionMb: errorWarningCode = (healthState == HealthState.Error) ? FOErrorWarningCodes.NodeErrorMemoryMB : FOErrorWarningCodes.NodeWarningMemoryMB; break; case ErrorWarningProperty.TotalMemoryConsumptionPct when replicaOrInstance != null: errorWarningCode = (healthState == HealthState.Error) ? FOErrorWarningCodes.AppErrorMemoryPercent : FOErrorWarningCodes.AppWarningMemoryPercent; break; case ErrorWarningProperty.TotalMemoryConsumptionPct: errorWarningCode = (healthState == HealthState.Error) ? FOErrorWarningCodes.NodeErrorMemoryPercent : FOErrorWarningCodes.NodeWarningMemoryPercent; break; case ErrorWarningProperty.DiskAverageQueueLength: errorWarningCode = (healthState == HealthState.Error) ? FOErrorWarningCodes.NodeErrorDiskAverageQueueLength : FOErrorWarningCodes.NodeWarningDiskAverageQueueLength; break; case ErrorWarningProperty.TotalActiveFirewallRules: errorWarningCode = (healthState == HealthState.Error) ? FOErrorWarningCodes.ErrorTooManyFirewallRules : FOErrorWarningCodes.WarningTooManyFirewallRules; break; case ErrorWarningProperty.TotalActivePorts when healthReportType == HealthReportType.Application: errorWarningCode = (healthState == HealthState.Error) ? FOErrorWarningCodes.AppErrorTooManyActiveTcpPorts : FOErrorWarningCodes.AppWarningTooManyActiveTcpPorts; break; case ErrorWarningProperty.TotalActivePorts: errorWarningCode = (healthState == HealthState.Error) ? FOErrorWarningCodes.NodeErrorTooManyActiveTcpPorts : FOErrorWarningCodes.NodeWarningTooManyActiveTcpPorts; break; case ErrorWarningProperty.TotalEphemeralPorts when healthReportType == HealthReportType.Application: errorWarningCode = (healthState == HealthState.Error) ? FOErrorWarningCodes.AppErrorTooManyActiveEphemeralPorts : FOErrorWarningCodes.AppWarningTooManyActiveEphemeralPorts; break; case ErrorWarningProperty.TotalEphemeralPorts: errorWarningCode = (healthState == HealthState.Error) ? FOErrorWarningCodes.NodeErrorTooManyActiveEphemeralPorts : FOErrorWarningCodes.NodeWarningTooManyActiveEphemeralPorts; break; } var healthMessage = new StringBuilder(); string drive = string.Empty; if (ObserverName == ObserverConstants.DiskObserverName) { drive = $"{data.Id}: "; if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows)) { drive = $"{data.Id.Remove(1, 2)}: "; } } _ = healthMessage.Append($"{drive}{data.Property} is at or above the specified {thresholdName} limit ({threshold}{data.Units})"); _ = healthMessage.AppendLine($" - {data.Property}: {Math.Round(data.AverageDataValue)}{data.Units}"); // The health event description will be a serialized instance of telemetryData, // so it should be completely constructed (filled with data) regardless // of user telemetry settings. telemetryData.ApplicationName = appName?.OriginalString ?? string.Empty; telemetryData.Code = errorWarningCode; if (replicaOrInstance != null && !string.IsNullOrEmpty(replicaOrInstance.ContainerId)) { telemetryData.ContainerId = replicaOrInstance.ContainerId; } telemetryData.HealthState = Enum.GetName(typeof(HealthState), healthState); telemetryData.HealthEventDescription = healthMessage.ToString(); telemetryData.Metric = $"{drive}{data.Property}"; telemetryData.ServiceName = serviceName?.OriginalString ?? string.Empty; telemetryData.Source = ObserverConstants.FabricObserverName; telemetryData.Value = Math.Round(data.AverageDataValue, 1); // Send Health Report as Telemetry event (perhaps it signals an Alert from App Insights, for example.). if (IsTelemetryProviderEnabled && IsObserverTelemetryEnabled) { _ = TelemetryClient?.ReportHealthAsync( telemetryData, Token); } // ETW. if (IsEtwEnabled) { Logger.EtwLogger?.Write( ObserverConstants.FabricObserverETWEventName, new { ApplicationName = appName?.OriginalString ?? string.Empty, Code = errorWarningCode, ContainerId = replicaOrInstance != null ? replicaOrInstance.ContainerId ?? string.Empty : string.Empty, HealthState = Enum.GetName(typeof(HealthState), healthState), HealthEventDescription = healthMessage.ToString(), Metric = $"{drive}{data.Property}", Node = NodeName, ServiceName = serviceName?.OriginalString ?? string.Empty, Source = ObserverConstants.FabricObserverName, Value = Math.Round(data.AverageDataValue, 1), }); } var healthReport = new HealthReport { AppName = appName, Code = errorWarningCode, EmitLogEvent = true, HealthData = telemetryData, HealthMessage = healthMessage.ToString(), HealthReportTimeToLive = healthReportTtl, ReportType = healthReportType, State = healthState, NodeName = NodeName, Observer = ObserverName, ResourceUsageDataProperty = data.Property, }; if (!AppNames.Any(a => a == appName?.OriginalString)) { AppNames.Add(appName?.OriginalString); } // From FSO. if (replicaOrInstance == null && healthReportType == HealthReportType.Application) { HealthReportProperties.Add(id); } else { if (HealthReportProperties.Count == 0) { HealthReportProperties.Add(ObserverName switch { ObserverConstants.AppObserverName => "ApplicationHealth", ObserverConstants.CertificateObserverName => "SecurityHealth", ObserverConstants.DiskObserverName => "DiskHealth", ObserverConstants.FabricSystemObserverName => "FabricSystemServiceHealth", ObserverConstants.NetworkObserverName => "NetworkHealth", ObserverConstants.OSObserverName => "MachineInformation", ObserverConstants.NodeObserverName => "MachineResourceHealth", _ => $"{data.Property}", });
/// <inheritdoc/> public override async Task ReportAsync(CancellationToken token) { string app; var timeToLiveWarning = this.SetTimeToLiveWarning(); // Report on connection state. for (int j = 0; j < this.userEndpoints.Count; j++) { token.ThrowIfCancellationRequested(); var deployedApps = await this.FabricClientInstance.QueryManager .GetDeployedApplicationListAsync( this.NodeName, new Uri(this.userEndpoints[j].AppTarget)).ConfigureAwait(true); // We only care about deployed apps. if (deployedApps == null || deployedApps.Count < 1) { continue; } app = this.userEndpoints[j].AppTarget.Replace("fabric:/", string.Empty); for (int i = 0; i < this.connectionStatus.Count; i++) { token.ThrowIfCancellationRequested(); var connStatus = this.connectionStatus[i]; if (!connStatus.Connected) { this.healthState = HealthState.Warning; var healthMessage = "Outbound Internet connection failure detected for endpoint " + connStatus.HostName + "\n"; HealthReport report = new HealthReport { AppName = new Uri(this.userEndpoints[j].AppTarget), Code = FoErrorWarningCodes.AppWarningNetworkEndpointUnreachable, EmitLogEvent = true, HealthMessage = healthMessage, HealthReportTimeToLive = timeToLiveWarning, State = this.healthState, NodeName = this.NodeName, Observer = this.ObserverName, ReportType = HealthReportType.Application, ResourceUsageDataProperty = $"{ErrorWarningProperty.InternetConnectionFailure}: connStatus.HostName", }; // Send health report Warning and log event locally. this.HealthReporter.ReportHealthToServiceFabric(report); // This means this observer created a Warning or Error SF Health Report this.HasActiveFabricErrorOrWarning = true; // Send Health Report as Telemetry (perhaps it signals an Alert from App Insights, for example.). if (this.IsTelemetryEnabled) { _ = this.ObserverTelemetryClient?.ReportHealthAsync( HealthScope.Application, this.userEndpoints[j].AppTarget, HealthState.Warning, $"{this.NodeName}/{FoErrorWarningCodes.AppWarningNetworkEndpointUnreachable}: {healthMessage}", this.ObserverName, this.Token); } } else { if (connStatus.Health == HealthState.Warning) { this.healthState = HealthState.Ok; var healthMessage = "Outbound Internet connection test successful."; // Clear existing Health Warning. HealthReport report = new HealthReport { AppName = new Uri(this.userEndpoints[j].AppTarget), EmitLogEvent = true, HealthMessage = healthMessage, HealthReportTimeToLive = default(TimeSpan), State = this.healthState, NodeName = this.NodeName, Observer = this.ObserverName, ReportType = HealthReportType.Application, }; this.HealthReporter.ReportHealthToServiceFabric(report); // Reset health state. this.HasActiveFabricErrorOrWarning = false; } } } } // Clear this.connectionStatus.RemoveAll(conn => conn.Connected == true); this.connectionStatus.TrimExcess(); }
public override Task ReportAsync(CancellationToken token) { try { this.Token.ThrowIfCancellationRequested(); var healthReportTimeToLive = this.SetHealthReportTimeToLive(); // App-specific reporting. foreach (var app in this.targetList) { this.Token.ThrowIfCancellationRequested(); // Process data for reporting. foreach (var repOrInst in this.ReplicaOrInstanceList) { this.Token.ThrowIfCancellationRequested(); if (!string.IsNullOrEmpty(app.TargetAppType) && !string.Equals( repOrInst.ApplicationTypeName, app.TargetAppType, StringComparison.CurrentCultureIgnoreCase)) { continue; } if (!string.IsNullOrEmpty(app.TargetApp) && !string.Equals( repOrInst.ApplicationName.OriginalString, app.TargetApp, StringComparison.CurrentCultureIgnoreCase)) { continue; } Process p; try { p = Process.GetProcessById((int)repOrInst.HostProcessId); // If the process is no longer running, then don't report on it. if (p.HasExited) { continue; } } catch (ArgumentException) { continue; } catch (InvalidOperationException) { continue; } catch (Win32Exception) { continue; } string appNameOrType = GetAppNameOrType(repOrInst); var id = $"{appNameOrType}:{p.ProcessName}"; // Log (csv) CPU/Mem/DiskIO per app. if (this.CsvFileLogger != null && this.CsvFileLogger.EnableCsvLogging) { this.LogAllAppResourceDataToCsv(id); } #if DEBUG // DEBUG \\ if (id.Contains("CpuStress")) { // Emit an Ok Health Report for debug output. var healthReport = new Utilities.HealthReport { AppName = new Uri("fabric:/CpuStress"), HealthMessage = $"{p.Id} CpuData Count: {this.allAppCpuData.FirstOrDefault(x => x.Id == id).Data.Count}\n" + $"Average: {this.allAppCpuData.FirstOrDefault(x => x.Id == id).AverageDataValue}", State = HealthState.Ok, Code = FoErrorWarningCodes.Ok, NodeName = this.NodeName, Observer = this.ObserverName, Property = id, ReportType = HealthReportType.Application, }; this.HealthReporter.ReportHealthToServiceFabric(healthReport); } #endif // CPU this.ProcessResourceDataReportHealth( this.allAppCpuData.FirstOrDefault(x => x.Id == id), app.CpuErrorLimitPercent, app.CpuWarningLimitPercent, healthReportTimeToLive, HealthReportType.Application, repOrInst, app.DumpProcessOnError); // Memory this.ProcessResourceDataReportHealth( this.allAppMemDataMb.FirstOrDefault(x => x.Id == id), app.MemoryErrorLimitMb, app.MemoryWarningLimitMb, healthReportTimeToLive, HealthReportType.Application, repOrInst, app.DumpProcessOnError); this.ProcessResourceDataReportHealth( this.allAppMemDataPercent.FirstOrDefault(x => x.Id == id), app.MemoryErrorLimitPercent, app.MemoryWarningLimitPercent, healthReportTimeToLive, HealthReportType.Application, repOrInst, app.DumpProcessOnError); // Ports this.ProcessResourceDataReportHealth( this.allAppTotalActivePortsData.FirstOrDefault(x => x.Id == id), app.NetworkErrorActivePorts, app.NetworkWarningActivePorts, healthReportTimeToLive, HealthReportType.Application, repOrInst); // Ports this.ProcessResourceDataReportHealth( this.allAppEphemeralPortsData.FirstOrDefault(x => x.Id == id), app.NetworkErrorEphemeralPorts, app.NetworkWarningEphemeralPorts, healthReportTimeToLive, HealthReportType.Application, repOrInst); } } return(Task.CompletedTask); } catch (Exception e) { this.WriteToLogWithLevel( this.ObserverName, $"Unhandled exception in ReportAsync: \n{e}", LogLevel.Error); throw; } }
public override Task ReportAsync(CancellationToken token) { // Local log. ObserverLogger.LogInfo(message.ToString()); /* Report to Fabric */ // These values will be preserved across observer runs and are useful for clearing warnings // by reporting Ok health state health events with the same property and sourceid values // as the error/warning health events when FO is safely taken down (e.g., app is being uninstalled, // safe restart of fabric node it's running on, etc.). HealthReportProperties.Add("SomePropertyName"); HealthReportSourceIds.Add($"{ObserverName}_SomethingUniqueToThisReport"); var healthReporter = new ObserverHealthReporter(ObserverLogger, FabricClientInstance); var healthReport = new Utilities.HealthReport { Code = FOErrorWarningCodes.Ok, HealthMessage = this.message.ToString(), NodeName = NodeName, Observer = ObserverName, Property = HealthReportProperties[HealthReportProperties.Count - 1], ReportType = HealthReportType.Node, State = HealthState.Ok, }; healthReporter.ReportHealthToServiceFabric(healthReport); // Emit Telemetry - This will use whatever telemetry provider you have configured in FabricObserver Settings.xml. var telemetryData = new TelemetryData(FabricClientInstance, Token) { Code = FOErrorWarningCodes.Ok, HealthEventDescription = this.message.ToString(), HealthState = "Ok", NodeName = NodeName, ObserverName = ObserverName, Source = ObserverConstants.FabricObserverName, }; if (IsTelemetryProviderEnabled && IsObserverTelemetryEnabled) { _ = TelemetryClient?.ReportHealthAsync( telemetryData, Token); } // ETW. if (IsEtwEnabled) { Logger.EtwLogger?.Write( ObserverConstants.FabricObserverETWEventName, new { Code = FOErrorWarningCodes.Ok, HealthEventDescription = this.message.ToString(), HealthState = "Ok", NodeName, ObserverName, Source = ObserverConstants.FabricObserverName, }); } this.message.Clear(); return(Task.CompletedTask); }
public override Task ReportAsync(CancellationToken token) { token.ThrowIfCancellationRequested(); // Someone calling without observing first, must be run after a new run of ObserveAsync if (ExpiringWarnings == null || ExpiredWarnings == null || NotFoundWarnings == null) { return(Task.CompletedTask); } HealthReport healthReport; if (ExpiringWarnings.Count == 0 && ExpiredWarnings.Count == 0 && NotFoundWarnings.Count == 0) { healthReport = new HealthReport { Observer = ObserverName, ReportType = HealthReportType.Node, EmitLogEvent = true, NodeName = NodeName, HealthMessage = $"All cluster and monitored app certificates are healthy.", State = HealthState.Ok, HealthReportTimeToLive = RunInterval > TimeSpan.MinValue ? RunInterval : HealthReportTimeToLive, }; HasActiveFabricErrorOrWarning = false; } else { string healthMessage = (ExpiredWarnings.Count == 0 ? string.Empty : (ExpiredWarnings.Aggregate(string.Empty, (i, j) => i + "\n" + j) + "\n")) + (NotFoundWarnings.Count == 0 ? string.Empty : (NotFoundWarnings.Aggregate(string.Empty, (i, j) => i + "\n" + j) + "\n")) + (ExpiringWarnings.Count == 0 ? string.Empty : ExpiringWarnings.Aggregate(string.Empty, (i, j) => i + "\n" + j)); healthReport = new HealthReport { Code = FOErrorWarningCodes.WarningCertificateExpiration, Observer = ObserverName, ReportType = HealthReportType.Node, EmitLogEvent = true, NodeName = NodeName, HealthMessage = healthMessage, State = HealthState.Warning, HealthReportTimeToLive = RunInterval > TimeSpan.MinValue ? RunInterval : HealthReportTimeToLive, }; HasActiveFabricErrorOrWarning = true; if (IsTelemetryProviderEnabled && IsObserverTelemetryEnabled) { TelemetryData telemetryData = new TelemetryData(FabricClientInstance, token) { Code = FOErrorWarningCodes.WarningCertificateExpiration, HealthState = "Warning", NodeName = NodeName, Metric = ErrorWarningProperty.CertificateExpiration, HealthEventDescription = healthMessage, ObserverName = ObserverName, OS = RuntimeInformation.IsOSPlatform(OSPlatform.Windows) ? "Windows" : "Linux", Source = ObserverConstants.FabricObserverName, Value = FOErrorWarningCodes.GetErrorWarningNameFromFOCode(FOErrorWarningCodes.WarningCertificateExpiration), }; _ = TelemetryClient?.ReportMetricAsync( telemetryData, Token); } if (IsEtwEnabled) { Logger.EtwLogger?.Write( ObserverConstants.FabricObserverETWEventName, new { Code = FOErrorWarningCodes.WarningCertificateExpiration, HealthState = "Warning", NodeName, Metric = ErrorWarningProperty.CertificateExpiration, HealthEventDescription = healthMessage, ObserverName, OS = RuntimeInformation.IsOSPlatform(OSPlatform.Windows) ? "Windows" : "Linux", Source = ObserverConstants.FabricObserverName, Value = FOErrorWarningCodes.GetErrorWarningNameFromFOCode(FOErrorWarningCodes.WarningCertificateExpiration), }); } } HealthReporter.ReportHealthToServiceFabric(healthReport); return(Task.CompletedTask); }
internal void ProcessResourceDataReportHealth <T>( FabricResourceUsageData <T> data, T thresholdError, T thresholdWarning, TimeSpan healthReportTtl, HealthReportType healthReportType = HealthReportType.Node, ReplicaOrInstanceMonitoringInfo replicaOrInstance = null, bool dumpOnError = false) { if (data == null) { throw new ArgumentException("Supply all required parameters with non-null value."); } var thresholdName = "Minimum"; bool warningOrError = false; string repPartitionId = null, repOrInstanceId = null, name = null, id = null, procName = null; T threshold = thresholdWarning; var healthState = HealthState.Ok; Uri appName = null; if (replicaOrInstance != null) { repPartitionId = $"Partition: {replicaOrInstance.PartitionId}"; repOrInstanceId = $"Replica: {replicaOrInstance.ReplicaOrInstanceId}"; // Create a unique id which may be used in the case of warnings or OK clears. appName = replicaOrInstance.ApplicationName; name = appName.OriginalString.Replace("fabric:/", string.Empty); id = name + "_" + data.Property.Replace(" ", string.Empty); // Telemetry. if (this.IsTelemetryEnabled) { this.ObserverTelemetryClient?.ReportMetricAsync( $"{this.NodeName}-{name}-{data.Id}-{data.Property}", data.AverageDataValue, this.Token); } try { procName = Process.GetProcessById((int)replicaOrInstance.HostProcessId).ProcessName; } catch (ArgumentException) { return; } catch (InvalidOperationException) { return; } } else { // Telemetry. if (this.IsTelemetryEnabled) { this.ObserverTelemetryClient?.ReportMetricAsync( $"{this.NodeName}-{data.Id}-{data.Property}", data.AverageDataValue, this.Token); } } // ETW. if (this.IsEtwEnabled) { Logger.EtwLogger?.Write( $"FabricObserverDataEvent", new { Level = 0, // Info Node = this.NodeName, Observer = this.ObserverName, data.Property, data.Id, Value = $"{Math.Round(data.AverageDataValue)}", Unit = data.Units, }); } // Health Error if (data.IsUnhealthy(thresholdError)) { thresholdName = "Maximum"; threshold = thresholdError; warningOrError = true; healthState = HealthState.Error; // This is primarily useful for AppObserver, but makes sense to be // part of the base class for future use, like for FSO. if (replicaOrInstance != null && dumpOnError) { try { int procId = (int)replicaOrInstance.HostProcessId; if (!this.serviceDumpCountDictionary.ContainsKey(procName)) { this.serviceDumpCountDictionary.Add(procName, 0); } if (this.serviceDumpCountDictionary[procName] < this.maxDumps) { // DumpServiceProcess defaults to a Full dump with // process memory, handles and thread data. bool success = this.DumpServiceProcess(procId); if (success) { this.serviceDumpCountDictionary[procName]++; } } } // Ignore these, it just means no dmp will be created.This is not // critical to FO. Log as info, not warning. catch (Exception e) when(e is ArgumentException || e is InvalidOperationException) { this.ObserverLogger.LogInfo($"Unable to generate dmp file:{Environment.NewLine}{e}"); } } } // Health Warning if (!warningOrError && data.IsUnhealthy(thresholdWarning)) { warningOrError = true; healthState = HealthState.Warning; } if (warningOrError) { string errorWarningKind = null; switch (data.Property) { case ErrorWarningProperty.TotalCpuTime when replicaOrInstance != null: errorWarningKind = (healthState == HealthState.Error) ? FoErrorWarningCodes.AppErrorCpuTime : FoErrorWarningCodes.AppWarningCpuTime; break; case ErrorWarningProperty.TotalCpuTime: errorWarningKind = (healthState == HealthState.Error) ? FoErrorWarningCodes.NodeErrorCpuTime : FoErrorWarningCodes.NodeWarningCpuTime; break; case ErrorWarningProperty.DiskSpaceUsagePercentage: errorWarningKind = (healthState == HealthState.Error) ? FoErrorWarningCodes.NodeErrorDiskSpacePercentUsed : FoErrorWarningCodes.NodeWarningDiskSpacePercentUsed; break; case ErrorWarningProperty.DiskSpaceUsageMb: errorWarningKind = (healthState == HealthState.Error) ? FoErrorWarningCodes.NodeErrorDiskSpaceMb : FoErrorWarningCodes.NodeWarningDiskSpaceMb; break; case ErrorWarningProperty.TotalMemoryConsumptionMb when replicaOrInstance != null: errorWarningKind = (healthState == HealthState.Error) ? FoErrorWarningCodes.AppErrorMemoryCommittedMb : FoErrorWarningCodes.AppWarningMemoryCommittedMb; break; case ErrorWarningProperty.TotalMemoryConsumptionMb: errorWarningKind = (healthState == HealthState.Error) ? FoErrorWarningCodes.NodeErrorMemoryCommittedMb : FoErrorWarningCodes.NodeWarningMemoryCommittedMb; break; case ErrorWarningProperty.TotalMemoryConsumptionPct when replicaOrInstance != null: errorWarningKind = (healthState == HealthState.Error) ? FoErrorWarningCodes.AppErrorMemoryPercentUsed : FoErrorWarningCodes.AppWarningMemoryPercentUsed; break; case ErrorWarningProperty.TotalMemoryConsumptionPct: errorWarningKind = (healthState == HealthState.Error) ? FoErrorWarningCodes.NodeErrorMemoryPercentUsed : FoErrorWarningCodes.NodeWarningMemoryPercentUsed; break; case ErrorWarningProperty.DiskAverageQueueLength: errorWarningKind = (healthState == HealthState.Error) ? FoErrorWarningCodes.NodeErrorDiskAverageQueueLength : FoErrorWarningCodes.NodeWarningDiskAverageQueueLength; break; case ErrorWarningProperty.TotalActiveFirewallRules: errorWarningKind = (healthState == HealthState.Error) ? FoErrorWarningCodes.ErrorTooManyFirewallRules : FoErrorWarningCodes.WarningTooManyFirewallRules; break; case ErrorWarningProperty.TotalActivePorts when replicaOrInstance != null: errorWarningKind = (healthState == HealthState.Error) ? FoErrorWarningCodes.AppErrorTooManyActiveTcpPorts : FoErrorWarningCodes.AppWarningTooManyActiveTcpPorts; break; case ErrorWarningProperty.TotalActivePorts: errorWarningKind = (healthState == HealthState.Error) ? FoErrorWarningCodes.NodeErrorTooManyActiveTcpPorts : FoErrorWarningCodes.NodeWarningTooManyActiveTcpPorts; break; case ErrorWarningProperty.TotalEphemeralPorts when replicaOrInstance != null: errorWarningKind = (healthState == HealthState.Error) ? FoErrorWarningCodes.AppErrorTooManyActiveEphemeralPorts : FoErrorWarningCodes.AppWarningTooManyActiveEphemeralPorts; break; case ErrorWarningProperty.TotalEphemeralPorts: errorWarningKind = (healthState == HealthState.Error) ? FoErrorWarningCodes.NodeErrorTooManyActiveEphemeralPorts : FoErrorWarningCodes.NodeWarningTooManyActiveEphemeralPorts; break; } var healthMessage = new StringBuilder(); if (name != null) { healthMessage.Append($"{name} (Service Process: {procName}, {repPartitionId}, {repOrInstanceId}): "); } string drive = string.Empty; if (data.Property.Contains("Disk")) { drive = $"{data.Id}: "; } healthMessage.Append($"{drive}{data.Property} is at or above the specified {thresholdName} limit ({threshold}{data.Units})"); healthMessage.AppendLine($" - Average {data.Property}: {Math.Round(data.AverageDataValue)}{data.Units}"); var healthReport = new HealthReport { AppName = appName, Code = errorWarningKind, EmitLogEvent = true, HealthMessage = healthMessage.ToString(), HealthReportTimeToLive = healthReportTtl, ReportType = healthReportType, State = healthState, NodeName = this.NodeName, Observer = this.ObserverName, ResourceUsageDataProperty = data.Property, }; // Emit a Fabric Health Report and optionally a local log write. this.HealthReporter.ReportHealthToServiceFabric(healthReport); // Set internal fabric health states. data.ActiveErrorOrWarning = true; // This means this observer created a Warning or Error SF Health Report this.HasActiveFabricErrorOrWarning = true; // Send Health Report as Telemetry event (perhaps it signals an Alert from App Insights, for example.). if (this.IsTelemetryEnabled) { this.ObserverTelemetryClient?.ReportHealthAsync( !string.IsNullOrEmpty(id) ? HealthScope.Application : HealthScope.Node, $"{(appName != null ? appName.OriginalString : this.NodeName)}", healthState, $"{this.NodeName}/{errorWarningKind}/{drive}{data.Property}/{Math.Round(data.AverageDataValue)}", this.ObserverName, this.Token); } // ETW. if (this.IsEtwEnabled) { Logger.EtwLogger?.Write( $"FabricObserverDataEvent", new { Level = (healthState == HealthState.Warning) ? 1 : 2, Node = this.NodeName, Observer = this.ObserverName, HealthEventErrorCode = errorWarningKind, HealthEventDescription = healthMessage.ToString(), data.Property, data.Id, Value = $"{Math.Round(data.AverageDataValue)}", Unit = data.Units, }); } // Clean up sb. healthMessage.Clear(); } else { if (data.ActiveErrorOrWarning) { var report = new HealthReport { AppName = appName, EmitLogEvent = true, HealthMessage = $"{data.Property} is now within normal/expected range.", HealthReportTimeToLive = default(TimeSpan), ReportType = healthReportType, State = HealthState.Ok, NodeName = this.NodeName, Observer = $"{this.ObserverName}({data.Id})", ResourceUsageDataProperty = data.Property, }; // Emit an Ok Health Report to clear Fabric Health warning. this.HealthReporter.ReportHealthToServiceFabric(report); // Reset health states. data.ActiveErrorOrWarning = false; this.HasActiveFabricErrorOrWarning = false; } } // No need to keep data in memory. data.Data.Clear(); data.Data.TrimExcess(); }
public override Task ReportAsync(CancellationToken token) { try { token.ThrowIfCancellationRequested(); // OS Health. if (this.osStatus != null && !string.Equals(this.osStatus, "OK", StringComparison.OrdinalIgnoreCase)) { string healthMessage = $"OS reporting unhealthy: {this.osStatus}"; var healthReport = new HealthReport { Observer = ObserverName, NodeName = NodeName, HealthMessage = healthMessage, State = HealthState.Error, HealthReportTimeToLive = SetHealthReportTimeToLive(), }; HealthReporter.ReportHealthToServiceFabric(healthReport); // This means this observer created a Warning or Error SF Health Report HasActiveFabricErrorOrWarning = true; // Send Health Report as Telemetry (perhaps it signals an Alert from App Insights, for example.). if (IsTelemetryProviderEnabled && IsObserverTelemetryEnabled) { _ = TelemetryClient?.ReportHealthAsync( HealthScope.Application, FabricRuntime.GetActivationContext().ApplicationName, HealthState.Error, $"{NodeName} - OS reporting unhealthy: {this.osStatus}", ObserverName, Token); } } else if (HasActiveFabricErrorOrWarning && string.Equals(this.osStatus, "OK", StringComparison.OrdinalIgnoreCase)) { // Clear Error or Warning with an OK Health Report. string healthMessage = $"OS reporting healthy: {this.osStatus}"; var healthReport = new HealthReport { Observer = ObserverName, NodeName = NodeName, HealthMessage = healthMessage, State = HealthState.Ok, HealthReportTimeToLive = default(TimeSpan), }; HealthReporter.ReportHealthToServiceFabric(healthReport); // Reset internal health state. HasActiveFabricErrorOrWarning = false; } if (ObserverManager.ObserverWebAppDeployed) { var logPath = Path.Combine(ObserverLogger.LogFolderBasePath, "SysInfo.txt"); // This file is used by the web application (log reader.). if (!ObserverLogger.TryWriteLogFile(logPath, $"Last updated on {DateTime.UtcNow.ToString("M/d/yyyy HH:mm:ss")} UTC<br/>{this.osReport}")) { HealthReporter.ReportFabricObserverServiceHealth( FabricServiceContext.ServiceName.OriginalString, ObserverName, HealthState.Warning, "Unable to create SysInfo.txt file."); } } var report = new HealthReport { Observer = ObserverName, HealthMessage = this.osReport, State = HealthState.Ok, NodeName = NodeName, HealthReportTimeToLive = SetHealthReportTimeToLive(), }; HealthReporter.ReportHealthToServiceFabric(report); // Windows Update automatic download enabled? if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows) && this.isWindowsUpdateAutoDownloadEnabled) { string linkText = $"{Environment.NewLine}For clusters of Silver durability or above, " + $"please consider <a href=\"https://docs.microsoft.com/azure/virtual-machine-scale-sets/virtual-machine-scale-sets-automatic-upgrade\" target=\"blank\">" + $"enabling VMSS automatic OS image upgrades</a> to prevent unexpected VM reboots. " + $"For Bronze durability clusters, please consider deploying the " + $"<a href=\"https://docs.microsoft.com/azure/service-fabric/service-fabric-patch-orchestration-application\" target=\"blank\">Patch Orchestration Service</a>."; string auServiceEnabledMessage = $"Windows Update Automatic Download is enabled.{linkText}"; report = new HealthReport { Observer = ObserverName, Property = "OSConfiguration", HealthMessage = auServiceEnabledMessage, State = HealthState.Warning, NodeName = NodeName, HealthReportTimeToLive = SetHealthReportTimeToLive(), }; HealthReporter.ReportHealthToServiceFabric(report); if (IsTelemetryProviderEnabled && IsObserverTelemetryEnabled && RuntimeInformation.IsOSPlatform(OSPlatform.Windows)) { // Send Health Report as Telemetry (perhaps it signals an Alert from App Insights, for example.). var telemetryData = new TelemetryData(FabricClientInstance, token) { HealthEventDescription = auServiceEnabledMessage, HealthState = "Warning", Metric = "WUAutoDownloadEnabled", Value = this.isWindowsUpdateAutoDownloadEnabled, NodeName = NodeName, ObserverName = ObserverName, Source = ObserverConstants.FabricObserverName, }; _ = TelemetryClient?.ReportMetricAsync( telemetryData, Token); } // ETW. if (IsEtwEnabled && RuntimeInformation.IsOSPlatform(OSPlatform.Windows)) { Logger.EtwLogger?.Write( ObserverConstants.FabricObserverETWEventName, new { HealthState = "Warning", HealthEventDescription = auServiceEnabledMessage, ObserverName, Metric = "WUAutoDownloadEnabled", Value = this.isWindowsUpdateAutoDownloadEnabled, NodeName, }); } } if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows)) { // reset au globals for fresh detection during next observer run. this.isWindowsUpdateAutoDownloadEnabled = false; this.auStateUnknown = false; this.isWUADSettingEnabled = false; } return(Task.CompletedTask); } catch (Exception e) { HealthReporter.ReportFabricObserverServiceHealth( FabricServiceContext.ServiceName.OriginalString, ObserverName, HealthState.Error, $"Unhandled exception processing OS information:{Environment.NewLine}{e}"); throw; } }
public void ReportHealthToServiceFabric(HealthReport healthReport) { if (healthReport == null) { return; } // There is no real need to change Immediate to true here for errors/warnings. This only adds unecessary stress to the // Health subsystem. var sendOptions = new HealthReportSendOptions { Immediate = false }; // Quickly send OK (clears warning/errors states). if (healthReport.State == HealthState.Ok) { sendOptions.Immediate = true; } var timeToLive = TimeSpan.FromMinutes(5); if (healthReport.HealthReportTimeToLive != default) { timeToLive = healthReport.HealthReportTimeToLive; } // Set property for health event. string property = healthReport.Property; if (string.IsNullOrEmpty(property)) { switch (healthReport.Observer) { case ObserverConstants.AppObserverName: property = "ApplicationHealth"; break; case ObserverConstants.CertificateObserverName: property = "SecurityHealth"; break; case ObserverConstants.DiskObserverName: property = "DiskHealth"; break; case ObserverConstants.FabricSystemObserverName: property = "FabricSystemServiceHealth"; break; case ObserverConstants.NetworkObserverName: property = "NetworkHealth"; break; case ObserverConstants.OsObserverName: property = "MachineInformation"; break; case ObserverConstants.NodeObserverName: property = "MachineResourceHealth"; break; default: property = "FOGenericHealth"; break; } } string sourceId = healthReport.Observer; TelemetryData healthData = healthReport.HealthData; if (!string.IsNullOrEmpty(healthReport.Code)) { // Only use FOErrorWarningCode for source sourceId += $"({healthReport.Code})"; } string errWarnPreamble = string.Empty; if (healthReport.State == HealthState.Error || healthReport.State == HealthState.Warning) { errWarnPreamble = $"{healthReport.Observer} detected " + $"{Enum.GetName(typeof(HealthState), healthReport.State)} threshold breach. "; // OSObserver does not monitor resources and therefore does not support related usage threshold configuration. if (healthReport.Observer == ObserverConstants.OsObserverName && property == "OSConfiguration") { errWarnPreamble = $"{ObserverConstants.OsObserverName} detected potential problem with OS configuration: "; property = "OSConfiguration"; } } string message = $"{errWarnPreamble}{healthReport.HealthMessage}"; if (healthData != null) { message = JsonConvert.SerializeObject(healthData); } var healthInformation = new HealthInformation(sourceId, property, healthReport.State) { Description = $"{message}", TimeToLive = timeToLive, RemoveWhenExpired = true, }; // Log event only if ObserverWebApi (REST API Log reader service) app is deployed. if (ObserverManager.ObserverWebAppDeployed && healthReport.EmitLogEvent) { if (healthReport.State == HealthState.Error) { this.logger.LogError(healthReport.NodeName + ": {0}", healthInformation.Description); } else if (healthReport.State == HealthState.Warning) { this.logger.LogWarning(healthReport.NodeName + ": {0}", healthInformation.Description); } else { this.logger.LogInfo(healthReport.NodeName + ": {0}", healthInformation.Description); } } // To SFX. if (healthReport.ReportType == HealthReportType.Application && healthReport.AppName != null) { var appHealthReport = new ApplicationHealthReport(healthReport.AppName, healthInformation); this.fabricClient.HealthManager.ReportHealth(appHealthReport, sendOptions); } else { var nodeHealthReport = new NodeHealthReport(healthReport.NodeName, healthInformation); this.fabricClient.HealthManager.ReportHealth(nodeHealthReport, sendOptions); } }
internal void ProcessResourceDataReportHealth <T>( FabricResourceUsageData <T> data, T thresholdError, T thresholdWarning, TimeSpan healthReportTtl, HealthReportType healthReportType = HealthReportType.Node, ReplicaOrInstanceMonitoringInfo replicaOrInstance = null, bool dumpOnError = false) where T : struct { if (data == null) { throw new ArgumentException("Supply all required parameters with non-null value."); } var thresholdName = "Minimum"; bool warningOrError = false; string repPartitionId = null, repOrInstanceId = null, name = null, id = null, procName = null; T threshold = thresholdWarning; var healthState = HealthState.Ok; Uri appName = null; Uri serviceName = null; TelemetryData telemetryData = null; if (healthReportType == HealthReportType.Application) { if (replicaOrInstance != null) { repPartitionId = $"Partition: {replicaOrInstance.PartitionId}"; repOrInstanceId = $"Replica: {replicaOrInstance.ReplicaOrInstanceId}"; // Create a unique id which will be used for health Warnings and OKs (clears). appName = replicaOrInstance.ApplicationName; serviceName = replicaOrInstance.ServiceName; name = appName.OriginalString.Replace("fabric:/", string.Empty); } else { appName = new Uri("fabric:/System"); name = data.Id; } id = name + "_" + data.Property.Replace(" ", string.Empty); // The health event description will be a serialized instance of telemetryData, // so it should be completely constructed (filled with data) regardless // of user telemetry settings. telemetryData = new TelemetryData(this.FabricClientInstance, this.Token) { ApplicationName = appName?.OriginalString ?? string.Empty, Code = FoErrorWarningCodes.Ok, HealthState = Enum.GetName(typeof(HealthState), HealthState.Ok), NodeName = this.NodeName, ObserverName = this.ObserverName, Metric = data.Property, Value = Math.Round(Convert.ToDouble(data.AverageDataValue), 1), PartitionId = replicaOrInstance?.PartitionId.ToString(), ReplicaId = replicaOrInstance?.ReplicaOrInstanceId.ToString(), ServiceName = serviceName?.OriginalString ?? string.Empty, Source = ObserverConstants.FabricObserverName, }; try { if (replicaOrInstance != null) { procName = Process.GetProcessById((int)replicaOrInstance.HostProcessId).ProcessName; } else { // The name of the target service process is always the id for data containers coming from FSO. procName = data.Id; } telemetryData.ServiceName = procName; if (this.IsTelemetryProviderEnabled && this.IsObserverTelemetryEnabled) { _ = this.TelemetryClient?.ReportMetricAsync( telemetryData, this.Token).ConfigureAwait(false); } if (this.IsEtwEnabled) { Logger.EtwLogger?.Write( ObserverConstants.FabricObserverETWEventName, new { ApplicationName = appName?.OriginalString ?? string.Empty, Code = FoErrorWarningCodes.Ok, HealthState = Enum.GetName(typeof(HealthState), HealthState.Ok), NodeName = this.NodeName, ObserverName = this.ObserverName, Metric = data.Property, Value = Math.Round(Convert.ToDouble(data.AverageDataValue), 1), PartitionId = replicaOrInstance?.PartitionId.ToString(), ReplicaId = replicaOrInstance?.ReplicaOrInstanceId.ToString(), ServiceName = procName, Source = ObserverConstants.FabricObserverName, }); } } catch (ArgumentException) { return; } catch (InvalidOperationException) { return; } } else { string drive = string.Empty; if (this.ObserverName == ObserverConstants.DiskObserverName) { drive = $"{data.Id}: "; } // The health event description will be a serialized instance of telemetryData, // so it should be completely constructed (filled with data) regardless // of user telemetry settings. telemetryData = new TelemetryData(this.FabricClientInstance, this.Token) { Code = FoErrorWarningCodes.Ok, HealthState = Enum.GetName(typeof(HealthState), HealthState.Ok), NodeName = this.NodeName, ObserverName = this.ObserverName, Metric = $"{drive}{data.Property}", Source = ObserverConstants.FabricObserverName, Value = Math.Round(Convert.ToDouble(data.AverageDataValue), 1), }; if (this.IsTelemetryProviderEnabled && this.IsObserverTelemetryEnabled) { _ = this.TelemetryClient?.ReportMetricAsync( telemetryData, this.Token); } if (this.IsEtwEnabled) { Logger.EtwLogger?.Write( ObserverConstants.FabricObserverETWEventName, new { Code = FoErrorWarningCodes.Ok, HealthState = Enum.GetName(typeof(HealthState), HealthState.Ok), NodeName = this.NodeName, ObserverName = this.ObserverName, Metric = $"{drive}{data.Property}", Source = ObserverConstants.FabricObserverName, Value = Math.Round(Convert.ToDouble(data.AverageDataValue), 1), }); } } // Health Error if (data.IsUnhealthy(thresholdError)) { thresholdName = "Maximum"; threshold = thresholdError; warningOrError = true; healthState = HealthState.Error; // This is primarily useful for AppObserver, but makes sense to be // part of the base class for future use, like for FSO. if (replicaOrInstance != null && dumpOnError) { try { int procId = (int)replicaOrInstance.HostProcessId; if (!this.serviceDumpCountDictionary.ContainsKey(procName)) { this.serviceDumpCountDictionary.Add(procName, 0); } if (this.serviceDumpCountDictionary[procName] < this.maxDumps) { // DumpServiceProcess defaults to a Full dump with // process memory, handles and thread data. bool success = this.DumpServiceProcess(procId); if (success) { this.serviceDumpCountDictionary[procName]++; } } } // Ignore these, it just means no dmp will be created.This is not // critical to FO. Log as info, not warning. catch (Exception e) when(e is ArgumentException || e is InvalidOperationException) { this.ObserverLogger.LogInfo($"Unable to generate dmp file:{Environment.NewLine}{e}"); } } } // Health Warning if (!warningOrError && data.IsUnhealthy(thresholdWarning)) { warningOrError = true; healthState = HealthState.Warning; } if (warningOrError) { string errorWarningCode = null; switch (data.Property) { case ErrorWarningProperty.TotalCpuTime when healthReportType == HealthReportType.Application: errorWarningCode = (healthState == HealthState.Error) ? FoErrorWarningCodes.AppErrorCpuTime : FoErrorWarningCodes.AppWarningCpuTime; break; case ErrorWarningProperty.TotalCpuTime: errorWarningCode = (healthState == HealthState.Error) ? FoErrorWarningCodes.NodeErrorCpuTime : FoErrorWarningCodes.NodeWarningCpuTime; break; case ErrorWarningProperty.DiskSpaceUsagePercentage: errorWarningCode = (healthState == HealthState.Error) ? FoErrorWarningCodes.NodeErrorDiskSpacePercentUsed : FoErrorWarningCodes.NodeWarningDiskSpacePercentUsed; break; case ErrorWarningProperty.DiskSpaceUsageMb: errorWarningCode = (healthState == HealthState.Error) ? FoErrorWarningCodes.NodeErrorDiskSpaceMb : FoErrorWarningCodes.NodeWarningDiskSpaceMb; break; case ErrorWarningProperty.TotalMemoryConsumptionMb when healthReportType == HealthReportType.Application: errorWarningCode = (healthState == HealthState.Error) ? FoErrorWarningCodes.AppErrorMemoryCommittedMb : FoErrorWarningCodes.AppWarningMemoryCommittedMb; break; case ErrorWarningProperty.TotalMemoryConsumptionMb: errorWarningCode = (healthState == HealthState.Error) ? FoErrorWarningCodes.NodeErrorMemoryCommittedMb : FoErrorWarningCodes.NodeWarningMemoryCommittedMb; break; case ErrorWarningProperty.TotalMemoryConsumptionPct when replicaOrInstance != null: errorWarningCode = (healthState == HealthState.Error) ? FoErrorWarningCodes.AppErrorMemoryPercentUsed : FoErrorWarningCodes.AppWarningMemoryPercentUsed; break; case ErrorWarningProperty.TotalMemoryConsumptionPct: errorWarningCode = (healthState == HealthState.Error) ? FoErrorWarningCodes.NodeErrorMemoryPercentUsed : FoErrorWarningCodes.NodeWarningMemoryPercentUsed; break; case ErrorWarningProperty.DiskAverageQueueLength: errorWarningCode = (healthState == HealthState.Error) ? FoErrorWarningCodes.NodeErrorDiskAverageQueueLength : FoErrorWarningCodes.NodeWarningDiskAverageQueueLength; break; case ErrorWarningProperty.TotalActiveFirewallRules: errorWarningCode = (healthState == HealthState.Error) ? FoErrorWarningCodes.ErrorTooManyFirewallRules : FoErrorWarningCodes.WarningTooManyFirewallRules; break; case ErrorWarningProperty.TotalActivePorts when healthReportType == HealthReportType.Application: errorWarningCode = (healthState == HealthState.Error) ? FoErrorWarningCodes.AppErrorTooManyActiveTcpPorts : FoErrorWarningCodes.AppWarningTooManyActiveTcpPorts; break; case ErrorWarningProperty.TotalActivePorts: errorWarningCode = (healthState == HealthState.Error) ? FoErrorWarningCodes.NodeErrorTooManyActiveTcpPorts : FoErrorWarningCodes.NodeWarningTooManyActiveTcpPorts; break; case ErrorWarningProperty.TotalEphemeralPorts when healthReportType == HealthReportType.Application: errorWarningCode = (healthState == HealthState.Error) ? FoErrorWarningCodes.AppErrorTooManyActiveEphemeralPorts : FoErrorWarningCodes.AppWarningTooManyActiveEphemeralPorts; break; case ErrorWarningProperty.TotalEphemeralPorts: errorWarningCode = (healthState == HealthState.Error) ? FoErrorWarningCodes.NodeErrorTooManyActiveEphemeralPorts : FoErrorWarningCodes.NodeWarningTooManyActiveEphemeralPorts; break; } var healthMessage = new StringBuilder(); /*if (name != null) * { * string partitionAndReplicaInfo = string.Empty; * * if (replicaOrInstance != null) * { * partitionAndReplicaInfo = $", {repPartitionId}, {repOrInstanceId}"; * } * * _ = healthMessage.Append($"{name} (Node: {this.NodeName}, Service Process: {procName}.exe{partitionAndReplicaInfo}): "); * }*/ string drive = string.Empty; if (this.ObserverName == ObserverConstants.DiskObserverName) { drive = $"{data.Id}: "; } _ = healthMessage.Append($"{drive}{data.Property} is at or above the specified {thresholdName} limit ({threshold}{data.Units})"); _ = healthMessage.AppendLine($" - {data.Property}: {Math.Round(Convert.ToDouble(data.AverageDataValue))}{data.Units}"); // The health event description will be a serialized instance of telemetryData, // so it should be completely constructed (filled with data) regardless // of user telemetry settings. telemetryData.ApplicationName = appName?.OriginalString ?? string.Empty; telemetryData.Code = errorWarningCode; telemetryData.HealthState = Enum.GetName(typeof(HealthState), healthState); telemetryData.HealthEventDescription = healthMessage.ToString(); telemetryData.Metric = $"{drive}{data.Property}"; telemetryData.ServiceName = serviceName?.OriginalString ?? string.Empty; telemetryData.Source = ObserverConstants.FabricObserverName; telemetryData.Value = Math.Round(Convert.ToDouble(data.AverageDataValue), 1); // Send Health Report as Telemetry event (perhaps it signals an Alert from App Insights, for example.). if (this.IsTelemetryProviderEnabled && this.IsObserverTelemetryEnabled) { _ = this.TelemetryClient?.ReportMetricAsync( telemetryData, this.Token); } // ETW. if (this.IsEtwEnabled) { Logger.EtwLogger?.Write( ObserverConstants.FabricObserverETWEventName, new { ApplicationName = appName?.OriginalString ?? string.Empty, Code = errorWarningCode, HealthState = Enum.GetName(typeof(HealthState), healthState), HealthEventDescription = healthMessage.ToString(), Metric = $"{drive}{data.Property}", Node = this.NodeName, ServiceName = serviceName?.OriginalString ?? string.Empty, Source = ObserverConstants.FabricObserverName, Value = Math.Round(Convert.ToDouble(data.AverageDataValue), 1), }); } var healthReport = new HealthReport { AppName = appName, Code = errorWarningCode, EmitLogEvent = true, HealthData = telemetryData, HealthMessage = healthMessage.ToString(), HealthReportTimeToLive = healthReportTtl, ReportType = healthReportType, State = healthState, NodeName = this.NodeName, Observer = this.ObserverName, ResourceUsageDataProperty = data.Property, }; // From FSO. if (replicaOrInstance == null && healthReportType == HealthReportType.Application) { healthReport.Property = id; } // Emit a Fabric Health Report and optionally a local log write. this.HealthReporter.ReportHealthToServiceFabric(healthReport); // Set internal health state info on data instance. data.ActiveErrorOrWarning = true; data.ActiveErrorOrWarningCode = errorWarningCode; // This means this observer created a Warning or Error SF Health Report this.HasActiveFabricErrorOrWarning = true; // Clean up sb. _ = healthMessage.Clear(); } else { if (data.ActiveErrorOrWarning) { // The health event description will be a serialized instance of telemetryData, // so it should be completely constructed (filled with data) regardless // of user telemetry settings. telemetryData.ApplicationName = appName?.OriginalString ?? string.Empty; telemetryData.Code = data.ActiveErrorOrWarningCode; telemetryData.HealthState = Enum.GetName(typeof(HealthState), HealthState.Ok); telemetryData.HealthEventDescription = $"{data.Property} is now within normal/expected range."; telemetryData.Metric = data.Property; telemetryData.Source = ObserverConstants.FabricObserverName; telemetryData.Value = Math.Round(Convert.ToDouble(data.AverageDataValue), 1); // Telemetry if (this.IsTelemetryProviderEnabled && this.IsObserverTelemetryEnabled) { _ = this.TelemetryClient?.ReportMetricAsync( telemetryData, this.Token); } // ETW. if (this.IsEtwEnabled) { Logger.EtwLogger?.Write( ObserverConstants.FabricObserverETWEventName, new { ApplicationName = appName != null ? appName.OriginalString : string.Empty, Code = data.ActiveErrorOrWarningCode, HealthState = Enum.GetName(typeof(HealthState), HealthState.Ok), HealthEventDescription = $"{data.Property} is now within normal/expected range.", Metric = data.Property, Node = this.NodeName, ServiceName = name ?? string.Empty, Source = ObserverConstants.FabricObserverName, Value = Math.Round(Convert.ToDouble(data.AverageDataValue), 1), }); } var healthReport = new HealthReport { AppName = appName, Code = data.ActiveErrorOrWarningCode, EmitLogEvent = true, HealthData = telemetryData, HealthMessage = $"{data.Property} is now within normal/expected range.", HealthReportTimeToLive = default(TimeSpan), ReportType = healthReportType, State = HealthState.Ok, NodeName = this.NodeName, Observer = this.ObserverName, ResourceUsageDataProperty = data.Property, }; // From FSO. if (replicaOrInstance == null && healthReportType == HealthReportType.Application) { healthReport.Property = id; } // Emit an Ok Health Report to clear Fabric Health warning. this.HealthReporter.ReportHealthToServiceFabric(healthReport); // Reset health states. data.ActiveErrorOrWarning = false; data.ActiveErrorOrWarningCode = FoErrorWarningCodes.Ok; this.HasActiveFabricErrorOrWarning = false; } } // No need to keep data in memory. if (data.Data is List <T> list) { // List<T> impl. list.Clear(); list.TrimExcess(); } else { // CircularBufferCollection<T> impl. data.Data.Clear(); } }
private async Task MonitorDeployedAppsAsync(CancellationToken token) { Process currentProcess = null; foreach (var repOrInst in ReplicaOrInstanceList) { token.ThrowIfCancellationRequested(); var timer = new Stopwatch(); int processId = (int)repOrInst.HostProcessId; var cpuUsage = new CpuUsage(); bool checkCpu = false, checkMemMb = false, checkMemPct = false, checkAllPorts = false, checkEphemeralPorts = false; var application = this.deployedTargetList?.FirstOrDefault( app => app?.TargetApp?.ToLower() == repOrInst.ApplicationName?.OriginalString?.ToLower() || app?.TargetAppType?.ToLower() == repOrInst.ApplicationTypeName?.ToLower()); if (application?.TargetApp == null && application?.TargetAppType == null) { continue; } try { // App level. currentProcess = Process.GetProcessById(processId); token.ThrowIfCancellationRequested(); var procName = currentProcess.ProcessName; string appNameOrType = GetAppNameOrType(repOrInst); var id = $"{appNameOrType}:{procName}"; // Add new resource data structures for each app service process where the metric is specified in configuration for related observation. if (this.AllAppCpuData.All(list => list.Id != id) && (application.CpuErrorLimitPercent > 0 || application.CpuWarningLimitPercent > 0)) { this.AllAppCpuData.Add(new FabricResourceUsageData <double>(ErrorWarningProperty.TotalCpuTime, id, DataCapacity, UseCircularBuffer)); } if (this.AllAppCpuData.Any(list => list.Id == id)) { checkCpu = true; } if (this.AllAppMemDataMb.All(list => list.Id != id) && (application.MemoryErrorLimitMb > 0 || application.MemoryWarningLimitMb > 0)) { this.AllAppMemDataMb.Add(new FabricResourceUsageData <float>(ErrorWarningProperty.TotalMemoryConsumptionMb, id, DataCapacity, UseCircularBuffer)); } if (this.AllAppMemDataMb.Any(list => list.Id == id)) { checkMemMb = true; } if (this.AllAppMemDataPercent.All(list => list.Id != id) && (application.MemoryErrorLimitPercent > 0 || application.MemoryWarningLimitPercent > 0)) { this.AllAppMemDataPercent.Add(new FabricResourceUsageData <double>(ErrorWarningProperty.TotalMemoryConsumptionPct, id, DataCapacity, UseCircularBuffer)); } if (this.AllAppMemDataPercent.Any(list => list.Id == id)) { checkMemPct = true; } if (this.AllAppTotalActivePortsData.All(list => list.Id != id) && (application.NetworkErrorActivePorts > 0 || application.NetworkWarningActivePorts > 0)) { this.AllAppTotalActivePortsData.Add(new FabricResourceUsageData <int>(ErrorWarningProperty.TotalActivePorts, id, 1)); } if (this.AllAppTotalActivePortsData.Any(list => list.Id == id)) { checkAllPorts = true; } if (this.AllAppEphemeralPortsData.All(list => list.Id != id) && (application.NetworkErrorEphemeralPorts > 0 || application.NetworkWarningEphemeralPorts > 0)) { this.AllAppEphemeralPortsData.Add(new FabricResourceUsageData <int>(ErrorWarningProperty.TotalEphemeralPorts, id, 1)); } if (this.AllAppEphemeralPortsData.Any(list => list.Id == id)) { checkEphemeralPorts = true; } // Measure Total and Ephemeral ports. if (checkAllPorts) { this.AllAppTotalActivePortsData.FirstOrDefault(x => x.Id == id).Data.Add(OperatingSystemInfoProvider.Instance.GetActivePortCount(currentProcess.Id, FabricServiceContext)); } if (checkEphemeralPorts) { this.AllAppEphemeralPortsData.FirstOrDefault(x => x.Id == id).Data.Add(OperatingSystemInfoProvider.Instance.GetActiveEphemeralPortCount(currentProcess.Id, FabricServiceContext)); } // No need to proceed further if no cpu and mem thresholds are specified in configuration. if (!checkCpu && !checkMemMb && !checkMemPct) { continue; } /* CPU and Memory Usage */ TimeSpan duration = TimeSpan.FromSeconds(15); if (MonitorDuration > TimeSpan.MinValue) { duration = MonitorDuration; } // Warm up the counters. if (checkCpu) { _ = cpuUsage.GetCpuUsagePercentageProcess(currentProcess); } if (checkMemMb || checkMemPct) { _ = ProcessInfoProvider.Instance.GetProcessPrivateWorkingSetInMB(currentProcess.Id); } timer.Start(); while (!currentProcess.HasExited && timer.Elapsed.Seconds <= duration.Seconds) { token.ThrowIfCancellationRequested(); if (checkCpu) { // CPU (all cores). double cpu = cpuUsage.GetCpuUsagePercentageProcess(currentProcess); if (cpu >= 0) { if (cpu > 100) { cpu = 100; } this.AllAppCpuData.FirstOrDefault(x => x.Id == id).Data.Add(cpu); } } float processMem = 0; if (checkMemMb || checkMemPct) { processMem = ProcessInfoProvider.Instance.GetProcessPrivateWorkingSetInMB(currentProcess.Id); } if (checkMemMb) { // Memory (private working set (process)). this.AllAppMemDataMb.FirstOrDefault(x => x.Id == id).Data.Add(processMem); } if (checkMemPct) { // Memory (percent in use (total)). var(TotalMemory, PercentInUse) = OperatingSystemInfoProvider.Instance.TupleGetTotalPhysicalMemorySizeAndPercentInUse(); long totalMem = TotalMemory; if (totalMem > 0) { double usedPct = Math.Round(((double)(processMem * 100)) / (totalMem * 1024), 2); this.AllAppMemDataPercent.FirstOrDefault(x => x.Id == id).Data.Add(Math.Round(usedPct, 1)); } } await Task.Delay(250, Token); } timer.Stop(); timer.Reset(); } catch (Exception e) { #if DEBUG // DEBUG INFO var healthReport = new Utilities.HealthReport { AppName = repOrInst.ApplicationName, HealthMessage = $"Error:{Environment.NewLine}{e}{Environment.NewLine}", State = HealthState.Ok, Code = FOErrorWarningCodes.Ok, NodeName = NodeName, Observer = ObserverName, Property = $"{e.Source}", ReportType = HealthReportType.Application, }; HealthReporter.ReportHealthToServiceFabric(healthReport); #endif if (e is Win32Exception || e is ArgumentException || e is InvalidOperationException) { WriteToLogWithLevel( ObserverName, $"MonitorAsync failed to find current service process for {repOrInst.ApplicationName?.OriginalString ?? repOrInst.ApplicationTypeName}{Environment.NewLine}{e}", LogLevel.Information); } else { if (!(e is OperationCanceledException || e is TaskCanceledException)) { WriteToLogWithLevel( ObserverName, $"Unhandled exception in MonitorAsync:{Environment.NewLine}{e}", LogLevel.Warning); } throw; } } finally { currentProcess?.Dispose(); currentProcess = null; } } }
public override Task ReportAsync(CancellationToken token) { Token.ThrowIfCancellationRequested(); // Informational report. For now, Linux is where we pay close attention to memory use by Fabric system services as there are still a few issues in that realm.. var timeToLiveWarning = SetHealthReportTimeToLive(); var portInformationReport = new HealthReport { Observer = ObserverName, NodeName = NodeName, HealthMessage = $"Number of ports in use by Fabric services: {TotalActivePortCountAllSystemServices}{Environment.NewLine}" + $"Number of ephemeral ports in use by Fabric services: {TotalActiveEphemeralPortCountAllSystemServices}{Environment.NewLine}" + $"Fabric memory use MB: {this.allMemData.Where(x => x.Id == "Fabric")?.FirstOrDefault()?.AverageDataValue}{Environment.NewLine}" + (RuntimeInformation.IsOSPlatform(OSPlatform.Linux) ? $"FabricGateway memory use MB: {this.allMemData.Where(x => x.Id == "FabricGateway.exe")?.FirstOrDefault()?.AverageDataValue}{Environment.NewLine}" + $"FabricHost memory use MB: {this.allMemData.Where(x => x.Id == "FabricHost")?.FirstOrDefault()?.AverageDataValue}{Environment.NewLine}" : string.Empty), State = HealthState.Ok, HealthReportTimeToLive = timeToLiveWarning, }; HealthReporter.ReportHealthToServiceFabric(portInformationReport); // Reset ports counters. TotalActivePortCountAllSystemServices = 0; TotalActiveEphemeralPortCountAllSystemServices = 0; // CPU ProcessResourceDataList( this.allCpuData, CpuErrorUsageThresholdPct, CpuWarnUsageThresholdPct); // Memory ProcessResourceDataList( this.allMemData, MemErrorUsageThresholdMb, MemWarnUsageThresholdMb); // Ports - Active TCP ProcessResourceDataList( this.allActiveTcpPortData, ActiveTcpPortCountError, ActiveTcpPortCountWarning); // Ports - Ephemeral ProcessResourceDataList( this.allEphemeralTcpPortData, ActiveEphemeralPortCountError, ActiveEphemeralPortCountWarning); // Windows Event Log if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows) && ObserverManager.ObserverWebAppDeployed && this.monitorWinEventLog) { // SF Eventlog Errors? // Write this out to a new file, for use by the web front end log viewer. // Format = HTML. int count = this.evtRecordList.Count(); var logPath = Path.Combine(ObserverLogger.LogFolderBasePath, "EventVwrErrors.txt"); // Remove existing file. if (File.Exists(logPath)) { try { File.Delete(logPath); } catch (IOException) { } catch (UnauthorizedAccessException) { } } if (count >= 10) { var sb = new StringBuilder(); _ = sb.AppendLine("<br/><div><strong>" + "<a href='javascript:toggle(\"evtContainer\")'>" + "<div id=\"plus\" style=\"display: inline; font-size: 25px;\">+</div> " + count + " Error Events in ServiceFabric and System</a> " + "Event logs</strong>.<br/></div>"); _ = sb.AppendLine("<div id='evtContainer' style=\"display: none;\">"); foreach (var evt in this.evtRecordList.Distinct()) { token.ThrowIfCancellationRequested(); try { // Access event properties: _ = sb.AppendLine("<div>" + evt.LogName + "</div>"); _ = sb.AppendLine("<div>" + evt.LevelDisplayName + "</div>"); if (evt.TimeCreated.HasValue) { _ = sb.AppendLine("<div>" + evt.TimeCreated.Value.ToShortDateString() + "</div>"); } foreach (var prop in evt.Properties) { if (prop.Value != null && Convert.ToString(prop.Value).Length > 0) { _ = sb.AppendLine("<div>" + prop.Value + "</div>"); } } } catch (EventLogException) { } } _ = sb.AppendLine("</div>"); _ = ObserverLogger.TryWriteLogFile(logPath, sb.ToString()); _ = sb.Clear(); } // Clean up. if (count > 0) { this.evtRecordList.Clear(); } } ClearDataContainers(); return(Task.CompletedTask); }
/// <inheritdoc/> public override Task ReportAsync(CancellationToken token) { this.Token.ThrowIfCancellationRequested(); var timeToLiveWarning = this.SetHealthReportTimeToLive(); var portInformationReport = new HealthReport { Observer = this.ObserverName, NodeName = this.NodeName, HealthMessage = $"Number of ports in use by Fabric services: {this.TotalActivePortCount}\n" + $"Number of ephemeral ports in use by Fabric services: {this.TotalActiveEphemeralPortCount}", State = HealthState.Ok, HealthReportTimeToLive = timeToLiveWarning, }; // TODO: Report on port count based on thresholds PortCountWarning/Error. this.HealthReporter.ReportHealthToServiceFabric(portInformationReport); // Reset ports counters. this.TotalActivePortCount = 0; this.TotalActiveEphemeralPortCount = 0; // CPU this.ProcessResourceDataList( this.allCpuData, this.CpuErrorUsageThresholdPct, this.CpuWarnUsageThresholdPct); // Memory this.ProcessResourceDataList( this.allMemData, this.MemErrorUsageThresholdMb, this.MemWarnUsageThresholdMb); // Windows Event Log if (ObserverManager.ObserverWebAppDeployed && this.monitorWinEventLog) { // SF Eventlog Errors? // Write this out to a new file, for use by the web front end log viewer. // Format = HTML. int count = this.evtRecordList.Count(); var logPath = Path.Combine(this.ObserverLogger.LogFolderBasePath, "EventVwrErrors.txt"); // Remove existing file. if (File.Exists(logPath)) { try { File.Delete(logPath); } catch (IOException) { } catch (UnauthorizedAccessException) { } } if (count >= 10) { var sb = new StringBuilder(); _ = sb.AppendLine("<br/><div><strong>" + "<a href='javascript:toggle(\"evtContainer\")'>" + "<div id=\"plus\" style=\"display: inline; font-size: 25px;\">+</div> " + count + " Error Events in ServiceFabric and System</a> " + "Event logs</strong>.<br/></div>"); _ = sb.AppendLine("<div id='evtContainer' style=\"display: none;\">"); foreach (var evt in this.evtRecordList.Distinct()) { token.ThrowIfCancellationRequested(); try { // Access event properties: _ = sb.AppendLine("<div>" + evt.LogName + "</div>"); _ = sb.AppendLine("<div>" + evt.LevelDisplayName + "</div>"); if (evt.TimeCreated.HasValue) { _ = sb.AppendLine("<div>" + evt.TimeCreated.Value.ToShortDateString() + "</div>"); } foreach (var prop in evt.Properties) { if (prop.Value != null && Convert.ToString(prop.Value).Length > 0) { _ = sb.AppendLine("<div>" + prop.Value + "</div>"); } } } catch (EventLogException) { } } _ = sb.AppendLine("</div>"); _ = this.ObserverLogger.TryWriteLogFile(logPath, sb.ToString()); _ = sb.Clear(); } // Clean up. if (count > 0) { this.evtRecordList.Clear(); } } return(Task.CompletedTask); }