public void WriteToLogWithLevel( string property, string description, LogLevel level) { switch (level) { case LogLevel.Information: ObserverLogger.LogInfo("{0} logged at level {1}: {2}", property, level, description); break; case LogLevel.Warning: ObserverLogger.LogWarning("{0} logged at level {1}: {2}", property, level, description); break; case LogLevel.Error: ObserverLogger.LogError("{0} logged at level {1}: {2}", property, level, description); break; default: throw new ArgumentOutOfRangeException(nameof(level), level, null); } Logger.Flush(); }
private Task CheckWuAutoDownloadEnabledAsync(CancellationToken token) { token.ThrowIfCancellationRequested(); // Windows Update Automatic Download enabled (automatically downloading an update without notification beforehand)? // If so, it's best to disable this and deploy either POA (for Bronze durability clusters) // or enable VMSS automatic OS image upgrades for Silver+ durability clusters. // This is important to prevent unexpected, concurrent VM reboots due to Windows Updates. try { var wuLibAutoUpdates = new AutomaticUpdatesClass(); this.isWindowsUpdateAutoDownloadEnabled = wuLibAutoUpdates.ServiceEnabled && wuLibAutoUpdates.Settings.NotificationLevel == AutomaticUpdatesNotificationLevel.aunlScheduledInstallation; } catch (Exception e) when( e is COMException || e is InvalidOperationException || e is SecurityException || e is Win32Exception) { ObserverLogger.LogWarning( $"{AuStateUnknownMessage}{Environment.NewLine}{e}"); this.auStateUnknown = true; } return(Task.CompletedTask); }
public void WriteToLogWithLevel(string property, string description, LogLevel level) { switch (level) { case LogLevel.Information: ObserverLogger.LogInfo("{0} logged at level {1}: {2}", property, level, description); break; case LogLevel.Warning: ObserverLogger.LogWarning("{0} logged at level {1}: {2}", property, level, description); break; case LogLevel.Error: ObserverLogger.LogError("{0} logged at level {1}: {2}", property, level, description); break; } Logger.Flush(); }
public override Task ReportAsync(CancellationToken token) { // Local log. ObserverLogger.LogInfo(message.ToString()); /* Report to Fabric */ // These values will be preserved across observer runs and are useful for clearing warnings // by reporting Ok health state health events with the same property and sourceid values // as the error/warning health events when FO is safely taken down (e.g., app is being uninstalled, // safe restart of fabric node it's running on, etc.). HealthReportProperties.Add("SomePropertyName"); HealthReportSourceIds.Add($"{ObserverName}_SomethingUniqueToThisReport"); var healthReporter = new ObserverHealthReporter(ObserverLogger); var healthReport = new Utilities.HealthReport { Code = FOErrorWarningCodes.Ok, HealthMessage = this.message.ToString(), NodeName = NodeName, Observer = ObserverName, Property = HealthReportProperties[^ 1],
public override Task ReportAsync(CancellationToken token) { Token.ThrowIfCancellationRequested(); // Informational report. For now, Linux is where we pay close attention to memory use by Fabric system services as there are still a few issues in that realm.. var timeToLiveWarning = SetHealthReportTimeToLive(); var portInformationReport = new HealthReport { Observer = ObserverName, NodeName = NodeName, HealthMessage = $"Number of ports in use by Fabric services: {TotalActivePortCountAllSystemServices}{Environment.NewLine}" + $"Number of ephemeral ports in use by Fabric services: {TotalActiveEphemeralPortCountAllSystemServices}{Environment.NewLine}" + $"Fabric memory use MB: {this.allMemData.Where(x => x.Id == "Fabric")?.FirstOrDefault()?.AverageDataValue}{Environment.NewLine}" + (RuntimeInformation.IsOSPlatform(OSPlatform.Linux) ? $"FabricGateway memory use MB: {this.allMemData.Where(x => x.Id == "FabricGateway.exe")?.FirstOrDefault()?.AverageDataValue}{Environment.NewLine}" + $"FabricHost memory use MB: {this.allMemData.Where(x => x.Id == "FabricHost")?.FirstOrDefault()?.AverageDataValue}{Environment.NewLine}" : string.Empty), State = HealthState.Ok, HealthReportTimeToLive = timeToLiveWarning, }; HealthReporter.ReportHealthToServiceFabric(portInformationReport); // Reset ports counters. TotalActivePortCountAllSystemServices = 0; TotalActiveEphemeralPortCountAllSystemServices = 0; // CPU ProcessResourceDataList( this.allCpuData, CpuErrorUsageThresholdPct, CpuWarnUsageThresholdPct); // Memory ProcessResourceDataList( this.allMemData, MemErrorUsageThresholdMb, MemWarnUsageThresholdMb); // Ports - Active TCP ProcessResourceDataList( this.allActiveTcpPortData, ActiveTcpPortCountError, ActiveTcpPortCountWarning); // Ports - Ephemeral ProcessResourceDataList( this.allEphemeralTcpPortData, ActiveEphemeralPortCountError, ActiveEphemeralPortCountWarning); // Windows Event Log if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows) && ObserverManager.ObserverWebAppDeployed && this.monitorWinEventLog) { // SF Eventlog Errors? // Write this out to a new file, for use by the web front end log viewer. // Format = HTML. int count = this.evtRecordList.Count(); var logPath = Path.Combine(ObserverLogger.LogFolderBasePath, "EventVwrErrors.txt"); // Remove existing file. if (File.Exists(logPath)) { try { File.Delete(logPath); } catch (IOException) { } catch (UnauthorizedAccessException) { } } if (count >= 10) { var sb = new StringBuilder(); _ = sb.AppendLine("<br/><div><strong>" + "<a href='javascript:toggle(\"evtContainer\")'>" + "<div id=\"plus\" style=\"display: inline; font-size: 25px;\">+</div> " + count + " Error Events in ServiceFabric and System</a> " + "Event logs</strong>.<br/></div>"); _ = sb.AppendLine("<div id='evtContainer' style=\"display: none;\">"); foreach (var evt in this.evtRecordList.Distinct()) { token.ThrowIfCancellationRequested(); try { // Access event properties: _ = sb.AppendLine("<div>" + evt.LogName + "</div>"); _ = sb.AppendLine("<div>" + evt.LevelDisplayName + "</div>"); if (evt.TimeCreated.HasValue) { _ = sb.AppendLine("<div>" + evt.TimeCreated.Value.ToShortDateString() + "</div>"); } foreach (var prop in evt.Properties) { if (prop.Value != null && Convert.ToString(prop.Value).Length > 0) { _ = sb.AppendLine("<div>" + prop.Value + "</div>"); } } } catch (EventLogException) { } } _ = sb.AppendLine("</div>"); _ = ObserverLogger.TryWriteLogFile(logPath, sb.ToString()); _ = sb.Clear(); } // Clean up. if (count > 0) { this.evtRecordList.Clear(); } } ClearDataContainers(); return(Task.CompletedTask); }
public override Task ReportAsync(CancellationToken token) { try { token.ThrowIfCancellationRequested(); // OS Health. if (this.osStatus != null && !string.Equals(this.osStatus, "OK", StringComparison.OrdinalIgnoreCase)) { string healthMessage = $"OS reporting unhealthy: {this.osStatus}"; var healthReport = new HealthReport { Observer = ObserverName, NodeName = NodeName, HealthMessage = healthMessage, State = HealthState.Error, HealthReportTimeToLive = SetHealthReportTimeToLive(), }; HealthReporter.ReportHealthToServiceFabric(healthReport); // This means this observer created a Warning or Error SF Health Report HasActiveFabricErrorOrWarning = true; // Send Health Report as Telemetry (perhaps it signals an Alert from App Insights, for example.). if (IsTelemetryProviderEnabled && IsObserverTelemetryEnabled) { _ = TelemetryClient?.ReportHealthAsync( HealthScope.Application, FabricRuntime.GetActivationContext().ApplicationName, HealthState.Error, $"{NodeName} - OS reporting unhealthy: {this.osStatus}", ObserverName, Token); } } else if (HasActiveFabricErrorOrWarning && string.Equals(this.osStatus, "OK", StringComparison.OrdinalIgnoreCase)) { // Clear Error or Warning with an OK Health Report. string healthMessage = $"OS reporting healthy: {this.osStatus}"; var healthReport = new HealthReport { Observer = ObserverName, NodeName = NodeName, HealthMessage = healthMessage, State = HealthState.Ok, HealthReportTimeToLive = default(TimeSpan), }; HealthReporter.ReportHealthToServiceFabric(healthReport); // Reset internal health state. HasActiveFabricErrorOrWarning = false; } if (ObserverManager.ObserverWebAppDeployed) { var logPath = Path.Combine(ObserverLogger.LogFolderBasePath, "SysInfo.txt"); // This file is used by the web application (log reader.). if (!ObserverLogger.TryWriteLogFile(logPath, $"Last updated on {DateTime.UtcNow.ToString("M/d/yyyy HH:mm:ss")} UTC<br/>{this.osReport}")) { HealthReporter.ReportFabricObserverServiceHealth( FabricServiceContext.ServiceName.OriginalString, ObserverName, HealthState.Warning, "Unable to create SysInfo.txt file."); } } var report = new HealthReport { Observer = ObserverName, HealthMessage = this.osReport, State = HealthState.Ok, NodeName = NodeName, HealthReportTimeToLive = SetHealthReportTimeToLive(), }; HealthReporter.ReportHealthToServiceFabric(report); // Windows Update automatic download enabled? if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows) && this.isWindowsUpdateAutoDownloadEnabled) { string linkText = $"{Environment.NewLine}For clusters of Silver durability or above, " + $"please consider <a href=\"https://docs.microsoft.com/azure/virtual-machine-scale-sets/virtual-machine-scale-sets-automatic-upgrade\" target=\"blank\">" + $"enabling VMSS automatic OS image upgrades</a> to prevent unexpected VM reboots. " + $"For Bronze durability clusters, please consider deploying the " + $"<a href=\"https://docs.microsoft.com/azure/service-fabric/service-fabric-patch-orchestration-application\" target=\"blank\">Patch Orchestration Service</a>."; string auServiceEnabledMessage = $"Windows Update Automatic Download is enabled.{linkText}"; report = new HealthReport { Observer = ObserverName, Property = "OSConfiguration", HealthMessage = auServiceEnabledMessage, State = HealthState.Warning, NodeName = NodeName, HealthReportTimeToLive = SetHealthReportTimeToLive(), }; HealthReporter.ReportHealthToServiceFabric(report); if (IsTelemetryProviderEnabled && IsObserverTelemetryEnabled && RuntimeInformation.IsOSPlatform(OSPlatform.Windows)) { // Send Health Report as Telemetry (perhaps it signals an Alert from App Insights, for example.). var telemetryData = new TelemetryData(FabricClientInstance, token) { HealthEventDescription = auServiceEnabledMessage, HealthState = "Warning", Metric = "WUAutoDownloadEnabled", Value = this.isWindowsUpdateAutoDownloadEnabled, NodeName = NodeName, ObserverName = ObserverName, Source = ObserverConstants.FabricObserverName, }; _ = TelemetryClient?.ReportMetricAsync( telemetryData, Token); } // ETW. if (IsEtwEnabled && RuntimeInformation.IsOSPlatform(OSPlatform.Windows)) { Logger.EtwLogger?.Write( ObserverConstants.FabricObserverETWEventName, new { HealthState = "Warning", HealthEventDescription = auServiceEnabledMessage, ObserverName, Metric = "WUAutoDownloadEnabled", Value = this.isWindowsUpdateAutoDownloadEnabled, NodeName, }); } } if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows)) { // reset au globals for fresh detection during next observer run. this.isWindowsUpdateAutoDownloadEnabled = false; this.auStateUnknown = false; this.isWUADSettingEnabled = false; } return(Task.CompletedTask); } catch (Exception e) { HealthReporter.ReportFabricObserverServiceHealth( FabricServiceContext.ServiceName.OriginalString, ObserverName, HealthState.Error, $"Unhandled exception processing OS information:{Environment.NewLine}{e}"); throw; } }
private async Task <bool> InitializeAsync() { WriteToLogWithLevel( ObserverName, $"Initializing {ObserverName} for network monitoring. | {NodeName}", LogLevel.Information); this.cancellationToken.ThrowIfCancellationRequested(); // This only needs to be logged once. // This file is used by the ObserverWebApi application. if (ObserverManager.ObserverWebAppDeployed && !this.hasRun) { var logPath = Path.Combine(ObserverLogger.LogFolderBasePath, "NetInfo.txt"); Console.WriteLine($"logPath: {logPath}"); if (!ObserverLogger.TryWriteLogFile(logPath, GetNetworkInterfaceInfo(this.cancellationToken))) { HealthReporter.ReportFabricObserverServiceHealth( FabricServiceContext.ServiceName.OriginalString, ObserverName, HealthState.Warning, "Unable to create NetInfo.txt file."); } } // Is this a unit test run? if (IsTestRun) { return(true); } var settings = FabricServiceContext.CodePackageActivationContext.GetConfigurationPackageObject( ObserverConstants.ObserverConfigurationPackageName)?.Settings; this.configSettings.Initialize( settings, ConfigurationSectionName, "NetworkObserverDataFileName"); var networkObserverConfigFileName = Path.Combine(this.dataPackagePath, this.configSettings.NetworkObserverConfigFileName); if (string.IsNullOrWhiteSpace(networkObserverConfigFileName)) { ObserverLogger.LogError( "Endpoint list file is not specified. " + "Please Add file containing endpoints that need to be monitored."); return(false); } if (!File.Exists(networkObserverConfigFileName)) { ObserverLogger.LogError( "Endpoint list file is not specified. " + "Please Add file containing endpoints that need to be monitored."); return(false); } if (this.userConfig.Count == 0) { using (Stream stream = new FileStream( networkObserverConfigFileName, FileMode.Open, FileAccess.Read, FileShare.Read)) { var configs = JsonHelper.ReadFromJsonStream <NetworkObserverConfig[]>(stream); foreach (var netConfig in configs) { var deployedApps = await FabricClientInstance.QueryManager.GetDeployedApplicationListAsync( NodeName, new Uri(netConfig.TargetApp)).ConfigureAwait(false); if (deployedApps == null || deployedApps.Count < 1) { continue; } this.userConfig.Add(netConfig); } } if (this.userConfig.Count == 0) { HealthReporter.ReportFabricObserverServiceHealth( FabricServiceContext.ServiceName.ToString(), ObserverName, HealthState.Warning, "Missing required configuration data: endpoints."); return(false); } } return(true); }
// This runs each time ObserveAsync is run to ensure that any new app targets and config changes will // be up to date across observer loop iterations. private async Task <bool> InitializeAsync() { if (ReplicaOrInstanceList == null) { ReplicaOrInstanceList = new List <ReplicaOrInstanceMonitoringInfo>(); } if (!IsTestRun) { configSettings.Initialize( FabricServiceContext.CodePackageActivationContext.GetConfigurationPackageObject( ObserverConstants.ObserverConfigurationPackageName)?.Settings, ConfigurationSectionName, "AppObserverDataFileName"); } // For unit tests, this path will be an empty string and not generate an exception. var appObserverConfigFileName = Path.Combine( ConfigPackagePath ?? string.Empty, configSettings.AppObserverConfigFileName ?? string.Empty); if (!File.Exists(appObserverConfigFileName)) { WriteToLogWithLevel( ObserverName, $"Will not observe resource consumption as no configuration parameters have been supplied. | {NodeName}", LogLevel.Information); return(false); } // This code runs each time ObserveAsync is called, // so clear app list and deployed replica/instance list in case a new app has been added to watch list. if (this.userTargetList.Count > 0) { this.userTargetList.Clear(); ReplicaOrInstanceList.Clear(); } if (this.deployedTargetList.Count > 0) { this.deployedTargetList.Clear(); } using Stream stream = new FileStream( appObserverConfigFileName, FileMode.Open, FileAccess.Read, FileShare.Read); if (stream.Length > 0 && JsonHelper.IsJson <List <ApplicationInfo> >(File.ReadAllText(appObserverConfigFileName))) { this.userTargetList.AddRange(JsonHelper.ReadFromJsonStream <ApplicationInfo[]>(stream)); } // Are any of the config-supplied apps deployed?. if (this.userTargetList.Count == 0) { WriteToLogWithLevel( ObserverName, $"Will not observe resource consumption as no configuration parameters have been supplied. | {NodeName}", LogLevel.Information); return(false); } int settingSFail = 0; foreach (var application in this.userTargetList) { if (string.IsNullOrWhiteSpace(application.TargetApp) && string.IsNullOrWhiteSpace(application.TargetAppType)) { HealthReporter.ReportFabricObserverServiceHealth( FabricServiceContext.ServiceName.ToString(), ObserverName, HealthState.Warning, $"Initialize() | {application.TargetApp}: Required setting, target, is not set."); settingSFail++; continue; } // No required settings supplied for deployed application(s). if (settingSFail == this.userTargetList.Count) { return(false); } if (!string.IsNullOrEmpty(application.TargetAppType)) { await SetDeployedApplicationReplicaOrInstanceListAsync( null, application.TargetAppType).ConfigureAwait(false); } else { await SetDeployedApplicationReplicaOrInstanceListAsync(new Uri(application.TargetApp)) .ConfigureAwait(false); } } foreach (var repOrInst in ReplicaOrInstanceList) { ObserverLogger.LogInfo( $"Will observe resource consumption by {repOrInst.ApplicationName?.OriginalString} " + $"on Node {NodeName}."); } return(true); }
private async Task MonitorAppAsync(ApplicationInfo application) { List <ReplicaOrInstanceMonitoringInfo> repOrInstList; if (!string.IsNullOrEmpty(application.TargetType)) { repOrInstList = await GetDeployedApplicationReplicaOrInstanceListAsync(null, application.TargetType).ConfigureAwait(true); } else { repOrInstList = await GetDeployedApplicationReplicaOrInstanceListAsync(new Uri(application.Target)).ConfigureAwait(true); } if (repOrInstList.Count == 0) { ObserverLogger.LogInfo("No target or targetType specified."); return; } Process currentProcess = null; foreach (var repOrInst in repOrInstList) { Token.ThrowIfCancellationRequested(); int processid = (int)repOrInst.HostProcessId; var cpuUsage = new CpuUsage(); try { // App level. currentProcess = Process.GetProcessById(processid); Token.ThrowIfCancellationRequested(); var procName = currentProcess.ProcessName; string appNameOrType = GetAppNameOrType(repOrInst); var id = $"{appNameOrType}:{procName}"; // Add new resource data structures for each app service process. if (!allAppCpuData.Any(list => list.Id == id)) { allAppCpuData.Add(new FabricResourceUsageData <int>(ErrorWarningProperty.TotalCpuTime, id)); allAppMemDataMB.Add(new FabricResourceUsageData <long>(ErrorWarningProperty.TotalMemoryConsumptionMb, id)); allAppMemDataPercent.Add(new FabricResourceUsageData <double>(ErrorWarningProperty.TotalMemoryConsumptionPct, id)); allAppTotalActivePortsData.Add(new FabricResourceUsageData <int>(ErrorWarningProperty.TotalActivePorts, id)); allAppEphemeralPortsData.Add(new FabricResourceUsageData <int>(ErrorWarningProperty.TotalEphemeralPorts, id)); } // CPU (all cores). int i = Environment.ProcessorCount + 10; while (!currentProcess.HasExited && i > 0) { Token.ThrowIfCancellationRequested(); int cpu = cpuUsage.GetCpuUsageProcess(currentProcess); if (cpu >= 0) { allAppCpuData.FirstOrDefault(x => x.Id == id).Data.Add(cpu); } // Memory (private working set (process)). var processMem = perfCounters.PerfCounterGetProcessPrivateWorkingSetMb(currentProcess.ProcessName); allAppMemDataMB.FirstOrDefault(x => x.Id == id).Data.Add((long)processMem); // Memory (percent in use (total)). var memInfo = ObserverManager.TupleGetTotalPhysicalMemorySizeAndPercentInUse(); long totalMem = memInfo.TotalMemory; if (totalMem > -1) { double usedPct = Math.Round(((double)(processMem * 100)) / (totalMem * 1024), 2); allAppMemDataPercent.FirstOrDefault(x => x.Id == id).Data.Add(usedPct); } --i; Thread.Sleep(250); } // Total and Ephemeral ports.. allAppTotalActivePortsData.FirstOrDefault(x => x.Id == id) .Data.Add(NetworkUsage.GetActivePortCount(currentProcess.Id)); allAppEphemeralPortsData.FirstOrDefault(x => x.Id == id) .Data.Add(NetworkUsage.GetActiveEphemeralPortCount(currentProcess.Id)); } catch (Exception e) { if (e is Win32Exception || e is ArgumentException || e is InvalidOperationException) { WriteToLogWithLevel( ObserverName, $"MonitorAsync failed to find current service process for {application.Target}/n{e}", LogLevel.Information); } else { if (!(e is OperationCanceledException)) { WriteToLogWithLevel( ObserverName, $"Unhandled exception in MonitorAsync: \n {e}", LogLevel.Warning); } throw; } } finally { currentProcess?.Dispose(); currentProcess = null; } } }
// Initialize() runs each time ObserveAsync is run to ensure // that any new app targets and config changes will // be up to date across observer loop iterations. private bool Initialize() { if (replicaOrInstanceList == null) { replicaOrInstanceList = new List <ReplicaOrInstanceMonitoringInfo>(); } // Is this a unit test run? if (IsTestRun) { replicaOrInstanceList.Add(new ReplicaOrInstanceMonitoringInfo { ApplicationName = new Uri("fabric:/TestApp"), PartitionId = Guid.NewGuid(), HostProcessId = 0, ReplicaOrInstanceId = default(long), }); return(true); } ConfigSettings.Initialize(FabricServiceContext.CodePackageActivationContext.GetConfigurationPackageObject(ObserverConstants.ObserverConfigurationPackageName)?.Settings, ObserverConstants.AppObserverConfigurationSectionName, "AppObserverDataFileName"); var appObserverConfigFileName = Path.Combine(configPackagePath, ConfigSettings.AppObserverDataFileName); if (!File.Exists(appObserverConfigFileName)) { WriteToLogWithLevel( ObserverName, $"Will not observe resource consumption as no configuration parameters have been supplied. | {NodeName}", LogLevel.Information); return(false); } // this code runs each time ObserveAsync is called, // so clear app list and deployed replica/instance list in case a new app has been added to watch list. if (targetList.Count > 0) { targetList.Clear(); replicaOrInstanceList.Clear(); } using (Stream stream = new FileStream(appObserverConfigFileName, FileMode.Open, FileAccess.Read, FileShare.Read)) { if (stream.Length > 42 && JsonHelper.IsJson <List <ApplicationInfo> >(File.ReadAllText(appObserverConfigFileName))) { targetList.AddRange(JsonHelper.ReadFromJsonStream <ApplicationInfo[]>(stream)); } } // Are any of the config-supplied apps deployed?. if (targetList.Count == 0) { WriteToLogWithLevel( ObserverName, $"Will not observe resource consumption as no configuration parameters have been supplied. | {NodeName}", LogLevel.Information); return(false); } int settingsFail = 0; foreach (var application in targetList) { if (string.IsNullOrWhiteSpace(application.Target) && string.IsNullOrWhiteSpace(application.TargetType)) { HealthReporter.ReportFabricObserverServiceHealth( FabricServiceContext.ServiceName.ToString(), ObserverName, HealthState.Warning, $"Initialize() | {application.Target}: Required setting, target, is not set."); settingsFail++; continue; } // No required settings supplied for deployed application(s). if (settingsFail == targetList.Count) { return(false); } ObserverLogger.LogInfo( $"Will observe resource consumption by {application.Target ?? application.TargetType} " + $"on Node {NodeName}."); } return(true); }
/// <summary> /// This function processes numeric data held in FRUD instances and generates Application or Node level Health Reports depending on supplied thresholds. /// </summary> /// <typeparam name="T">This represents the numeric type of data this function will operate on.</typeparam> /// <param name="data">FabricResourceUsageData instance.</param> /// <param name="thresholdError">Error threshold (numeric)</param> /// <param name="thresholdWarning">Warning threshold (numeric)</param> /// <param name="healthReportTtl">Health report Time to Live (TimeSpan)</param> /// <param name="healthReportType">HealthReport type. Note, only Application and Node health report types are supported.</param> /// <param name="replicaOrInstance">Replica or Instance information contained in a type.</param> /// <param name="dumpOnError">Wheter or not to dump process if Error threshold has been reached.</param> public void ProcessResourceDataReportHealth <T>( FabricResourceUsageData <T> data, T thresholdError, T thresholdWarning, TimeSpan healthReportTtl, HealthReportType healthReportType = HealthReportType.Node, ReplicaOrInstanceMonitoringInfo replicaOrInstance = null, bool dumpOnError = false) where T : struct { if (data == null) { throw new ArgumentException("Supply all required parameters with non-null value."); } if (healthReportType != HealthReportType.Application && healthReportType != HealthReportType.Node) { this.ObserverLogger.LogWarning($"ProcessResourceDataReportHealth: Unsupported HealthReport type -> {Enum.GetName(typeof(HealthReportType), healthReportType)}"); return; } var thresholdName = "Minimum"; bool warningOrError = false; string repPartitionId = null, repOrInstanceId = null, name = null, id = null, procName = null; T threshold = thresholdWarning; var healthState = HealthState.Ok; Uri appName = null; Uri serviceName = null; TelemetryData telemetryData = null; if (healthReportType == HealthReportType.Application) { if (replicaOrInstance != null) { repPartitionId = $"Partition: {replicaOrInstance.PartitionId}"; repOrInstanceId = $"Replica: {replicaOrInstance.ReplicaOrInstanceId}"; // Create a unique id which will be used for health Warnings and OKs (clears). appName = replicaOrInstance.ApplicationName; serviceName = replicaOrInstance.ServiceName; name = appName.OriginalString.Replace("fabric:/", string.Empty); } else { appName = new Uri("fabric:/System"); name = data.Id; } id = name + "_" + data.Property.Replace(" ", string.Empty); // The health event description will be a serialized instance of telemetryData, // so it should be completely constructed (filled with data) regardless // of user telemetry settings. telemetryData = new TelemetryData(FabricClientInstance, Token) { ApplicationName = appName?.OriginalString ?? string.Empty, Code = FOErrorWarningCodes.Ok, HealthState = Enum.GetName(typeof(HealthState), HealthState.Ok), NodeName = NodeName, ObserverName = ObserverName, Metric = data.Property, Value = Math.Round(data.AverageDataValue, 1), PartitionId = replicaOrInstance?.PartitionId.ToString(), ReplicaId = replicaOrInstance?.ReplicaOrInstanceId.ToString(), ServiceName = serviceName?.OriginalString ?? string.Empty, Source = ObserverConstants.FabricObserverName, }; try { if (replicaOrInstance != null && replicaOrInstance.HostProcessId > 0) { procName = Process.GetProcessById((int)replicaOrInstance.HostProcessId).ProcessName; } else { // The name of the target service process is always the id for data containers coming from FSO. procName = data.Id; } telemetryData.ServiceName = procName; if (IsTelemetryProviderEnabled && IsObserverTelemetryEnabled) { _ = TelemetryClient?.ReportMetricAsync( telemetryData, Token).ConfigureAwait(false); } if (IsEtwEnabled) { Logger.EtwLogger?.Write( ObserverConstants.FabricObserverETWEventName, new { ApplicationName = appName?.OriginalString ?? string.Empty, Code = FOErrorWarningCodes.Ok, HealthState = Enum.GetName(typeof(HealthState), HealthState.Ok), NodeName, ObserverName, Metric = data.Property, Value = Math.Round(data.AverageDataValue, 1), PartitionId = replicaOrInstance?.PartitionId.ToString(), ReplicaId = replicaOrInstance?.ReplicaOrInstanceId.ToString(), ServiceName = procName, Source = ObserverConstants.FabricObserverName, }); } } catch (ArgumentException) { return; } catch (InvalidOperationException) { return; } } else { string drive = string.Empty; if (ObserverName == ObserverConstants.DiskObserverName) { drive = $"{data.Id}: "; if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows)) { drive = $"{data.Id.Remove(1, 2)}: "; } } // The health event description will be a serialized instance of telemetryData, // so it should be completely constructed (filled with data) regardless // of user telemetry settings. telemetryData = new TelemetryData(FabricClientInstance, Token) { Code = FOErrorWarningCodes.Ok, HealthState = Enum.GetName(typeof(HealthState), HealthState.Ok), NodeName = NodeName, ObserverName = ObserverName, Metric = $"{drive}{data.Property}", Source = ObserverConstants.FabricObserverName, Value = Math.Round(data.AverageDataValue, 1), }; if (IsTelemetryProviderEnabled && IsObserverTelemetryEnabled) { _ = TelemetryClient?.ReportMetricAsync( telemetryData, Token); } if (IsEtwEnabled) { Logger.EtwLogger?.Write( ObserverConstants.FabricObserverETWEventName, new { Code = FOErrorWarningCodes.Ok, HealthState = Enum.GetName(typeof(HealthState), HealthState.Ok), NodeName, ObserverName, Metric = $"{drive}{data.Property}", Source = ObserverConstants.FabricObserverName, Value = Math.Round(data.AverageDataValue, 1), }); } } // Health Error if (data.IsUnhealthy(thresholdError)) { thresholdName = "Maximum"; threshold = thresholdError; warningOrError = true; healthState = HealthState.Error; // This is primarily useful for AppObserver, but makes sense to be // part of the base class for future use, like for FSO. if (replicaOrInstance != null && dumpOnError) { try { int procId = (int)replicaOrInstance.HostProcessId; if (!this.serviceDumpCountDictionary.ContainsKey(procName)) { this.serviceDumpCountDictionary.Add(procName, 0); } if (this.serviceDumpCountDictionary[procName] < this.maxDumps) { // DumpServiceProcess defaults to a Full dump with // process memory, handles and thread data. bool success = DumpServiceProcess(procId); if (success) { this.serviceDumpCountDictionary[procName]++; } } } // Ignore these, it just means no dmp will be created.This is not // critical to FO. Log as info, not warning. catch (Exception e) when(e is ArgumentException || e is InvalidOperationException) { ObserverLogger.LogInfo($"Unable to generate dmp file:{Environment.NewLine}{e}"); } } } // Health Warning if (!warningOrError && data.IsUnhealthy(thresholdWarning)) { warningOrError = true; healthState = HealthState.Warning; } if (warningOrError) { string errorWarningCode = null; switch (data.Property) { case ErrorWarningProperty.TotalCpuTime when healthReportType == HealthReportType.Application: errorWarningCode = (healthState == HealthState.Error) ? FOErrorWarningCodes.AppErrorCpuPercent : FOErrorWarningCodes.AppWarningCpuPercent; break; case ErrorWarningProperty.TotalCpuTime: errorWarningCode = (healthState == HealthState.Error) ? FOErrorWarningCodes.NodeErrorCpuPercent : FOErrorWarningCodes.NodeWarningCpuPercent; break; case ErrorWarningProperty.DiskSpaceUsagePercentage: errorWarningCode = (healthState == HealthState.Error) ? FOErrorWarningCodes.NodeErrorDiskSpacePercent : FOErrorWarningCodes.NodeWarningDiskSpacePercent; break; case ErrorWarningProperty.DiskSpaceUsageMb: errorWarningCode = (healthState == HealthState.Error) ? FOErrorWarningCodes.NodeErrorDiskSpaceMB : FOErrorWarningCodes.NodeWarningDiskSpaceMB; break; case ErrorWarningProperty.TotalMemoryConsumptionMb when healthReportType == HealthReportType.Application: errorWarningCode = (healthState == HealthState.Error) ? FOErrorWarningCodes.AppErrorMemoryMB : FOErrorWarningCodes.AppWarningMemoryMB; break; case ErrorWarningProperty.TotalMemoryConsumptionMb: errorWarningCode = (healthState == HealthState.Error) ? FOErrorWarningCodes.NodeErrorMemoryMB : FOErrorWarningCodes.NodeWarningMemoryMB; break; case ErrorWarningProperty.TotalMemoryConsumptionPct when replicaOrInstance != null: errorWarningCode = (healthState == HealthState.Error) ? FOErrorWarningCodes.AppErrorMemoryPercent : FOErrorWarningCodes.AppWarningMemoryPercent; break; case ErrorWarningProperty.TotalMemoryConsumptionPct: errorWarningCode = (healthState == HealthState.Error) ? FOErrorWarningCodes.NodeErrorMemoryPercent : FOErrorWarningCodes.NodeWarningMemoryPercent; break; case ErrorWarningProperty.DiskAverageQueueLength: errorWarningCode = (healthState == HealthState.Error) ? FOErrorWarningCodes.NodeErrorDiskAverageQueueLength : FOErrorWarningCodes.NodeWarningDiskAverageQueueLength; break; case ErrorWarningProperty.TotalActiveFirewallRules: errorWarningCode = (healthState == HealthState.Error) ? FOErrorWarningCodes.ErrorTooManyFirewallRules : FOErrorWarningCodes.WarningTooManyFirewallRules; break; case ErrorWarningProperty.TotalActivePorts when healthReportType == HealthReportType.Application: errorWarningCode = (healthState == HealthState.Error) ? FOErrorWarningCodes.AppErrorTooManyActiveTcpPorts : FOErrorWarningCodes.AppWarningTooManyActiveTcpPorts; break; case ErrorWarningProperty.TotalActivePorts: errorWarningCode = (healthState == HealthState.Error) ? FOErrorWarningCodes.NodeErrorTooManyActiveTcpPorts : FOErrorWarningCodes.NodeWarningTooManyActiveTcpPorts; break; case ErrorWarningProperty.TotalEphemeralPorts when healthReportType == HealthReportType.Application: errorWarningCode = (healthState == HealthState.Error) ? FOErrorWarningCodes.AppErrorTooManyActiveEphemeralPorts : FOErrorWarningCodes.AppWarningTooManyActiveEphemeralPorts; break; case ErrorWarningProperty.TotalEphemeralPorts: errorWarningCode = (healthState == HealthState.Error) ? FOErrorWarningCodes.NodeErrorTooManyActiveEphemeralPorts : FOErrorWarningCodes.NodeWarningTooManyActiveEphemeralPorts; break; } var healthMessage = new StringBuilder(); string drive = string.Empty; if (ObserverName == ObserverConstants.DiskObserverName) { drive = $"{data.Id}: "; if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows)) { drive = $"{data.Id.Remove(1, 2)}: "; } } _ = healthMessage.Append($"{drive}{data.Property} is at or above the specified {thresholdName} limit ({threshold}{data.Units})"); _ = healthMessage.AppendLine($" - {data.Property}: {Math.Round(data.AverageDataValue)}{data.Units}"); // The health event description will be a serialized instance of telemetryData, // so it should be completely constructed (filled with data) regardless // of user telemetry settings. telemetryData.ApplicationName = appName?.OriginalString ?? string.Empty; telemetryData.Code = errorWarningCode; if (replicaOrInstance != null && !string.IsNullOrEmpty(replicaOrInstance.ContainerId)) { telemetryData.ContainerId = replicaOrInstance.ContainerId; } telemetryData.HealthState = Enum.GetName(typeof(HealthState), healthState); telemetryData.HealthEventDescription = healthMessage.ToString(); telemetryData.Metric = $"{drive}{data.Property}"; telemetryData.ServiceName = serviceName?.OriginalString ?? string.Empty; telemetryData.Source = ObserverConstants.FabricObserverName; telemetryData.Value = Math.Round(data.AverageDataValue, 1); // Send Health Report as Telemetry event (perhaps it signals an Alert from App Insights, for example.). if (IsTelemetryProviderEnabled && IsObserverTelemetryEnabled) { _ = TelemetryClient?.ReportHealthAsync( telemetryData, Token); } // ETW. if (IsEtwEnabled) { Logger.EtwLogger?.Write( ObserverConstants.FabricObserverETWEventName, new { ApplicationName = appName?.OriginalString ?? string.Empty, Code = errorWarningCode, ContainerId = replicaOrInstance != null ? replicaOrInstance.ContainerId ?? string.Empty : string.Empty, HealthState = Enum.GetName(typeof(HealthState), healthState), HealthEventDescription = healthMessage.ToString(), Metric = $"{drive}{data.Property}", Node = NodeName, ServiceName = serviceName?.OriginalString ?? string.Empty, Source = ObserverConstants.FabricObserverName, Value = Math.Round(data.AverageDataValue, 1), }); } var healthReport = new HealthReport { AppName = appName, Code = errorWarningCode, EmitLogEvent = true, HealthData = telemetryData, HealthMessage = healthMessage.ToString(), HealthReportTimeToLive = healthReportTtl, ReportType = healthReportType, State = healthState, NodeName = NodeName, Observer = ObserverName, ResourceUsageDataProperty = data.Property, }; if (!AppNames.Any(a => a == appName?.OriginalString)) { AppNames.Add(appName?.OriginalString); } // From FSO. if (replicaOrInstance == null && healthReportType == HealthReportType.Application) { HealthReportProperties.Add(id); } else { if (HealthReportProperties.Count == 0) { HealthReportProperties.Add(ObserverName switch { ObserverConstants.AppObserverName => "ApplicationHealth", ObserverConstants.CertificateObserverName => "SecurityHealth", ObserverConstants.DiskObserverName => "DiskHealth", ObserverConstants.FabricSystemObserverName => "FabricSystemServiceHealth", ObserverConstants.NetworkObserverName => "NetworkHealth", ObserverConstants.OSObserverName => "MachineInformation", ObserverConstants.NodeObserverName => "MachineResourceHealth", _ => $"{data.Property}", });
// Windows process dmp creator. public bool DumpServiceProcess(int processId, DumpType dumpType = DumpType.Full) { if (string.IsNullOrEmpty(this.dumpsPath)) { return(false); } string processName = string.Empty; NativeMethods.MINIDUMP_TYPE miniDumpType; switch (dumpType) { case DumpType.Full: miniDumpType = NativeMethods.MINIDUMP_TYPE.MiniDumpWithFullMemory | NativeMethods.MINIDUMP_TYPE.MiniDumpWithFullMemoryInfo | NativeMethods.MINIDUMP_TYPE.MiniDumpWithHandleData | NativeMethods.MINIDUMP_TYPE.MiniDumpWithThreadInfo | NativeMethods.MINIDUMP_TYPE.MiniDumpWithUnloadedModules; break; case DumpType.MiniPlus: miniDumpType = NativeMethods.MINIDUMP_TYPE.MiniDumpWithPrivateReadWriteMemory | NativeMethods.MINIDUMP_TYPE.MiniDumpWithDataSegs | NativeMethods.MINIDUMP_TYPE.MiniDumpWithHandleData | NativeMethods.MINIDUMP_TYPE.MiniDumpWithFullMemoryInfo | NativeMethods.MINIDUMP_TYPE.MiniDumpWithThreadInfo | NativeMethods.MINIDUMP_TYPE.MiniDumpWithUnloadedModules; break; case DumpType.Mini: miniDumpType = NativeMethods.MINIDUMP_TYPE.MiniDumpWithIndirectlyReferencedMemory | NativeMethods.MINIDUMP_TYPE.MiniDumpScanMemory; break; default: throw new ArgumentOutOfRangeException(nameof(dumpType), dumpType, null); } try { // This is to ensure friendly-name of resulting dmp file. processName = Process.GetProcessById(processId).ProcessName; if (string.IsNullOrEmpty(processName)) { return(false); } IntPtr processHandle = Process.GetProcessById(processId).Handle; processName += "_" + DateTime.Now.ToString("ddMMyyyyHHmmss") + ".dmp"; // Check disk space availability before writing dump file. // This will not work on Linux string driveName = this.dumpsPath.Substring(0, 2); if (DiskUsage.GetCurrentDiskSpaceUsedPercent(driveName) > 90) { HealthReporter.ReportFabricObserverServiceHealth( FabricServiceContext.ServiceName.OriginalString, ObserverName, HealthState.Warning, "Not enough disk space available for dump file creation."); return(false); } using (var file = File.Create(Path.Combine(this.dumpsPath, processName))) { if (!NativeMethods.MiniDumpWriteDump( processHandle, (uint)processId, file.SafeFileHandle, miniDumpType, IntPtr.Zero, IntPtr.Zero, IntPtr.Zero)) { throw new Win32Exception(Marshal.GetLastWin32Error()); } } return(true); } catch (Exception e) when(e is ArgumentException || e is InvalidOperationException || e is Win32Exception) { ObserverLogger.LogInfo( $"Unable to generate dump file {processName} with error{Environment.NewLine}{e}"); } return(false); }
public override Task ReportAsync(CancellationToken token) { try { var timeToLiveWarning = SetHealthReportTimeToLive(); // User-supplied Disk Space Usage % thresholds from Settings.xml. foreach (var data in this.DiskSpaceUsagePercentageData) { token.ThrowIfCancellationRequested(); ProcessResourceDataReportHealth( data, DiskSpacePercentErrorThreshold, DiskSpacePercentWarningThreshold, timeToLiveWarning); } // User-supplied Average disk queue length thresholds from Settings.xml. foreach (var data in this.DiskAverageQueueLengthData) { token.ThrowIfCancellationRequested(); ProcessResourceDataReportHealth( data, AverageQueueLengthErrorThreshold, AverageQueueLengthWarningThreshold, timeToLiveWarning); } /* For ETW Only - These calls will just produce ETW (note the thresholds). */ if (IsEtwEnabled) { // Disk Space Available foreach (var data in this.DiskSpaceAvailableMbData) { token.ThrowIfCancellationRequested(); ProcessResourceDataReportHealth( data, 0, 0, timeToLiveWarning); } // Disk Space Total foreach (var data in this.DiskSpaceTotalMbData) { token.ThrowIfCancellationRequested(); ProcessResourceDataReportHealth( data, 0, 0, timeToLiveWarning); } } token.ThrowIfCancellationRequested(); // This section only needs to run if you have the FabricObserverWebApi app installed. if (!ObserverManager.ObserverWebAppDeployed) { return(Task.CompletedTask); } var diskInfoPath = Path.Combine(ObserverLogger.LogFolderBasePath, "disks.txt"); _ = ObserverLogger.TryWriteLogFile(diskInfoPath, this.diskInfo.ToString()); _ = this.diskInfo.Clear(); return(Task.CompletedTask); } catch (AggregateException e) when(e.InnerException is OperationCanceledException || e.InnerException is TaskCanceledException || e.InnerException is TimeoutException) { return(Task.CompletedTask); } catch (Exception e) { HealthReporter.ReportFabricObserverServiceHealth( FabricServiceContext.ServiceName.OriginalString, ObserverName, HealthState.Warning, $"Unhandled exception in GetSystemCpuMemoryValuesAsync:{Environment.NewLine}{e}"); throw; } }
public override async Task ReportAsync(CancellationToken token) { token.ThrowIfCancellationRequested(); var sb = new StringBuilder(); _ = sb.AppendLine("\nService Fabric information:\n"); if (!string.IsNullOrEmpty(this.SFVersion)) { _ = sb.AppendLine("Runtime Version: " + this.SFVersion); } if (this.SFBinRoot != null) { _ = sb.AppendLine("Fabric Bin root directory: " + this.SFBinRoot); } if (this.SFCodePath != null) { _ = sb.AppendLine("Fabric Code Path: " + this.SFCodePath); } if (!string.IsNullOrEmpty(this.SFDataRoot)) { _ = sb.AppendLine("Data root directory: " + this.SFDataRoot); } if (!string.IsNullOrEmpty(this.SFLogRoot)) { _ = sb.AppendLine("Log root directory: " + this.SFLogRoot); } if (this.SFVolumeDiskServiceEnabled != null) { _ = sb.AppendLine("Volume Disk Service Enabled: " + this.SFVolumeDiskServiceEnabled); } if (this.unsupportedPreviewFeaturesEnabled != null) { _ = sb.AppendLine("Unsupported Preview Features Enabled: " + this.unsupportedPreviewFeaturesEnabled); } if (this.SFCompatibilityJsonPath != null) { _ = sb.AppendLine("Compatibility Json path: " + this.SFCompatibilityJsonPath); } if (this.SFEnableCircularTraceSession != null) { _ = sb.AppendLine("Enable Circular trace session: " + this.SFEnableCircularTraceSession); } _ = sb.Append(await GetDeployedAppsInfoAsync(token).ConfigureAwait(true)); _ = sb.AppendLine(); token.ThrowIfCancellationRequested(); var logPath = Path.Combine(ObserverLogger.LogFolderBasePath, "SFInfraInfo.txt"); // This file is used by the web application (ObserverWebApi). if (!ObserverLogger.TryWriteLogFile(logPath, sb.ToString())) { HealthReporter.ReportFabricObserverServiceHealth( FabricServiceContext.ServiceName.OriginalString, ObserverName, HealthState.Warning, "Unable to create SFInfraInfo.txt file."); } _ = sb.Clear(); }
public override Task ReportAsync(CancellationToken token) { // Local log. ObserverLogger.LogInfo(message.ToString()); /* Report to Fabric */ // These values will be preserved across observer runs and are useful for clearing warnings // by reporting Ok health state health events with the same property and sourceid values // as the error/warning health events when FO is safely taken down (e.g., app is being uninstalled, // safe restart of fabric node it's running on, etc.). HealthReportProperties.Add("SomePropertyName"); HealthReportSourceIds.Add($"{ObserverName}_SomethingUniqueToThisReport"); var healthReporter = new ObserverHealthReporter(ObserverLogger, FabricClientInstance); var healthReport = new Utilities.HealthReport { Code = FOErrorWarningCodes.Ok, HealthMessage = this.message.ToString(), NodeName = NodeName, Observer = ObserverName, Property = HealthReportProperties[HealthReportProperties.Count - 1], ReportType = HealthReportType.Node, State = HealthState.Ok, }; healthReporter.ReportHealthToServiceFabric(healthReport); // Emit Telemetry - This will use whatever telemetry provider you have configured in FabricObserver Settings.xml. var telemetryData = new TelemetryData(FabricClientInstance, Token) { Code = FOErrorWarningCodes.Ok, HealthEventDescription = this.message.ToString(), HealthState = "Ok", NodeName = NodeName, ObserverName = ObserverName, Source = ObserverConstants.FabricObserverName, }; if (IsTelemetryProviderEnabled && IsObserverTelemetryEnabled) { _ = TelemetryClient?.ReportHealthAsync( telemetryData, Token); } // ETW. if (IsEtwEnabled) { Logger.EtwLogger?.Write( ObserverConstants.FabricObserverETWEventName, new { Code = FOErrorWarningCodes.Ok, HealthEventDescription = this.message.ToString(), HealthState = "Ok", NodeName, ObserverName, Source = ObserverConstants.FabricObserverName, }); } this.message.Clear(); return(Task.CompletedTask); }