/// <inheritdoc/> public override Task ReportAsync(CancellationToken token) { this.Token.ThrowIfCancellationRequested(); var timeToLiveWarning = this.SetTimeToLiveWarning(this.runtime.Seconds); var portInformationReport = new Utilities.HealthReport { Observer = this.ObserverName, NodeName = this.NodeName, HealthMessage = $"Number of ports in use by Fabric services: {this.TotalActivePortCount}\n" + $"Number of ephemeral ports in use by Fabric services: {this.TotalActiveEphemeralPortCount}", State = HealthState.Ok, HealthReportTimeToLive = timeToLiveWarning, }; // TODO: Report on port count based on thresholds PortCountWarning/Error... this.HealthReporter.ReportHealthToServiceFabric(portInformationReport); // Reset ports counters... this.TotalActivePortCount = 0; this.TotalActiveEphemeralPortCount = 0; // CPU this.ProcessResourceDataList( this.allCpuData, this.CpuErrorUsageThresholdPct, this.CpuWarnUsageThresholdPct); // Memory this.ProcessResourceDataList( this.allMemData, this.MemErrorUsageThresholdMB, this.MemWarnUsageThresholdMB); // Disk IO - Reads this.ProcessResourceDataList( this.allAppDiskReadsData, this.DiskErrorIOReadsThresholdMS, this.DiskWarnIOReadsThresholdMS); // Disk IO - Writes this.ProcessResourceDataList( this.allAppDiskWritesData, this.DiskErrorIOWritesThresholdMS, this.DiskWarnIOWritesThresholdMS); // Windows Event Log if (this.monitorWinEventLog) { // SF Eventlog Errors? // Write this out to a new file, for use by the web front end log viewer... // Format = HTML... int count = this.evtRecordList.Count(); var logPath = Path.Combine(this.ObserverLogger.LogFolderBasePath, "EventVwrErrors.txt"); // Remove existing file... if (File.Exists(logPath)) { try { File.Delete(logPath); } catch (IOException) { } catch (UnauthorizedAccessException) { } } if (count >= 10) { var sb = new StringBuilder(); sb.AppendLine("<br/><div><strong>" + "<a href='javascript:toggle(\"evtContainer\")'>" + "<div id=\"plus\" style=\"display: inline; font-size: 25px;\">+</div> " + count + " Error Events in ServiceFabric and System</a> " + "Event logs</strong>.<br/></div>"); sb.AppendLine("<div id='evtContainer' style=\"display: none;\">"); foreach (var evt in this.evtRecordList.Distinct()) { token.ThrowIfCancellationRequested(); try { // Access event properties: sb.AppendLine("<div>" + evt.LogName + "</div>"); sb.AppendLine("<div>" + evt.LevelDisplayName + "</div>"); if (evt.TimeCreated.HasValue) { sb.AppendLine("<div>" + evt.TimeCreated.Value.ToShortDateString() + "</div>"); } foreach (var prop in evt.Properties) { if (prop.Value != null && Convert.ToString(prop.Value).Length > 0) { sb.AppendLine("<div>" + prop.Value + "</div>"); } } } catch (EventLogException) { } } sb.AppendLine("</div>"); this.ObserverLogger.TryWriteLogFile(logPath, sb.ToString()); sb.Clear(); } // Clean up... if (count > 0) { this.evtRecordList.Clear(); } } return(Task.CompletedTask); }
/// <inheritdoc/> public override Task ReportAsync(CancellationToken token) { try { token.ThrowIfCancellationRequested(); // OS Health... if (this.osStatus != null && this.osStatus.ToUpper() != "OK") { string healthMessage = $"OS reporting unhealthy: {this.osStatus}"; var healthReport = new Utilities.HealthReport { Observer = this.ObserverName, NodeName = this.NodeName, HealthMessage = healthMessage, State = HealthState.Error, HealthReportTimeToLive = this.SetTimeToLiveWarning(), }; this.HealthReporter.ReportHealthToServiceFabric(healthReport); // This means this observer created a Warning or Error SF Health Report this.HasActiveFabricErrorOrWarning = true; // Send Health Report as Telemetry (perhaps it signals an Alert from App Insights, for example...)... if (this.IsTelemetryEnabled) { _ = this.ObserverTelemetryClient?.ReportHealthAsync( FabricRuntime.GetActivationContext().ApplicationName, this.FabricServiceContext.ServiceName.OriginalString, "FabricObserver", this.ObserverName, $"{this.NodeName}/OS reporting unhealthy: {this.osStatus}", HealthState.Error, token); } } else if (this.HasActiveFabricErrorOrWarning && this.osStatus != null && this.osStatus.ToUpper() == "OK") { // Clear Error or Warning with an OK Health Report... string healthMessage = $"OS reporting healthy: {this.osStatus}"; var healthReport = new Utilities.HealthReport { Observer = this.ObserverName, NodeName = this.NodeName, HealthMessage = healthMessage, State = HealthState.Ok, HealthReportTimeToLive = default(TimeSpan), }; this.HealthReporter.ReportHealthToServiceFabric(healthReport); // Reset internal health state... this.HasActiveFabricErrorOrWarning = false; } var logPath = Path.Combine(this.ObserverLogger.LogFolderBasePath, "SysInfo.txt"); // This file is used by the web application (log reader...)... if (!this.ObserverLogger.TryWriteLogFile(logPath, $"Last updated on {DateTime.UtcNow.ToString("M/d/yyyy HH:mm:ss")} UTC<br/>{this.osReport}")) { this.HealthReporter.ReportFabricObserverServiceHealth( this.FabricServiceContext.ServiceName.OriginalString, this.ObserverName, HealthState.Warning, "Unable to create SysInfo.txt file..."); } var osReport = new Utilities.HealthReport { Observer = this.ObserverName, HealthMessage = this.osReport, State = HealthState.Ok, NodeName = this.NodeName, HealthReportTimeToLive = this.SetTimeToLiveWarning(), }; this.HealthReporter.ReportHealthToServiceFabric(osReport); return Task.CompletedTask; } catch (Exception e) { this.HealthReporter.ReportFabricObserverServiceHealth( this.FabricServiceContext.ServiceName.OriginalString, this.ObserverName, HealthState.Error, $"Unhandled exception processing OS information: {e.Message}: \n {e.StackTrace}"); throw; } }
/// <inheritdoc/> public override async Task ReportAsync(CancellationToken token) { string app; var timeToLiveWarning = this.SetTimeToLiveWarning(); // Report on connection state. for (int j = 0; j < this.userEndpoints.Count; j++) { token.ThrowIfCancellationRequested(); var deployedApps = await this.FabricClientInstance.QueryManager .GetDeployedApplicationListAsync( this.NodeName, new Uri(this.userEndpoints[j].AppTarget)).ConfigureAwait(true); // We only care about deployed apps. if (deployedApps == null || deployedApps.Count < 1) { continue; } app = this.userEndpoints[j].AppTarget.Replace("fabric:/", string.Empty); for (int i = 0; i < this.connectionStatus.Count; i++) { token.ThrowIfCancellationRequested(); var connStatus = this.connectionStatus[i]; if (!connStatus.Connected) { this.healthState = HealthState.Warning; var healthMessage = "Outbound Internet connection failure detected for endpoint " + connStatus.HostName + "\n"; Utilities.HealthReport report = new Utilities.HealthReport { AppName = new Uri(this.userEndpoints[j].AppTarget), Code = FOErrorWarningCodes.AppWarningNetworkEndpointUnreachable, EmitLogEvent = true, HealthMessage = healthMessage, HealthReportTimeToLive = timeToLiveWarning, State = this.healthState, NodeName = this.NodeName, Observer = this.ObserverName, ReportType = HealthReportType.Application, ResourceUsageDataProperty = $"{ErrorWarningProperty.InternetConnectionFailure}: connStatus.HostName", }; // Send health report Warning and log event locally. this.HealthReporter.ReportHealthToServiceFabric(report); // This means this observer created a Warning or Error SF Health Report this.HasActiveFabricErrorOrWarning = true; // Send Health Report as Telemetry (perhaps it signals an Alert from App Insights, for example.). if (this.IsTelemetryEnabled) { _ = this.ObserverTelemetryClient?.ReportHealthAsync( Utilities.Telemetry.HealthScope.Application, this.userEndpoints[j].AppTarget, HealthState.Warning, $"{this.NodeName}/{FOErrorWarningCodes.AppWarningNetworkEndpointUnreachable}: {healthMessage}", this.ObserverName, this.Token); } } else { if (connStatus.Health == HealthState.Warning) { this.healthState = HealthState.Ok; var healthMessage = "Outbound Internet connection test successful."; // Clear existing Health Warning. Utilities.HealthReport report = new Utilities.HealthReport { AppName = new Uri(this.userEndpoints[j].AppTarget), EmitLogEvent = true, HealthMessage = healthMessage, HealthReportTimeToLive = default(TimeSpan), State = this.healthState, NodeName = this.NodeName, Observer = this.ObserverName, ReportType = HealthReportType.Application, }; this.HealthReporter.ReportHealthToServiceFabric(report); // Reset health state. this.HasActiveFabricErrorOrWarning = false; } } } } // Clear this.connectionStatus.RemoveAll(conn => conn.Connected == true); this.connectionStatus.TrimExcess(); }
public void ProcessResourceDataReportHealth <T>( FabricResourceUsageData <T> data, T thresholdError, T thresholdWarning, TimeSpan healthReportTtl, HealthReportType healthReportType = HealthReportType.Node, string app = null, ReplicaMonitoringInfo replicaOrInstance = null, bool dumpOnError = false) { if (data == null) { throw new ArgumentException("Supply all required parameters with non-null value..."); } var thresholdName = "Minimum"; bool warningOrError = false; string repPartitionId = null, repOrInstanceId = null, name = null, id = null, procName = null; T threshold = thresholdWarning; var healthState = HealthState.Ok; Uri appName = null; if (replicaOrInstance != null) { repPartitionId = $"Partition: {replicaOrInstance.Partitionid}"; repOrInstanceId = $"Replica: {replicaOrInstance.ReplicaOrInstanceId}"; procName = Process.GetProcessById((int)replicaOrInstance.ReplicaHostProcessId)?.ProcessName; } // Create a unique node id which may be used in the case of warnings or OK clears... if (app != null) { appName = new Uri(app); name = app.Replace("fabric:/", string.Empty); id = name + "_" + data.Property.Replace(" ", string.Empty); } // Telemetry... if (this.IsTelemetryEnabled) { _ = this.ObserverTelemetryClient?.ReportMetricAsync($"{this.NodeName}-{app}-{data.Id}-{data.Property}", data.AverageDataValue, this.Token); } // ETW... if (this.IsEtwEnabled) { Logger.EtwLogger?.Write( $"FabricObserverDataEvent", new { Level = 0, // Info Node = this.NodeName, Observer = this.ObserverName, Property = data.Property, Id = data.Id, Value = $"{Math.Round(data.AverageDataValue)}", Unit = data.Units, }); } // Health Error if (data.IsUnhealthy(thresholdError)) { thresholdName = "Maximum"; threshold = thresholdError; warningOrError = true; healthState = HealthState.Error; // This is primarily useful for AppObserver, but makes sense to be // part of the base class for future use, like for FSO... if (replicaOrInstance != null && procName != null && dumpOnError) { try { int procId = (int)replicaOrInstance.ReplicaHostProcessId; if (!this.serviceDumpCountDictionary.ContainsKey(procName)) { this.serviceDumpCountDictionary.Add(procName, 0); } if (this.serviceDumpCountDictionary[procName] < this.maxDumps) { // DumpServiceProcess defaults to a Full dump with // process memory, handles and thread data... bool success = this.DumpServiceProcess(procId); if (success) { this.serviceDumpCountDictionary[procName]++; } } } // Ignore these, it just means no dmp will be created.This is not // critical to FO... Log as info, not warning... catch (ArgumentException ae) { this.ObserverLogger.LogInfo($"Unable to generate dmp file:\n{ae.ToString()}"); } catch (InvalidOperationException ioe) { this.ObserverLogger.LogInfo($"Unable to generate dmp file:\n{ioe.ToString()}"); } } } // Health Warning if (!warningOrError && data.IsUnhealthy(thresholdWarning)) { warningOrError = true; healthState = HealthState.Warning; } if (warningOrError) { string errorWarningKind = null; if (data.Property.ToLower().Contains("cpu")) { errorWarningKind = (healthState == HealthState.Error) ? ErrorWarningCode.ErrorCpuTime : ErrorWarningCode.WarningCpuTime; } else if (data.Property.ToLower().Contains("disk space")) { errorWarningKind = (healthState == HealthState.Error) ? ErrorWarningCode.ErrorDiskSpace : ErrorWarningCode.WarningDiskSpace; } else if (data.Property == "Memory Consumption MB") { errorWarningKind = (healthState == HealthState.Error) ? ErrorWarningCode.ErrorMemoryCommitted : ErrorWarningCode.WarningMemoryCommitted; } else if (data.Property == "Memory Consumption %") { errorWarningKind = (healthState == HealthState.Error) ? ErrorWarningCode.ErrorMemoryPercentUsed : ErrorWarningCode.WarningMemoryPercentUsed; } else if (data.Property.Contains("Read")) { errorWarningKind = (healthState == HealthState.Error) ? ErrorWarningCode.ErrorDiskIoReads : ErrorWarningCode.WarningDiskIoReads; } else if (data.Property.Contains("Write")) { errorWarningKind = (healthState == HealthState.Error) ? ErrorWarningCode.ErrorDiskIoWrites : ErrorWarningCode.WarningDiskIoWrites; } else if (data.Property.Contains("Queue")) { errorWarningKind = (healthState == HealthState.Error) ? ErrorWarningCode.ErrorDiskAverageQueueLength : ErrorWarningCode.WarningDiskAverageQueueLength; } else if (data.Property.Contains("Firewall")) { errorWarningKind = (healthState == HealthState.Error) ? ErrorWarningCode.ErrorTooManyFirewallRules : ErrorWarningCode.WarningTooManyFirewallRules; } else if (data.Property.Contains("Ports")) { errorWarningKind = (healthState == HealthState.Error) ? ErrorWarningCode.ErrorTooManyActivePorts : ErrorWarningCode.WarningTooManyActivePorts; } var healthMessage = new StringBuilder(); if (name != null) { healthMessage.Append($"{name} (Service Process: {procName}, {repPartitionId}, {repOrInstanceId}): "); } healthMessage.Append($"{data.Property} is at or above the specified {thresholdName} limit ({threshold}{data.Units})"); healthMessage.AppendLine($" - Average {data.Property}: {Math.Round(data.AverageDataValue)}{data.Units}"); // Set internal fabric health states... data.ActiveErrorOrWarning = true; // This means this observer created a Warning or Error SF Health Report this.HasActiveFabricErrorOrWarning = true; var healthReport = new Utilities.HealthReport { AppName = appName, Code = errorWarningKind, EmitLogEvent = true, HealthMessage = healthMessage.ToString(), HealthReportTimeToLive = healthReportTtl, ReportType = healthReportType, State = healthState, NodeName = this.NodeName, Observer = this.ObserverName, }; // Emit a Fabric Health Report and optionally a local log write... this.HealthReporter.ReportHealthToServiceFabric(healthReport); // Send Health Report as Telemetry event (perhaps it signals an Alert from App Insights, for example...)... if (this.IsTelemetryEnabled) { _ = this.ObserverTelemetryClient?.ReportHealthAsync( id, this.FabricServiceContext.ServiceName.OriginalString, "FabricObserver", this.ObserverName, $"{this.NodeName}/{errorWarningKind}/{data.Property}/{Math.Round(data.AverageDataValue)}", healthState, this.Token); } // ETW... if (this.IsEtwEnabled) { Logger.EtwLogger?.Write( $"FabricObserverDataEvent", new { Level = (healthState == HealthState.Warning) ? 1 : 2, Node = this.NodeName, Observer = this.ObserverName, HealthEventErrorCode = errorWarningKind, HealthEventDescription = healthMessage.ToString(), Property = data.Property, Id = data.Id, Value = $"{Math.Round(data.AverageDataValue)}", Unit = data.Units, }); } // Clean up sb... healthMessage.Clear(); } else { if (data.ActiveErrorOrWarning) { Utilities.HealthReport report = new Utilities.HealthReport { AppName = appName, EmitLogEvent = true, HealthMessage = $"{data.Id}: {data.Property} is now within normal/expected range.", HealthReportTimeToLive = default(TimeSpan), ReportType = healthReportType, State = HealthState.Ok, NodeName = this.NodeName, Observer = this.ObserverName, }; // Emit an Ok Health Report to clear Fabric Health warning... this.HealthReporter.ReportHealthToServiceFabric(report); // Reset health states... data.ActiveErrorOrWarning = false; this.HasActiveFabricErrorOrWarning = false; } } // No need to keep data in memory... data.Data.Clear(); data.Data.TrimExcess(); }
/// <inheritdoc/> public override Task ReportAsync(CancellationToken token) { if (token.IsCancellationRequested) { return(Task.CompletedTask); } // Someone calling without observing first, must be run after a new run of ObserveAsync if (this.ExpiringWarnings == null || this.ExpiredWarnings == null || this.NotFoundWarnings == null) { return(Task.CompletedTask); } HealthReport healthReport; if (this.ExpiringWarnings.Count == 0 && this.ExpiredWarnings.Count == 0 && this.NotFoundWarnings.Count == 0) { healthReport = new HealthReport { Observer = this.ObserverName, ReportType = HealthReportType.Node, EmitLogEvent = true, NodeName = this.NodeName, HealthMessage = $"All cluster and monitored app certificates are healthy.", State = HealthState.Ok, HealthReportTimeToLive = this.RunInterval > TimeSpan.MinValue ? this.RunInterval : this.HealthReportTimeToLive, }; this.HasActiveFabricErrorOrWarning = false; } else { string healthMessage = (this.ExpiredWarnings.Count == 0 ? string.Empty : (this.ExpiredWarnings.Aggregate(string.Empty, (i, j) => i + "\n" + j) + "\n")) + (this.NotFoundWarnings.Count == 0 ? string.Empty : (this.NotFoundWarnings.Aggregate(string.Empty, (i, j) => i + "\n" + j) + "\n")) + (this.ExpiringWarnings.Count == 0 ? string.Empty : this.ExpiringWarnings.Aggregate(string.Empty, (i, j) => i + "\n" + j)); healthReport = new HealthReport { Code = FOErrorWarningCodes.WarningCertificateExpiration, Observer = this.ObserverName, ReportType = HealthReportType.Node, EmitLogEvent = true, NodeName = this.NodeName, HealthMessage = healthMessage, State = HealthState.Warning, HealthReportTimeToLive = this.RunInterval > TimeSpan.MinValue ? this.RunInterval : this.HealthReportTimeToLive, }; this.HasActiveFabricErrorOrWarning = true; } this.HealthReporter.ReportHealthToServiceFabric(healthReport); this.ExpiredWarnings = null; this.ExpiringWarnings = null; this.NotFoundWarnings = null; this.LastRunDateTime = DateTime.Now; return(Task.CompletedTask); }
public void ReportHealthToServiceFabric(HealthReport healthReport) { if (healthReport == null) { return; } // There is no real need to change Immediate to true here for errors/warnings. This only adds unecessary stress to the // Health subsystem. var sendOptions = new HealthReportSendOptions { Immediate = false }; // Quickly send OK (clears warning/errors states). if (healthReport.State == HealthState.Ok) { sendOptions.Immediate = true; } var timeToLive = TimeSpan.FromMinutes(5); if (healthReport.HealthReportTimeToLive != default) { timeToLive = healthReport.HealthReportTimeToLive; } // In order for multiple Error/Warning/Ok events to show up in SFX Details view from observer instances, // Event Source Ids must be unique, thus the seemingly strange conditionals inside the cases below: // The apparent duplicity in OR checks is for the case when the incoming report is an OK report, where there is // no error code, but the specific ErrorWarningProperty is known. string property; switch (healthReport.Observer) { case ObserverConstants.AppObserverName: property = "AppHealth"; break; case ObserverConstants.CertificateObserverName: property = "SecurityHealth"; break; case ObserverConstants.DiskObserverName: property = "DiskHealth"; break; case ObserverConstants.FabricSystemObserverName: property = "FabricSystemServiceHealth"; break; case ObserverConstants.NetworkObserverName: property = "NetworkingHealth"; break; case ObserverConstants.OSObserverName: property = "MachineInformation"; break; case ObserverConstants.NodeObserverName: property = "MachineResourceHealth"; break; default: property = "FOGenericHealth"; break; } string sourceId = healthReport.Observer; if (!string.IsNullOrEmpty(healthReport.Code)) { // Only use FOErrorWarningCode for source sourceId = $"{healthReport.Code}"; } var healthInformation = new HealthInformation(sourceId, property, healthReport.State) { Description = healthReport.HealthMessage, TimeToLive = timeToLive, RemoveWhenExpired = true, }; // Log event only if ObserverWebApi (REST Log reader.) app is deployed. if (ObserverManager.ObserverWebAppDeployed && healthReport.EmitLogEvent) { if (healthReport.State == HealthState.Error) { this.logger.LogError(healthReport.NodeName + ": {0}", healthInformation.Description); } else if (healthReport.State == HealthState.Warning) { this.logger.LogWarning(healthReport.NodeName + ": {0}", healthInformation.Description); } else { this.logger.LogInfo(healthReport.NodeName + ": {0}", healthInformation.Description); } } // To SFX and Telemetry provider. if (healthReport.ReportType == HealthReportType.Application && healthReport.AppName != null) { var appHealthReport = new ApplicationHealthReport(healthReport.AppName, healthInformation); this.fabricClient.HealthManager.ReportHealth(appHealthReport, sendOptions); } else { var nodeHealthReport = new NodeHealthReport(healthReport.NodeName, healthInformation); this.fabricClient.HealthManager.ReportHealth(nodeHealthReport, sendOptions); } }