private async Task <List <ReplicaMonitoringInfo> > GetDeployedPrimaryReplicaAsync(Uri appName, ServiceList services) { var deployedReplicaList = await this.FabricClientInstance.QueryManager.GetDeployedReplicaListAsync(this.NodeName, appName).ConfigureAwait(true); var replicaMonitoringList = new List <ReplicaMonitoringInfo>(); foreach (var deployedReplica in deployedReplicaList) { if (deployedReplica is DeployedStatefulServiceReplica statefulReplica) { if (statefulReplica.ReplicaRole == ReplicaRole.Primary && services.Any(s => s.ServiceName == statefulReplica.ServiceName)) { var replicaInfo = new ReplicaMonitoringInfo() { ApplicationName = appName, ReplicaHostProcessId = statefulReplica.HostProcessId, ReplicaOrInstanceId = statefulReplica.ReplicaId, Partitionid = statefulReplica.Partitionid, }; replicaMonitoringList.Add(replicaInfo); continue; } } if (deployedReplica is DeployedStatelessServiceInstance statelessReplica && services.Any(s => s.ServiceName == statelessReplica.ServiceName)) { var replicaInfo = new ReplicaMonitoringInfo() { ApplicationName = appName, ReplicaHostProcessId = statelessReplica.HostProcessId, ReplicaOrInstanceId = statelessReplica.InstanceId, Partitionid = statelessReplica.Partitionid, }; replicaMonitoringList.Add(replicaInfo); continue; } } return(replicaMonitoringList); }
public void ProcessResourceDataReportHealth <T>( FabricResourceUsageData <T> data, T thresholdError, T thresholdWarning, TimeSpan healthReportTtl, HealthReportType healthReportType = HealthReportType.Node, string app = null, ReplicaMonitoringInfo replicaOrInstance = null, bool dumpOnError = false) { if (data == null) { throw new ArgumentException("Supply all required parameters with non-null value..."); } var thresholdName = "Minimum"; bool warningOrError = false; string repPartitionId = null, repOrInstanceId = null, name = null, id = null, procName = null; T threshold = thresholdWarning; var healthState = HealthState.Ok; Uri appName = null; if (replicaOrInstance != null) { repPartitionId = $"Partition: {replicaOrInstance.Partitionid}"; repOrInstanceId = $"Replica: {replicaOrInstance.ReplicaOrInstanceId}"; procName = Process.GetProcessById((int)replicaOrInstance.ReplicaHostProcessId)?.ProcessName; } // Create a unique node id which may be used in the case of warnings or OK clears... if (app != null) { appName = new Uri(app); name = app.Replace("fabric:/", string.Empty); id = name + "_" + data.Property.Replace(" ", string.Empty); } // Telemetry... if (this.IsTelemetryEnabled) { _ = this.ObserverTelemetryClient?.ReportMetricAsync($"{this.NodeName}-{app}-{data.Id}-{data.Property}", data.AverageDataValue, this.Token); } // ETW... if (this.IsEtwEnabled) { Logger.EtwLogger?.Write( $"FabricObserverDataEvent", new { Level = 0, // Info Node = this.NodeName, Observer = this.ObserverName, Property = data.Property, Id = data.Id, Value = $"{Math.Round(data.AverageDataValue)}", Unit = data.Units, }); } // Health Error if (data.IsUnhealthy(thresholdError)) { thresholdName = "Maximum"; threshold = thresholdError; warningOrError = true; healthState = HealthState.Error; // This is primarily useful for AppObserver, but makes sense to be // part of the base class for future use, like for FSO... if (replicaOrInstance != null && procName != null && dumpOnError) { try { int procId = (int)replicaOrInstance.ReplicaHostProcessId; if (!this.serviceDumpCountDictionary.ContainsKey(procName)) { this.serviceDumpCountDictionary.Add(procName, 0); } if (this.serviceDumpCountDictionary[procName] < this.maxDumps) { // DumpServiceProcess defaults to a Full dump with // process memory, handles and thread data... bool success = this.DumpServiceProcess(procId); if (success) { this.serviceDumpCountDictionary[procName]++; } } } // Ignore these, it just means no dmp will be created.This is not // critical to FO... Log as info, not warning... catch (ArgumentException ae) { this.ObserverLogger.LogInfo($"Unable to generate dmp file:\n{ae.ToString()}"); } catch (InvalidOperationException ioe) { this.ObserverLogger.LogInfo($"Unable to generate dmp file:\n{ioe.ToString()}"); } } } // Health Warning if (!warningOrError && data.IsUnhealthy(thresholdWarning)) { warningOrError = true; healthState = HealthState.Warning; } if (warningOrError) { string errorWarningKind = null; if (data.Property.ToLower().Contains("cpu")) { errorWarningKind = (healthState == HealthState.Error) ? ErrorWarningCode.ErrorCpuTime : ErrorWarningCode.WarningCpuTime; } else if (data.Property.ToLower().Contains("disk space")) { errorWarningKind = (healthState == HealthState.Error) ? ErrorWarningCode.ErrorDiskSpace : ErrorWarningCode.WarningDiskSpace; } else if (data.Property == "Memory Consumption MB") { errorWarningKind = (healthState == HealthState.Error) ? ErrorWarningCode.ErrorMemoryCommitted : ErrorWarningCode.WarningMemoryCommitted; } else if (data.Property == "Memory Consumption %") { errorWarningKind = (healthState == HealthState.Error) ? ErrorWarningCode.ErrorMemoryPercentUsed : ErrorWarningCode.WarningMemoryPercentUsed; } else if (data.Property.Contains("Read")) { errorWarningKind = (healthState == HealthState.Error) ? ErrorWarningCode.ErrorDiskIoReads : ErrorWarningCode.WarningDiskIoReads; } else if (data.Property.Contains("Write")) { errorWarningKind = (healthState == HealthState.Error) ? ErrorWarningCode.ErrorDiskIoWrites : ErrorWarningCode.WarningDiskIoWrites; } else if (data.Property.Contains("Queue")) { errorWarningKind = (healthState == HealthState.Error) ? ErrorWarningCode.ErrorDiskAverageQueueLength : ErrorWarningCode.WarningDiskAverageQueueLength; } else if (data.Property.Contains("Firewall")) { errorWarningKind = (healthState == HealthState.Error) ? ErrorWarningCode.ErrorTooManyFirewallRules : ErrorWarningCode.WarningTooManyFirewallRules; } else if (data.Property.Contains("Ports")) { errorWarningKind = (healthState == HealthState.Error) ? ErrorWarningCode.ErrorTooManyActivePorts : ErrorWarningCode.WarningTooManyActivePorts; } var healthMessage = new StringBuilder(); if (name != null) { healthMessage.Append($"{name} (Service Process: {procName}, {repPartitionId}, {repOrInstanceId}): "); } healthMessage.Append($"{data.Property} is at or above the specified {thresholdName} limit ({threshold}{data.Units})"); healthMessage.AppendLine($" - Average {data.Property}: {Math.Round(data.AverageDataValue)}{data.Units}"); // Set internal fabric health states... data.ActiveErrorOrWarning = true; // This means this observer created a Warning or Error SF Health Report this.HasActiveFabricErrorOrWarning = true; var healthReport = new Utilities.HealthReport { AppName = appName, Code = errorWarningKind, EmitLogEvent = true, HealthMessage = healthMessage.ToString(), HealthReportTimeToLive = healthReportTtl, ReportType = healthReportType, State = healthState, NodeName = this.NodeName, Observer = this.ObserverName, }; // Emit a Fabric Health Report and optionally a local log write... this.HealthReporter.ReportHealthToServiceFabric(healthReport); // Send Health Report as Telemetry event (perhaps it signals an Alert from App Insights, for example...)... if (this.IsTelemetryEnabled) { _ = this.ObserverTelemetryClient?.ReportHealthAsync( id, this.FabricServiceContext.ServiceName.OriginalString, "FabricObserver", this.ObserverName, $"{this.NodeName}/{errorWarningKind}/{data.Property}/{Math.Round(data.AverageDataValue)}", healthState, this.Token); } // ETW... if (this.IsEtwEnabled) { Logger.EtwLogger?.Write( $"FabricObserverDataEvent", new { Level = (healthState == HealthState.Warning) ? 1 : 2, Node = this.NodeName, Observer = this.ObserverName, HealthEventErrorCode = errorWarningKind, HealthEventDescription = healthMessage.ToString(), Property = data.Property, Id = data.Id, Value = $"{Math.Round(data.AverageDataValue)}", Unit = data.Units, }); } // Clean up sb... healthMessage.Clear(); } else { if (data.ActiveErrorOrWarning) { Utilities.HealthReport report = new Utilities.HealthReport { AppName = appName, EmitLogEvent = true, HealthMessage = $"{data.Id}: {data.Property} is now within normal/expected range.", HealthReportTimeToLive = default(TimeSpan), ReportType = healthReportType, State = HealthState.Ok, NodeName = this.NodeName, Observer = this.ObserverName, }; // Emit an Ok Health Report to clear Fabric Health warning... this.HealthReporter.ReportHealthToServiceFabric(report); // Reset health states... data.ActiveErrorOrWarning = false; this.HasActiveFabricErrorOrWarning = false; } } // No need to keep data in memory... data.Data.Clear(); data.Data.TrimExcess(); }