protected override void Dispose(bool disposing)
        {
            if (!disposing)
            {
                return;
            }

            var errWarnHealthStates = this.connectionStatus.Where(
                conn => conn.Health == HealthState.Error || conn.Health == HealthState.Warning);

            foreach (var state in errWarnHealthStates)
            {
                // Clear existing Health Warning.
                var report = new HealthReport
                {
                    AppName                = new Uri(state.TargetApp),
                    Code                   = FOErrorWarningCodes.AppWarningNetworkEndpointUnreachable,
                    EmitLogEvent           = true,
                    HealthMessage          = $"Clearing NetworkObserver's Health Error/Warning for {state.TargetApp}/{state.HostName} connection state since FO is stopping.",
                    HealthReportTimeToLive = default(TimeSpan),
                    State                  = HealthState.Ok,
                    NodeName               = NodeName,
                    Observer               = ObserverName,
                    Property               = $"EndpointUnreachable({state.HostName})",
                    ReportType             = HealthReportType.Application,
                };

                HealthReporter.ReportHealthToServiceFabric(report);
            }
        }
Example #2
0
        public override Task ReportAsync(CancellationToken token)
        {
            // Local log.
            this.ObserverLogger.LogInfo(message.ToString());

            // Report to Fabric.
            var healthReporter = new ObserverHealthReporter(this.ObserverLogger);
            var healthReport   = new Utilities.HealthReport
            {
                Code          = FoErrorWarningCodes.Ok,
                HealthMessage = this.message.ToString(),
                NodeName      = this.NodeName,
                Observer      = this.ObserverName,
                ReportType    = HealthReportType.Node,
                State         = HealthState.Ok,
            };

            healthReporter.ReportHealthToServiceFabric(healthReport);

            // Emit Telemetry - This will use whatever telemetry provider you have configured in FabricObserver Settings.xml.
            var telemetryData = new TelemetryData(this.FabricClientInstance, this.Token)
            {
                Code = FoErrorWarningCodes.Ok,
                HealthEventDescription = this.message.ToString(),
                HealthState            = "Ok",
                NodeName     = this.NodeName,
                ObserverName = this.ObserverName,
                Source       = ObserverConstants.FabricObserverName,
            };

            if (this.IsTelemetryProviderEnabled && this.IsObserverTelemetryEnabled)
            {
                _ = this.TelemetryClient?.ReportHealthAsync(
                    telemetryData,
                    this.Token);
            }

            // ETW.
            if (this.IsEtwEnabled)
            {
                Logger.EtwLogger?.Write(
                    ObserverConstants.FabricObserverETWEventName,
                    new
                {
                    Code = FoErrorWarningCodes.Ok,
                    HealthEventDescription = this.message.ToString(),
                    HealthState            = "Ok",
                    this.NodeName,
                    this.ObserverName,
                    Source = ObserverConstants.FabricObserverName,
                });
            }

            this.message.Clear();

            return(Task.CompletedTask);
        }
Example #3
0
        /// <inheritdoc/>
        public override Task ReportAsync(CancellationToken token)
        {
            if (token.IsCancellationRequested)
            {
                return(Task.CompletedTask);
            }

            // Someone calling without observing first, must be run after a new run of ObserveAsync
            if (this.ExpiringWarnings == null ||
                this.ExpiredWarnings == null ||
                this.NotFoundWarnings == null)
            {
                return(Task.CompletedTask);
            }

            HealthReport healthReport;

            if (this.ExpiringWarnings.Count == 0 &&
                this.ExpiredWarnings.Count == 0 &&
                this.NotFoundWarnings.Count == 0)
            {
                healthReport = new HealthReport
                {
                    Observer               = this.ObserverName,
                    ReportType             = HealthReportType.Node,
                    EmitLogEvent           = true,
                    NodeName               = this.NodeName,
                    HealthMessage          = $"All cluster and monitored app certificates are healthy.",
                    State                  = HealthState.Ok,
                    HealthReportTimeToLive = this.RunInterval > TimeSpan.MinValue ? this.RunInterval : this.HealthReportTimeToLive,
                };

                this.HasActiveFabricErrorOrWarning = false;
            }
            else
            {
                string healthMessage = (this.ExpiredWarnings.Count == 0 ? string.Empty : (this.ExpiredWarnings.Aggregate(string.Empty, (i, j) => i + "\n" + j) + "\n")) +
                                       (this.NotFoundWarnings.Count == 0 ? string.Empty : (this.NotFoundWarnings.Aggregate(string.Empty, (i, j) => i + "\n" + j) + "\n")) +
                                       (this.ExpiringWarnings.Count == 0 ? string.Empty : this.ExpiringWarnings.Aggregate(string.Empty, (i, j) => i + "\n" + j));

                healthReport = new HealthReport
                {
                    Code                   = FoErrorWarningCodes.WarningCertificateExpiration,
                    Observer               = this.ObserverName,
                    ReportType             = HealthReportType.Node,
                    EmitLogEvent           = true,
                    NodeName               = this.NodeName,
                    HealthMessage          = healthMessage,
                    State                  = HealthState.Warning,
                    HealthReportTimeToLive = this.RunInterval > TimeSpan.MinValue ? this.RunInterval : this.HealthReportTimeToLive,
                };

                this.HasActiveFabricErrorOrWarning = true;
            }

            this.HealthReporter.ReportHealthToServiceFabric(healthReport);

            this.ExpiredWarnings  = null;
            this.ExpiringWarnings = null;
            this.NotFoundWarnings = null;

            this.LastRunDateTime = DateTime.Now;

            return(Task.CompletedTask);
        }
        public override Task ReportAsync(CancellationToken token)
        {
            var timeToLiveWarning = SetHealthReportTimeToLive();

            // Report on connection state.
            foreach (var config in this.userConfig)
            {
                token.ThrowIfCancellationRequested();

                foreach (var conn in this.connectionStatus.Where(cs => cs.TargetApp == config.TargetApp))
                {
                    token.ThrowIfCancellationRequested();

                    var connState = conn;

                    if (!connState.Connected)
                    {
                        this.healthState = HealthState.Warning;
                        var healthMessage = $"Outbound Internet connection failure detected for endpoint {connState.HostName}{Environment.NewLine}";

                        // Send Health Telemetry (perhaps it signals an Alert in AppInsights or LogAnalytics).
                        // This will also be serialied into the health event (Desf.
                        var telemetryData = new TelemetryData(FabricClientInstance, token)
                        {
                            ApplicationName        = conn.TargetApp,
                            Code                   = FOErrorWarningCodes.AppWarningNetworkEndpointUnreachable,
                            HealthState            = "Warning",
                            HealthEventDescription = healthMessage,
                            ObserverName           = ObserverName,
                            Metric                 = ErrorWarningProperty.InternetConnectionFailure,
                            NodeName               = NodeName,
                        };

                        if (IsTelemetryProviderEnabled && IsObserverTelemetryEnabled)
                        {
                            _ = TelemetryClient?.ReportMetricAsync(
                                telemetryData,
                                Token);
                        }

                        var report = new HealthReport
                        {
                            AppName                = new Uri(conn.TargetApp),
                            EmitLogEvent           = true,
                            HealthData             = telemetryData,
                            HealthMessage          = healthMessage,
                            HealthReportTimeToLive = timeToLiveWarning,
                            State      = this.healthState,
                            NodeName   = NodeName,
                            Observer   = ObserverName,
                            Property   = $"EndpointUnreachable({conn.HostName})",
                            ReportType = HealthReportType.Application,
                            ResourceUsageDataProperty = $"{ErrorWarningProperty.InternetConnectionFailure}: {connState.HostName}",
                        };

                        // Send health report Warning and log event locally.
                        HealthReporter.ReportHealthToServiceFabric(report);

                        // This means this observer created a Warning or Error SF Health Report
                        HasActiveFabricErrorOrWarning = true;

                        // ETW.
                        if (IsEtwEnabled)
                        {
                            Logger.EtwLogger?.Write(
                                ObserverConstants.FabricObserverETWEventName,
                                new
                            {
                                ApplicationName        = conn.TargetApp,
                                Code                   = FOErrorWarningCodes.AppWarningNetworkEndpointUnreachable,
                                HealthState            = "Warning",
                                HealthEventDescription = healthMessage,
                                ObserverName,
                                Metric = ErrorWarningProperty.InternetConnectionFailure,
                                NodeName,
                            });
                        }
                    }
                    else
                    {
                        if (connState.Health != HealthState.Warning ||
                            connState.Health != HealthState.Error)
                        {
                            continue;
                        }

                        this.healthState = HealthState.Ok;
                        var healthMessage = $"Outbound Internet connection successful for {connState?.HostName} from node {NodeName}.";

                        // Clear existing Health Warning.
                        var report = new HealthReport
                        {
                            AppName                = new Uri(conn.TargetApp),
                            Code                   = FOErrorWarningCodes.AppWarningNetworkEndpointUnreachable,
                            EmitLogEvent           = true,
                            HealthMessage          = healthMessage,
                            HealthReportTimeToLive = default(TimeSpan),
                            State                  = HealthState.Ok,
                            NodeName               = NodeName,
                            Observer               = ObserverName,
                            Property               = $"EndpointUnreachable({conn.HostName})",
                            ReportType             = HealthReportType.Application,
                        };

                        HealthReporter.ReportHealthToServiceFabric(report);

                        // Telemetry.
                        if (IsTelemetryProviderEnabled && IsObserverTelemetryEnabled)
                        {
                            var telemetryData = new TelemetryData(FabricClientInstance, token)
                            {
                                ApplicationName        = conn.TargetApp,
                                Code                   = FOErrorWarningCodes.Ok,
                                HealthState            = "Ok",
                                HealthEventDescription = healthMessage,
                                ObserverName           = ObserverName,
                                Metric                 = "Internet Connection State",
                                NodeName               = NodeName,
                            };

                            _ = TelemetryClient?.ReportMetricAsync(
                                telemetryData,
                                Token);
                        }

                        // ETW.
                        if (IsEtwEnabled)
                        {
                            Logger.EtwLogger?.Write(
                                ObserverConstants.FabricObserverETWEventName,
                                new
                            {
                                ApplicationName        = conn.TargetApp,
                                Code                   = FOErrorWarningCodes.Ok,
                                HealthState            = "Ok",
                                HealthEventDescription = healthMessage,
                                ObserverName,
                                Metric = "Internet Connection State",
                                NodeName,
                            });
                        }

                        // Reset health state.
                        HasActiveFabricErrorOrWarning = false;
                    }
                }
            }

            // Clear
            _ = this.connectionStatus.RemoveAll(conn => conn.Connected);
            this.connectionStatus.TrimExcess();
            this.connEndpointTestResults.Clear();

            return(Task.CompletedTask);
        }
Example #5
0
        /// <inheritdoc/>
        public override Task ReportAsync(CancellationToken token)
        {
            if (token.IsCancellationRequested)
            {
                return(Task.CompletedTask);
            }

            // Someone calling without observing first, must be run after a new run of ObserveAsync
            if (this.ExpiringWarnings == null ||
                this.ExpiredWarnings == null ||
                this.NotFoundWarnings == null)
            {
                return(Task.CompletedTask);
            }

            HealthReport healthReport;

            if (this.ExpiringWarnings.Count == 0 &&
                this.ExpiredWarnings.Count == 0 &&
                this.NotFoundWarnings.Count == 0)
            {
                healthReport = new HealthReport
                {
                    Observer               = this.ObserverName,
                    ReportType             = HealthReportType.Node,
                    EmitLogEvent           = true,
                    NodeName               = this.NodeName,
                    HealthMessage          = $"All cluster and monitored app certificates are healthy.",
                    State                  = HealthState.Ok,
                    HealthReportTimeToLive = this.RunInterval > TimeSpan.MinValue ? this.RunInterval : this.HealthReportTimeToLive,
                };

                this.HasActiveFabricErrorOrWarning = false;
            }
            else
            {
                string healthMessage = (this.ExpiredWarnings.Count == 0 ? string.Empty : (this.ExpiredWarnings.Aggregate(string.Empty, (i, j) => i + "\n" + j) + "\n")) +
                                       (this.NotFoundWarnings.Count == 0 ? string.Empty : (this.NotFoundWarnings.Aggregate(string.Empty, (i, j) => i + "\n" + j) + "\n")) +
                                       (this.ExpiringWarnings.Count == 0 ? string.Empty : this.ExpiringWarnings.Aggregate(string.Empty, (i, j) => i + "\n" + j));

                healthReport = new HealthReport
                {
                    Code                   = FoErrorWarningCodes.WarningCertificateExpiration,
                    Observer               = this.ObserverName,
                    ReportType             = HealthReportType.Node,
                    EmitLogEvent           = true,
                    NodeName               = this.NodeName,
                    HealthMessage          = healthMessage,
                    State                  = HealthState.Warning,
                    HealthReportTimeToLive = this.RunInterval > TimeSpan.MinValue ? this.RunInterval : this.HealthReportTimeToLive,
                };

                this.HasActiveFabricErrorOrWarning = true;

                if (this.IsTelemetryProviderEnabled && this.IsObserverTelemetryEnabled)
                {
                    TelemetryData telemetryData = new TelemetryData(this.FabricClientInstance, token)
                    {
                        Code                   = FoErrorWarningCodes.WarningCertificateExpiration,
                        HealthState            = "Warning",
                        NodeName               = this.NodeName,
                        Metric                 = ErrorWarningProperty.CertificateExpiration,
                        HealthEventDescription = healthMessage,
                        ObserverName           = this.ObserverName,
                        Source                 = ObserverConstants.FabricObserverName,
                        Value                  = FoErrorWarningCodes.GetErrorWarningNameFromFOCode(
                            FoErrorWarningCodes.WarningCertificateExpiration,
                            HealthScope.Node),
                    };

                    _ = this.TelemetryClient?.ReportMetricAsync(
                        telemetryData,
                        this.Token);
                }

                if (this.IsEtwEnabled)
                {
                    Logger.EtwLogger?.Write(
                        ObserverConstants.FabricObserverETWEventName,
                        new
                    {
                        Code                   = FoErrorWarningCodes.WarningCertificateExpiration,
                        HealthState            = "Warning",
                        NodeName               = this.NodeName,
                        Metric                 = ErrorWarningProperty.CertificateExpiration,
                        HealthEventDescription = healthMessage,
                        ObserverName           = this.ObserverName,
                        Source                 = ObserverConstants.FabricObserverName,
                        Value                  = FoErrorWarningCodes.GetErrorWarningNameFromFOCode(
                            FoErrorWarningCodes.WarningCertificateExpiration,
                            HealthScope.Node),
                    });
                }
            }

            this.HealthReporter.ReportHealthToServiceFabric(healthReport);

            this.ExpiredWarnings  = null;
            this.ExpiringWarnings = null;
            this.NotFoundWarnings = null;
            this.LastRunDateTime  = DateTime.Now;

            return(Task.CompletedTask);
        }
        public void ReportHealthToServiceFabric(HealthReport healthReport)
        {
            if (healthReport == null)
            {
                return;
            }

            // There is no real need to change Immediate to true here for errors/warnings. This only adds unecessary stress to the
            // Health subsystem.
            var sendOptions = new HealthReportSendOptions {
                Immediate = false
            };

            // Quickly send OK (clears warning/errors states).
            if (healthReport.State == HealthState.Ok)
            {
                sendOptions.Immediate = true;
            }

            var timeToLive = TimeSpan.FromMinutes(5);

            if (healthReport.HealthReportTimeToLive != default)
            {
                timeToLive = healthReport.HealthReportTimeToLive;
            }

            // In order for multiple Error/Warning/Ok events to show up in SFX Details view from observer instances,
            // Event Source Ids must be unique, thus the seemingly strange conditionals inside the cases below:
            // The apparent duplicity in OR checks is for the case when the incoming report is an OK report, where there is
            // no error code, but the specific ErrorWarningProperty is known.
            string property;

            switch (healthReport.Observer)
            {
            case ObserverConstants.AppObserverName:
                property = "AppHealth";
                break;

            case ObserverConstants.CertificateObserverName:
                property = "SecurityHealth";
                break;

            case ObserverConstants.DiskObserverName:
                property = "DiskHealth";
                break;

            case ObserverConstants.FabricSystemObserverName:
                property = "FabricSystemServiceHealth";
                break;

            case ObserverConstants.NetworkObserverName:
                property = "NetworkingHealth";
                break;

            case ObserverConstants.OsObserverName:
                property = "MachineInformation";
                break;

            case ObserverConstants.NodeObserverName:
                property = "MachineResourceHealth";
                break;

            default:
                property = "FOGenericHealth";
                break;
            }

            string sourceId = healthReport.Observer;

            if (!string.IsNullOrEmpty(healthReport.Code))
            {
                // Only use FOErrorWarningCode for source
                sourceId = $"{healthReport.Code}";
            }

            var healthInformation = new HealthInformation(sourceId, property, healthReport.State)
            {
                Description       = healthReport.HealthMessage,
                TimeToLive        = timeToLive,
                RemoveWhenExpired = true,
            };

            // Log event only if ObserverWebApi (REST Log reader.) app is deployed.
            if (ObserverManager.ObserverWebAppDeployed &&
                healthReport.EmitLogEvent)
            {
                if (healthReport.State == HealthState.Error)
                {
                    this.logger.LogError(healthReport.NodeName + ": {0}", healthInformation.Description);
                }
                else if (healthReport.State == HealthState.Warning)
                {
                    this.logger.LogWarning(healthReport.NodeName + ": {0}", healthInformation.Description);
                }
                else
                {
                    this.logger.LogInfo(healthReport.NodeName + ": {0}", healthInformation.Description);
                }
            }

            // To SFX and Telemetry provider.
            if (healthReport.ReportType == HealthReportType.Application && healthReport.AppName != null)
            {
                var appHealthReport = new ApplicationHealthReport(healthReport.AppName, healthInformation);
                this.fabricClient.HealthManager.ReportHealth(appHealthReport, sendOptions);
            }
            else
            {
                var nodeHealthReport = new NodeHealthReport(healthReport.NodeName, healthInformation);
                this.fabricClient.HealthManager.ReportHealth(nodeHealthReport, sendOptions);
            }
        }
        /// <inheritdoc/>
        public override Task ReportAsync(CancellationToken token)
        {
            try
            {
                token.ThrowIfCancellationRequested();

                // OS Health.
                if (this.osStatus != null &&
                    this.osStatus.ToUpper() != "OK")
                {
                    string healthMessage = $"OS reporting unhealthy: {this.osStatus}";
                    var    healthReport  = new HealthReport
                    {
                        Observer               = this.ObserverName,
                        NodeName               = this.NodeName,
                        HealthMessage          = healthMessage,
                        State                  = HealthState.Error,
                        HealthReportTimeToLive = this.SetTimeToLiveWarning(),
                    };

                    this.HealthReporter.ReportHealthToServiceFabric(healthReport);

                    // This means this observer created a Warning or Error SF Health Report
                    this.HasActiveFabricErrorOrWarning = true;

                    // Send Health Report as Telemetry (perhaps it signals an Alert from App Insights, for example.).
                    if (this.IsTelemetryEnabled)
                    {
                        _ = this.ObserverTelemetryClient?.ReportHealthAsync(
                            HealthScope.Application,
                            FabricRuntime.GetActivationContext().ApplicationName,
                            HealthState.Error,
                            $"{this.NodeName} - OS reporting unhealthy: {this.osStatus}",
                            this.ObserverName,
                            this.Token);
                    }
                }
                else if (this.HasActiveFabricErrorOrWarning &&
                         this.osStatus != null &&
                         this.osStatus.ToUpper() == "OK")
                {
                    // Clear Error or Warning with an OK Health Report.
                    string healthMessage = $"OS reporting healthy: {this.osStatus}";
                    var    healthReport  = new HealthReport
                    {
                        Observer               = this.ObserverName,
                        NodeName               = this.NodeName,
                        HealthMessage          = healthMessage,
                        State                  = HealthState.Ok,
                        HealthReportTimeToLive = default(TimeSpan),
                    };

                    this.HealthReporter.ReportHealthToServiceFabric(healthReport);

                    // Reset internal health state.
                    this.HasActiveFabricErrorOrWarning = false;
                }

                if (ObserverManager.ObserverWebAppDeployed)
                {
                    var logPath = Path.Combine(this.ObserverLogger.LogFolderBasePath, "SysInfo.txt");

                    // This file is used by the web application (log reader.).
                    if (!this.ObserverLogger.TryWriteLogFile(logPath, $"Last updated on {DateTime.UtcNow.ToString("M/d/yyyy HH:mm:ss")} UTC<br/>{this.osReport}"))
                    {
                        this.HealthReporter.ReportFabricObserverServiceHealth(
                            this.FabricServiceContext.ServiceName.OriginalString,
                            this.ObserverName,
                            HealthState.Warning,
                            "Unable to create SysInfo.txt file.");
                    }
                }

                var report = new HealthReport
                {
                    Observer               = this.ObserverName,
                    HealthMessage          = this.osReport,
                    State                  = HealthState.Ok,
                    NodeName               = this.NodeName,
                    HealthReportTimeToLive = this.SetTimeToLiveWarning(),
                };

                this.HealthReporter.ReportHealthToServiceFabric(report);

                return(Task.CompletedTask);
            }
            catch (Exception e)
            {
                this.HealthReporter.ReportFabricObserverServiceHealth(
                    this.FabricServiceContext.ServiceName.OriginalString,
                    this.ObserverName,
                    HealthState.Error,
                    $"Unhandled exception processing OS information: {e.Message}: \n {e.StackTrace}");
                throw;
            }
        }
        private async Task MonitorDeployedAppsAsync(CancellationToken token)
        {
            Process currentProcess = null;

            foreach (var repOrInst in ReplicaOrInstanceList)
            {
                token.ThrowIfCancellationRequested();

                var timer     = new Stopwatch();
                int processId = (int)repOrInst.HostProcessId;
                var cpuUsage  = new CpuUsage();

                try
                {
                    // App level.
                    currentProcess = Process.GetProcessById(processId);

                    token.ThrowIfCancellationRequested();

                    var    procName      = currentProcess.ProcessName;
                    string appNameOrType = GetAppNameOrType(repOrInst);

                    var id = $"{appNameOrType}:{procName}";

                    // Add new resource data structures for each app service process.
                    if (this.allAppCpuData.All(list => list.Id != id))
                    {
                        this.allAppCpuData.Add(new FabricResourceUsageData <double>(ErrorWarningProperty.TotalCpuTime, id, DataCapacity, UseCircularBuffer));
                        this.allAppMemDataMb.Add(new FabricResourceUsageData <float>(ErrorWarningProperty.TotalMemoryConsumptionMb, id, DataCapacity, UseCircularBuffer));
                        this.allAppMemDataPercent.Add(new FabricResourceUsageData <double>(ErrorWarningProperty.TotalMemoryConsumptionPct, id, DataCapacity, UseCircularBuffer));
                        this.allAppTotalActivePortsData.Add(new FabricResourceUsageData <int>(ErrorWarningProperty.TotalActivePorts, id, 1));
                        this.allAppEphemeralPortsData.Add(new FabricResourceUsageData <int>(ErrorWarningProperty.TotalEphemeralPorts, id, 1));
                    }

                    TimeSpan duration = TimeSpan.FromSeconds(15);

                    if (MonitorDuration > TimeSpan.MinValue)
                    {
                        duration = MonitorDuration;
                    }

                    // Warm up the counters.
                    _ = cpuUsage.GetCpuUsagePercentageProcess(currentProcess);
                    _ = ProcessInfoProvider.Instance.GetProcessPrivateWorkingSetInMB(currentProcess.Id);

                    timer.Start();

                    while (!currentProcess.HasExited && timer.Elapsed.Seconds <= duration.Seconds)
                    {
                        token.ThrowIfCancellationRequested();

                        // CPU (all cores).
                        double cpu = cpuUsage.GetCpuUsagePercentageProcess(currentProcess);

                        if (cpu >= 0)
                        {
                            if (cpu > 100)
                            {
                                cpu = 100;
                            }

                            this.allAppCpuData.FirstOrDefault(x => x.Id == id).Data.Add(cpu);
                        }

                        // Memory (private working set (process)).
                        var processMem = ProcessInfoProvider.Instance.GetProcessPrivateWorkingSetInMB(currentProcess.Id);
                        this.allAppMemDataMb.FirstOrDefault(x => x.Id == id).Data.Add(processMem);

                        // Memory (percent in use (total)).
                        var(TotalMemory, PercentInUse) = OperatingSystemInfoProvider.Instance.TupleGetTotalPhysicalMemorySizeAndPercentInUse();
                        long totalMem = TotalMemory;

                        if (totalMem > -1)
                        {
                            double usedPct = Math.Round(((double)(processMem * 100)) / (totalMem * 1024), 2);
                            this.allAppMemDataPercent.FirstOrDefault(x => x.Id == id).Data.Add(Math.Round(usedPct, 1));
                        }

                        await Task.Delay(250, Token);
                    }

                    timer.Stop();
                    timer.Reset();

                    // Total and Ephemeral ports..
                    this.allAppTotalActivePortsData.FirstOrDefault(x => x.Id == id)
                    .Data.Add(OperatingSystemInfoProvider.Instance.GetActivePortCount(currentProcess.Id, FabricServiceContext));

                    this.allAppEphemeralPortsData.FirstOrDefault(x => x.Id == id)
                    .Data.Add(OperatingSystemInfoProvider.Instance.GetActiveEphemeralPortCount(currentProcess.Id, FabricServiceContext));
                }
                catch (Exception e)
                {
#if DEBUG
                    // DEBUG INFO
                    var healthReport = new Utilities.HealthReport
                    {
                        AppName       = repOrInst.ApplicationName,
                        HealthMessage = $"Error: {e}\n\n",
                        State         = HealthState.Ok,
                        Code          = FOErrorWarningCodes.Ok,
                        NodeName      = NodeName,
                        Observer      = ObserverName,
                        Property      = $"{e.Source}",
                        ReportType    = HealthReportType.Application,
                    };

                    HealthReporter.ReportHealthToServiceFabric(healthReport);
#endif
                    if (e is Win32Exception || e is ArgumentException || e is InvalidOperationException)
                    {
                        WriteToLogWithLevel(
                            ObserverName,
                            $"MonitorAsync failed to find current service process for {repOrInst.ApplicationName?.OriginalString ?? repOrInst.ApplicationTypeName}/n{e}",
                            LogLevel.Information);
                    }
                    else
                    {
                        if (!(e is OperationCanceledException || e is TaskCanceledException))
                        {
                            WriteToLogWithLevel(
                                ObserverName,
                                $"Unhandled exception in MonitorAsync: \n {e}",
                                LogLevel.Warning);
                        }

                        throw;
                    }
                }
                finally
                {
                    currentProcess?.Dispose();
                    currentProcess = null;
                }
            }
        }
        /// <summary>
        /// This function processes numeric data held in FRUD instances and generates Application or Node level Health Reports depending on supplied thresholds.
        /// </summary>
        /// <typeparam name="T">This represents the numeric type of data this function will operate on.</typeparam>
        /// <param name="data">FabricResourceUsageData instance.</param>
        /// <param name="thresholdError">Error threshold (numeric)</param>
        /// <param name="thresholdWarning">Warning threshold (numeric)</param>
        /// <param name="healthReportTtl">Health report Time to Live (TimeSpan)</param>
        /// <param name="healthReportType">HealthReport type. Note, only Application and Node health report types are supported.</param>
        /// <param name="replicaOrInstance">Replica or Instance information contained in a type.</param>
        /// <param name="dumpOnError">Wheter or not to dump process if Error threshold has been reached.</param>
        public void ProcessResourceDataReportHealth <T>(
            FabricResourceUsageData <T> data,
            T thresholdError,
            T thresholdWarning,
            TimeSpan healthReportTtl,
            HealthReportType healthReportType = HealthReportType.Node,
            ReplicaOrInstanceMonitoringInfo replicaOrInstance = null,
            bool dumpOnError = false)
            where T : struct
        {
            if (data == null)
            {
                throw new ArgumentException("Supply all required parameters with non-null value.");
            }

            if (healthReportType != HealthReportType.Application && healthReportType != HealthReportType.Node)
            {
                this.ObserverLogger.LogWarning($"ProcessResourceDataReportHealth: Unsupported HealthReport type -> {Enum.GetName(typeof(HealthReportType), healthReportType)}");
                return;
            }

            var           thresholdName = "Minimum";
            bool          warningOrError = false;
            string        repPartitionId = null, repOrInstanceId = null, name = null, id = null, procName = null;
            T             threshold     = thresholdWarning;
            var           healthState   = HealthState.Ok;
            Uri           appName       = null;
            Uri           serviceName   = null;
            TelemetryData telemetryData = null;

            if (healthReportType == HealthReportType.Application)
            {
                if (replicaOrInstance != null)
                {
                    repPartitionId  = $"Partition: {replicaOrInstance.PartitionId}";
                    repOrInstanceId = $"Replica: {replicaOrInstance.ReplicaOrInstanceId}";

                    // Create a unique id which will be used for health Warnings and OKs (clears).
                    appName     = replicaOrInstance.ApplicationName;
                    serviceName = replicaOrInstance.ServiceName;
                    name        = appName.OriginalString.Replace("fabric:/", string.Empty);
                }
                else
                {
                    appName = new Uri("fabric:/System");
                    name    = data.Id;
                }

                id = name + "_" + data.Property.Replace(" ", string.Empty);

                // The health event description will be a serialized instance of telemetryData,
                // so it should be completely constructed (filled with data) regardless
                // of user telemetry settings.
                telemetryData = new TelemetryData(FabricClientInstance, Token)
                {
                    ApplicationName = appName?.OriginalString ?? string.Empty,
                    Code            = FOErrorWarningCodes.Ok,
                    HealthState     = Enum.GetName(typeof(HealthState), HealthState.Ok),
                    NodeName        = NodeName,
                    ObserverName    = ObserverName,
                    Metric          = data.Property,
                    Value           = Math.Round(data.AverageDataValue, 1),
                    PartitionId     = replicaOrInstance?.PartitionId.ToString(),
                    ReplicaId       = replicaOrInstance?.ReplicaOrInstanceId.ToString(),
                    ServiceName     = serviceName?.OriginalString ?? string.Empty,
                    Source          = ObserverConstants.FabricObserverName,
                };

                try
                {
                    if (replicaOrInstance != null && replicaOrInstance.HostProcessId > 0)
                    {
                        procName = Process.GetProcessById((int)replicaOrInstance.HostProcessId).ProcessName;
                    }
                    else
                    {
                        // The name of the target service process is always the id for data containers coming from FSO.
                        procName = data.Id;
                    }

                    telemetryData.ServiceName = procName;

                    if (IsTelemetryProviderEnabled && IsObserverTelemetryEnabled)
                    {
                        _ = TelemetryClient?.ReportMetricAsync(
                            telemetryData,
                            Token).ConfigureAwait(false);
                    }

                    if (IsEtwEnabled)
                    {
                        Logger.EtwLogger?.Write(
                            ObserverConstants.FabricObserverETWEventName,
                            new
                        {
                            ApplicationName = appName?.OriginalString ?? string.Empty,
                            Code            = FOErrorWarningCodes.Ok,
                            HealthState     = Enum.GetName(typeof(HealthState), HealthState.Ok),
                            NodeName,
                            ObserverName,
                            Metric      = data.Property,
                            Value       = Math.Round(data.AverageDataValue, 1),
                            PartitionId = replicaOrInstance?.PartitionId.ToString(),
                            ReplicaId   = replicaOrInstance?.ReplicaOrInstanceId.ToString(),
                            ServiceName = procName,
                            Source      = ObserverConstants.FabricObserverName,
                        });
                    }
                }
                catch (ArgumentException)
                {
                    return;
                }
                catch (InvalidOperationException)
                {
                    return;
                }
            }
            else
            {
                string drive = string.Empty;

                if (ObserverName == ObserverConstants.DiskObserverName)
                {
                    drive = $"{data.Id}: ";

                    if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows))
                    {
                        drive = $"{data.Id.Remove(1, 2)}: ";
                    }
                }

                // The health event description will be a serialized instance of telemetryData,
                // so it should be completely constructed (filled with data) regardless
                // of user telemetry settings.
                telemetryData = new TelemetryData(FabricClientInstance, Token)
                {
                    Code         = FOErrorWarningCodes.Ok,
                    HealthState  = Enum.GetName(typeof(HealthState), HealthState.Ok),
                    NodeName     = NodeName,
                    ObserverName = ObserverName,
                    Metric       = $"{drive}{data.Property}",
                    Source       = ObserverConstants.FabricObserverName,
                    Value        = Math.Round(data.AverageDataValue, 1),
                };

                if (IsTelemetryProviderEnabled && IsObserverTelemetryEnabled)
                {
                    _ = TelemetryClient?.ReportMetricAsync(
                        telemetryData,
                        Token);
                }

                if (IsEtwEnabled)
                {
                    Logger.EtwLogger?.Write(
                        ObserverConstants.FabricObserverETWEventName,
                        new
                    {
                        Code        = FOErrorWarningCodes.Ok,
                        HealthState = Enum.GetName(typeof(HealthState), HealthState.Ok),
                        NodeName,
                        ObserverName,
                        Metric = $"{drive}{data.Property}",
                        Source = ObserverConstants.FabricObserverName,
                        Value  = Math.Round(data.AverageDataValue, 1),
                    });
                }
            }

            // Health Error
            if (data.IsUnhealthy(thresholdError))
            {
                thresholdName  = "Maximum";
                threshold      = thresholdError;
                warningOrError = true;
                healthState    = HealthState.Error;

                // This is primarily useful for AppObserver, but makes sense to be
                // part of the base class for future use, like for FSO.
                if (replicaOrInstance != null && dumpOnError)
                {
                    try
                    {
                        int procId = (int)replicaOrInstance.HostProcessId;

                        if (!this.serviceDumpCountDictionary.ContainsKey(procName))
                        {
                            this.serviceDumpCountDictionary.Add(procName, 0);
                        }

                        if (this.serviceDumpCountDictionary[procName] < this.maxDumps)
                        {
                            // DumpServiceProcess defaults to a Full dump with
                            // process memory, handles and thread data.
                            bool success = DumpServiceProcess(procId);

                            if (success)
                            {
                                this.serviceDumpCountDictionary[procName]++;
                            }
                        }
                    }

                    // Ignore these, it just means no dmp will be created.This is not
                    // critical to FO. Log as info, not warning.
                    catch (Exception e) when(e is ArgumentException || e is InvalidOperationException)
                    {
                        ObserverLogger.LogInfo($"Unable to generate dmp file:{Environment.NewLine}{e}");
                    }
                }
            }

            // Health Warning
            if (!warningOrError && data.IsUnhealthy(thresholdWarning))
            {
                warningOrError = true;
                healthState    = HealthState.Warning;
            }

            if (warningOrError)
            {
                string errorWarningCode = null;

                switch (data.Property)
                {
                case ErrorWarningProperty.TotalCpuTime when healthReportType == HealthReportType.Application:
                    errorWarningCode = (healthState == HealthState.Error) ?
                                       FOErrorWarningCodes.AppErrorCpuPercent : FOErrorWarningCodes.AppWarningCpuPercent;
                    break;

                case ErrorWarningProperty.TotalCpuTime:
                    errorWarningCode = (healthState == HealthState.Error) ?
                                       FOErrorWarningCodes.NodeErrorCpuPercent : FOErrorWarningCodes.NodeWarningCpuPercent;
                    break;

                case ErrorWarningProperty.DiskSpaceUsagePercentage:
                    errorWarningCode = (healthState == HealthState.Error) ?
                                       FOErrorWarningCodes.NodeErrorDiskSpacePercent : FOErrorWarningCodes.NodeWarningDiskSpacePercent;
                    break;

                case ErrorWarningProperty.DiskSpaceUsageMb:
                    errorWarningCode = (healthState == HealthState.Error) ?
                                       FOErrorWarningCodes.NodeErrorDiskSpaceMB : FOErrorWarningCodes.NodeWarningDiskSpaceMB;
                    break;

                case ErrorWarningProperty.TotalMemoryConsumptionMb when healthReportType == HealthReportType.Application:
                    errorWarningCode = (healthState == HealthState.Error) ?
                                       FOErrorWarningCodes.AppErrorMemoryMB : FOErrorWarningCodes.AppWarningMemoryMB;
                    break;

                case ErrorWarningProperty.TotalMemoryConsumptionMb:
                    errorWarningCode = (healthState == HealthState.Error) ?
                                       FOErrorWarningCodes.NodeErrorMemoryMB : FOErrorWarningCodes.NodeWarningMemoryMB;
                    break;

                case ErrorWarningProperty.TotalMemoryConsumptionPct when replicaOrInstance != null:
                    errorWarningCode = (healthState == HealthState.Error) ?
                                       FOErrorWarningCodes.AppErrorMemoryPercent : FOErrorWarningCodes.AppWarningMemoryPercent;
                    break;

                case ErrorWarningProperty.TotalMemoryConsumptionPct:
                    errorWarningCode = (healthState == HealthState.Error) ?
                                       FOErrorWarningCodes.NodeErrorMemoryPercent : FOErrorWarningCodes.NodeWarningMemoryPercent;
                    break;

                case ErrorWarningProperty.DiskAverageQueueLength:
                    errorWarningCode = (healthState == HealthState.Error) ?
                                       FOErrorWarningCodes.NodeErrorDiskAverageQueueLength : FOErrorWarningCodes.NodeWarningDiskAverageQueueLength;
                    break;

                case ErrorWarningProperty.TotalActiveFirewallRules:
                    errorWarningCode = (healthState == HealthState.Error) ?
                                       FOErrorWarningCodes.ErrorTooManyFirewallRules : FOErrorWarningCodes.WarningTooManyFirewallRules;
                    break;

                case ErrorWarningProperty.TotalActivePorts when healthReportType == HealthReportType.Application:
                    errorWarningCode = (healthState == HealthState.Error) ?
                                       FOErrorWarningCodes.AppErrorTooManyActiveTcpPorts : FOErrorWarningCodes.AppWarningTooManyActiveTcpPorts;
                    break;

                case ErrorWarningProperty.TotalActivePorts:
                    errorWarningCode = (healthState == HealthState.Error) ?
                                       FOErrorWarningCodes.NodeErrorTooManyActiveTcpPorts : FOErrorWarningCodes.NodeWarningTooManyActiveTcpPorts;
                    break;

                case ErrorWarningProperty.TotalEphemeralPorts when healthReportType == HealthReportType.Application:
                    errorWarningCode = (healthState == HealthState.Error) ?
                                       FOErrorWarningCodes.AppErrorTooManyActiveEphemeralPorts : FOErrorWarningCodes.AppWarningTooManyActiveEphemeralPorts;
                    break;

                case ErrorWarningProperty.TotalEphemeralPorts:
                    errorWarningCode = (healthState == HealthState.Error) ?
                                       FOErrorWarningCodes.NodeErrorTooManyActiveEphemeralPorts : FOErrorWarningCodes.NodeWarningTooManyActiveEphemeralPorts;
                    break;
                }

                var healthMessage = new StringBuilder();

                string drive = string.Empty;

                if (ObserverName == ObserverConstants.DiskObserverName)
                {
                    drive = $"{data.Id}: ";

                    if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows))
                    {
                        drive = $"{data.Id.Remove(1, 2)}: ";
                    }
                }

                _ = healthMessage.Append($"{drive}{data.Property} is at or above the specified {thresholdName} limit ({threshold}{data.Units})");
                _ = healthMessage.AppendLine($" - {data.Property}: {Math.Round(data.AverageDataValue)}{data.Units}");

                // The health event description will be a serialized instance of telemetryData,
                // so it should be completely constructed (filled with data) regardless
                // of user telemetry settings.
                telemetryData.ApplicationName = appName?.OriginalString ?? string.Empty;
                telemetryData.Code            = errorWarningCode;
                if (replicaOrInstance != null && !string.IsNullOrEmpty(replicaOrInstance.ContainerId))
                {
                    telemetryData.ContainerId = replicaOrInstance.ContainerId;
                }
                telemetryData.HealthState            = Enum.GetName(typeof(HealthState), healthState);
                telemetryData.HealthEventDescription = healthMessage.ToString();
                telemetryData.Metric      = $"{drive}{data.Property}";
                telemetryData.ServiceName = serviceName?.OriginalString ?? string.Empty;
                telemetryData.Source      = ObserverConstants.FabricObserverName;
                telemetryData.Value       = Math.Round(data.AverageDataValue, 1);

                // Send Health Report as Telemetry event (perhaps it signals an Alert from App Insights, for example.).
                if (IsTelemetryProviderEnabled && IsObserverTelemetryEnabled)
                {
                    _ = TelemetryClient?.ReportHealthAsync(
                        telemetryData,
                        Token);
                }

                // ETW.
                if (IsEtwEnabled)
                {
                    Logger.EtwLogger?.Write(
                        ObserverConstants.FabricObserverETWEventName,
                        new
                    {
                        ApplicationName        = appName?.OriginalString ?? string.Empty,
                        Code                   = errorWarningCode,
                        ContainerId            = replicaOrInstance != null ? replicaOrInstance.ContainerId ?? string.Empty : string.Empty,
                        HealthState            = Enum.GetName(typeof(HealthState), healthState),
                        HealthEventDescription = healthMessage.ToString(),
                        Metric                 = $"{drive}{data.Property}",
                        Node                   = NodeName,
                        ServiceName            = serviceName?.OriginalString ?? string.Empty,
                        Source                 = ObserverConstants.FabricObserverName,
                        Value                  = Math.Round(data.AverageDataValue, 1),
                    });
                }

                var healthReport = new HealthReport
                {
                    AppName                   = appName,
                    Code                      = errorWarningCode,
                    EmitLogEvent              = true,
                    HealthData                = telemetryData,
                    HealthMessage             = healthMessage.ToString(),
                    HealthReportTimeToLive    = healthReportTtl,
                    ReportType                = healthReportType,
                    State                     = healthState,
                    NodeName                  = NodeName,
                    Observer                  = ObserverName,
                    ResourceUsageDataProperty = data.Property,
                };

                if (!AppNames.Any(a => a == appName?.OriginalString))
                {
                    AppNames.Add(appName?.OriginalString);
                }

                // From FSO.
                if (replicaOrInstance == null && healthReportType == HealthReportType.Application)
                {
                    HealthReportProperties.Add(id);
                }
                else
                {
                    if (HealthReportProperties.Count == 0)
                    {
                        HealthReportProperties.Add(ObserverName switch
                        {
                            ObserverConstants.AppObserverName => "ApplicationHealth",
                            ObserverConstants.CertificateObserverName => "SecurityHealth",
                            ObserverConstants.DiskObserverName => "DiskHealth",
                            ObserverConstants.FabricSystemObserverName => "FabricSystemServiceHealth",
                            ObserverConstants.NetworkObserverName => "NetworkHealth",
                            ObserverConstants.OSObserverName => "MachineInformation",
                            ObserverConstants.NodeObserverName => "MachineResourceHealth",
                            _ => $"{data.Property}",
                        });
Example #10
0
        /// <inheritdoc/>
        public override async Task ReportAsync(CancellationToken token)
        {
            string app;
            var    timeToLiveWarning = this.SetTimeToLiveWarning();

            // Report on connection state.
            for (int j = 0; j < this.userEndpoints.Count; j++)
            {
                token.ThrowIfCancellationRequested();

                var deployedApps = await this.FabricClientInstance.QueryManager
                                   .GetDeployedApplicationListAsync(
                    this.NodeName,
                    new Uri(this.userEndpoints[j].AppTarget)).ConfigureAwait(true);

                // We only care about deployed apps.
                if (deployedApps == null || deployedApps.Count < 1)
                {
                    continue;
                }

                app = this.userEndpoints[j].AppTarget.Replace("fabric:/", string.Empty);

                for (int i = 0; i < this.connectionStatus.Count; i++)
                {
                    token.ThrowIfCancellationRequested();

                    var connStatus = this.connectionStatus[i];

                    if (!connStatus.Connected)
                    {
                        this.healthState = HealthState.Warning;
                        var healthMessage = "Outbound Internet connection failure detected for endpoint " + connStatus.HostName + "\n";

                        HealthReport report = new HealthReport
                        {
                            AppName                   = new Uri(this.userEndpoints[j].AppTarget),
                            Code                      = FoErrorWarningCodes.AppWarningNetworkEndpointUnreachable,
                            EmitLogEvent              = true,
                            HealthMessage             = healthMessage,
                            HealthReportTimeToLive    = timeToLiveWarning,
                            State                     = this.healthState,
                            NodeName                  = this.NodeName,
                            Observer                  = this.ObserverName,
                            ReportType                = HealthReportType.Application,
                            ResourceUsageDataProperty = $"{ErrorWarningProperty.InternetConnectionFailure}: connStatus.HostName",
                        };

                        // Send health report Warning and log event locally.
                        this.HealthReporter.ReportHealthToServiceFabric(report);

                        // This means this observer created a Warning or Error SF Health Report
                        this.HasActiveFabricErrorOrWarning = true;

                        // Send Health Report as Telemetry (perhaps it signals an Alert from App Insights, for example.).
                        if (this.IsTelemetryEnabled)
                        {
                            _ = this.ObserverTelemetryClient?.ReportHealthAsync(
                                HealthScope.Application,
                                this.userEndpoints[j].AppTarget,
                                HealthState.Warning,
                                $"{this.NodeName}/{FoErrorWarningCodes.AppWarningNetworkEndpointUnreachable}: {healthMessage}",
                                this.ObserverName,
                                this.Token);
                        }
                    }
                    else
                    {
                        if (connStatus.Health == HealthState.Warning)
                        {
                            this.healthState = HealthState.Ok;
                            var healthMessage = "Outbound Internet connection test successful.";

                            // Clear existing Health Warning.
                            HealthReport report = new HealthReport
                            {
                                AppName                = new Uri(this.userEndpoints[j].AppTarget),
                                EmitLogEvent           = true,
                                HealthMessage          = healthMessage,
                                HealthReportTimeToLive = default(TimeSpan),
                                State      = this.healthState,
                                NodeName   = this.NodeName,
                                Observer   = this.ObserverName,
                                ReportType = HealthReportType.Application,
                            };

                            this.HealthReporter.ReportHealthToServiceFabric(report);

                            // Reset health state.
                            this.HasActiveFabricErrorOrWarning = false;
                        }
                    }
                }
            }

            // Clear
            this.connectionStatus.RemoveAll(conn => conn.Connected == true);
            this.connectionStatus.TrimExcess();
        }
        public override Task ReportAsync(CancellationToken token)
        {
            try
            {
                this.Token.ThrowIfCancellationRequested();
                var healthReportTimeToLive = this.SetHealthReportTimeToLive();

                // App-specific reporting.
                foreach (var app in this.targetList)
                {
                    this.Token.ThrowIfCancellationRequested();

                    // Process data for reporting.
                    foreach (var repOrInst in this.ReplicaOrInstanceList)
                    {
                        this.Token.ThrowIfCancellationRequested();

                        if (!string.IsNullOrEmpty(app.TargetAppType) &&
                            !string.Equals(
                                repOrInst.ApplicationTypeName,
                                app.TargetAppType,
                                StringComparison.CurrentCultureIgnoreCase))
                        {
                            continue;
                        }

                        if (!string.IsNullOrEmpty(app.TargetApp) &&
                            !string.Equals(
                                repOrInst.ApplicationName.OriginalString,
                                app.TargetApp,
                                StringComparison.CurrentCultureIgnoreCase))
                        {
                            continue;
                        }

                        Process p;

                        try
                        {
                            p = Process.GetProcessById((int)repOrInst.HostProcessId);

                            // If the process is no longer running, then don't report on it.
                            if (p.HasExited)
                            {
                                continue;
                            }
                        }
                        catch (ArgumentException)
                        {
                            continue;
                        }
                        catch (InvalidOperationException)
                        {
                            continue;
                        }
                        catch (Win32Exception)
                        {
                            continue;
                        }

                        string appNameOrType = GetAppNameOrType(repOrInst);

                        var id = $"{appNameOrType}:{p.ProcessName}";

                        // Log (csv) CPU/Mem/DiskIO per app.
                        if (this.CsvFileLogger != null && this.CsvFileLogger.EnableCsvLogging)
                        {
                            this.LogAllAppResourceDataToCsv(id);
                        }
#if DEBUG
                        // DEBUG \\
                        if (id.Contains("CpuStress"))
                        {
                            // Emit an Ok Health Report for debug output.
                            var healthReport = new Utilities.HealthReport
                            {
                                AppName       = new Uri("fabric:/CpuStress"),
                                HealthMessage = $"{p.Id} CpuData Count: {this.allAppCpuData.FirstOrDefault(x => x.Id == id).Data.Count}\n" +
                                                $"Average: {this.allAppCpuData.FirstOrDefault(x => x.Id == id).AverageDataValue}",
                                State      = HealthState.Ok,
                                Code       = FoErrorWarningCodes.Ok,
                                NodeName   = this.NodeName,
                                Observer   = this.ObserverName,
                                Property   = id,
                                ReportType = HealthReportType.Application,
                            };

                            this.HealthReporter.ReportHealthToServiceFabric(healthReport);
                        }
#endif

                        // CPU
                        this.ProcessResourceDataReportHealth(
                            this.allAppCpuData.FirstOrDefault(x => x.Id == id),
                            app.CpuErrorLimitPercent,
                            app.CpuWarningLimitPercent,
                            healthReportTimeToLive,
                            HealthReportType.Application,
                            repOrInst,
                            app.DumpProcessOnError);

                        // Memory
                        this.ProcessResourceDataReportHealth(
                            this.allAppMemDataMb.FirstOrDefault(x => x.Id == id),
                            app.MemoryErrorLimitMb,
                            app.MemoryWarningLimitMb,
                            healthReportTimeToLive,
                            HealthReportType.Application,
                            repOrInst,
                            app.DumpProcessOnError);

                        this.ProcessResourceDataReportHealth(
                            this.allAppMemDataPercent.FirstOrDefault(x => x.Id == id),
                            app.MemoryErrorLimitPercent,
                            app.MemoryWarningLimitPercent,
                            healthReportTimeToLive,
                            HealthReportType.Application,
                            repOrInst,
                            app.DumpProcessOnError);

                        // Ports
                        this.ProcessResourceDataReportHealth(
                            this.allAppTotalActivePortsData.FirstOrDefault(x => x.Id == id),
                            app.NetworkErrorActivePorts,
                            app.NetworkWarningActivePorts,
                            healthReportTimeToLive,
                            HealthReportType.Application,
                            repOrInst);

                        // Ports
                        this.ProcessResourceDataReportHealth(
                            this.allAppEphemeralPortsData.FirstOrDefault(x => x.Id == id),
                            app.NetworkErrorEphemeralPorts,
                            app.NetworkWarningEphemeralPorts,
                            healthReportTimeToLive,
                            HealthReportType.Application,
                            repOrInst);
                    }
                }

                return(Task.CompletedTask);
            }
            catch (Exception e)
            {
                this.WriteToLogWithLevel(
                    this.ObserverName,
                    $"Unhandled exception in ReportAsync: \n{e}",
                    LogLevel.Error);

                throw;
            }
        }
        public override Task ReportAsync(CancellationToken token)
        {
            // Local log.
            ObserverLogger.LogInfo(message.ToString());

            /* Report to Fabric */

            // These values will be preserved across observer runs and are useful for clearing warnings
            // by reporting Ok health state health events with the same property and sourceid values
            // as the error/warning health events when FO is safely taken down (e.g., app is being uninstalled,
            // safe restart of fabric node it's running on, etc.).
            HealthReportProperties.Add("SomePropertyName");
            HealthReportSourceIds.Add($"{ObserverName}_SomethingUniqueToThisReport");

            var healthReporter = new ObserverHealthReporter(ObserverLogger, FabricClientInstance);
            var healthReport   = new Utilities.HealthReport
            {
                Code          = FOErrorWarningCodes.Ok,
                HealthMessage = this.message.ToString(),
                NodeName      = NodeName,
                Observer      = ObserverName,
                Property      = HealthReportProperties[HealthReportProperties.Count - 1],
                ReportType    = HealthReportType.Node,
                State         = HealthState.Ok,
            };

            healthReporter.ReportHealthToServiceFabric(healthReport);

            // Emit Telemetry - This will use whatever telemetry provider you have configured in FabricObserver Settings.xml.
            var telemetryData = new TelemetryData(FabricClientInstance, Token)
            {
                Code = FOErrorWarningCodes.Ok,
                HealthEventDescription = this.message.ToString(),
                HealthState            = "Ok",
                NodeName     = NodeName,
                ObserverName = ObserverName,
                Source       = ObserverConstants.FabricObserverName,
            };

            if (IsTelemetryProviderEnabled && IsObserverTelemetryEnabled)
            {
                _ = TelemetryClient?.ReportHealthAsync(
                    telemetryData,
                    Token);
            }

            // ETW.
            if (IsEtwEnabled)
            {
                Logger.EtwLogger?.Write(
                    ObserverConstants.FabricObserverETWEventName,
                    new
                {
                    Code = FOErrorWarningCodes.Ok,
                    HealthEventDescription = this.message.ToString(),
                    HealthState            = "Ok",
                    NodeName,
                    ObserverName,
                    Source = ObserverConstants.FabricObserverName,
                });
            }

            this.message.Clear();

            return(Task.CompletedTask);
        }
Example #13
0
        public override Task ReportAsync(CancellationToken token)
        {
            token.ThrowIfCancellationRequested();

            // Someone calling without observing first, must be run after a new run of ObserveAsync
            if (ExpiringWarnings == null ||
                ExpiredWarnings == null ||
                NotFoundWarnings == null)
            {
                return(Task.CompletedTask);
            }

            HealthReport healthReport;

            if (ExpiringWarnings.Count == 0 &&
                ExpiredWarnings.Count == 0 &&
                NotFoundWarnings.Count == 0)
            {
                healthReport = new HealthReport
                {
                    Observer               = ObserverName,
                    ReportType             = HealthReportType.Node,
                    EmitLogEvent           = true,
                    NodeName               = NodeName,
                    HealthMessage          = $"All cluster and monitored app certificates are healthy.",
                    State                  = HealthState.Ok,
                    HealthReportTimeToLive = RunInterval > TimeSpan.MinValue ? RunInterval : HealthReportTimeToLive,
                };

                HasActiveFabricErrorOrWarning = false;
            }
            else
            {
                string healthMessage = (ExpiredWarnings.Count == 0 ? string.Empty : (ExpiredWarnings.Aggregate(string.Empty, (i, j) => i + "\n" + j) + "\n")) +
                                       (NotFoundWarnings.Count == 0 ? string.Empty : (NotFoundWarnings.Aggregate(string.Empty, (i, j) => i + "\n" + j) + "\n")) +
                                       (ExpiringWarnings.Count == 0 ? string.Empty : ExpiringWarnings.Aggregate(string.Empty, (i, j) => i + "\n" + j));

                healthReport = new HealthReport
                {
                    Code                   = FOErrorWarningCodes.WarningCertificateExpiration,
                    Observer               = ObserverName,
                    ReportType             = HealthReportType.Node,
                    EmitLogEvent           = true,
                    NodeName               = NodeName,
                    HealthMessage          = healthMessage,
                    State                  = HealthState.Warning,
                    HealthReportTimeToLive = RunInterval > TimeSpan.MinValue ? RunInterval : HealthReportTimeToLive,
                };

                HasActiveFabricErrorOrWarning = true;

                if (IsTelemetryProviderEnabled && IsObserverTelemetryEnabled)
                {
                    TelemetryData telemetryData = new TelemetryData(FabricClientInstance, token)
                    {
                        Code                   = FOErrorWarningCodes.WarningCertificateExpiration,
                        HealthState            = "Warning",
                        NodeName               = NodeName,
                        Metric                 = ErrorWarningProperty.CertificateExpiration,
                        HealthEventDescription = healthMessage,
                        ObserverName           = ObserverName,
                        OS     = RuntimeInformation.IsOSPlatform(OSPlatform.Windows) ? "Windows" : "Linux",
                        Source = ObserverConstants.FabricObserverName,
                        Value  = FOErrorWarningCodes.GetErrorWarningNameFromFOCode(FOErrorWarningCodes.WarningCertificateExpiration),
                    };

                    _ = TelemetryClient?.ReportMetricAsync(
                        telemetryData,
                        Token);
                }

                if (IsEtwEnabled)
                {
                    Logger.EtwLogger?.Write(
                        ObserverConstants.FabricObserverETWEventName,
                        new
                    {
                        Code        = FOErrorWarningCodes.WarningCertificateExpiration,
                        HealthState = "Warning",
                        NodeName,
                        Metric = ErrorWarningProperty.CertificateExpiration,
                        HealthEventDescription = healthMessage,
                        ObserverName,
                        OS     = RuntimeInformation.IsOSPlatform(OSPlatform.Windows) ? "Windows" : "Linux",
                        Source = ObserverConstants.FabricObserverName,
                        Value  = FOErrorWarningCodes.GetErrorWarningNameFromFOCode(FOErrorWarningCodes.WarningCertificateExpiration),
                    });
                }
            }

            HealthReporter.ReportHealthToServiceFabric(healthReport);

            return(Task.CompletedTask);
        }
        internal void ProcessResourceDataReportHealth <T>(
            FabricResourceUsageData <T> data,
            T thresholdError,
            T thresholdWarning,
            TimeSpan healthReportTtl,
            HealthReportType healthReportType = HealthReportType.Node,
            ReplicaOrInstanceMonitoringInfo replicaOrInstance = null,
            bool dumpOnError = false)
        {
            if (data == null)
            {
                throw new ArgumentException("Supply all required parameters with non-null value.");
            }

            var    thresholdName = "Minimum";
            bool   warningOrError = false;
            string repPartitionId = null, repOrInstanceId = null, name = null, id = null, procName = null;
            T      threshold   = thresholdWarning;
            var    healthState = HealthState.Ok;
            Uri    appName     = null;

            if (replicaOrInstance != null)
            {
                repPartitionId  = $"Partition: {replicaOrInstance.PartitionId}";
                repOrInstanceId = $"Replica: {replicaOrInstance.ReplicaOrInstanceId}";

                // Create a unique id which may be used in the case of warnings or OK clears.
                appName = replicaOrInstance.ApplicationName;
                name    = appName.OriginalString.Replace("fabric:/", string.Empty);
                id      = name + "_" + data.Property.Replace(" ", string.Empty);

                // Telemetry.
                if (this.IsTelemetryEnabled)
                {
                    this.ObserverTelemetryClient?.ReportMetricAsync(
                        $"{this.NodeName}-{name}-{data.Id}-{data.Property}",
                        data.AverageDataValue,
                        this.Token);
                }

                try
                {
                    procName = Process.GetProcessById((int)replicaOrInstance.HostProcessId).ProcessName;
                }
                catch (ArgumentException)
                {
                    return;
                }
                catch (InvalidOperationException)
                {
                    return;
                }
            }
            else
            {
                // Telemetry.
                if (this.IsTelemetryEnabled)
                {
                    this.ObserverTelemetryClient?.ReportMetricAsync(
                        $"{this.NodeName}-{data.Id}-{data.Property}",
                        data.AverageDataValue,
                        this.Token);
                }
            }

            // ETW.
            if (this.IsEtwEnabled)
            {
                Logger.EtwLogger?.Write(
                    $"FabricObserverDataEvent",
                    new
                {
                    Level    = 0,  // Info
                    Node     = this.NodeName,
                    Observer = this.ObserverName,
                    data.Property,
                    data.Id,
                    Value = $"{Math.Round(data.AverageDataValue)}",
                    Unit  = data.Units,
                });
            }

            // Health Error
            if (data.IsUnhealthy(thresholdError))
            {
                thresholdName  = "Maximum";
                threshold      = thresholdError;
                warningOrError = true;
                healthState    = HealthState.Error;

                // This is primarily useful for AppObserver, but makes sense to be
                // part of the base class for future use, like for FSO.
                if (replicaOrInstance != null && dumpOnError)
                {
                    try
                    {
                        int procId = (int)replicaOrInstance.HostProcessId;

                        if (!this.serviceDumpCountDictionary.ContainsKey(procName))
                        {
                            this.serviceDumpCountDictionary.Add(procName, 0);
                        }

                        if (this.serviceDumpCountDictionary[procName] < this.maxDumps)
                        {
                            // DumpServiceProcess defaults to a Full dump with
                            // process memory, handles and thread data.
                            bool success = this.DumpServiceProcess(procId);

                            if (success)
                            {
                                this.serviceDumpCountDictionary[procName]++;
                            }
                        }
                    }

                    // Ignore these, it just means no dmp will be created.This is not
                    // critical to FO. Log as info, not warning.
                    catch (Exception e) when(e is ArgumentException || e is InvalidOperationException)
                    {
                        this.ObserverLogger.LogInfo($"Unable to generate dmp file:{Environment.NewLine}{e}");
                    }
                }
            }

            // Health Warning
            if (!warningOrError && data.IsUnhealthy(thresholdWarning))
            {
                warningOrError = true;
                healthState    = HealthState.Warning;
            }

            if (warningOrError)
            {
                string errorWarningKind = null;

                switch (data.Property)
                {
                case ErrorWarningProperty.TotalCpuTime when replicaOrInstance != null:
                    errorWarningKind = (healthState == HealthState.Error) ?
                                       FoErrorWarningCodes.AppErrorCpuTime : FoErrorWarningCodes.AppWarningCpuTime;
                    break;

                case ErrorWarningProperty.TotalCpuTime:
                    errorWarningKind = (healthState == HealthState.Error) ?
                                       FoErrorWarningCodes.NodeErrorCpuTime : FoErrorWarningCodes.NodeWarningCpuTime;
                    break;

                case ErrorWarningProperty.DiskSpaceUsagePercentage:
                    errorWarningKind = (healthState == HealthState.Error) ?
                                       FoErrorWarningCodes.NodeErrorDiskSpacePercentUsed : FoErrorWarningCodes.NodeWarningDiskSpacePercentUsed;
                    break;

                case ErrorWarningProperty.DiskSpaceUsageMb:
                    errorWarningKind = (healthState == HealthState.Error) ?
                                       FoErrorWarningCodes.NodeErrorDiskSpaceMb : FoErrorWarningCodes.NodeWarningDiskSpaceMb;
                    break;

                case ErrorWarningProperty.TotalMemoryConsumptionMb when replicaOrInstance != null:
                    errorWarningKind = (healthState == HealthState.Error) ?
                                       FoErrorWarningCodes.AppErrorMemoryCommittedMb : FoErrorWarningCodes.AppWarningMemoryCommittedMb;
                    break;

                case ErrorWarningProperty.TotalMemoryConsumptionMb:
                    errorWarningKind = (healthState == HealthState.Error) ?
                                       FoErrorWarningCodes.NodeErrorMemoryCommittedMb : FoErrorWarningCodes.NodeWarningMemoryCommittedMb;
                    break;

                case ErrorWarningProperty.TotalMemoryConsumptionPct when replicaOrInstance != null:
                    errorWarningKind = (healthState == HealthState.Error) ?
                                       FoErrorWarningCodes.AppErrorMemoryPercentUsed : FoErrorWarningCodes.AppWarningMemoryPercentUsed;
                    break;

                case ErrorWarningProperty.TotalMemoryConsumptionPct:
                    errorWarningKind = (healthState == HealthState.Error) ?
                                       FoErrorWarningCodes.NodeErrorMemoryPercentUsed : FoErrorWarningCodes.NodeWarningMemoryPercentUsed;
                    break;

                case ErrorWarningProperty.DiskAverageQueueLength:
                    errorWarningKind = (healthState == HealthState.Error) ?
                                       FoErrorWarningCodes.NodeErrorDiskAverageQueueLength : FoErrorWarningCodes.NodeWarningDiskAverageQueueLength;
                    break;

                case ErrorWarningProperty.TotalActiveFirewallRules:
                    errorWarningKind = (healthState == HealthState.Error) ?
                                       FoErrorWarningCodes.ErrorTooManyFirewallRules : FoErrorWarningCodes.WarningTooManyFirewallRules;
                    break;

                case ErrorWarningProperty.TotalActivePorts when replicaOrInstance != null:
                    errorWarningKind = (healthState == HealthState.Error) ?
                                       FoErrorWarningCodes.AppErrorTooManyActiveTcpPorts : FoErrorWarningCodes.AppWarningTooManyActiveTcpPorts;
                    break;

                case ErrorWarningProperty.TotalActivePorts:
                    errorWarningKind = (healthState == HealthState.Error) ?
                                       FoErrorWarningCodes.NodeErrorTooManyActiveTcpPorts : FoErrorWarningCodes.NodeWarningTooManyActiveTcpPorts;
                    break;

                case ErrorWarningProperty.TotalEphemeralPorts when replicaOrInstance != null:
                    errorWarningKind = (healthState == HealthState.Error) ?
                                       FoErrorWarningCodes.AppErrorTooManyActiveEphemeralPorts : FoErrorWarningCodes.AppWarningTooManyActiveEphemeralPorts;
                    break;

                case ErrorWarningProperty.TotalEphemeralPorts:
                    errorWarningKind = (healthState == HealthState.Error) ?
                                       FoErrorWarningCodes.NodeErrorTooManyActiveEphemeralPorts : FoErrorWarningCodes.NodeWarningTooManyActiveEphemeralPorts;
                    break;
                }

                var healthMessage = new StringBuilder();

                if (name != null)
                {
                    healthMessage.Append($"{name} (Service Process: {procName}, {repPartitionId}, {repOrInstanceId}): ");
                }

                string drive = string.Empty;

                if (data.Property.Contains("Disk"))
                {
                    drive = $"{data.Id}: ";
                }

                healthMessage.Append($"{drive}{data.Property} is at or above the specified {thresholdName} limit ({threshold}{data.Units})");
                healthMessage.AppendLine($" - Average {data.Property}: {Math.Round(data.AverageDataValue)}{data.Units}");

                var healthReport = new HealthReport
                {
                    AppName                   = appName,
                    Code                      = errorWarningKind,
                    EmitLogEvent              = true,
                    HealthMessage             = healthMessage.ToString(),
                    HealthReportTimeToLive    = healthReportTtl,
                    ReportType                = healthReportType,
                    State                     = healthState,
                    NodeName                  = this.NodeName,
                    Observer                  = this.ObserverName,
                    ResourceUsageDataProperty = data.Property,
                };

                // Emit a Fabric Health Report and optionally a local log write.
                this.HealthReporter.ReportHealthToServiceFabric(healthReport);

                // Set internal fabric health states.
                data.ActiveErrorOrWarning = true;

                // This means this observer created a Warning or Error SF Health Report
                this.HasActiveFabricErrorOrWarning = true;

                // Send Health Report as Telemetry event (perhaps it signals an Alert from App Insights, for example.).
                if (this.IsTelemetryEnabled)
                {
                    this.ObserverTelemetryClient?.ReportHealthAsync(
                        !string.IsNullOrEmpty(id) ? HealthScope.Application : HealthScope.Node,
                        $"{(appName != null ? appName.OriginalString : this.NodeName)}",
                        healthState,
                        $"{this.NodeName}/{errorWarningKind}/{drive}{data.Property}/{Math.Round(data.AverageDataValue)}",
                        this.ObserverName,
                        this.Token);
                }

                // ETW.
                if (this.IsEtwEnabled)
                {
                    Logger.EtwLogger?.Write(
                        $"FabricObserverDataEvent",
                        new
                    {
                        Level                  = (healthState == HealthState.Warning) ? 1 : 2,
                        Node                   = this.NodeName,
                        Observer               = this.ObserverName,
                        HealthEventErrorCode   = errorWarningKind,
                        HealthEventDescription = healthMessage.ToString(),
                        data.Property,
                        data.Id,
                        Value = $"{Math.Round(data.AverageDataValue)}",
                        Unit  = data.Units,
                    });
                }

                // Clean up sb.
                healthMessage.Clear();
            }
            else
            {
                if (data.ActiveErrorOrWarning)
                {
                    var report = new HealthReport
                    {
                        AppName                = appName,
                        EmitLogEvent           = true,
                        HealthMessage          = $"{data.Property} is now within normal/expected range.",
                        HealthReportTimeToLive = default(TimeSpan),
                        ReportType             = healthReportType,
                        State    = HealthState.Ok,
                        NodeName = this.NodeName,
                        Observer = $"{this.ObserverName}({data.Id})",
                        ResourceUsageDataProperty = data.Property,
                    };

                    // Emit an Ok Health Report to clear Fabric Health warning.
                    this.HealthReporter.ReportHealthToServiceFabric(report);

                    // Reset health states.
                    data.ActiveErrorOrWarning          = false;
                    this.HasActiveFabricErrorOrWarning = false;
                }
            }

            // No need to keep data in memory.
            data.Data.Clear();
            data.Data.TrimExcess();
        }
        public override Task ReportAsync(CancellationToken token)
        {
            try
            {
                token.ThrowIfCancellationRequested();

                // OS Health.
                if (this.osStatus != null && !string.Equals(this.osStatus, "OK", StringComparison.OrdinalIgnoreCase))
                {
                    string healthMessage = $"OS reporting unhealthy: {this.osStatus}";
                    var    healthReport  = new HealthReport
                    {
                        Observer               = ObserverName,
                        NodeName               = NodeName,
                        HealthMessage          = healthMessage,
                        State                  = HealthState.Error,
                        HealthReportTimeToLive = SetHealthReportTimeToLive(),
                    };

                    HealthReporter.ReportHealthToServiceFabric(healthReport);

                    // This means this observer created a Warning or Error SF Health Report
                    HasActiveFabricErrorOrWarning = true;

                    // Send Health Report as Telemetry (perhaps it signals an Alert from App Insights, for example.).
                    if (IsTelemetryProviderEnabled && IsObserverTelemetryEnabled)
                    {
                        _ = TelemetryClient?.ReportHealthAsync(
                            HealthScope.Application,
                            FabricRuntime.GetActivationContext().ApplicationName,
                            HealthState.Error,
                            $"{NodeName} - OS reporting unhealthy: {this.osStatus}",
                            ObserverName,
                            Token);
                    }
                }
                else if (HasActiveFabricErrorOrWarning && string.Equals(this.osStatus, "OK", StringComparison.OrdinalIgnoreCase))
                {
                    // Clear Error or Warning with an OK Health Report.
                    string healthMessage = $"OS reporting healthy: {this.osStatus}";

                    var healthReport = new HealthReport
                    {
                        Observer               = ObserverName,
                        NodeName               = NodeName,
                        HealthMessage          = healthMessage,
                        State                  = HealthState.Ok,
                        HealthReportTimeToLive = default(TimeSpan),
                    };

                    HealthReporter.ReportHealthToServiceFabric(healthReport);

                    // Reset internal health state.
                    HasActiveFabricErrorOrWarning = false;
                }

                if (ObserverManager.ObserverWebAppDeployed)
                {
                    var logPath = Path.Combine(ObserverLogger.LogFolderBasePath, "SysInfo.txt");

                    // This file is used by the web application (log reader.).
                    if (!ObserverLogger.TryWriteLogFile(logPath, $"Last updated on {DateTime.UtcNow.ToString("M/d/yyyy HH:mm:ss")} UTC<br/>{this.osReport}"))
                    {
                        HealthReporter.ReportFabricObserverServiceHealth(
                            FabricServiceContext.ServiceName.OriginalString,
                            ObserverName,
                            HealthState.Warning,
                            "Unable to create SysInfo.txt file.");
                    }
                }

                var report = new HealthReport
                {
                    Observer               = ObserverName,
                    HealthMessage          = this.osReport,
                    State                  = HealthState.Ok,
                    NodeName               = NodeName,
                    HealthReportTimeToLive = SetHealthReportTimeToLive(),
                };

                HealthReporter.ReportHealthToServiceFabric(report);

                // Windows Update automatic download enabled?
                if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows) &&
                    this.isWindowsUpdateAutoDownloadEnabled)
                {
                    string linkText =
                        $"{Environment.NewLine}For clusters of Silver durability or above, " +
                        $"please consider <a href=\"https://docs.microsoft.com/azure/virtual-machine-scale-sets/virtual-machine-scale-sets-automatic-upgrade\" target=\"blank\">" +
                        $"enabling VMSS automatic OS image upgrades</a> to prevent unexpected VM reboots. " +
                        $"For Bronze durability clusters, please consider deploying the " +
                        $"<a href=\"https://docs.microsoft.com/azure/service-fabric/service-fabric-patch-orchestration-application\" target=\"blank\">Patch Orchestration Service</a>.";

                    string auServiceEnabledMessage = $"Windows Update Automatic Download is enabled.{linkText}";

                    report = new HealthReport
                    {
                        Observer               = ObserverName,
                        Property               = "OSConfiguration",
                        HealthMessage          = auServiceEnabledMessage,
                        State                  = HealthState.Warning,
                        NodeName               = NodeName,
                        HealthReportTimeToLive = SetHealthReportTimeToLive(),
                    };

                    HealthReporter.ReportHealthToServiceFabric(report);

                    if (IsTelemetryProviderEnabled &&
                        IsObserverTelemetryEnabled &&
                        RuntimeInformation.IsOSPlatform(OSPlatform.Windows))
                    {
                        // Send Health Report as Telemetry (perhaps it signals an Alert from App Insights, for example.).
                        var telemetryData = new TelemetryData(FabricClientInstance, token)
                        {
                            HealthEventDescription = auServiceEnabledMessage,
                            HealthState            = "Warning",
                            Metric       = "WUAutoDownloadEnabled",
                            Value        = this.isWindowsUpdateAutoDownloadEnabled,
                            NodeName     = NodeName,
                            ObserverName = ObserverName,
                            Source       = ObserverConstants.FabricObserverName,
                        };

                        _ = TelemetryClient?.ReportMetricAsync(
                            telemetryData,
                            Token);
                    }

                    // ETW.
                    if (IsEtwEnabled && RuntimeInformation.IsOSPlatform(OSPlatform.Windows))
                    {
                        Logger.EtwLogger?.Write(
                            ObserverConstants.FabricObserverETWEventName,
                            new
                        {
                            HealthState            = "Warning",
                            HealthEventDescription = auServiceEnabledMessage,
                            ObserverName,
                            Metric = "WUAutoDownloadEnabled",
                            Value  = this.isWindowsUpdateAutoDownloadEnabled,
                            NodeName,
                        });
                    }
                }

                if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows))
                {
                    // reset au globals for fresh detection during next observer run.
                    this.isWindowsUpdateAutoDownloadEnabled = false;
                    this.auStateUnknown       = false;
                    this.isWUADSettingEnabled = false;
                }

                return(Task.CompletedTask);
            }
            catch (Exception e)
            {
                HealthReporter.ReportFabricObserverServiceHealth(
                    FabricServiceContext.ServiceName.OriginalString,
                    ObserverName,
                    HealthState.Error,
                    $"Unhandled exception processing OS information:{Environment.NewLine}{e}");

                throw;
            }
        }
        public void ReportHealthToServiceFabric(HealthReport healthReport)
        {
            if (healthReport == null)
            {
                return;
            }

            // There is no real need to change Immediate to true here for errors/warnings. This only adds unecessary stress to the
            // Health subsystem.
            var sendOptions = new HealthReportSendOptions {
                Immediate = false
            };

            // Quickly send OK (clears warning/errors states).
            if (healthReport.State == HealthState.Ok)
            {
                sendOptions.Immediate = true;
            }

            var timeToLive = TimeSpan.FromMinutes(5);

            if (healthReport.HealthReportTimeToLive != default)
            {
                timeToLive = healthReport.HealthReportTimeToLive;
            }

            // Set property for health event.
            string property = healthReport.Property;

            if (string.IsNullOrEmpty(property))
            {
                switch (healthReport.Observer)
                {
                case ObserverConstants.AppObserverName:
                    property = "ApplicationHealth";
                    break;

                case ObserverConstants.CertificateObserverName:
                    property = "SecurityHealth";
                    break;

                case ObserverConstants.DiskObserverName:
                    property = "DiskHealth";
                    break;

                case ObserverConstants.FabricSystemObserverName:
                    property = "FabricSystemServiceHealth";
                    break;

                case ObserverConstants.NetworkObserverName:
                    property = "NetworkHealth";
                    break;

                case ObserverConstants.OsObserverName:
                    property = "MachineInformation";
                    break;

                case ObserverConstants.NodeObserverName:
                    property = "MachineResourceHealth";
                    break;

                default:
                    property = "FOGenericHealth";
                    break;
                }
            }

            string        sourceId   = healthReport.Observer;
            TelemetryData healthData = healthReport.HealthData;

            if (!string.IsNullOrEmpty(healthReport.Code))
            {
                // Only use FOErrorWarningCode for source
                sourceId += $"({healthReport.Code})";
            }

            string errWarnPreamble = string.Empty;

            if (healthReport.State == HealthState.Error ||
                healthReport.State == HealthState.Warning)
            {
                errWarnPreamble =
                    $"{healthReport.Observer} detected " +
                    $"{Enum.GetName(typeof(HealthState), healthReport.State)} threshold breach. ";

                // OSObserver does not monitor resources and therefore does not support related usage threshold configuration.
                if (healthReport.Observer == ObserverConstants.OsObserverName &&
                    property == "OSConfiguration")
                {
                    errWarnPreamble = $"{ObserverConstants.OsObserverName} detected potential problem with OS configuration: ";
                    property        = "OSConfiguration";
                }
            }

            string message = $"{errWarnPreamble}{healthReport.HealthMessage}";

            if (healthData != null)
            {
                message = JsonConvert.SerializeObject(healthData);
            }

            var healthInformation = new HealthInformation(sourceId, property, healthReport.State)
            {
                Description       = $"{message}",
                TimeToLive        = timeToLive,
                RemoveWhenExpired = true,
            };

            // Log event only if ObserverWebApi (REST API Log reader service) app is deployed.
            if (ObserverManager.ObserverWebAppDeployed &&
                healthReport.EmitLogEvent)
            {
                if (healthReport.State == HealthState.Error)
                {
                    this.logger.LogError(healthReport.NodeName + ": {0}", healthInformation.Description);
                }
                else if (healthReport.State == HealthState.Warning)
                {
                    this.logger.LogWarning(healthReport.NodeName + ": {0}", healthInformation.Description);
                }
                else
                {
                    this.logger.LogInfo(healthReport.NodeName + ": {0}", healthInformation.Description);
                }
            }

            // To SFX.
            if (healthReport.ReportType == HealthReportType.Application && healthReport.AppName != null)
            {
                var appHealthReport = new ApplicationHealthReport(healthReport.AppName, healthInformation);
                this.fabricClient.HealthManager.ReportHealth(appHealthReport, sendOptions);
            }
            else
            {
                var nodeHealthReport = new NodeHealthReport(healthReport.NodeName, healthInformation);
                this.fabricClient.HealthManager.ReportHealth(nodeHealthReport, sendOptions);
            }
        }
        internal void ProcessResourceDataReportHealth <T>(
            FabricResourceUsageData <T> data,
            T thresholdError,
            T thresholdWarning,
            TimeSpan healthReportTtl,
            HealthReportType healthReportType = HealthReportType.Node,
            ReplicaOrInstanceMonitoringInfo replicaOrInstance = null,
            bool dumpOnError = false)
            where T : struct
        {
            if (data == null)
            {
                throw new ArgumentException("Supply all required parameters with non-null value.");
            }

            var           thresholdName = "Minimum";
            bool          warningOrError = false;
            string        repPartitionId = null, repOrInstanceId = null, name = null, id = null, procName = null;
            T             threshold     = thresholdWarning;
            var           healthState   = HealthState.Ok;
            Uri           appName       = null;
            Uri           serviceName   = null;
            TelemetryData telemetryData = null;

            if (healthReportType == HealthReportType.Application)
            {
                if (replicaOrInstance != null)
                {
                    repPartitionId  = $"Partition: {replicaOrInstance.PartitionId}";
                    repOrInstanceId = $"Replica: {replicaOrInstance.ReplicaOrInstanceId}";

                    // Create a unique id which will be used for health Warnings and OKs (clears).
                    appName     = replicaOrInstance.ApplicationName;
                    serviceName = replicaOrInstance.ServiceName;
                    name        = appName.OriginalString.Replace("fabric:/", string.Empty);
                }
                else
                {
                    appName = new Uri("fabric:/System");
                    name    = data.Id;
                }

                id = name + "_" + data.Property.Replace(" ", string.Empty);

                // The health event description will be a serialized instance of telemetryData,
                // so it should be completely constructed (filled with data) regardless
                // of user telemetry settings.
                telemetryData = new TelemetryData(this.FabricClientInstance, this.Token)
                {
                    ApplicationName = appName?.OriginalString ?? string.Empty,
                    Code            = FoErrorWarningCodes.Ok,
                    HealthState     = Enum.GetName(typeof(HealthState), HealthState.Ok),
                    NodeName        = this.NodeName,
                    ObserverName    = this.ObserverName,
                    Metric          = data.Property,
                    Value           = Math.Round(Convert.ToDouble(data.AverageDataValue), 1),
                    PartitionId     = replicaOrInstance?.PartitionId.ToString(),
                    ReplicaId       = replicaOrInstance?.ReplicaOrInstanceId.ToString(),
                    ServiceName     = serviceName?.OriginalString ?? string.Empty,
                    Source          = ObserverConstants.FabricObserverName,
                };

                try
                {
                    if (replicaOrInstance != null)
                    {
                        procName = Process.GetProcessById((int)replicaOrInstance.HostProcessId).ProcessName;
                    }
                    else
                    {
                        // The name of the target service process is always the id for data containers coming from FSO.
                        procName = data.Id;
                    }

                    telemetryData.ServiceName = procName;

                    if (this.IsTelemetryProviderEnabled && this.IsObserverTelemetryEnabled)
                    {
                        _ = this.TelemetryClient?.ReportMetricAsync(
                            telemetryData,
                            this.Token).ConfigureAwait(false);
                    }

                    if (this.IsEtwEnabled)
                    {
                        Logger.EtwLogger?.Write(
                            ObserverConstants.FabricObserverETWEventName,
                            new
                        {
                            ApplicationName = appName?.OriginalString ?? string.Empty,
                            Code            = FoErrorWarningCodes.Ok,
                            HealthState     = Enum.GetName(typeof(HealthState), HealthState.Ok),
                            NodeName        = this.NodeName,
                            ObserverName    = this.ObserverName,
                            Metric          = data.Property,
                            Value           = Math.Round(Convert.ToDouble(data.AverageDataValue), 1),
                            PartitionId     = replicaOrInstance?.PartitionId.ToString(),
                            ReplicaId       = replicaOrInstance?.ReplicaOrInstanceId.ToString(),
                            ServiceName     = procName,
                            Source          = ObserverConstants.FabricObserverName,
                        });
                    }
                }
                catch (ArgumentException)
                {
                    return;
                }
                catch (InvalidOperationException)
                {
                    return;
                }
            }
            else
            {
                string drive = string.Empty;

                if (this.ObserverName == ObserverConstants.DiskObserverName)
                {
                    drive = $"{data.Id}: ";
                }

                // The health event description will be a serialized instance of telemetryData,
                // so it should be completely constructed (filled with data) regardless
                // of user telemetry settings.
                telemetryData = new TelemetryData(this.FabricClientInstance, this.Token)
                {
                    Code         = FoErrorWarningCodes.Ok,
                    HealthState  = Enum.GetName(typeof(HealthState), HealthState.Ok),
                    NodeName     = this.NodeName,
                    ObserverName = this.ObserverName,
                    Metric       = $"{drive}{data.Property}",
                    Source       = ObserverConstants.FabricObserverName,
                    Value        = Math.Round(Convert.ToDouble(data.AverageDataValue), 1),
                };

                if (this.IsTelemetryProviderEnabled && this.IsObserverTelemetryEnabled)
                {
                    _ = this.TelemetryClient?.ReportMetricAsync(
                        telemetryData,
                        this.Token);
                }

                if (this.IsEtwEnabled)
                {
                    Logger.EtwLogger?.Write(
                        ObserverConstants.FabricObserverETWEventName,
                        new
                    {
                        Code         = FoErrorWarningCodes.Ok,
                        HealthState  = Enum.GetName(typeof(HealthState), HealthState.Ok),
                        NodeName     = this.NodeName,
                        ObserverName = this.ObserverName,
                        Metric       = $"{drive}{data.Property}",
                        Source       = ObserverConstants.FabricObserverName,
                        Value        = Math.Round(Convert.ToDouble(data.AverageDataValue), 1),
                    });
                }
            }

            // Health Error
            if (data.IsUnhealthy(thresholdError))
            {
                thresholdName  = "Maximum";
                threshold      = thresholdError;
                warningOrError = true;
                healthState    = HealthState.Error;

                // This is primarily useful for AppObserver, but makes sense to be
                // part of the base class for future use, like for FSO.
                if (replicaOrInstance != null && dumpOnError)
                {
                    try
                    {
                        int procId = (int)replicaOrInstance.HostProcessId;

                        if (!this.serviceDumpCountDictionary.ContainsKey(procName))
                        {
                            this.serviceDumpCountDictionary.Add(procName, 0);
                        }

                        if (this.serviceDumpCountDictionary[procName] < this.maxDumps)
                        {
                            // DumpServiceProcess defaults to a Full dump with
                            // process memory, handles and thread data.
                            bool success = this.DumpServiceProcess(procId);

                            if (success)
                            {
                                this.serviceDumpCountDictionary[procName]++;
                            }
                        }
                    }

                    // Ignore these, it just means no dmp will be created.This is not
                    // critical to FO. Log as info, not warning.
                    catch (Exception e) when(e is ArgumentException || e is InvalidOperationException)
                    {
                        this.ObserverLogger.LogInfo($"Unable to generate dmp file:{Environment.NewLine}{e}");
                    }
                }
            }

            // Health Warning
            if (!warningOrError && data.IsUnhealthy(thresholdWarning))
            {
                warningOrError = true;
                healthState    = HealthState.Warning;
            }

            if (warningOrError)
            {
                string errorWarningCode = null;

                switch (data.Property)
                {
                case ErrorWarningProperty.TotalCpuTime when healthReportType == HealthReportType.Application:
                    errorWarningCode = (healthState == HealthState.Error) ?
                                       FoErrorWarningCodes.AppErrorCpuTime : FoErrorWarningCodes.AppWarningCpuTime;
                    break;

                case ErrorWarningProperty.TotalCpuTime:
                    errorWarningCode = (healthState == HealthState.Error) ?
                                       FoErrorWarningCodes.NodeErrorCpuTime : FoErrorWarningCodes.NodeWarningCpuTime;
                    break;

                case ErrorWarningProperty.DiskSpaceUsagePercentage:
                    errorWarningCode = (healthState == HealthState.Error) ?
                                       FoErrorWarningCodes.NodeErrorDiskSpacePercentUsed : FoErrorWarningCodes.NodeWarningDiskSpacePercentUsed;
                    break;

                case ErrorWarningProperty.DiskSpaceUsageMb:
                    errorWarningCode = (healthState == HealthState.Error) ?
                                       FoErrorWarningCodes.NodeErrorDiskSpaceMb : FoErrorWarningCodes.NodeWarningDiskSpaceMb;
                    break;

                case ErrorWarningProperty.TotalMemoryConsumptionMb when healthReportType == HealthReportType.Application:
                    errorWarningCode = (healthState == HealthState.Error) ?
                                       FoErrorWarningCodes.AppErrorMemoryCommittedMb : FoErrorWarningCodes.AppWarningMemoryCommittedMb;
                    break;

                case ErrorWarningProperty.TotalMemoryConsumptionMb:
                    errorWarningCode = (healthState == HealthState.Error) ?
                                       FoErrorWarningCodes.NodeErrorMemoryCommittedMb : FoErrorWarningCodes.NodeWarningMemoryCommittedMb;
                    break;

                case ErrorWarningProperty.TotalMemoryConsumptionPct when replicaOrInstance != null:
                    errorWarningCode = (healthState == HealthState.Error) ?
                                       FoErrorWarningCodes.AppErrorMemoryPercentUsed : FoErrorWarningCodes.AppWarningMemoryPercentUsed;
                    break;

                case ErrorWarningProperty.TotalMemoryConsumptionPct:
                    errorWarningCode = (healthState == HealthState.Error) ?
                                       FoErrorWarningCodes.NodeErrorMemoryPercentUsed : FoErrorWarningCodes.NodeWarningMemoryPercentUsed;
                    break;

                case ErrorWarningProperty.DiskAverageQueueLength:
                    errorWarningCode = (healthState == HealthState.Error) ?
                                       FoErrorWarningCodes.NodeErrorDiskAverageQueueLength : FoErrorWarningCodes.NodeWarningDiskAverageQueueLength;
                    break;

                case ErrorWarningProperty.TotalActiveFirewallRules:
                    errorWarningCode = (healthState == HealthState.Error) ?
                                       FoErrorWarningCodes.ErrorTooManyFirewallRules : FoErrorWarningCodes.WarningTooManyFirewallRules;
                    break;

                case ErrorWarningProperty.TotalActivePorts when healthReportType == HealthReportType.Application:
                    errorWarningCode = (healthState == HealthState.Error) ?
                                       FoErrorWarningCodes.AppErrorTooManyActiveTcpPorts : FoErrorWarningCodes.AppWarningTooManyActiveTcpPorts;
                    break;

                case ErrorWarningProperty.TotalActivePorts:
                    errorWarningCode = (healthState == HealthState.Error) ?
                                       FoErrorWarningCodes.NodeErrorTooManyActiveTcpPorts : FoErrorWarningCodes.NodeWarningTooManyActiveTcpPorts;
                    break;

                case ErrorWarningProperty.TotalEphemeralPorts when healthReportType == HealthReportType.Application:
                    errorWarningCode = (healthState == HealthState.Error) ?
                                       FoErrorWarningCodes.AppErrorTooManyActiveEphemeralPorts : FoErrorWarningCodes.AppWarningTooManyActiveEphemeralPorts;
                    break;

                case ErrorWarningProperty.TotalEphemeralPorts:
                    errorWarningCode = (healthState == HealthState.Error) ?
                                       FoErrorWarningCodes.NodeErrorTooManyActiveEphemeralPorts : FoErrorWarningCodes.NodeWarningTooManyActiveEphemeralPorts;
                    break;
                }

                var healthMessage = new StringBuilder();

                /*if (name != null)
                 * {
                 *  string partitionAndReplicaInfo = string.Empty;
                 *
                 *  if (replicaOrInstance != null)
                 *  {
                 *      partitionAndReplicaInfo = $", {repPartitionId}, {repOrInstanceId}";
                 *  }
                 *
                 *  _ = healthMessage.Append($"{name} (Node: {this.NodeName}, Service Process: {procName}.exe{partitionAndReplicaInfo}): ");
                 * }*/

                string drive = string.Empty;

                if (this.ObserverName == ObserverConstants.DiskObserverName)
                {
                    drive = $"{data.Id}: ";
                }

                _ = healthMessage.Append($"{drive}{data.Property} is at or above the specified {thresholdName} limit ({threshold}{data.Units})");
                _ = healthMessage.AppendLine($" - {data.Property}: {Math.Round(Convert.ToDouble(data.AverageDataValue))}{data.Units}");

                // The health event description will be a serialized instance of telemetryData,
                // so it should be completely constructed (filled with data) regardless
                // of user telemetry settings.
                telemetryData.ApplicationName        = appName?.OriginalString ?? string.Empty;
                telemetryData.Code                   = errorWarningCode;
                telemetryData.HealthState            = Enum.GetName(typeof(HealthState), healthState);
                telemetryData.HealthEventDescription = healthMessage.ToString();
                telemetryData.Metric                 = $"{drive}{data.Property}";
                telemetryData.ServiceName            = serviceName?.OriginalString ?? string.Empty;
                telemetryData.Source                 = ObserverConstants.FabricObserverName;
                telemetryData.Value                  = Math.Round(Convert.ToDouble(data.AverageDataValue), 1);

                // Send Health Report as Telemetry event (perhaps it signals an Alert from App Insights, for example.).
                if (this.IsTelemetryProviderEnabled && this.IsObserverTelemetryEnabled)
                {
                    _ = this.TelemetryClient?.ReportMetricAsync(
                        telemetryData,
                        this.Token);
                }

                // ETW.
                if (this.IsEtwEnabled)
                {
                    Logger.EtwLogger?.Write(
                        ObserverConstants.FabricObserverETWEventName,
                        new
                    {
                        ApplicationName        = appName?.OriginalString ?? string.Empty,
                        Code                   = errorWarningCode,
                        HealthState            = Enum.GetName(typeof(HealthState), healthState),
                        HealthEventDescription = healthMessage.ToString(),
                        Metric                 = $"{drive}{data.Property}",
                        Node                   = this.NodeName,
                        ServiceName            = serviceName?.OriginalString ?? string.Empty,
                        Source                 = ObserverConstants.FabricObserverName,
                        Value                  = Math.Round(Convert.ToDouble(data.AverageDataValue), 1),
                    });
                }

                var healthReport = new HealthReport
                {
                    AppName                   = appName,
                    Code                      = errorWarningCode,
                    EmitLogEvent              = true,
                    HealthData                = telemetryData,
                    HealthMessage             = healthMessage.ToString(),
                    HealthReportTimeToLive    = healthReportTtl,
                    ReportType                = healthReportType,
                    State                     = healthState,
                    NodeName                  = this.NodeName,
                    Observer                  = this.ObserverName,
                    ResourceUsageDataProperty = data.Property,
                };

                // From FSO.
                if (replicaOrInstance == null && healthReportType == HealthReportType.Application)
                {
                    healthReport.Property = id;
                }

                // Emit a Fabric Health Report and optionally a local log write.
                this.HealthReporter.ReportHealthToServiceFabric(healthReport);

                // Set internal health state info on data instance.
                data.ActiveErrorOrWarning     = true;
                data.ActiveErrorOrWarningCode = errorWarningCode;

                // This means this observer created a Warning or Error SF Health Report
                this.HasActiveFabricErrorOrWarning = true;

                // Clean up sb.
                _ = healthMessage.Clear();
            }
            else
            {
                if (data.ActiveErrorOrWarning)
                {
                    // The health event description will be a serialized instance of telemetryData,
                    // so it should be completely constructed (filled with data) regardless
                    // of user telemetry settings.
                    telemetryData.ApplicationName        = appName?.OriginalString ?? string.Empty;
                    telemetryData.Code                   = data.ActiveErrorOrWarningCode;
                    telemetryData.HealthState            = Enum.GetName(typeof(HealthState), HealthState.Ok);
                    telemetryData.HealthEventDescription = $"{data.Property} is now within normal/expected range.";
                    telemetryData.Metric                 = data.Property;
                    telemetryData.Source                 = ObserverConstants.FabricObserverName;
                    telemetryData.Value                  = Math.Round(Convert.ToDouble(data.AverageDataValue), 1);

                    // Telemetry
                    if (this.IsTelemetryProviderEnabled && this.IsObserverTelemetryEnabled)
                    {
                        _ = this.TelemetryClient?.ReportMetricAsync(
                            telemetryData,
                            this.Token);
                    }

                    // ETW.
                    if (this.IsEtwEnabled)
                    {
                        Logger.EtwLogger?.Write(
                            ObserverConstants.FabricObserverETWEventName,
                            new
                        {
                            ApplicationName        = appName != null ? appName.OriginalString : string.Empty,
                            Code                   = data.ActiveErrorOrWarningCode,
                            HealthState            = Enum.GetName(typeof(HealthState), HealthState.Ok),
                            HealthEventDescription = $"{data.Property} is now within normal/expected range.",
                            Metric                 = data.Property,
                            Node                   = this.NodeName,
                            ServiceName            = name ?? string.Empty,
                            Source                 = ObserverConstants.FabricObserverName,
                            Value                  = Math.Round(Convert.ToDouble(data.AverageDataValue), 1),
                        });
                    }

                    var healthReport = new HealthReport
                    {
                        AppName                   = appName,
                        Code                      = data.ActiveErrorOrWarningCode,
                        EmitLogEvent              = true,
                        HealthData                = telemetryData,
                        HealthMessage             = $"{data.Property} is now within normal/expected range.",
                        HealthReportTimeToLive    = default(TimeSpan),
                        ReportType                = healthReportType,
                        State                     = HealthState.Ok,
                        NodeName                  = this.NodeName,
                        Observer                  = this.ObserverName,
                        ResourceUsageDataProperty = data.Property,
                    };

                    // From FSO.
                    if (replicaOrInstance == null && healthReportType == HealthReportType.Application)
                    {
                        healthReport.Property = id;
                    }

                    // Emit an Ok Health Report to clear Fabric Health warning.
                    this.HealthReporter.ReportHealthToServiceFabric(healthReport);

                    // Reset health states.
                    data.ActiveErrorOrWarning          = false;
                    data.ActiveErrorOrWarningCode      = FoErrorWarningCodes.Ok;
                    this.HasActiveFabricErrorOrWarning = false;
                }
            }

            // No need to keep data in memory.
            if (data.Data is List <T> list)
            {
                // List<T> impl.
                list.Clear();
                list.TrimExcess();
            }
            else
            {
                // CircularBufferCollection<T> impl.
                data.Data.Clear();
            }
        }
        private async Task MonitorDeployedAppsAsync(CancellationToken token)
        {
            Process currentProcess = null;

            foreach (var repOrInst in ReplicaOrInstanceList)
            {
                token.ThrowIfCancellationRequested();

                var  timer = new Stopwatch();
                int  processId = (int)repOrInst.HostProcessId;
                var  cpuUsage = new CpuUsage();
                bool checkCpu = false, checkMemMb = false, checkMemPct = false, checkAllPorts = false, checkEphemeralPorts = false;
                var  application = this.deployedTargetList?.FirstOrDefault(
                    app => app?.TargetApp?.ToLower() == repOrInst.ApplicationName?.OriginalString?.ToLower() ||
                    app?.TargetAppType?.ToLower() == repOrInst.ApplicationTypeName?.ToLower());

                if (application?.TargetApp == null && application?.TargetAppType == null)
                {
                    continue;
                }

                try
                {
                    // App level.
                    currentProcess = Process.GetProcessById(processId);

                    token.ThrowIfCancellationRequested();

                    var    procName      = currentProcess.ProcessName;
                    string appNameOrType = GetAppNameOrType(repOrInst);

                    var id = $"{appNameOrType}:{procName}";

                    // Add new resource data structures for each app service process where the metric is specified in configuration for related observation.
                    if (this.AllAppCpuData.All(list => list.Id != id) && (application.CpuErrorLimitPercent > 0 || application.CpuWarningLimitPercent > 0))
                    {
                        this.AllAppCpuData.Add(new FabricResourceUsageData <double>(ErrorWarningProperty.TotalCpuTime, id, DataCapacity, UseCircularBuffer));
                    }

                    if (this.AllAppCpuData.Any(list => list.Id == id))
                    {
                        checkCpu = true;
                    }

                    if (this.AllAppMemDataMb.All(list => list.Id != id) && (application.MemoryErrorLimitMb > 0 || application.MemoryWarningLimitMb > 0))
                    {
                        this.AllAppMemDataMb.Add(new FabricResourceUsageData <float>(ErrorWarningProperty.TotalMemoryConsumptionMb, id, DataCapacity, UseCircularBuffer));
                    }

                    if (this.AllAppMemDataMb.Any(list => list.Id == id))
                    {
                        checkMemMb = true;
                    }

                    if (this.AllAppMemDataPercent.All(list => list.Id != id) && (application.MemoryErrorLimitPercent > 0 || application.MemoryWarningLimitPercent > 0))
                    {
                        this.AllAppMemDataPercent.Add(new FabricResourceUsageData <double>(ErrorWarningProperty.TotalMemoryConsumptionPct, id, DataCapacity, UseCircularBuffer));
                    }

                    if (this.AllAppMemDataPercent.Any(list => list.Id == id))
                    {
                        checkMemPct = true;
                    }

                    if (this.AllAppTotalActivePortsData.All(list => list.Id != id) && (application.NetworkErrorActivePorts > 0 || application.NetworkWarningActivePorts > 0))
                    {
                        this.AllAppTotalActivePortsData.Add(new FabricResourceUsageData <int>(ErrorWarningProperty.TotalActivePorts, id, 1));
                    }

                    if (this.AllAppTotalActivePortsData.Any(list => list.Id == id))
                    {
                        checkAllPorts = true;
                    }

                    if (this.AllAppEphemeralPortsData.All(list => list.Id != id) && (application.NetworkErrorEphemeralPorts > 0 || application.NetworkWarningEphemeralPorts > 0))
                    {
                        this.AllAppEphemeralPortsData.Add(new FabricResourceUsageData <int>(ErrorWarningProperty.TotalEphemeralPorts, id, 1));
                    }

                    if (this.AllAppEphemeralPortsData.Any(list => list.Id == id))
                    {
                        checkEphemeralPorts = true;
                    }

                    // Measure Total and Ephemeral ports.
                    if (checkAllPorts)
                    {
                        this.AllAppTotalActivePortsData.FirstOrDefault(x => x.Id == id).Data.Add(OperatingSystemInfoProvider.Instance.GetActivePortCount(currentProcess.Id, FabricServiceContext));
                    }

                    if (checkEphemeralPorts)
                    {
                        this.AllAppEphemeralPortsData.FirstOrDefault(x => x.Id == id).Data.Add(OperatingSystemInfoProvider.Instance.GetActiveEphemeralPortCount(currentProcess.Id, FabricServiceContext));
                    }

                    // No need to proceed further if no cpu and mem thresholds are specified in configuration.
                    if (!checkCpu && !checkMemMb && !checkMemPct)
                    {
                        continue;
                    }

                    /* CPU and Memory Usage */

                    TimeSpan duration = TimeSpan.FromSeconds(15);

                    if (MonitorDuration > TimeSpan.MinValue)
                    {
                        duration = MonitorDuration;
                    }

                    // Warm up the counters.
                    if (checkCpu)
                    {
                        _ = cpuUsage.GetCpuUsagePercentageProcess(currentProcess);
                    }

                    if (checkMemMb || checkMemPct)
                    {
                        _ = ProcessInfoProvider.Instance.GetProcessPrivateWorkingSetInMB(currentProcess.Id);
                    }

                    timer.Start();

                    while (!currentProcess.HasExited && timer.Elapsed.Seconds <= duration.Seconds)
                    {
                        token.ThrowIfCancellationRequested();

                        if (checkCpu)
                        {
                            // CPU (all cores).
                            double cpu = cpuUsage.GetCpuUsagePercentageProcess(currentProcess);

                            if (cpu >= 0)
                            {
                                if (cpu > 100)
                                {
                                    cpu = 100;
                                }

                                this.AllAppCpuData.FirstOrDefault(x => x.Id == id).Data.Add(cpu);
                            }
                        }

                        float processMem = 0;

                        if (checkMemMb || checkMemPct)
                        {
                            processMem = ProcessInfoProvider.Instance.GetProcessPrivateWorkingSetInMB(currentProcess.Id);
                        }

                        if (checkMemMb)
                        {
                            // Memory (private working set (process)).
                            this.AllAppMemDataMb.FirstOrDefault(x => x.Id == id).Data.Add(processMem);
                        }

                        if (checkMemPct)
                        {
                            // Memory (percent in use (total)).
                            var(TotalMemory, PercentInUse) = OperatingSystemInfoProvider.Instance.TupleGetTotalPhysicalMemorySizeAndPercentInUse();
                            long totalMem = TotalMemory;

                            if (totalMem > 0)
                            {
                                double usedPct = Math.Round(((double)(processMem * 100)) / (totalMem * 1024), 2);
                                this.AllAppMemDataPercent.FirstOrDefault(x => x.Id == id).Data.Add(Math.Round(usedPct, 1));
                            }
                        }

                        await Task.Delay(250, Token);
                    }

                    timer.Stop();
                    timer.Reset();
                }
                catch (Exception e)
                {
#if DEBUG
                    // DEBUG INFO
                    var healthReport = new Utilities.HealthReport
                    {
                        AppName       = repOrInst.ApplicationName,
                        HealthMessage = $"Error:{Environment.NewLine}{e}{Environment.NewLine}",
                        State         = HealthState.Ok,
                        Code          = FOErrorWarningCodes.Ok,
                        NodeName      = NodeName,
                        Observer      = ObserverName,
                        Property      = $"{e.Source}",
                        ReportType    = HealthReportType.Application,
                    };

                    HealthReporter.ReportHealthToServiceFabric(healthReport);
#endif
                    if (e is Win32Exception || e is ArgumentException || e is InvalidOperationException)
                    {
                        WriteToLogWithLevel(
                            ObserverName,
                            $"MonitorAsync failed to find current service process for {repOrInst.ApplicationName?.OriginalString ?? repOrInst.ApplicationTypeName}{Environment.NewLine}{e}",
                            LogLevel.Information);
                    }
                    else
                    {
                        if (!(e is OperationCanceledException || e is TaskCanceledException))
                        {
                            WriteToLogWithLevel(
                                ObserverName,
                                $"Unhandled exception in MonitorAsync:{Environment.NewLine}{e}",
                                LogLevel.Warning);
                        }

                        throw;
                    }
                }
                finally
                {
                    currentProcess?.Dispose();
                    currentProcess = null;
                }
            }
        }
Example #19
0
        public override Task ReportAsync(CancellationToken token)
        {
            Token.ThrowIfCancellationRequested();

            // Informational report. For now, Linux is where we pay close attention to memory use by Fabric system services as there are still a few issues in that realm..
            var timeToLiveWarning     = SetHealthReportTimeToLive();
            var portInformationReport = new HealthReport
            {
                Observer      = ObserverName,
                NodeName      = NodeName,
                HealthMessage = $"Number of ports in use by Fabric services: {TotalActivePortCountAllSystemServices}{Environment.NewLine}" +
                                $"Number of ephemeral ports in use by Fabric services: {TotalActiveEphemeralPortCountAllSystemServices}{Environment.NewLine}" +
                                $"Fabric memory use MB: {this.allMemData.Where(x => x.Id == "Fabric")?.FirstOrDefault()?.AverageDataValue}{Environment.NewLine}" +
                                (RuntimeInformation.IsOSPlatform(OSPlatform.Linux) ?
                                 $"FabricGateway memory use MB: {this.allMemData.Where(x => x.Id == "FabricGateway.exe")?.FirstOrDefault()?.AverageDataValue}{Environment.NewLine}" +
                                 $"FabricHost memory use MB: {this.allMemData.Where(x => x.Id == "FabricHost")?.FirstOrDefault()?.AverageDataValue}{Environment.NewLine}" : string.Empty),

                State = HealthState.Ok,
                HealthReportTimeToLive = timeToLiveWarning,
            };

            HealthReporter.ReportHealthToServiceFabric(portInformationReport);

            // Reset ports counters.
            TotalActivePortCountAllSystemServices          = 0;
            TotalActiveEphemeralPortCountAllSystemServices = 0;

            // CPU
            ProcessResourceDataList(
                this.allCpuData,
                CpuErrorUsageThresholdPct,
                CpuWarnUsageThresholdPct);

            // Memory
            ProcessResourceDataList(
                this.allMemData,
                MemErrorUsageThresholdMb,
                MemWarnUsageThresholdMb);

            // Ports - Active TCP
            ProcessResourceDataList(
                this.allActiveTcpPortData,
                ActiveTcpPortCountError,
                ActiveTcpPortCountWarning);

            // Ports - Ephemeral
            ProcessResourceDataList(
                this.allEphemeralTcpPortData,
                ActiveEphemeralPortCountError,
                ActiveEphemeralPortCountWarning);

            // Windows Event Log
            if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows) && ObserverManager.ObserverWebAppDeployed &&
                this.monitorWinEventLog)
            {
                // SF Eventlog Errors?
                // Write this out to a new file, for use by the web front end log viewer.
                // Format = HTML.
                int count   = this.evtRecordList.Count();
                var logPath = Path.Combine(ObserverLogger.LogFolderBasePath, "EventVwrErrors.txt");

                // Remove existing file.
                if (File.Exists(logPath))
                {
                    try
                    {
                        File.Delete(logPath);
                    }
                    catch (IOException)
                    {
                    }
                    catch (UnauthorizedAccessException)
                    {
                    }
                }

                if (count >= 10)
                {
                    var sb = new StringBuilder();

                    _ = sb.AppendLine("<br/><div><strong>" +
                                      "<a href='javascript:toggle(\"evtContainer\")'>" +
                                      "<div id=\"plus\" style=\"display: inline; font-size: 25px;\">+</div> " + count +
                                      " Error Events in ServiceFabric and System</a> " +
                                      "Event logs</strong>.<br/></div>");

                    _ = sb.AppendLine("<div id='evtContainer' style=\"display: none;\">");

                    foreach (var evt in this.evtRecordList.Distinct())
                    {
                        token.ThrowIfCancellationRequested();

                        try
                        {
                            // Access event properties:
                            _ = sb.AppendLine("<div>" + evt.LogName + "</div>");
                            _ = sb.AppendLine("<div>" + evt.LevelDisplayName + "</div>");
                            if (evt.TimeCreated.HasValue)
                            {
                                _ = sb.AppendLine("<div>" + evt.TimeCreated.Value.ToShortDateString() + "</div>");
                            }

                            foreach (var prop in evt.Properties)
                            {
                                if (prop.Value != null && Convert.ToString(prop.Value).Length > 0)
                                {
                                    _ = sb.AppendLine("<div>" + prop.Value + "</div>");
                                }
                            }
                        }
                        catch (EventLogException)
                        {
                        }
                    }

                    _ = sb.AppendLine("</div>");

                    _ = ObserverLogger.TryWriteLogFile(logPath, sb.ToString());
                    _ = sb.Clear();
                }

                // Clean up.
                if (count > 0)
                {
                    this.evtRecordList.Clear();
                }
            }

            ClearDataContainers();

            return(Task.CompletedTask);
        }
        /// <inheritdoc/>
        public override Task ReportAsync(CancellationToken token)
        {
            this.Token.ThrowIfCancellationRequested();
            var timeToLiveWarning     = this.SetHealthReportTimeToLive();
            var portInformationReport = new HealthReport
            {
                Observer      = this.ObserverName,
                NodeName      = this.NodeName,
                HealthMessage = $"Number of ports in use by Fabric services: {this.TotalActivePortCount}\n" +
                                $"Number of ephemeral ports in use by Fabric services: {this.TotalActiveEphemeralPortCount}",
                State = HealthState.Ok,
                HealthReportTimeToLive = timeToLiveWarning,
            };

            // TODO: Report on port count based on thresholds PortCountWarning/Error.
            this.HealthReporter.ReportHealthToServiceFabric(portInformationReport);

            // Reset ports counters.
            this.TotalActivePortCount          = 0;
            this.TotalActiveEphemeralPortCount = 0;

            // CPU
            this.ProcessResourceDataList(
                this.allCpuData,
                this.CpuErrorUsageThresholdPct,
                this.CpuWarnUsageThresholdPct);

            // Memory
            this.ProcessResourceDataList(
                this.allMemData,
                this.MemErrorUsageThresholdMb,
                this.MemWarnUsageThresholdMb);

            // Windows Event Log
            if (ObserverManager.ObserverWebAppDeployed &&
                this.monitorWinEventLog)
            {
                // SF Eventlog Errors?
                // Write this out to a new file, for use by the web front end log viewer.
                // Format = HTML.
                int count   = this.evtRecordList.Count();
                var logPath = Path.Combine(this.ObserverLogger.LogFolderBasePath, "EventVwrErrors.txt");

                // Remove existing file.
                if (File.Exists(logPath))
                {
                    try
                    {
                        File.Delete(logPath);
                    }
                    catch (IOException)
                    {
                    }
                    catch (UnauthorizedAccessException)
                    {
                    }
                }

                if (count >= 10)
                {
                    var sb = new StringBuilder();

                    _ = sb.AppendLine("<br/><div><strong>" +
                                      "<a href='javascript:toggle(\"evtContainer\")'>" +
                                      "<div id=\"plus\" style=\"display: inline; font-size: 25px;\">+</div> " + count +
                                      " Error Events in ServiceFabric and System</a> " +
                                      "Event logs</strong>.<br/></div>");

                    _ = sb.AppendLine("<div id='evtContainer' style=\"display: none;\">");

                    foreach (var evt in this.evtRecordList.Distinct())
                    {
                        token.ThrowIfCancellationRequested();

                        try
                        {
                            // Access event properties:
                            _ = sb.AppendLine("<div>" + evt.LogName + "</div>");
                            _ = sb.AppendLine("<div>" + evt.LevelDisplayName + "</div>");
                            if (evt.TimeCreated.HasValue)
                            {
                                _ = sb.AppendLine("<div>" + evt.TimeCreated.Value.ToShortDateString() + "</div>");
                            }

                            foreach (var prop in evt.Properties)
                            {
                                if (prop.Value != null && Convert.ToString(prop.Value).Length > 0)
                                {
                                    _ = sb.AppendLine("<div>" + prop.Value + "</div>");
                                }
                            }
                        }
                        catch (EventLogException)
                        {
                        }
                    }

                    _ = sb.AppendLine("</div>");

                    _ = this.ObserverLogger.TryWriteLogFile(logPath, sb.ToString());
                    _ = sb.Clear();
                }

                // Clean up.
                if (count > 0)
                {
                    this.evtRecordList.Clear();
                }
            }

            return(Task.CompletedTask);
        }