Esempio n. 1
0
        public override Task ReportAsync(CancellationToken token)
        {
            // Local log.
            ObserverLogger.LogInfo(message.ToString());

            /* Report to Fabric */

            // These values will be preserved across observer runs and are useful for clearing warnings
            // by reporting Ok health state health events with the same property and sourceid values
            // as the error/warning health events when FO is safely taken down (e.g., app is being uninstalled,
            // safe restart of fabric node it's running on, etc.).
            HealthReportProperties.Add("SomePropertyName");
            HealthReportSourceIds.Add($"{ObserverName}_SomethingUniqueToThisReport");

            var healthReporter = new ObserverHealthReporter(ObserverLogger);
            var healthReport   = new Utilities.HealthReport
            {
                Code          = FOErrorWarningCodes.Ok,
                HealthMessage = this.message.ToString(),
                NodeName      = NodeName,
                Observer      = ObserverName,
                Property      = HealthReportProperties[^ 1],
        /// <summary>
        /// This function processes numeric data held in FRUD instances and generates Application or Node level Health Reports depending on supplied thresholds.
        /// </summary>
        /// <typeparam name="T">This represents the numeric type of data this function will operate on.</typeparam>
        /// <param name="data">FabricResourceUsageData instance.</param>
        /// <param name="thresholdError">Error threshold (numeric)</param>
        /// <param name="thresholdWarning">Warning threshold (numeric)</param>
        /// <param name="healthReportTtl">Health report Time to Live (TimeSpan)</param>
        /// <param name="healthReportType">HealthReport type. Note, only Application and Node health report types are supported.</param>
        /// <param name="replicaOrInstance">Replica or Instance information contained in a type.</param>
        /// <param name="dumpOnError">Wheter or not to dump process if Error threshold has been reached.</param>
        public void ProcessResourceDataReportHealth <T>(
            FabricResourceUsageData <T> data,
            T thresholdError,
            T thresholdWarning,
            TimeSpan healthReportTtl,
            HealthReportType healthReportType = HealthReportType.Node,
            ReplicaOrInstanceMonitoringInfo replicaOrInstance = null,
            bool dumpOnError = false)
            where T : struct
        {
            if (data == null)
            {
                throw new ArgumentException("Supply all required parameters with non-null value.");
            }

            if (healthReportType != HealthReportType.Application && healthReportType != HealthReportType.Node)
            {
                this.ObserverLogger.LogWarning($"ProcessResourceDataReportHealth: Unsupported HealthReport type -> {Enum.GetName(typeof(HealthReportType), healthReportType)}");
                return;
            }

            var           thresholdName = "Minimum";
            bool          warningOrError = false;
            string        repPartitionId = null, repOrInstanceId = null, name = null, id = null, procName = null;
            T             threshold     = thresholdWarning;
            var           healthState   = HealthState.Ok;
            Uri           appName       = null;
            Uri           serviceName   = null;
            TelemetryData telemetryData = null;

            if (healthReportType == HealthReportType.Application)
            {
                if (replicaOrInstance != null)
                {
                    repPartitionId  = $"Partition: {replicaOrInstance.PartitionId}";
                    repOrInstanceId = $"Replica: {replicaOrInstance.ReplicaOrInstanceId}";

                    // Create a unique id which will be used for health Warnings and OKs (clears).
                    appName     = replicaOrInstance.ApplicationName;
                    serviceName = replicaOrInstance.ServiceName;
                    name        = appName.OriginalString.Replace("fabric:/", string.Empty);
                }
                else
                {
                    appName = new Uri("fabric:/System");
                    name    = data.Id;
                }

                id = name + "_" + data.Property.Replace(" ", string.Empty);

                // The health event description will be a serialized instance of telemetryData,
                // so it should be completely constructed (filled with data) regardless
                // of user telemetry settings.
                telemetryData = new TelemetryData(FabricClientInstance, Token)
                {
                    ApplicationName = appName?.OriginalString ?? string.Empty,
                    Code            = FOErrorWarningCodes.Ok,
                    HealthState     = Enum.GetName(typeof(HealthState), HealthState.Ok),
                    NodeName        = NodeName,
                    ObserverName    = ObserverName,
                    Metric          = data.Property,
                    Value           = Math.Round(data.AverageDataValue, 1),
                    PartitionId     = replicaOrInstance?.PartitionId.ToString(),
                    ReplicaId       = replicaOrInstance?.ReplicaOrInstanceId.ToString(),
                    ServiceName     = serviceName?.OriginalString ?? string.Empty,
                    Source          = ObserverConstants.FabricObserverName,
                };

                try
                {
                    if (replicaOrInstance != null && replicaOrInstance.HostProcessId > 0)
                    {
                        procName = Process.GetProcessById((int)replicaOrInstance.HostProcessId).ProcessName;
                    }
                    else
                    {
                        // The name of the target service process is always the id for data containers coming from FSO.
                        procName = data.Id;
                    }

                    telemetryData.ServiceName = procName;

                    if (IsTelemetryProviderEnabled && IsObserverTelemetryEnabled)
                    {
                        _ = TelemetryClient?.ReportMetricAsync(
                            telemetryData,
                            Token).ConfigureAwait(false);
                    }

                    if (IsEtwEnabled)
                    {
                        Logger.EtwLogger?.Write(
                            ObserverConstants.FabricObserverETWEventName,
                            new
                        {
                            ApplicationName = appName?.OriginalString ?? string.Empty,
                            Code            = FOErrorWarningCodes.Ok,
                            HealthState     = Enum.GetName(typeof(HealthState), HealthState.Ok),
                            NodeName,
                            ObserverName,
                            Metric      = data.Property,
                            Value       = Math.Round(data.AverageDataValue, 1),
                            PartitionId = replicaOrInstance?.PartitionId.ToString(),
                            ReplicaId   = replicaOrInstance?.ReplicaOrInstanceId.ToString(),
                            ServiceName = procName,
                            Source      = ObserverConstants.FabricObserverName,
                        });
                    }
                }
                catch (ArgumentException)
                {
                    return;
                }
                catch (InvalidOperationException)
                {
                    return;
                }
            }
            else
            {
                string drive = string.Empty;

                if (ObserverName == ObserverConstants.DiskObserverName)
                {
                    drive = $"{data.Id}: ";

                    if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows))
                    {
                        drive = $"{data.Id.Remove(1, 2)}: ";
                    }
                }

                // The health event description will be a serialized instance of telemetryData,
                // so it should be completely constructed (filled with data) regardless
                // of user telemetry settings.
                telemetryData = new TelemetryData(FabricClientInstance, Token)
                {
                    Code         = FOErrorWarningCodes.Ok,
                    HealthState  = Enum.GetName(typeof(HealthState), HealthState.Ok),
                    NodeName     = NodeName,
                    ObserverName = ObserverName,
                    Metric       = $"{drive}{data.Property}",
                    Source       = ObserverConstants.FabricObserverName,
                    Value        = Math.Round(data.AverageDataValue, 1),
                };

                if (IsTelemetryProviderEnabled && IsObserverTelemetryEnabled)
                {
                    _ = TelemetryClient?.ReportMetricAsync(
                        telemetryData,
                        Token);
                }

                if (IsEtwEnabled)
                {
                    Logger.EtwLogger?.Write(
                        ObserverConstants.FabricObserverETWEventName,
                        new
                    {
                        Code        = FOErrorWarningCodes.Ok,
                        HealthState = Enum.GetName(typeof(HealthState), HealthState.Ok),
                        NodeName,
                        ObserverName,
                        Metric = $"{drive}{data.Property}",
                        Source = ObserverConstants.FabricObserverName,
                        Value  = Math.Round(data.AverageDataValue, 1),
                    });
                }
            }

            // Health Error
            if (data.IsUnhealthy(thresholdError))
            {
                thresholdName  = "Maximum";
                threshold      = thresholdError;
                warningOrError = true;
                healthState    = HealthState.Error;

                // This is primarily useful for AppObserver, but makes sense to be
                // part of the base class for future use, like for FSO.
                if (replicaOrInstance != null && dumpOnError)
                {
                    try
                    {
                        int procId = (int)replicaOrInstance.HostProcessId;

                        if (!this.serviceDumpCountDictionary.ContainsKey(procName))
                        {
                            this.serviceDumpCountDictionary.Add(procName, 0);
                        }

                        if (this.serviceDumpCountDictionary[procName] < this.maxDumps)
                        {
                            // DumpServiceProcess defaults to a Full dump with
                            // process memory, handles and thread data.
                            bool success = DumpServiceProcess(procId);

                            if (success)
                            {
                                this.serviceDumpCountDictionary[procName]++;
                            }
                        }
                    }

                    // Ignore these, it just means no dmp will be created.This is not
                    // critical to FO. Log as info, not warning.
                    catch (Exception e) when(e is ArgumentException || e is InvalidOperationException)
                    {
                        ObserverLogger.LogInfo($"Unable to generate dmp file:{Environment.NewLine}{e}");
                    }
                }
            }

            // Health Warning
            if (!warningOrError && data.IsUnhealthy(thresholdWarning))
            {
                warningOrError = true;
                healthState    = HealthState.Warning;
            }

            if (warningOrError)
            {
                string errorWarningCode = null;

                switch (data.Property)
                {
                case ErrorWarningProperty.TotalCpuTime when healthReportType == HealthReportType.Application:
                    errorWarningCode = (healthState == HealthState.Error) ?
                                       FOErrorWarningCodes.AppErrorCpuPercent : FOErrorWarningCodes.AppWarningCpuPercent;
                    break;

                case ErrorWarningProperty.TotalCpuTime:
                    errorWarningCode = (healthState == HealthState.Error) ?
                                       FOErrorWarningCodes.NodeErrorCpuPercent : FOErrorWarningCodes.NodeWarningCpuPercent;
                    break;

                case ErrorWarningProperty.DiskSpaceUsagePercentage:
                    errorWarningCode = (healthState == HealthState.Error) ?
                                       FOErrorWarningCodes.NodeErrorDiskSpacePercent : FOErrorWarningCodes.NodeWarningDiskSpacePercent;
                    break;

                case ErrorWarningProperty.DiskSpaceUsageMb:
                    errorWarningCode = (healthState == HealthState.Error) ?
                                       FOErrorWarningCodes.NodeErrorDiskSpaceMB : FOErrorWarningCodes.NodeWarningDiskSpaceMB;
                    break;

                case ErrorWarningProperty.TotalMemoryConsumptionMb when healthReportType == HealthReportType.Application:
                    errorWarningCode = (healthState == HealthState.Error) ?
                                       FOErrorWarningCodes.AppErrorMemoryMB : FOErrorWarningCodes.AppWarningMemoryMB;
                    break;

                case ErrorWarningProperty.TotalMemoryConsumptionMb:
                    errorWarningCode = (healthState == HealthState.Error) ?
                                       FOErrorWarningCodes.NodeErrorMemoryMB : FOErrorWarningCodes.NodeWarningMemoryMB;
                    break;

                case ErrorWarningProperty.TotalMemoryConsumptionPct when replicaOrInstance != null:
                    errorWarningCode = (healthState == HealthState.Error) ?
                                       FOErrorWarningCodes.AppErrorMemoryPercent : FOErrorWarningCodes.AppWarningMemoryPercent;
                    break;

                case ErrorWarningProperty.TotalMemoryConsumptionPct:
                    errorWarningCode = (healthState == HealthState.Error) ?
                                       FOErrorWarningCodes.NodeErrorMemoryPercent : FOErrorWarningCodes.NodeWarningMemoryPercent;
                    break;

                case ErrorWarningProperty.DiskAverageQueueLength:
                    errorWarningCode = (healthState == HealthState.Error) ?
                                       FOErrorWarningCodes.NodeErrorDiskAverageQueueLength : FOErrorWarningCodes.NodeWarningDiskAverageQueueLength;
                    break;

                case ErrorWarningProperty.TotalActiveFirewallRules:
                    errorWarningCode = (healthState == HealthState.Error) ?
                                       FOErrorWarningCodes.ErrorTooManyFirewallRules : FOErrorWarningCodes.WarningTooManyFirewallRules;
                    break;

                case ErrorWarningProperty.TotalActivePorts when healthReportType == HealthReportType.Application:
                    errorWarningCode = (healthState == HealthState.Error) ?
                                       FOErrorWarningCodes.AppErrorTooManyActiveTcpPorts : FOErrorWarningCodes.AppWarningTooManyActiveTcpPorts;
                    break;

                case ErrorWarningProperty.TotalActivePorts:
                    errorWarningCode = (healthState == HealthState.Error) ?
                                       FOErrorWarningCodes.NodeErrorTooManyActiveTcpPorts : FOErrorWarningCodes.NodeWarningTooManyActiveTcpPorts;
                    break;

                case ErrorWarningProperty.TotalEphemeralPorts when healthReportType == HealthReportType.Application:
                    errorWarningCode = (healthState == HealthState.Error) ?
                                       FOErrorWarningCodes.AppErrorTooManyActiveEphemeralPorts : FOErrorWarningCodes.AppWarningTooManyActiveEphemeralPorts;
                    break;

                case ErrorWarningProperty.TotalEphemeralPorts:
                    errorWarningCode = (healthState == HealthState.Error) ?
                                       FOErrorWarningCodes.NodeErrorTooManyActiveEphemeralPorts : FOErrorWarningCodes.NodeWarningTooManyActiveEphemeralPorts;
                    break;
                }

                var healthMessage = new StringBuilder();

                string drive = string.Empty;

                if (ObserverName == ObserverConstants.DiskObserverName)
                {
                    drive = $"{data.Id}: ";

                    if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows))
                    {
                        drive = $"{data.Id.Remove(1, 2)}: ";
                    }
                }

                _ = healthMessage.Append($"{drive}{data.Property} is at or above the specified {thresholdName} limit ({threshold}{data.Units})");
                _ = healthMessage.AppendLine($" - {data.Property}: {Math.Round(data.AverageDataValue)}{data.Units}");

                // The health event description will be a serialized instance of telemetryData,
                // so it should be completely constructed (filled with data) regardless
                // of user telemetry settings.
                telemetryData.ApplicationName = appName?.OriginalString ?? string.Empty;
                telemetryData.Code            = errorWarningCode;
                if (replicaOrInstance != null && !string.IsNullOrEmpty(replicaOrInstance.ContainerId))
                {
                    telemetryData.ContainerId = replicaOrInstance.ContainerId;
                }
                telemetryData.HealthState            = Enum.GetName(typeof(HealthState), healthState);
                telemetryData.HealthEventDescription = healthMessage.ToString();
                telemetryData.Metric      = $"{drive}{data.Property}";
                telemetryData.ServiceName = serviceName?.OriginalString ?? string.Empty;
                telemetryData.Source      = ObserverConstants.FabricObserverName;
                telemetryData.Value       = Math.Round(data.AverageDataValue, 1);

                // Send Health Report as Telemetry event (perhaps it signals an Alert from App Insights, for example.).
                if (IsTelemetryProviderEnabled && IsObserverTelemetryEnabled)
                {
                    _ = TelemetryClient?.ReportHealthAsync(
                        telemetryData,
                        Token);
                }

                // ETW.
                if (IsEtwEnabled)
                {
                    Logger.EtwLogger?.Write(
                        ObserverConstants.FabricObserverETWEventName,
                        new
                    {
                        ApplicationName        = appName?.OriginalString ?? string.Empty,
                        Code                   = errorWarningCode,
                        ContainerId            = replicaOrInstance != null ? replicaOrInstance.ContainerId ?? string.Empty : string.Empty,
                        HealthState            = Enum.GetName(typeof(HealthState), healthState),
                        HealthEventDescription = healthMessage.ToString(),
                        Metric                 = $"{drive}{data.Property}",
                        Node                   = NodeName,
                        ServiceName            = serviceName?.OriginalString ?? string.Empty,
                        Source                 = ObserverConstants.FabricObserverName,
                        Value                  = Math.Round(data.AverageDataValue, 1),
                    });
                }

                var healthReport = new HealthReport
                {
                    AppName                   = appName,
                    Code                      = errorWarningCode,
                    EmitLogEvent              = true,
                    HealthData                = telemetryData,
                    HealthMessage             = healthMessage.ToString(),
                    HealthReportTimeToLive    = healthReportTtl,
                    ReportType                = healthReportType,
                    State                     = healthState,
                    NodeName                  = NodeName,
                    Observer                  = ObserverName,
                    ResourceUsageDataProperty = data.Property,
                };

                if (!AppNames.Any(a => a == appName?.OriginalString))
                {
                    AppNames.Add(appName?.OriginalString);
                }

                // From FSO.
                if (replicaOrInstance == null && healthReportType == HealthReportType.Application)
                {
                    HealthReportProperties.Add(id);
                }
                else
                {
                    if (HealthReportProperties.Count == 0)
                    {
                        HealthReportProperties.Add(ObserverName switch
                        {
                            ObserverConstants.AppObserverName => "ApplicationHealth",
                            ObserverConstants.CertificateObserverName => "SecurityHealth",
                            ObserverConstants.DiskObserverName => "DiskHealth",
                            ObserverConstants.FabricSystemObserverName => "FabricSystemServiceHealth",
                            ObserverConstants.NetworkObserverName => "NetworkHealth",
                            ObserverConstants.OSObserverName => "MachineInformation",
                            ObserverConstants.NodeObserverName => "MachineResourceHealth",
                            _ => $"{data.Property}",
                        });
        public override Task ReportAsync(CancellationToken token)
        {
            // Local log.
            ObserverLogger.LogInfo(message.ToString());

            /* Report to Fabric */

            // These values will be preserved across observer runs and are useful for clearing warnings
            // by reporting Ok health state health events with the same property and sourceid values
            // as the error/warning health events when FO is safely taken down (e.g., app is being uninstalled,
            // safe restart of fabric node it's running on, etc.).
            HealthReportProperties.Add("SomePropertyName");
            HealthReportSourceIds.Add($"{ObserverName}_SomethingUniqueToThisReport");

            var healthReporter = new ObserverHealthReporter(ObserverLogger, FabricClientInstance);
            var healthReport   = new Utilities.HealthReport
            {
                Code          = FOErrorWarningCodes.Ok,
                HealthMessage = this.message.ToString(),
                NodeName      = NodeName,
                Observer      = ObserverName,
                Property      = HealthReportProperties[HealthReportProperties.Count - 1],
                ReportType    = HealthReportType.Node,
                State         = HealthState.Ok,
            };

            healthReporter.ReportHealthToServiceFabric(healthReport);

            // Emit Telemetry - This will use whatever telemetry provider you have configured in FabricObserver Settings.xml.
            var telemetryData = new TelemetryData(FabricClientInstance, Token)
            {
                Code = FOErrorWarningCodes.Ok,
                HealthEventDescription = this.message.ToString(),
                HealthState            = "Ok",
                NodeName     = NodeName,
                ObserverName = ObserverName,
                Source       = ObserverConstants.FabricObserverName,
            };

            if (IsTelemetryProviderEnabled && IsObserverTelemetryEnabled)
            {
                _ = TelemetryClient?.ReportHealthAsync(
                    telemetryData,
                    Token);
            }

            // ETW.
            if (IsEtwEnabled)
            {
                Logger.EtwLogger?.Write(
                    ObserverConstants.FabricObserverETWEventName,
                    new
                {
                    Code = FOErrorWarningCodes.Ok,
                    HealthEventDescription = this.message.ToString(),
                    HealthState            = "Ok",
                    NodeName,
                    ObserverName,
                    Source = ObserverConstants.FabricObserverName,
                });
            }

            this.message.Clear();

            return(Task.CompletedTask);
        }