예제 #1
0
        /// <summary>
        /// Post warning on cluster depending upon how ConsiderWarningAsError bool is set in cluster manifest.
        /// </summary>
        internal async Task PostWarningOnCoordinatorService(string warningDescription, int timeToLiveInMinutes)
        {
            bool considerWarningAsError = await CheckIfConsiderWarningAsErrorIsTrue();

            if (considerWarningAsError)
            {
                HealthManagerHelper.PostNodeHealthReport(this.fabricClient, this.context.ServiceName, ClusterPatchingStatusProperty, warningDescription, HealthState.Ok, timeToLiveInMinutes);
            }
            else
            {
                HealthManagerHelper.PostNodeHealthReport(this.fabricClient, this.context.ServiceName, ClusterPatchingStatusProperty, warningDescription, HealthState.Warning, timeToLiveInMinutes);
            }
        }
        /// <summary>
        /// Used to clear the events on Coordinator Service for nodes which are deleted from cluster.
        /// </summary>
        public async Task ClearOrphanEvents(CancellationToken cancellationToken)
        {
            try
            {
                Uri           nodeAgentServiceUri = new Uri(NodeAgentServiceName);
                ServiceHealth health = await this.fabricClient.HealthManager.GetServiceHealthAsync(nodeAgentServiceUri);

                List <HealthEvent> healthEventsToCheck = new List <HealthEvent>();
                foreach (var e in health.HealthEvents)
                {
                    if (e.HealthInformation.Property.Contains(WUOperationStatus) || e.HealthInformation.Property.Contains(WUOperationSetting))
                    {
                        healthEventsToCheck.Add(e);
                    }
                }
                cancellationToken.ThrowIfCancellationRequested();

                NodeList nodeList = await this.fabricClient.QueryManager.GetNodeListAsync(null, null, this.DefaultTimeoutForOperation, cancellationToken);

                Dictionary <string, bool> propertyDict = new Dictionary <string, bool>();
                if (healthEventsToCheck.Count == 2 * nodeList.Count)
                {
                    return;
                }
                else
                {
                    foreach (var node in nodeList)
                    {
                        propertyDict.Add(WUOperationStatus + "-" + node.NodeName, true);
                        propertyDict.Add(WUOperationSetting + "-" + node.NodeName, true);
                    }

                    string NodeNotPartOfClusterDescription = "This node is no longer part of the cluster.";
                    foreach (var e in healthEventsToCheck)
                    {
                        if (!propertyDict.ContainsKey(e.HealthInformation.Property))
                        {
                            ServiceEventSource.Current.VerboseMessage("Property {0}'s event is removed from CoordinatorService by updating TTL to 1 minute.", e.HealthInformation.Property);
                            HealthManagerHelper.PostNodeHealthReport(fabricClient, nodeAgentServiceUri, e.HealthInformation.SourceId, e.HealthInformation.Property, NodeNotPartOfClusterDescription, HealthState.Ok, 1);
                        }
                    }
                }
            }
            catch (Exception ex)
            {
                ServiceEventSource.Current.ErrorMessage("ClearOrphanEvents failed with exception {0}", ex.ToString());
            }
        }
예제 #3
0
        /// <summary>
        /// Checks if repair manager is enalbed on the cluster or not
        /// </summary>
        /// <param name="cancellationToken">cancellation token to stop the asyn operation</param>
        /// <returns>true if repair manager application is present in cluster, otherwise false</returns>
        internal async Task <bool> CheckRepairManagerStatus(CancellationToken cancellationToken)
        {
            ServiceList serviceList = await this.fabricClient.QueryManager.GetServiceListAsync(SystemUri, RepairManagerUri, this.DefaultTimeoutForOperation, cancellationToken);

            if (serviceList.Count == 0)
            {
                string warningDescription =
                    string.Format("{0} could not be found, Patch Orchestration Service requires RepairManager system service to be enable on the cluster. Consider adding RepairManager section in cluster manifest.",
                                  RepairManagerUri);
                HealthManagerHelper.PostNodeHealthReport(this.fabricClient, this.context.ServiceName,
                                                         RepairManagerStatus, warningDescription, HealthState.Warning);
                return(false);
            }

            string description = string.Format("{0} is available", RepairManagerUri);

            HealthManagerHelper.PostNodeHealthReport(this.fabricClient, this.context.ServiceName,
                                                     RepairManagerStatus, description, HealthState.Ok, 1);
            return(true);
        }
예제 #4
0
        /// <summary>
        /// Posts the cluster patching status by finding the nodes on which patching is going on.
        /// </summary>
        private async Task PostRMTaskNodeUpdate(CancellationToken cancellationToken)
        {
            NodeList nodeList = await this.fabricClient.QueryManager.GetNodeListAsync(null, null, this.DefaultTimeoutForOperation, cancellationToken);

            HashSet <string>   processingNodes = new HashSet <string>();
            HashSet <string>   pendingNodes    = new HashSet <string>();
            IList <RepairTask> claimedTaskList = await this.GetClaimedRepairTasks(nodeList, cancellationToken);

            foreach (var task in claimedTaskList)
            {
                pendingNodes.Add(task.Target.ToString());
            }
            cancellationToken.ThrowIfCancellationRequested();
            RepairTaskList processingTaskListFinal = await this.GetRepairTasksUnderProcessing(cancellationToken);

            foreach (var task in processingTaskListFinal)
            {
                processingNodes.Add(task.Target.ToString());
            }

            string pendingNodesString    = string.Join(", ", pendingNodes);
            string processingNodesString = string.Join(", ", processingNodes);

            if (String.IsNullOrEmpty(pendingNodesString))
            {
                pendingNodesString = "None";
            }

            if (String.IsNullOrEmpty(processingNodesString))
            {
                processingNodesString = "None";
            }

            string description = string.Format(" Node currently being patched: {0} \nNodes waiting to be patched: {1}", processingNodesString, pendingNodesString);

            HealthManagerHelper.PostNodeHealthReport(fabricClient, this.context.ServiceName, ClusterPatchingStatusProperty, description, HealthState.Ok);
        }
예제 #5
0
        /// <summary>
        /// Fetches all the repair tasks which are under execution and checks
        /// if any of them has exceeded the pre-specified execution timeout limit
        /// </summary>
        /// <param name="cancellationToken"></param>
        /// <returns></returns>
        internal async Task TimeoutRepairTasks(CancellationToken cancellationToken)
        {
            if (!this.ManageRepairTasksOnTimeout)
            {
                return;
            }

            // Get repair tasks which have been approved and are still under execution by POA
            RepairTaskList repairTasks = await this.fabricClient.RepairManager.GetRepairTaskListAsync(TaskIdPrefix,
                                                                                                      RepairTaskStateFilter.Approved | RepairTaskStateFilter.Executing, ExecutorName, this.DefaultTimeoutForOperation, cancellationToken);

            foreach (var task in repairTasks)
            {
                ExecutorDataForRmTask executorData =
                    SerializationUtility.Deserialize <ExecutorDataForRmTask>(task.ExecutorData);
                Debug.Assert(task.ApprovedTimestamp != null, "ApprovedTimestamp of an approved repair task can never be null");
                TimeSpan elapsedTime = DateTime.UtcNow.Subtract(task.ApprovedTimestamp.Value);
                if (elapsedTime > (TimeSpan.FromMinutes(executorData.ExecutorTimeoutInMinutes) + GraceTimeForNtService))
                {
                    // Check if the node exists or not. If node does not exists, then don't break;
                    bool     nodeExists = false;
                    string   nodeName   = this.GetNodeNameFromRepairTask(task);
                    NodeList nodeList   = await this.fabricClient.QueryManager.GetNodeListAsync(nodeName, null, this.DefaultTimeoutForOperation, cancellationToken);

                    foreach (var node in nodeList)
                    {
                        if (node.NodeName.Equals(nodeName))
                        {
                            // Node Exists.
                            nodeExists = true;
                            break;
                        }
                    }

                    if (!nodeExists)
                    {
                        // If node does not exist now, there is no point in waiting on the task.
                        ServiceEventSource.Current.VerboseMessage("Cancelling repair task {0} which is in {1} state as the node {2} does not exist anymore.", task.TaskId, task.State, nodeName);
                        await this.CancelRepairTask(task);

                        continue;
                    }

                    switch (executorData.ExecutorSubState)
                    {
                    // These are special states where its best if NodeAgentNtService should move the repair task, just post warning in this case
                    case NodeAgentSfUtilityExitCodes.RestartRequested:
                    case NodeAgentSfUtilityExitCodes.RestartCompleted:
                    case NodeAgentSfUtilityExitCodes.InstallationCompleted:
                    {
                        string healthproperty = string.Format(
                            NodeTimeoutStatusFormat,
                            nodeName);
                        string healthDescription =
                            string.Format(
                                "Installation timeout {0} minutes alloted to repair task {1}, node {2} is over, however since node is in post-installation phase, wait for few more minutes for operation to complete"
                                + "In case problem persists, please check if recent installations of updates has caused any problem on the node",
                                executorData.ExecutorTimeoutInMinutes,
                                task.TaskId,
                                nodeName);
                        ServiceEventSource.Current.ErrorMessage("Title = {0}, Description = {1}", healthproperty, healthDescription);
                        HealthManagerHelper.PostNodeHealthReport(this.fabricClient,
                                                                 this.context.ServiceName,
                                                                 healthproperty,
                                                                 healthDescription,
                                                                 HealthState.Warning,
                                                                 60);

                        break;
                    }

                    default:
                    {
                        await UpdateRepairTaskState(task, nodeName, RepairTaskState.Restoring, executorData.ExecutorTimeoutInMinutes, cancellationToken);

                        break;
                    }
                    }
                }
            }
        }
예제 #6
0
        /// <summary>
        /// Post the cluster patching status as events on CoordinatorService
        /// </summary>
        public async Task PostClusterPatchingStatus(CancellationToken cancellationToken)
        {
            try
            {
                NodeList nodeList = await this.fabricClient.QueryManager.GetNodeListAsync(null, null, this.DefaultTimeoutForOperation, cancellationToken);

                IList <RepairTask> claimedTaskList = await this.GetClaimedRepairTasks(nodeList, cancellationToken);

                RepairTaskList processingTaskList = await this.GetRepairTasksUnderProcessing(cancellationToken);

                cancellationToken.ThrowIfCancellationRequested();
                if (claimedTaskList.Any())
                {
                    if (!processingTaskList.Any())
                    {
                        // This means that repair tasks are not getting approved.
                        ClusterHealth clusterHealth = await this.fabricClient.HealthManager.GetClusterHealthAsync();

                        if (clusterHealth.AggregatedHealthState == HealthState.Error)
                        {
                            // Reset Count
                            postUpdateCount = 0;
                            string warningDescription = " Cluster is currently unhealthy. Nodes are currently not getting patched by Patch Orchestration Application. Please ensure the cluster becomes healthy for patching to continue.";
                            await PostWarningOnCoordinatorService(warningDescription, 1);
                        }
                        else
                        {
                            postUpdateCount++;
                            if (postUpdateCount > 60)
                            {
                                // Reset Count and throw a warning on the service saying we dont know the reason. But POA not is not approving tasks.
                                postUpdateCount = 0;
                                string warningDescription = "Patch Orchestration Application is currently not patching nodes. This could be possible if there is some node which is stuck in disabling state for long time.";
                                await PostWarningOnCoordinatorService(warningDescription, 61);
                            }
                        }
                    }
                    else
                    {
                        // Reset Count
                        postUpdateCount = 0;
                        await PostRMTaskNodeUpdate(cancellationToken);
                    }
                }
                else
                {
                    // Reset Count
                    postUpdateCount = 0;
                    if (processingTaskList.Any())
                    {
                        await PostRMTaskNodeUpdate(cancellationToken);
                    }
                    else
                    {
                        // Post the health event saying that there is no repair task and things are working fine.
                        string description = "No claimed tasks and no processing tasks are found.";
                        HealthManagerHelper.PostNodeHealthReport(this.fabricClient, this.context.ServiceName, ClusterPatchingStatusProperty, description, HealthState.Ok, -1);
                    }
                }
            }
            catch (Exception ex)
            {
                ServiceEventSource.Current.ErrorMessage("PostClusterPatchingStatus failed with exception {0}", ex.ToString());
            }
        }