/// <summary> /// Post warning on cluster depending upon how ConsiderWarningAsError bool is set in cluster manifest. /// </summary> internal async Task PostWarningOnCoordinatorService(string warningDescription, int timeToLiveInMinutes) { bool considerWarningAsError = await CheckIfConsiderWarningAsErrorIsTrue(); if (considerWarningAsError) { HealthManagerHelper.PostNodeHealthReport(this.fabricClient, this.context.ServiceName, ClusterPatchingStatusProperty, warningDescription, HealthState.Ok, timeToLiveInMinutes); } else { HealthManagerHelper.PostNodeHealthReport(this.fabricClient, this.context.ServiceName, ClusterPatchingStatusProperty, warningDescription, HealthState.Warning, timeToLiveInMinutes); } }
/// <summary> /// Used to clear the events on Coordinator Service for nodes which are deleted from cluster. /// </summary> public async Task ClearOrphanEvents(CancellationToken cancellationToken) { try { Uri nodeAgentServiceUri = new Uri(NodeAgentServiceName); ServiceHealth health = await this.fabricClient.HealthManager.GetServiceHealthAsync(nodeAgentServiceUri); List <HealthEvent> healthEventsToCheck = new List <HealthEvent>(); foreach (var e in health.HealthEvents) { if (e.HealthInformation.Property.Contains(WUOperationStatus) || e.HealthInformation.Property.Contains(WUOperationSetting)) { healthEventsToCheck.Add(e); } } cancellationToken.ThrowIfCancellationRequested(); NodeList nodeList = await this.fabricClient.QueryManager.GetNodeListAsync(null, null, this.DefaultTimeoutForOperation, cancellationToken); Dictionary <string, bool> propertyDict = new Dictionary <string, bool>(); if (healthEventsToCheck.Count == 2 * nodeList.Count) { return; } else { foreach (var node in nodeList) { propertyDict.Add(WUOperationStatus + "-" + node.NodeName, true); propertyDict.Add(WUOperationSetting + "-" + node.NodeName, true); } string NodeNotPartOfClusterDescription = "This node is no longer part of the cluster."; foreach (var e in healthEventsToCheck) { if (!propertyDict.ContainsKey(e.HealthInformation.Property)) { ServiceEventSource.Current.VerboseMessage("Property {0}'s event is removed from CoordinatorService by updating TTL to 1 minute.", e.HealthInformation.Property); HealthManagerHelper.PostNodeHealthReport(fabricClient, nodeAgentServiceUri, e.HealthInformation.SourceId, e.HealthInformation.Property, NodeNotPartOfClusterDescription, HealthState.Ok, 1); } } } } catch (Exception ex) { ServiceEventSource.Current.ErrorMessage("ClearOrphanEvents failed with exception {0}", ex.ToString()); } }
/// <summary> /// Checks if repair manager is enalbed on the cluster or not /// </summary> /// <param name="cancellationToken">cancellation token to stop the asyn operation</param> /// <returns>true if repair manager application is present in cluster, otherwise false</returns> internal async Task <bool> CheckRepairManagerStatus(CancellationToken cancellationToken) { ServiceList serviceList = await this.fabricClient.QueryManager.GetServiceListAsync(SystemUri, RepairManagerUri, this.DefaultTimeoutForOperation, cancellationToken); if (serviceList.Count == 0) { string warningDescription = string.Format("{0} could not be found, Patch Orchestration Service requires RepairManager system service to be enable on the cluster. Consider adding RepairManager section in cluster manifest.", RepairManagerUri); HealthManagerHelper.PostNodeHealthReport(this.fabricClient, this.context.ServiceName, RepairManagerStatus, warningDescription, HealthState.Warning); return(false); } string description = string.Format("{0} is available", RepairManagerUri); HealthManagerHelper.PostNodeHealthReport(this.fabricClient, this.context.ServiceName, RepairManagerStatus, description, HealthState.Ok, 1); return(true); }
/// <summary> /// Posts the cluster patching status by finding the nodes on which patching is going on. /// </summary> private async Task PostRMTaskNodeUpdate(CancellationToken cancellationToken) { NodeList nodeList = await this.fabricClient.QueryManager.GetNodeListAsync(null, null, this.DefaultTimeoutForOperation, cancellationToken); HashSet <string> processingNodes = new HashSet <string>(); HashSet <string> pendingNodes = new HashSet <string>(); IList <RepairTask> claimedTaskList = await this.GetClaimedRepairTasks(nodeList, cancellationToken); foreach (var task in claimedTaskList) { pendingNodes.Add(task.Target.ToString()); } cancellationToken.ThrowIfCancellationRequested(); RepairTaskList processingTaskListFinal = await this.GetRepairTasksUnderProcessing(cancellationToken); foreach (var task in processingTaskListFinal) { processingNodes.Add(task.Target.ToString()); } string pendingNodesString = string.Join(", ", pendingNodes); string processingNodesString = string.Join(", ", processingNodes); if (String.IsNullOrEmpty(pendingNodesString)) { pendingNodesString = "None"; } if (String.IsNullOrEmpty(processingNodesString)) { processingNodesString = "None"; } string description = string.Format(" Node currently being patched: {0} \nNodes waiting to be patched: {1}", processingNodesString, pendingNodesString); HealthManagerHelper.PostNodeHealthReport(fabricClient, this.context.ServiceName, ClusterPatchingStatusProperty, description, HealthState.Ok); }
/// <summary> /// Fetches all the repair tasks which are under execution and checks /// if any of them has exceeded the pre-specified execution timeout limit /// </summary> /// <param name="cancellationToken"></param> /// <returns></returns> internal async Task TimeoutRepairTasks(CancellationToken cancellationToken) { if (!this.ManageRepairTasksOnTimeout) { return; } // Get repair tasks which have been approved and are still under execution by POA RepairTaskList repairTasks = await this.fabricClient.RepairManager.GetRepairTaskListAsync(TaskIdPrefix, RepairTaskStateFilter.Approved | RepairTaskStateFilter.Executing, ExecutorName, this.DefaultTimeoutForOperation, cancellationToken); foreach (var task in repairTasks) { ExecutorDataForRmTask executorData = SerializationUtility.Deserialize <ExecutorDataForRmTask>(task.ExecutorData); Debug.Assert(task.ApprovedTimestamp != null, "ApprovedTimestamp of an approved repair task can never be null"); TimeSpan elapsedTime = DateTime.UtcNow.Subtract(task.ApprovedTimestamp.Value); if (elapsedTime > (TimeSpan.FromMinutes(executorData.ExecutorTimeoutInMinutes) + GraceTimeForNtService)) { // Check if the node exists or not. If node does not exists, then don't break; bool nodeExists = false; string nodeName = this.GetNodeNameFromRepairTask(task); NodeList nodeList = await this.fabricClient.QueryManager.GetNodeListAsync(nodeName, null, this.DefaultTimeoutForOperation, cancellationToken); foreach (var node in nodeList) { if (node.NodeName.Equals(nodeName)) { // Node Exists. nodeExists = true; break; } } if (!nodeExists) { // If node does not exist now, there is no point in waiting on the task. ServiceEventSource.Current.VerboseMessage("Cancelling repair task {0} which is in {1} state as the node {2} does not exist anymore.", task.TaskId, task.State, nodeName); await this.CancelRepairTask(task); continue; } switch (executorData.ExecutorSubState) { // These are special states where its best if NodeAgentNtService should move the repair task, just post warning in this case case NodeAgentSfUtilityExitCodes.RestartRequested: case NodeAgentSfUtilityExitCodes.RestartCompleted: case NodeAgentSfUtilityExitCodes.InstallationCompleted: { string healthproperty = string.Format( NodeTimeoutStatusFormat, nodeName); string healthDescription = string.Format( "Installation timeout {0} minutes alloted to repair task {1}, node {2} is over, however since node is in post-installation phase, wait for few more minutes for operation to complete" + "In case problem persists, please check if recent installations of updates has caused any problem on the node", executorData.ExecutorTimeoutInMinutes, task.TaskId, nodeName); ServiceEventSource.Current.ErrorMessage("Title = {0}, Description = {1}", healthproperty, healthDescription); HealthManagerHelper.PostNodeHealthReport(this.fabricClient, this.context.ServiceName, healthproperty, healthDescription, HealthState.Warning, 60); break; } default: { await UpdateRepairTaskState(task, nodeName, RepairTaskState.Restoring, executorData.ExecutorTimeoutInMinutes, cancellationToken); break; } } } } }
/// <summary> /// Post the cluster patching status as events on CoordinatorService /// </summary> public async Task PostClusterPatchingStatus(CancellationToken cancellationToken) { try { NodeList nodeList = await this.fabricClient.QueryManager.GetNodeListAsync(null, null, this.DefaultTimeoutForOperation, cancellationToken); IList <RepairTask> claimedTaskList = await this.GetClaimedRepairTasks(nodeList, cancellationToken); RepairTaskList processingTaskList = await this.GetRepairTasksUnderProcessing(cancellationToken); cancellationToken.ThrowIfCancellationRequested(); if (claimedTaskList.Any()) { if (!processingTaskList.Any()) { // This means that repair tasks are not getting approved. ClusterHealth clusterHealth = await this.fabricClient.HealthManager.GetClusterHealthAsync(); if (clusterHealth.AggregatedHealthState == HealthState.Error) { // Reset Count postUpdateCount = 0; string warningDescription = " Cluster is currently unhealthy. Nodes are currently not getting patched by Patch Orchestration Application. Please ensure the cluster becomes healthy for patching to continue."; await PostWarningOnCoordinatorService(warningDescription, 1); } else { postUpdateCount++; if (postUpdateCount > 60) { // Reset Count and throw a warning on the service saying we dont know the reason. But POA not is not approving tasks. postUpdateCount = 0; string warningDescription = "Patch Orchestration Application is currently not patching nodes. This could be possible if there is some node which is stuck in disabling state for long time."; await PostWarningOnCoordinatorService(warningDescription, 61); } } } else { // Reset Count postUpdateCount = 0; await PostRMTaskNodeUpdate(cancellationToken); } } else { // Reset Count postUpdateCount = 0; if (processingTaskList.Any()) { await PostRMTaskNodeUpdate(cancellationToken); } else { // Post the health event saying that there is no repair task and things are working fine. string description = "No claimed tasks and no processing tasks are found."; HealthManagerHelper.PostNodeHealthReport(this.fabricClient, this.context.ServiceName, ClusterPatchingStatusProperty, description, HealthState.Ok, -1); } } } catch (Exception ex) { ServiceEventSource.Current.ErrorMessage("PostClusterPatchingStatus failed with exception {0}", ex.ToString()); } }