/// <summary> /// Prepares the claimed repair tasks belonging to POS as per the TaskApprovalPolicy /// </summary> /// <returns>Task for the asynchronous operation</returns> internal async Task PrepareRepairTasks(CancellationToken cancellationToken) { NodeList nodeList = await this.fabricClient.QueryManager.GetNodeListAsync(null, null, this.DefaultTimeoutForOperation, cancellationToken); IList <RepairTask> claimedTaskList = await this.GetClaimedRepairTasks(nodeList, cancellationToken); switch (RmPolicy) { case TaskApprovalPolicy.NodeWise: { RepairTaskList processingTaskList = await this.GetRepairTasksUnderProcessing(cancellationToken); if (!processingTaskList.Any()) { if (claimedTaskList.Any()) { RepairTask oldestClaimedTask = claimedTaskList.Aggregate( (curMin, task) => (task.CreatedTimestamp < curMin.CreatedTimestamp ? task : curMin)); ServiceEventSource.Current.VerboseMessage( "Out of {0} claimed tasks, Oldest repair task = {0} with node = {1} will be prepared", claimedTaskList.Count, oldestClaimedTask.TaskId, oldestClaimedTask.Target); this.StartPreparingRepairTask(oldestClaimedTask); } } break; } case TaskApprovalPolicy.UpgradeDomainWise: { string currentUpgradeDomain = await this.GetCurrentUpgradeDomainUnderProcessing(nodeList, cancellationToken); ServiceEventSource.Current.VerboseMessage(String.Format("{0} repair tasks were found in claimed state", claimedTaskList.Count)); // Below line can be enabled for debugging // rmHelper.PrintRepairTasks(claimedTaskList); foreach (var claimedTask in claimedTaskList) { string udName = this.GetUpgradeDomainOfRepairTask(claimedTask, nodeList); if (string.IsNullOrEmpty(currentUpgradeDomain)) { currentUpgradeDomain = udName; } if (udName == currentUpgradeDomain) { this.StartPreparingRepairTask(claimedTask); } } break; } default: { string errorMessage = String.Format("Illegal RmPolicy found: {0}", RmPolicy); ServiceEventSource.Current.ErrorMessage(errorMessage); throw new InvalidOperationException(errorMessage); } } }
public async Task <IList <IRepairTask> > GetRepairTaskListAsync( Guid activityId, string taskIdFilter = null, RepairTaskStateFilter stateFilter = RepairTaskStateFilter.Default, string executorFilter = null) { //var startTime = DateTimeOffset.UtcNow; try { // TODO, using the overload without timeout and cancellation token for now since there is some max timeout limit // being exercised somewhere. if timeout provided is more than that, repair task creation fails RepairTaskList repairTaskList = await repairManager.GetRepairTaskListAsync(taskIdFilter, stateFilter, executorFilter).ConfigureAwait(false); var repairTasks = new List <IRepairTask>(repairTaskList.Count); foreach (var repairTask in repairTaskList) { repairTasks.Add(new ServiceFabricRepairTask(repairTask)); } //activityLogger.LogOperation(activityId, startTime); return(repairTasks); } catch (Exception ex) { traceType.WriteWarning("Unable to get repair task list. Errors: {0}", ex.GetMessage()); //activityLogger.LogOperation(activityId, startTime, OperationResult.Failure, ex); throw; } }
/// <summary> /// Gets the upgrade domain from current repair tasks under processing, ideally all the repair tasks under processing should've the same upgrade domain. /// However if repair tasks belonging to multiple UpgradeDomains are found, then we consider the UD of first repair task in the list of repair tasks. /// </summary> /// <param name="nodeList">List of Nodes currently in cluster, used to get nodename and upgradedomain mapping</param> /// <returns>Upgrade domain of the first repair task among the list of repair tasks under processing</returns> private async Task <string> GetCurrentUpgradeDomainUnderProcessing(NodeList nodeList, CancellationToken cancellationToken) { string currentUpgradeDomain = null; RepairTaskList processingTaskList = await this.GetRepairTasksUnderProcessing(cancellationToken); ServiceEventSource.Current.VerboseMessage(String.Format("{0} repair tasks were found under processing", processingTaskList.Count)); // All the tasks under processing should ideally be from the same UpgradeDomain. // However in case the cluster topology has changed or repair tasks were manually created from some other entity resulting in multiple repair tasks with target nodes belonging to different UD's. // In that case we'll consider the upgrade domain of the first repair task among the list of repair tasks which are already under processing. foreach (var task in processingTaskList) { string udName = this.GetUpgradeDomainOfRepairTask(task, nodeList); if (string.IsNullOrEmpty(currentUpgradeDomain)) { currentUpgradeDomain = udName; } else if (currentUpgradeDomain != udName) { ServiceEventSource.Current.ErrorMessage( String.Format( "Found repair task {0} under processing belonging to UpgradeDomains {1}, Expected only repair tasks from {2}. cluster topology might've changed", task.TaskId, currentUpgradeDomain, udName)); } } return(currentUpgradeDomain); }
public void Onebox_Successful_SelfRepairAfterTaskIsApproved() { const string property = "Onebox_Successful_SelfRepairAfterTaskIsCreated"; MockRepairExecutor executor = Executors.First(); LogHelper.Log("Starting test {0}", property); Node node = GetHealthyNode(_fabricClient); // Inject error MockWatchdog watchdog = new MockWatchdog(_fabricClient.HealthManager, node.NodeName); watchdog.ReportError(property); RepairTask createdTask = PollingForCreatedTask(executor, node); Assert.IsNotNull(createdTask, "There must be a repair task created."); executor.Approve(ref createdTask); ClearHealthError(watchdog, property, executor); // Make sure the task is cancelled Thread.Sleep(_repairManagerLatency); RepairTaskList repairTaskList = _fabricClient.RepairManager.GetRepairTaskListAsync(createdTask.TaskId, RepairTaskStateFilter.All, null).Result; Assert.IsTrue((repairTaskList.Any() && repairTaskList.Count == 1), "There must be a unique repair task with task id = {0}", createdTask.TaskId); RepairTask cancelledTask = repairTaskList.First(); Assert.IsTrue(cancelledTask.Flags == RepairTaskFlags.CancelRequested, "PE should request cancellation of the task"); }
/// <summary> /// This function returns the list of repair tasks which are undergoing work. /// At any point of time there will be only one UD which will have POS repair tasks in these states. /// </summary> /// <returns>List of repair tasks in Preparing, Approved, Executing or Restoring state</returns> internal async Task <RepairTaskList> GetRepairTasksUnderProcessing(CancellationToken cancellationToken) { RepairTaskList repairTasks = await this.fabricClient.RepairManager.GetRepairTaskListAsync(TaskIdPrefix, RepairTaskStateFilter.Preparing | RepairTaskStateFilter.Approved | RepairTaskStateFilter.Executing | RepairTaskStateFilter.Restoring, ExecutorName, this.DefaultTimeoutForOperation, cancellationToken); return(repairTasks); }
/// <summary> /// Prints the repair task for ease of debugging /// </summary> internal async Task PrintRepairTasks(CancellationToken cancellationToken) { RepairTaskList repairTasks = await this.fabricClient.RepairManager.GetRepairTaskListAsync(TaskIdPrefix, RepairTaskStateFilter.All, ExecutorName, this.DefaultTimeoutForOperation, cancellationToken); ServiceEventSource.Current.VerboseMessage("Total {0} repair tasks were found for POS", repairTasks.Count); foreach (var task in repairTasks) { ServiceEventSource.Current.PrintRepairTasks(task.TaskId, task.State.ToString(), task.Action, task.Executor, task.Description, task.ExecutorData, task.Target.ToString()); } }
/// <summary> /// Posts the cluster patching status by finding the nodes on which patching is going on. /// </summary> private async Task PostRMTaskNodeUpdate(CancellationToken cancellationToken) { NodeList nodeList = await this.fabricClient.QueryManager.GetNodeListAsync(null, null, this.DefaultTimeoutForOperation, cancellationToken); HashSet <string> processingNodes = new HashSet <string>(); HashSet <string> pendingNodes = new HashSet <string>(); IList <RepairTask> claimedTaskList = await this.GetClaimedRepairTasks(nodeList, cancellationToken); foreach (var task in claimedTaskList) { pendingNodes.Add(task.Target.ToString()); } cancellationToken.ThrowIfCancellationRequested(); RepairTaskList processingTaskListFinal = await this.GetRepairTasksUnderProcessing(cancellationToken); foreach (var task in processingTaskListFinal) { processingNodes.Add(task.Target.ToString()); } string pendingNodesString = string.Join(", ", pendingNodes); string processingNodesString = string.Join(", ", processingNodes); if (String.IsNullOrEmpty(pendingNodesString)) { pendingNodesString = "None"; } if (String.IsNullOrEmpty(processingNodesString)) { processingNodesString = "None"; } string description = string.Format(" Node currently being patched: {0} \nNodes waiting to be patched: {1}", processingNodesString, pendingNodesString); HealthManagerHelper.PostNodeHealthReport(fabricClient, this.context.ServiceName, ClusterPatchingStatusProperty, description, HealthState.Ok); }
/// <summary> /// Fetches all the repair tasks which are under execution and checks /// if any of them has exceeded the pre-specified execution timeout limit /// </summary> /// <param name="cancellationToken"></param> /// <returns></returns> internal async Task TimeoutRepairTasks(CancellationToken cancellationToken) { if (!this.ManageRepairTasksOnTimeout) { return; } // Get repair tasks which have been approved and are still under execution by POA RepairTaskList repairTasks = await this.fabricClient.RepairManager.GetRepairTaskListAsync(TaskIdPrefix, RepairTaskStateFilter.Approved | RepairTaskStateFilter.Executing, ExecutorName, this.DefaultTimeoutForOperation, cancellationToken); foreach (var task in repairTasks) { ExecutorDataForRmTask executorData = SerializationUtility.Deserialize <ExecutorDataForRmTask>(task.ExecutorData); Debug.Assert(task.ApprovedTimestamp != null, "ApprovedTimestamp of an approved repair task can never be null"); TimeSpan elapsedTime = DateTime.UtcNow.Subtract(task.ApprovedTimestamp.Value); if (elapsedTime > (TimeSpan.FromMinutes(executorData.ExecutorTimeoutInMinutes) + GraceTimeForNtService)) { // Check if the node exists or not. If node does not exists, then don't break; bool nodeExists = false; string nodeName = this.GetNodeNameFromRepairTask(task); NodeList nodeList = await this.fabricClient.QueryManager.GetNodeListAsync(nodeName, null, this.DefaultTimeoutForOperation, cancellationToken); foreach (var node in nodeList) { if (node.NodeName.Equals(nodeName)) { // Node Exists. nodeExists = true; break; } } if (!nodeExists) { // If node does not exist now, there is no point in waiting on the task. ServiceEventSource.Current.VerboseMessage("Cancelling repair task {0} which is in {1} state as the node {2} does not exist anymore.", task.TaskId, task.State, nodeName); await this.CancelRepairTask(task); continue; } switch (executorData.ExecutorSubState) { // These are special states where its best if NodeAgentNtService should move the repair task, just post warning in this case case NodeAgentSfUtilityExitCodes.RestartRequested: case NodeAgentSfUtilityExitCodes.RestartCompleted: case NodeAgentSfUtilityExitCodes.InstallationCompleted: { string healthproperty = string.Format( NodeTimeoutStatusFormat, nodeName); string healthDescription = string.Format( "Installation timeout {0} minutes alloted to repair task {1}, node {2} is over, however since node is in post-installation phase, wait for few more minutes for operation to complete" + "In case problem persists, please check if recent installations of updates has caused any problem on the node", executorData.ExecutorTimeoutInMinutes, task.TaskId, nodeName); ServiceEventSource.Current.ErrorMessage("Title = {0}, Description = {1}", healthproperty, healthDescription); HealthManagerHelper.PostNodeHealthReport(this.fabricClient, this.context.ServiceName, healthproperty, healthDescription, HealthState.Warning, 60); break; } default: { await UpdateRepairTaskState(task, nodeName, RepairTaskState.Restoring, executorData.ExecutorTimeoutInMinutes, cancellationToken); break; } } } } }
/// <summary> /// Post the cluster patching status as events on CoordinatorService /// </summary> public async Task PostClusterPatchingStatus(CancellationToken cancellationToken) { try { NodeList nodeList = await this.fabricClient.QueryManager.GetNodeListAsync(null, null, this.DefaultTimeoutForOperation, cancellationToken); IList <RepairTask> claimedTaskList = await this.GetClaimedRepairTasks(nodeList, cancellationToken); RepairTaskList processingTaskList = await this.GetRepairTasksUnderProcessing(cancellationToken); cancellationToken.ThrowIfCancellationRequested(); if (claimedTaskList.Any()) { if (!processingTaskList.Any()) { // This means that repair tasks are not getting approved. ClusterHealth clusterHealth = await this.fabricClient.HealthManager.GetClusterHealthAsync(); if (clusterHealth.AggregatedHealthState == HealthState.Error) { // Reset Count postUpdateCount = 0; string warningDescription = " Cluster is currently unhealthy. Nodes are currently not getting patched by Patch Orchestration Application. Please ensure the cluster becomes healthy for patching to continue."; await PostWarningOnCoordinatorService(warningDescription, 1); } else { postUpdateCount++; if (postUpdateCount > 60) { // Reset Count and throw a warning on the service saying we dont know the reason. But POA not is not approving tasks. postUpdateCount = 0; string warningDescription = "Patch Orchestration Application is currently not patching nodes. This could be possible if there is some node which is stuck in disabling state for long time."; await PostWarningOnCoordinatorService(warningDescription, 61); } } } else { // Reset Count postUpdateCount = 0; await PostRMTaskNodeUpdate(cancellationToken); } } else { // Reset Count postUpdateCount = 0; if (processingTaskList.Any()) { await PostRMTaskNodeUpdate(cancellationToken); } else { // Post the health event saying that there is no repair task and things are working fine. string description = "No claimed tasks and no processing tasks are found."; HealthManagerHelper.PostNodeHealthReport(this.fabricClient, this.context.ServiceName, ClusterPatchingStatusProperty, description, HealthState.Ok, -1); } } } catch (Exception ex) { ServiceEventSource.Current.ErrorMessage("PostClusterPatchingStatus failed with exception {0}", ex.ToString()); } }
/// <summary> /// Fetches all the repair tasks which are under execution and checks /// if any of them has exceeded the pre-specified execution timeout limit /// </summary> /// <param name="cancellationToken"></param> /// <returns></returns> internal async Task TimeoutRepairTasks(CancellationToken cancellationToken) { if (!this.ManageRepairTasksOnTimeout) { return; } // Get repair tasks which have been approved and are still under execution by POA RepairTaskList repairTasks = await this.fabricClient.RepairManager.GetRepairTaskListAsync(TaskIdPrefix, RepairTaskStateFilter.Approved | RepairTaskStateFilter.Executing, ExecutorName, this.DefaultTimeoutForOperation, cancellationToken); foreach (var task in repairTasks) { ExecutorDataForRmTask executorData = SerializationUtility.Deserialize <ExecutorDataForRmTask>(task.ExecutorData); Debug.Assert(task.ApprovedTimestamp != null, "ApprovedTimestamp of an approved repair task can never be null"); TimeSpan elapsedTime = DateTime.UtcNow.Subtract(task.ApprovedTimestamp.Value); if (elapsedTime > (TimeSpan.FromMinutes(executorData.ExecutorTimeoutInMinutes) + GraceTimeForNtService)) { // Check if the node exists or not. If node does not exists, then don't break; bool nodeExists = false; string nodeName = this.GetNodeNameFromRepairTask(task); NodeList nodeList = await this.fabricClient.QueryManager.GetNodeListAsync(nodeName, null, this.DefaultTimeoutForOperation, cancellationToken); foreach (var node in nodeList) { if (node.NodeName.Equals(nodeName)) { // Node Exists. nodeExists = true; break; } } if (!nodeExists) { // If node does not exist now, there is no point in waiting on the task. ServiceEventSource.Current.VerboseMessage("Cancelling repair task {0} which is in {1} state as the node {2} does not exist anymore.", task.TaskId, task.State, nodeName); await this.CancelRepairTask(task); continue; } switch (executorData.ExecutorSubState) { // These are special states where its best if NodeAgentNtService should move the repair task, just post warning in this case case NodeAgentSfUtilityExitCodes.RestartRequested: case NodeAgentSfUtilityExitCodes.RestartCompleted: case NodeAgentSfUtilityExitCodes.InstallationCompleted: { string message = string.Format( "Repair Task {0} did not complete within the Timeout period for node {1}. Since Installation was already started, updating Repair Task state to further proceed with Node enabling", task.TaskId, nodeName); ServiceEventSource.Current.InfoMessage(message); await UpdateRepairTaskState(task, nodeName, RepairTaskState.Restoring, executorData.ExecutorTimeoutInMinutes, cancellationToken); break; } default: { string message = string.Format( "Repair Task {0} completed within the Timeout period for node {1}. Updating Repair Task state to further proceed with Node enabling", task.TaskId, nodeName); ServiceEventSource.Current.InfoMessage(message); await UpdateRepairTaskState(task, nodeName, RepairTaskState.Restoring, executorData.ExecutorTimeoutInMinutes, cancellationToken); break; } } } } }
private RepairTaskList GetRepairTaskListAsyncEndWrapper( NativeCommon.IFabricAsyncOperationContext context) { return(RepairTaskList.CreateFromNativeListResult( this.nativeRepairClient.EndGetRepairTaskList(context))); }