/// <summary> /// Prepares the claimed repair tasks belonging to POS as per the TaskApprovalPolicy /// </summary> /// <returns>Task for the asynchronous operation</returns> internal async Task PrepareRepairTasks(CancellationToken cancellationToken) { NodeList nodeList = await this.fabricClient.QueryManager.GetNodeListAsync(null, null, this.DefaultTimeoutForOperation, cancellationToken); IList <RepairTask> claimedTaskList = await this.GetClaimedRepairTasks(nodeList, cancellationToken); switch (RmPolicy) { case TaskApprovalPolicy.NodeWise: { RepairTaskList processingTaskList = await this.GetRepairTasksUnderProcessing(cancellationToken); if (!processingTaskList.Any()) { if (claimedTaskList.Any()) { RepairTask oldestClaimedTask = claimedTaskList.Aggregate( (curMin, task) => (task.CreatedTimestamp < curMin.CreatedTimestamp ? task : curMin)); ServiceEventSource.Current.VerboseMessage( "Out of {0} claimed tasks, Oldest repair task = {0} with node = {1} will be prepared", claimedTaskList.Count, oldestClaimedTask.TaskId, oldestClaimedTask.Target); this.StartPreparingRepairTask(oldestClaimedTask); } } break; } case TaskApprovalPolicy.UpgradeDomainWise: { string currentUpgradeDomain = await this.GetCurrentUpgradeDomainUnderProcessing(nodeList, cancellationToken); ServiceEventSource.Current.VerboseMessage(String.Format("{0} repair tasks were found in claimed state", claimedTaskList.Count)); // Below line can be enabled for debugging // rmHelper.PrintRepairTasks(claimedTaskList); foreach (var claimedTask in claimedTaskList) { string udName = this.GetUpgradeDomainOfRepairTask(claimedTask, nodeList); if (string.IsNullOrEmpty(currentUpgradeDomain)) { currentUpgradeDomain = udName; } if (udName == currentUpgradeDomain) { this.StartPreparingRepairTask(claimedTask); } } break; } default: { string errorMessage = String.Format("Illegal RmPolicy found: {0}", RmPolicy); ServiceEventSource.Current.ErrorMessage(errorMessage); throw new InvalidOperationException(errorMessage); } } }
public void Onebox_Successful_SelfRepairAfterTaskIsApproved() { const string property = "Onebox_Successful_SelfRepairAfterTaskIsCreated"; MockRepairExecutor executor = Executors.First(); LogHelper.Log("Starting test {0}", property); Node node = GetHealthyNode(_fabricClient); // Inject error MockWatchdog watchdog = new MockWatchdog(_fabricClient.HealthManager, node.NodeName); watchdog.ReportError(property); RepairTask createdTask = PollingForCreatedTask(executor, node); Assert.IsNotNull(createdTask, "There must be a repair task created."); executor.Approve(ref createdTask); ClearHealthError(watchdog, property, executor); // Make sure the task is cancelled Thread.Sleep(_repairManagerLatency); RepairTaskList repairTaskList = _fabricClient.RepairManager.GetRepairTaskListAsync(createdTask.TaskId, RepairTaskStateFilter.All, null).Result; Assert.IsTrue((repairTaskList.Any() && repairTaskList.Count == 1), "There must be a unique repair task with task id = {0}", createdTask.TaskId); RepairTask cancelledTask = repairTaskList.First(); Assert.IsTrue(cancelledTask.Flags == RepairTaskFlags.CancelRequested, "PE should request cancellation of the task"); }
/// <summary> /// Post the cluster patching status as events on CoordinatorService /// </summary> public async Task PostClusterPatchingStatus(CancellationToken cancellationToken) { try { NodeList nodeList = await this.fabricClient.QueryManager.GetNodeListAsync(null, null, this.DefaultTimeoutForOperation, cancellationToken); IList <RepairTask> claimedTaskList = await this.GetClaimedRepairTasks(nodeList, cancellationToken); RepairTaskList processingTaskList = await this.GetRepairTasksUnderProcessing(cancellationToken); cancellationToken.ThrowIfCancellationRequested(); if (claimedTaskList.Any()) { if (!processingTaskList.Any()) { // This means that repair tasks are not getting approved. ClusterHealth clusterHealth = await this.fabricClient.HealthManager.GetClusterHealthAsync(); if (clusterHealth.AggregatedHealthState == HealthState.Error) { // Reset Count postUpdateCount = 0; string warningDescription = " Cluster is currently unhealthy. Nodes are currently not getting patched by Patch Orchestration Application. Please ensure the cluster becomes healthy for patching to continue."; await PostWarningOnCoordinatorService(warningDescription, 1); } else { postUpdateCount++; if (postUpdateCount > 60) { // Reset Count and throw a warning on the service saying we dont know the reason. But POA not is not approving tasks. postUpdateCount = 0; string warningDescription = "Patch Orchestration Application is currently not patching nodes. This could be possible if there is some node which is stuck in disabling state for long time."; await PostWarningOnCoordinatorService(warningDescription, 61); } } } else { // Reset Count postUpdateCount = 0; await PostRMTaskNodeUpdate(cancellationToken); } } else { // Reset Count postUpdateCount = 0; if (processingTaskList.Any()) { await PostRMTaskNodeUpdate(cancellationToken); } else { // Post the health event saying that there is no repair task and things are working fine. string description = "No claimed tasks and no processing tasks are found."; HealthManagerHelper.PostNodeHealthReport(this.fabricClient, this.context.ServiceName, ClusterPatchingStatusProperty, description, HealthState.Ok, -1); } } } catch (Exception ex) { ServiceEventSource.Current.ErrorMessage("PostClusterPatchingStatus failed with exception {0}", ex.ToString()); } }