Exemple #1
0
        /// <summary>
        /// Prepares the claimed repair tasks belonging to POS as per the TaskApprovalPolicy
        /// </summary>
        /// <returns>Task for the asynchronous operation</returns>
        internal async Task PrepareRepairTasks(CancellationToken cancellationToken)
        {
            NodeList nodeList = await this.fabricClient.QueryManager.GetNodeListAsync(null, null, this.DefaultTimeoutForOperation, cancellationToken);

            IList <RepairTask> claimedTaskList = await this.GetClaimedRepairTasks(nodeList, cancellationToken);

            switch (RmPolicy)
            {
            case TaskApprovalPolicy.NodeWise:
            {
                RepairTaskList processingTaskList = await this.GetRepairTasksUnderProcessing(cancellationToken);

                if (!processingTaskList.Any())
                {
                    if (claimedTaskList.Any())
                    {
                        RepairTask oldestClaimedTask = claimedTaskList.Aggregate(
                            (curMin, task) => (task.CreatedTimestamp < curMin.CreatedTimestamp ? task : curMin));
                        ServiceEventSource.Current.VerboseMessage(
                            "Out of {0} claimed tasks, Oldest repair task = {0} with node = {1} will be prepared",
                            claimedTaskList.Count, oldestClaimedTask.TaskId, oldestClaimedTask.Target);
                        this.StartPreparingRepairTask(oldestClaimedTask);
                    }
                }
                break;
            }

            case TaskApprovalPolicy.UpgradeDomainWise:
            {
                string currentUpgradeDomain = await this.GetCurrentUpgradeDomainUnderProcessing(nodeList, cancellationToken);

                ServiceEventSource.Current.VerboseMessage(String.Format("{0} repair tasks were found in claimed state", claimedTaskList.Count));
                // Below line can be enabled for debugging
                // rmHelper.PrintRepairTasks(claimedTaskList);

                foreach (var claimedTask in claimedTaskList)
                {
                    string udName = this.GetUpgradeDomainOfRepairTask(claimedTask, nodeList);

                    if (string.IsNullOrEmpty(currentUpgradeDomain))
                    {
                        currentUpgradeDomain = udName;
                    }

                    if (udName == currentUpgradeDomain)
                    {
                        this.StartPreparingRepairTask(claimedTask);
                    }
                }
                break;
            }

            default:
            {
                string errorMessage = String.Format("Illegal RmPolicy found: {0}", RmPolicy);
                ServiceEventSource.Current.ErrorMessage(errorMessage);
                throw new InvalidOperationException(errorMessage);
            }
            }
        }
Exemple #2
0
        public void Onebox_Successful_SelfRepairAfterTaskIsApproved()
        {
            const string property = "Onebox_Successful_SelfRepairAfterTaskIsCreated";

            MockRepairExecutor executor = Executors.First();

            LogHelper.Log("Starting test {0}", property);

            Node node = GetHealthyNode(_fabricClient);
            // Inject error
            MockWatchdog watchdog = new MockWatchdog(_fabricClient.HealthManager, node.NodeName);

            watchdog.ReportError(property);

            RepairTask createdTask = PollingForCreatedTask(executor, node);

            Assert.IsNotNull(createdTask, "There must be a repair task created.");

            executor.Approve(ref createdTask);

            ClearHealthError(watchdog, property, executor);

            // Make sure the task is cancelled
            Thread.Sleep(_repairManagerLatency);
            RepairTaskList repairTaskList = _fabricClient.RepairManager.GetRepairTaskListAsync(createdTask.TaskId, RepairTaskStateFilter.All, null).Result;

            Assert.IsTrue((repairTaskList.Any() && repairTaskList.Count == 1), "There must be a unique repair task with task id = {0}", createdTask.TaskId);

            RepairTask cancelledTask = repairTaskList.First();

            Assert.IsTrue(cancelledTask.Flags == RepairTaskFlags.CancelRequested, "PE should request cancellation of the task");
        }
Exemple #3
0
        /// <summary>
        /// Post the cluster patching status as events on CoordinatorService
        /// </summary>
        public async Task PostClusterPatchingStatus(CancellationToken cancellationToken)
        {
            try
            {
                NodeList nodeList = await this.fabricClient.QueryManager.GetNodeListAsync(null, null, this.DefaultTimeoutForOperation, cancellationToken);

                IList <RepairTask> claimedTaskList = await this.GetClaimedRepairTasks(nodeList, cancellationToken);

                RepairTaskList processingTaskList = await this.GetRepairTasksUnderProcessing(cancellationToken);

                cancellationToken.ThrowIfCancellationRequested();
                if (claimedTaskList.Any())
                {
                    if (!processingTaskList.Any())
                    {
                        // This means that repair tasks are not getting approved.
                        ClusterHealth clusterHealth = await this.fabricClient.HealthManager.GetClusterHealthAsync();

                        if (clusterHealth.AggregatedHealthState == HealthState.Error)
                        {
                            // Reset Count
                            postUpdateCount = 0;
                            string warningDescription = " Cluster is currently unhealthy. Nodes are currently not getting patched by Patch Orchestration Application. Please ensure the cluster becomes healthy for patching to continue.";
                            await PostWarningOnCoordinatorService(warningDescription, 1);
                        }
                        else
                        {
                            postUpdateCount++;
                            if (postUpdateCount > 60)
                            {
                                // Reset Count and throw a warning on the service saying we dont know the reason. But POA not is not approving tasks.
                                postUpdateCount = 0;
                                string warningDescription = "Patch Orchestration Application is currently not patching nodes. This could be possible if there is some node which is stuck in disabling state for long time.";
                                await PostWarningOnCoordinatorService(warningDescription, 61);
                            }
                        }
                    }
                    else
                    {
                        // Reset Count
                        postUpdateCount = 0;
                        await PostRMTaskNodeUpdate(cancellationToken);
                    }
                }
                else
                {
                    // Reset Count
                    postUpdateCount = 0;
                    if (processingTaskList.Any())
                    {
                        await PostRMTaskNodeUpdate(cancellationToken);
                    }
                    else
                    {
                        // Post the health event saying that there is no repair task and things are working fine.
                        string description = "No claimed tasks and no processing tasks are found.";
                        HealthManagerHelper.PostNodeHealthReport(this.fabricClient, this.context.ServiceName, ClusterPatchingStatusProperty, description, HealthState.Ok, -1);
                    }
                }
            }
            catch (Exception ex)
            {
                ServiceEventSource.Current.ErrorMessage("PostClusterPatchingStatus failed with exception {0}", ex.ToString());
            }
        }