Example #1
0
        /// <summary>
        /// Prepares the claimed repair tasks belonging to POS as per the TaskApprovalPolicy
        /// </summary>
        /// <returns>Task for the asynchronous operation</returns>
        internal async Task PrepareRepairTasks(CancellationToken cancellationToken)
        {
            NodeList nodeList = await this.fabricClient.QueryManager.GetNodeListAsync(null, null, this.DefaultTimeoutForOperation, cancellationToken);

            IList <RepairTask> claimedTaskList = await this.GetClaimedRepairTasks(nodeList, cancellationToken);

            switch (RmPolicy)
            {
            case TaskApprovalPolicy.NodeWise:
            {
                RepairTaskList processingTaskList = await this.GetRepairTasksUnderProcessing(cancellationToken);

                if (!processingTaskList.Any())
                {
                    if (claimedTaskList.Any())
                    {
                        RepairTask oldestClaimedTask = claimedTaskList.Aggregate(
                            (curMin, task) => (task.CreatedTimestamp < curMin.CreatedTimestamp ? task : curMin));
                        ServiceEventSource.Current.VerboseMessage(
                            "Out of {0} claimed tasks, Oldest repair task = {0} with node = {1} will be prepared",
                            claimedTaskList.Count, oldestClaimedTask.TaskId, oldestClaimedTask.Target);
                        this.StartPreparingRepairTask(oldestClaimedTask);
                    }
                }
                break;
            }

            case TaskApprovalPolicy.UpgradeDomainWise:
            {
                string currentUpgradeDomain = await this.GetCurrentUpgradeDomainUnderProcessing(nodeList, cancellationToken);

                ServiceEventSource.Current.VerboseMessage(String.Format("{0} repair tasks were found in claimed state", claimedTaskList.Count));
                // Below line can be enabled for debugging
                // rmHelper.PrintRepairTasks(claimedTaskList);

                foreach (var claimedTask in claimedTaskList)
                {
                    string udName = this.GetUpgradeDomainOfRepairTask(claimedTask, nodeList);

                    if (string.IsNullOrEmpty(currentUpgradeDomain))
                    {
                        currentUpgradeDomain = udName;
                    }

                    if (udName == currentUpgradeDomain)
                    {
                        this.StartPreparingRepairTask(claimedTask);
                    }
                }
                break;
            }

            default:
            {
                string errorMessage = String.Format("Illegal RmPolicy found: {0}", RmPolicy);
                ServiceEventSource.Current.ErrorMessage(errorMessage);
                throw new InvalidOperationException(errorMessage);
            }
            }
        }
        public async Task <IList <IRepairTask> > GetRepairTaskListAsync(
            Guid activityId,
            string taskIdFilter = null,
            RepairTaskStateFilter stateFilter = RepairTaskStateFilter.Default,
            string executorFilter             = null)
        {
            //var startTime = DateTimeOffset.UtcNow;

            try
            {
                // TODO, using the overload without timeout and cancellation token for now since there is some max timeout limit
                // being exercised somewhere. if timeout provided is more than that, repair task creation fails
                RepairTaskList repairTaskList = await repairManager.GetRepairTaskListAsync(taskIdFilter, stateFilter, executorFilter).ConfigureAwait(false);

                var repairTasks = new List <IRepairTask>(repairTaskList.Count);
                foreach (var repairTask in repairTaskList)
                {
                    repairTasks.Add(new ServiceFabricRepairTask(repairTask));
                }

                //activityLogger.LogOperation(activityId, startTime);

                return(repairTasks);
            }
            catch (Exception ex)
            {
                traceType.WriteWarning("Unable to get repair task list. Errors: {0}", ex.GetMessage());
                //activityLogger.LogOperation(activityId, startTime, OperationResult.Failure, ex);
                throw;
            }
        }
Example #3
0
        /// <summary>
        /// Gets the upgrade domain from current repair tasks under processing, ideally all the repair tasks under processing should've the same upgrade domain.
        /// However if repair tasks belonging to multiple UpgradeDomains are found, then we consider the UD of first repair task in the list of repair tasks.
        /// </summary>
        /// <param name="nodeList">List of Nodes currently in cluster, used to get nodename and upgradedomain mapping</param>
        /// <returns>Upgrade domain of the first repair task among the list of repair tasks under processing</returns>
        private async Task <string> GetCurrentUpgradeDomainUnderProcessing(NodeList nodeList, CancellationToken cancellationToken)
        {
            string         currentUpgradeDomain = null;
            RepairTaskList processingTaskList   = await this.GetRepairTasksUnderProcessing(cancellationToken);

            ServiceEventSource.Current.VerboseMessage(String.Format("{0} repair tasks were found under processing",
                                                                    processingTaskList.Count));

            // All the tasks under processing should ideally be from the same UpgradeDomain.
            // However in case the cluster topology has changed or repair tasks were manually created from some other entity resulting in multiple repair tasks with target nodes belonging to different UD's.
            // In that case we'll consider the upgrade domain of the first repair task among the list of repair tasks which are already under processing.
            foreach (var task in processingTaskList)
            {
                string udName = this.GetUpgradeDomainOfRepairTask(task, nodeList);

                if (string.IsNullOrEmpty(currentUpgradeDomain))
                {
                    currentUpgradeDomain = udName;
                }
                else if (currentUpgradeDomain != udName)
                {
                    ServiceEventSource.Current.ErrorMessage(
                        String.Format(
                            "Found repair task {0} under processing belonging to UpgradeDomains {1}, Expected only repair tasks from {2}. cluster topology might've changed",
                            task.TaskId, currentUpgradeDomain, udName));
                }
            }

            return(currentUpgradeDomain);
        }
Example #4
0
        public void Onebox_Successful_SelfRepairAfterTaskIsApproved()
        {
            const string property = "Onebox_Successful_SelfRepairAfterTaskIsCreated";

            MockRepairExecutor executor = Executors.First();

            LogHelper.Log("Starting test {0}", property);

            Node node = GetHealthyNode(_fabricClient);
            // Inject error
            MockWatchdog watchdog = new MockWatchdog(_fabricClient.HealthManager, node.NodeName);

            watchdog.ReportError(property);

            RepairTask createdTask = PollingForCreatedTask(executor, node);

            Assert.IsNotNull(createdTask, "There must be a repair task created.");

            executor.Approve(ref createdTask);

            ClearHealthError(watchdog, property, executor);

            // Make sure the task is cancelled
            Thread.Sleep(_repairManagerLatency);
            RepairTaskList repairTaskList = _fabricClient.RepairManager.GetRepairTaskListAsync(createdTask.TaskId, RepairTaskStateFilter.All, null).Result;

            Assert.IsTrue((repairTaskList.Any() && repairTaskList.Count == 1), "There must be a unique repair task with task id = {0}", createdTask.TaskId);

            RepairTask cancelledTask = repairTaskList.First();

            Assert.IsTrue(cancelledTask.Flags == RepairTaskFlags.CancelRequested, "PE should request cancellation of the task");
        }
Example #5
0
        /// <summary>
        /// This function returns the list of repair tasks which are undergoing work.
        /// At any point of time there will be only one UD which will have POS repair tasks in these states.
        /// </summary>
        /// <returns>List of repair tasks in Preparing, Approved, Executing or Restoring state</returns>
        internal async Task <RepairTaskList> GetRepairTasksUnderProcessing(CancellationToken cancellationToken)
        {
            RepairTaskList repairTasks = await this.fabricClient.RepairManager.GetRepairTaskListAsync(TaskIdPrefix,
                                                                                                      RepairTaskStateFilter.Preparing | RepairTaskStateFilter.Approved | RepairTaskStateFilter.Executing |
                                                                                                      RepairTaskStateFilter.Restoring,
                                                                                                      ExecutorName, this.DefaultTimeoutForOperation, cancellationToken);

            return(repairTasks);
        }
Example #6
0
        /// <summary>
        /// Prints the repair task for ease of debugging
        /// </summary>
        internal async Task PrintRepairTasks(CancellationToken cancellationToken)
        {
            RepairTaskList repairTasks = await this.fabricClient.RepairManager.GetRepairTaskListAsync(TaskIdPrefix,
                                                                                                      RepairTaskStateFilter.All,
                                                                                                      ExecutorName, this.DefaultTimeoutForOperation, cancellationToken);

            ServiceEventSource.Current.VerboseMessage("Total {0} repair tasks were found for POS", repairTasks.Count);
            foreach (var task in repairTasks)
            {
                ServiceEventSource.Current.PrintRepairTasks(task.TaskId, task.State.ToString(), task.Action, task.Executor,
                                                            task.Description, task.ExecutorData, task.Target.ToString());
            }
        }
Example #7
0
        /// <summary>
        /// Posts the cluster patching status by finding the nodes on which patching is going on.
        /// </summary>
        private async Task PostRMTaskNodeUpdate(CancellationToken cancellationToken)
        {
            NodeList nodeList = await this.fabricClient.QueryManager.GetNodeListAsync(null, null, this.DefaultTimeoutForOperation, cancellationToken);

            HashSet <string>   processingNodes = new HashSet <string>();
            HashSet <string>   pendingNodes    = new HashSet <string>();
            IList <RepairTask> claimedTaskList = await this.GetClaimedRepairTasks(nodeList, cancellationToken);

            foreach (var task in claimedTaskList)
            {
                pendingNodes.Add(task.Target.ToString());
            }
            cancellationToken.ThrowIfCancellationRequested();
            RepairTaskList processingTaskListFinal = await this.GetRepairTasksUnderProcessing(cancellationToken);

            foreach (var task in processingTaskListFinal)
            {
                processingNodes.Add(task.Target.ToString());
            }

            string pendingNodesString    = string.Join(", ", pendingNodes);
            string processingNodesString = string.Join(", ", processingNodes);

            if (String.IsNullOrEmpty(pendingNodesString))
            {
                pendingNodesString = "None";
            }

            if (String.IsNullOrEmpty(processingNodesString))
            {
                processingNodesString = "None";
            }

            string description = string.Format(" Node currently being patched: {0} \nNodes waiting to be patched: {1}", processingNodesString, pendingNodesString);

            HealthManagerHelper.PostNodeHealthReport(fabricClient, this.context.ServiceName, ClusterPatchingStatusProperty, description, HealthState.Ok);
        }
Example #8
0
        /// <summary>
        /// Fetches all the repair tasks which are under execution and checks
        /// if any of them has exceeded the pre-specified execution timeout limit
        /// </summary>
        /// <param name="cancellationToken"></param>
        /// <returns></returns>
        internal async Task TimeoutRepairTasks(CancellationToken cancellationToken)
        {
            if (!this.ManageRepairTasksOnTimeout)
            {
                return;
            }

            // Get repair tasks which have been approved and are still under execution by POA
            RepairTaskList repairTasks = await this.fabricClient.RepairManager.GetRepairTaskListAsync(TaskIdPrefix,
                                                                                                      RepairTaskStateFilter.Approved | RepairTaskStateFilter.Executing, ExecutorName, this.DefaultTimeoutForOperation, cancellationToken);

            foreach (var task in repairTasks)
            {
                ExecutorDataForRmTask executorData =
                    SerializationUtility.Deserialize <ExecutorDataForRmTask>(task.ExecutorData);
                Debug.Assert(task.ApprovedTimestamp != null, "ApprovedTimestamp of an approved repair task can never be null");
                TimeSpan elapsedTime = DateTime.UtcNow.Subtract(task.ApprovedTimestamp.Value);
                if (elapsedTime > (TimeSpan.FromMinutes(executorData.ExecutorTimeoutInMinutes) + GraceTimeForNtService))
                {
                    // Check if the node exists or not. If node does not exists, then don't break;
                    bool     nodeExists = false;
                    string   nodeName   = this.GetNodeNameFromRepairTask(task);
                    NodeList nodeList   = await this.fabricClient.QueryManager.GetNodeListAsync(nodeName, null, this.DefaultTimeoutForOperation, cancellationToken);

                    foreach (var node in nodeList)
                    {
                        if (node.NodeName.Equals(nodeName))
                        {
                            // Node Exists.
                            nodeExists = true;
                            break;
                        }
                    }

                    if (!nodeExists)
                    {
                        // If node does not exist now, there is no point in waiting on the task.
                        ServiceEventSource.Current.VerboseMessage("Cancelling repair task {0} which is in {1} state as the node {2} does not exist anymore.", task.TaskId, task.State, nodeName);
                        await this.CancelRepairTask(task);

                        continue;
                    }

                    switch (executorData.ExecutorSubState)
                    {
                    // These are special states where its best if NodeAgentNtService should move the repair task, just post warning in this case
                    case NodeAgentSfUtilityExitCodes.RestartRequested:
                    case NodeAgentSfUtilityExitCodes.RestartCompleted:
                    case NodeAgentSfUtilityExitCodes.InstallationCompleted:
                    {
                        string healthproperty = string.Format(
                            NodeTimeoutStatusFormat,
                            nodeName);
                        string healthDescription =
                            string.Format(
                                "Installation timeout {0} minutes alloted to repair task {1}, node {2} is over, however since node is in post-installation phase, wait for few more minutes for operation to complete"
                                + "In case problem persists, please check if recent installations of updates has caused any problem on the node",
                                executorData.ExecutorTimeoutInMinutes,
                                task.TaskId,
                                nodeName);
                        ServiceEventSource.Current.ErrorMessage("Title = {0}, Description = {1}", healthproperty, healthDescription);
                        HealthManagerHelper.PostNodeHealthReport(this.fabricClient,
                                                                 this.context.ServiceName,
                                                                 healthproperty,
                                                                 healthDescription,
                                                                 HealthState.Warning,
                                                                 60);

                        break;
                    }

                    default:
                    {
                        await UpdateRepairTaskState(task, nodeName, RepairTaskState.Restoring, executorData.ExecutorTimeoutInMinutes, cancellationToken);

                        break;
                    }
                    }
                }
            }
        }
Example #9
0
        /// <summary>
        /// Post the cluster patching status as events on CoordinatorService
        /// </summary>
        public async Task PostClusterPatchingStatus(CancellationToken cancellationToken)
        {
            try
            {
                NodeList nodeList = await this.fabricClient.QueryManager.GetNodeListAsync(null, null, this.DefaultTimeoutForOperation, cancellationToken);

                IList <RepairTask> claimedTaskList = await this.GetClaimedRepairTasks(nodeList, cancellationToken);

                RepairTaskList processingTaskList = await this.GetRepairTasksUnderProcessing(cancellationToken);

                cancellationToken.ThrowIfCancellationRequested();
                if (claimedTaskList.Any())
                {
                    if (!processingTaskList.Any())
                    {
                        // This means that repair tasks are not getting approved.
                        ClusterHealth clusterHealth = await this.fabricClient.HealthManager.GetClusterHealthAsync();

                        if (clusterHealth.AggregatedHealthState == HealthState.Error)
                        {
                            // Reset Count
                            postUpdateCount = 0;
                            string warningDescription = " Cluster is currently unhealthy. Nodes are currently not getting patched by Patch Orchestration Application. Please ensure the cluster becomes healthy for patching to continue.";
                            await PostWarningOnCoordinatorService(warningDescription, 1);
                        }
                        else
                        {
                            postUpdateCount++;
                            if (postUpdateCount > 60)
                            {
                                // Reset Count and throw a warning on the service saying we dont know the reason. But POA not is not approving tasks.
                                postUpdateCount = 0;
                                string warningDescription = "Patch Orchestration Application is currently not patching nodes. This could be possible if there is some node which is stuck in disabling state for long time.";
                                await PostWarningOnCoordinatorService(warningDescription, 61);
                            }
                        }
                    }
                    else
                    {
                        // Reset Count
                        postUpdateCount = 0;
                        await PostRMTaskNodeUpdate(cancellationToken);
                    }
                }
                else
                {
                    // Reset Count
                    postUpdateCount = 0;
                    if (processingTaskList.Any())
                    {
                        await PostRMTaskNodeUpdate(cancellationToken);
                    }
                    else
                    {
                        // Post the health event saying that there is no repair task and things are working fine.
                        string description = "No claimed tasks and no processing tasks are found.";
                        HealthManagerHelper.PostNodeHealthReport(this.fabricClient, this.context.ServiceName, ClusterPatchingStatusProperty, description, HealthState.Ok, -1);
                    }
                }
            }
            catch (Exception ex)
            {
                ServiceEventSource.Current.ErrorMessage("PostClusterPatchingStatus failed with exception {0}", ex.ToString());
            }
        }
Example #10
0
        /// <summary>
        /// Fetches all the repair tasks which are under execution and checks
        /// if any of them has exceeded the pre-specified execution timeout limit
        /// </summary>
        /// <param name="cancellationToken"></param>
        /// <returns></returns>
        internal async Task TimeoutRepairTasks(CancellationToken cancellationToken)
        {
            if (!this.ManageRepairTasksOnTimeout)
            {
                return;
            }

            // Get repair tasks which have been approved and are still under execution by POA
            RepairTaskList repairTasks = await this.fabricClient.RepairManager.GetRepairTaskListAsync(TaskIdPrefix,
                                                                                                      RepairTaskStateFilter.Approved | RepairTaskStateFilter.Executing, ExecutorName, this.DefaultTimeoutForOperation, cancellationToken);

            foreach (var task in repairTasks)
            {
                ExecutorDataForRmTask executorData =
                    SerializationUtility.Deserialize <ExecutorDataForRmTask>(task.ExecutorData);
                Debug.Assert(task.ApprovedTimestamp != null, "ApprovedTimestamp of an approved repair task can never be null");
                TimeSpan elapsedTime = DateTime.UtcNow.Subtract(task.ApprovedTimestamp.Value);
                if (elapsedTime > (TimeSpan.FromMinutes(executorData.ExecutorTimeoutInMinutes) + GraceTimeForNtService))
                {
                    // Check if the node exists or not. If node does not exists, then don't break;
                    bool     nodeExists = false;
                    string   nodeName   = this.GetNodeNameFromRepairTask(task);
                    NodeList nodeList   = await this.fabricClient.QueryManager.GetNodeListAsync(nodeName, null, this.DefaultTimeoutForOperation, cancellationToken);

                    foreach (var node in nodeList)
                    {
                        if (node.NodeName.Equals(nodeName))
                        {
                            // Node Exists.
                            nodeExists = true;
                            break;
                        }
                    }

                    if (!nodeExists)
                    {
                        // If node does not exist now, there is no point in waiting on the task.
                        ServiceEventSource.Current.VerboseMessage("Cancelling repair task {0} which is in {1} state as the node {2} does not exist anymore.", task.TaskId, task.State, nodeName);
                        await this.CancelRepairTask(task);

                        continue;
                    }

                    switch (executorData.ExecutorSubState)
                    {
                    // These are special states where its best if NodeAgentNtService should move the repair task, just post warning in this case
                    case NodeAgentSfUtilityExitCodes.RestartRequested:
                    case NodeAgentSfUtilityExitCodes.RestartCompleted:
                    case NodeAgentSfUtilityExitCodes.InstallationCompleted:
                    {
                        string message =
                            string.Format(
                                "Repair Task {0} did not complete within the Timeout period for node {1}.  Since Installation was already started, updating Repair Task state to further proceed with Node enabling",
                                task.TaskId,
                                nodeName);
                        ServiceEventSource.Current.InfoMessage(message);
                        await UpdateRepairTaskState(task, nodeName, RepairTaskState.Restoring, executorData.ExecutorTimeoutInMinutes, cancellationToken);

                        break;
                    }

                    default:
                    {
                        string message =
                            string.Format(
                                "Repair Task {0} completed within the Timeout period for node {1}. Updating Repair Task state to further proceed with Node enabling",
                                task.TaskId,
                                nodeName);
                        ServiceEventSource.Current.InfoMessage(message);
                        await UpdateRepairTaskState(task, nodeName, RepairTaskState.Restoring, executorData.ExecutorTimeoutInMinutes, cancellationToken);

                        break;
                    }
                    }
                }
            }
        }
 private RepairTaskList GetRepairTaskListAsyncEndWrapper(
     NativeCommon.IFabricAsyncOperationContext context)
 {
     return(RepairTaskList.CreateFromNativeListResult(
                this.nativeRepairClient.EndGetRepairTaskList(context)));
 }