Пример #1
0
        /// <summary>
        /// Updates the repair task for current node
        /// </summary>
        /// <param name="fc">Fabric client object used for carrying out service fabric client requests</param>
        /// <param name="task">Repair task which needs to be updated</param>
        /// <param name="taskState">State of the repair task <see cref="RepairTaskState"/></param>
        /// <param name="taskResultStatus">Result status for last completed operation by RE</param>
        /// <param name="resultDetails">Result details for last completed operation by RE</param>
        /// <param name="executorState">Substate of repair executor</param>
        /// <param name="timeout">Timeout for the async operation</param>
        /// <param name="cancellationToken">The cancellation token to cancel the async operation</param>
        /// <returns>A Task representing the asnyc operation</returns>
        private static async Task UpdateRepairTask(FabricClient fc, RepairTask task, RepairTaskState taskState,
                                                   RepairTaskResult taskResultStatus, string resultDetails, NodeAgentSfUtilityExitCodes executorState, TimeSpan timeout,
                                                   CancellationToken cancellationToken, DateTime?restartRequesteDateTime)
        {
            // Do the actual work before mark the task as Executing.
            task.State         = taskState;
            task.ResultStatus  = taskResultStatus;
            task.ResultDetails = resultDetails;
            ExecutorDataForRmTask executorData =
                SerializationUtility.Deserialize <ExecutorDataForRmTask>(task.ExecutorData);

            executorData.ExecutorSubState = executorState;
            if (restartRequesteDateTime.HasValue)
            {
                executorData.RestartRequestedTime = restartRequesteDateTime.Value;
            }

            task.ExecutorData = SerializationUtility.Serialize(executorData);
            await fc.RepairManager.UpdateRepairExecutionStateAsync(task, timeout, cancellationToken);
        }
Пример #2
0
        /// <summary>
        /// Fetches all the repair tasks which are under execution and checks
        /// if any of them has exceeded the pre-specified execution timeout limit
        /// </summary>
        /// <param name="cancellationToken"></param>
        /// <returns></returns>
        internal async Task TimeoutRepairTasks(CancellationToken cancellationToken)
        {
            if (!this.ManageRepairTasksOnTimeout)
            {
                return;
            }

            // Get repair tasks which have been approved and are still under execution by POA
            RepairTaskList repairTasks = await this.fabricClient.RepairManager.GetRepairTaskListAsync(TaskIdPrefix,
                                                                                                      RepairTaskStateFilter.Approved | RepairTaskStateFilter.Executing, ExecutorName, this.DefaultTimeoutForOperation, cancellationToken);

            foreach (var task in repairTasks)
            {
                ExecutorDataForRmTask executorData =
                    SerializationUtility.Deserialize <ExecutorDataForRmTask>(task.ExecutorData);
                Debug.Assert(task.ApprovedTimestamp != null, "ApprovedTimestamp of an approved repair task can never be null");
                TimeSpan elapsedTime = DateTime.UtcNow.Subtract(task.ApprovedTimestamp.Value);
                if (elapsedTime > (TimeSpan.FromMinutes(executorData.ExecutorTimeoutInMinutes) + GraceTimeForNtService))
                {
                    // Check if the node exists or not. If node does not exists, then don't break;
                    bool     nodeExists = false;
                    string   nodeName   = this.GetNodeNameFromRepairTask(task);
                    NodeList nodeList   = await this.fabricClient.QueryManager.GetNodeListAsync(nodeName, null, this.DefaultTimeoutForOperation, cancellationToken);

                    foreach (var node in nodeList)
                    {
                        if (node.NodeName.Equals(nodeName))
                        {
                            // Node Exists.
                            nodeExists = true;
                            break;
                        }
                    }

                    if (!nodeExists)
                    {
                        // If node does not exist now, there is no point in waiting on the task.
                        ServiceEventSource.Current.VerboseMessage("Cancelling repair task {0} which is in {1} state as the node {2} does not exist anymore.", task.TaskId, task.State, nodeName);
                        await this.CancelRepairTask(task);

                        continue;
                    }

                    switch (executorData.ExecutorSubState)
                    {
                    // These are special states where its best if NodeAgentNtService should move the repair task, just post warning in this case
                    case NodeAgentSfUtilityExitCodes.RestartRequested:
                    case NodeAgentSfUtilityExitCodes.RestartCompleted:
                    case NodeAgentSfUtilityExitCodes.InstallationCompleted:
                    {
                        string healthproperty = string.Format(
                            NodeTimeoutStatusFormat,
                            nodeName);
                        string healthDescription =
                            string.Format(
                                "Installation timeout {0} minutes alloted to repair task {1}, node {2} is over, however since node is in post-installation phase, wait for few more minutes for operation to complete"
                                + "In case problem persists, please check if recent installations of updates has caused any problem on the node",
                                executorData.ExecutorTimeoutInMinutes,
                                task.TaskId,
                                nodeName);
                        ServiceEventSource.Current.ErrorMessage("Title = {0}, Description = {1}", healthproperty, healthDescription);
                        HealthManagerHelper.PostNodeHealthReport(this.fabricClient,
                                                                 this.context.ServiceName,
                                                                 healthproperty,
                                                                 healthDescription,
                                                                 HealthState.Warning,
                                                                 60);

                        break;
                    }

                    default:
                    {
                        await UpdateRepairTaskState(task, nodeName, RepairTaskState.Restoring, executorData.ExecutorTimeoutInMinutes, cancellationToken);

                        break;
                    }
                    }
                }
            }
        }
Пример #3
0
        /// <summary>
        /// Fetches all the repair tasks which are under execution and checks
        /// if any of them has exceeded the pre-specified execution timeout limit
        /// </summary>
        /// <param name="cancellationToken"></param>
        /// <returns></returns>
        internal async Task TimeoutRepairTasks(CancellationToken cancellationToken)
        {
            if (!this.ManageRepairTasksOnTimeout)
            {
                return;
            }

            // Get repair tasks which have been approved and are still under execution by POA
            RepairTaskList repairTasks = await this.fabricClient.RepairManager.GetRepairTaskListAsync(TaskIdPrefix,
                                                                                                      RepairTaskStateFilter.Approved | RepairTaskStateFilter.Executing, ExecutorName, this.DefaultTimeoutForOperation, cancellationToken);

            foreach (var task in repairTasks)
            {
                ExecutorDataForRmTask executorData =
                    SerializationUtility.Deserialize <ExecutorDataForRmTask>(task.ExecutorData);
                Debug.Assert(task.ApprovedTimestamp != null, "ApprovedTimestamp of an approved repair task can never be null");
                TimeSpan elapsedTime = DateTime.UtcNow.Subtract(task.ApprovedTimestamp.Value);
                if (elapsedTime > (TimeSpan.FromMinutes(executorData.ExecutorTimeoutInMinutes) + GraceTimeForNtService))
                {
                    // Check if the node exists or not. If node does not exists, then don't break;
                    bool     nodeExists = false;
                    string   nodeName   = this.GetNodeNameFromRepairTask(task);
                    NodeList nodeList   = await this.fabricClient.QueryManager.GetNodeListAsync(nodeName, null, this.DefaultTimeoutForOperation, cancellationToken);

                    foreach (var node in nodeList)
                    {
                        if (node.NodeName.Equals(nodeName))
                        {
                            // Node Exists.
                            nodeExists = true;
                            break;
                        }
                    }

                    if (!nodeExists)
                    {
                        // If node does not exist now, there is no point in waiting on the task.
                        ServiceEventSource.Current.VerboseMessage("Cancelling repair task {0} which is in {1} state as the node {2} does not exist anymore.", task.TaskId, task.State, nodeName);
                        await this.CancelRepairTask(task);

                        continue;
                    }

                    switch (executorData.ExecutorSubState)
                    {
                    // These are special states where its best if NodeAgentNtService should move the repair task, just post warning in this case
                    case NodeAgentSfUtilityExitCodes.RestartRequested:
                    case NodeAgentSfUtilityExitCodes.RestartCompleted:
                    case NodeAgentSfUtilityExitCodes.InstallationCompleted:
                    {
                        string message =
                            string.Format(
                                "Repair Task {0} did not complete within the Timeout period for node {1}.  Since Installation was already started, updating Repair Task state to further proceed with Node enabling",
                                task.TaskId,
                                nodeName);
                        ServiceEventSource.Current.InfoMessage(message);
                        await UpdateRepairTaskState(task, nodeName, RepairTaskState.Restoring, executorData.ExecutorTimeoutInMinutes, cancellationToken);

                        break;
                    }

                    default:
                    {
                        string message =
                            string.Format(
                                "Repair Task {0} completed within the Timeout period for node {1}. Updating Repair Task state to further proceed with Node enabling",
                                task.TaskId,
                                nodeName);
                        ServiceEventSource.Current.InfoMessage(message);
                        await UpdateRepairTaskState(task, nodeName, RepairTaskState.Restoring, executorData.ExecutorTimeoutInMinutes, cancellationToken);

                        break;
                    }
                    }
                }
            }
        }
Пример #4
0
        /// <summary>
        /// Creates Repair task for a node with executor set as Patch Orchestration Service
        /// </summary>
        /// <param name="fc">Fabric client object used for carrying out service fabric client requests</param>
        /// <param name="nodeName">Node name for which repair task needs to be created</param>
        /// <param name="taskDescription">Description of repair task which needs to be created</param>
        /// <param name="resultDetails">Result details for the completed operation to make the repair task verbose</param>
        /// <param name="executorData">Executor data associated with the repair task</param>
        /// <param name="timeout">Timeout for the async operation</param>
        /// <param name="cancellationToken">The cancellation token to cancel the async operation</param>
        /// <returns>A Task representing the asnyc operation, result of task would be <see cref="NodeAgentSfUtilityExitCodes"/></returns>
        internal static async Task <NodeAgentSfUtilityExitCodes> CreateRepairTaskForNode(FabricClient fc, string nodeName,
                                                                                         string taskDescription, string resultDetails, ExecutorDataForRmTask executorData, TimeSpan timeout,
                                                                                         CancellationToken cancellationToken)
        {
            string            taskIdPrefix = string.Format("{0}_{1}", TaskIdPrefix, nodeName);
            string            taskId       = string.Format("{0}_{1}", taskIdPrefix, Guid.NewGuid());
            ClusterRepairTask repairTask   = new ClusterRepairTask(taskId, RepairAction);

            repairTask.Description   = taskDescription;
            repairTask.State         = RepairTaskState.Claimed;
            repairTask.Executor      = ExecutorName;
            repairTask.ExecutorData  = SerializationUtility.Serialize(executorData);
            repairTask.Target        = new NodeRepairTargetDescription(nodeName);
            repairTask.ResultDetails = resultDetails;

            try
            {
                await fc.RepairManager.CreateRepairTaskAsync(repairTask, timeout, cancellationToken);

                return(NodeAgentSfUtilityExitCodes.Success);
            }
            catch (Exception e)
            {
                ServiceEventSource.Current.ErrorMessage(
                    String.Format("RepairManagerHelper.CreateRepairTaskForNode failed. Exception details {0}", e));
                if (e is FabricTransientException)
                {
                    return(NodeAgentSfUtilityExitCodes.RetryableException);
                }
                else
                {
                    return(NodeAgentSfUtilityExitCodes.Failure);
                }
            }
        }
Пример #5
0
        /// <summary>
        /// Updates the status of search and download operation in CoordinatorService's repliable store
        /// And updates the RepairTask with appropriate state as per the updateState provided
        /// </summary>
        /// <param name="nodeName">Name of current service fabric node</param>
        /// <param name="applicationName">Uri of the Patch Orchestration Application.</param>
        /// <param name="updateState">State of Wu operation, possible values are DownloadAvailable, DownloadCompleted, OperationCompleted</param>
        /// <param name="operationResult">result of the search and download operation, cannot be null</param>
        /// <param name="installationTimeout">Amount of time a node can undergo installation, during installation node would be in disabled state</param>
        /// <param name="timeout">Timeout for the async operation</param>
        /// <param name="cancellationToken">The cancellation token to cancel the async operation</param>
        /// <returns>Task containing result of operation, true for success, false for failure</returns>
        /// <returns>
        /// A Task representing the asnyc operation, result of the task would be <see cref="NodeAgentSfUtilityExitCodes.Success"/> in case of success
        /// Any other <see cref="NodeAgentSfUtilityExitCodes"/> in case of failure
        /// </returns>
        public async Task <NodeAgentSfUtilityExitCodes> UpdateSearchAndDownloadStatusAsync(String nodeName, Uri applicationName,
                                                                                           NodeAgentSfUtilityExitCodes updateState, WindowsUpdateOperationResult operationResult, int installationTimeout, TimeSpan timeout, CancellationToken cancellationToken)
        {
            String taskDescription = null;
            String resultDetails   = null;
            NodeAgentSfUtilityExitCodes result;
            ExecutorDataForRmTask       executorData = new ExecutorDataForRmTask()
            {
                ExecutorSubState         = updateState,
                ExecutorTimeoutInMinutes = installationTimeout
            };

            if (null != operationResult)
            {
                int succeededOperations;
                int abortedOperations;
                int totalOperations;

                this.GetWuOperationResultCount(operationResult, out totalOperations, out abortedOperations,
                                               out succeededOperations);

                result = await
                         CoordinatorServiceHelper.UpdateWuOperationResult(
                    this.fabricClient,
                    applicationName,
                    operationResult,
                    timeout,
                    cancellationToken);

                if (result != NodeAgentSfUtilityExitCodes.Success)
                {
                    return(result);
                }

                taskDescription =
                    String.Format(
                        "{0} updates successfully downloaded on {1}. Creating this repair task to install the downloaded updates",
                        succeededOperations, operationResult.OperationTime);
                resultDetails =
                    String.Format("{0} updates searched. {1} downloaded successfully, {2} downloads were aborted",
                                  operationResult.UpdateDetails.Count, succeededOperations, abortedOperations);
            }

            switch (updateState)
            {
            case NodeAgentSfUtilityExitCodes.DownloadCompleted:
            {
                result = await
                         RepairManagerHelper.CreateRepairTaskForNode(
                    this.fabricClient,
                    nodeName,
                    taskDescription,
                    resultDetails,
                    executorData,
                    timeout,
                    cancellationToken);

                break;
            }

            case NodeAgentSfUtilityExitCodes.OperationCompleted:
            {
                result = NodeAgentSfUtilityExitCodes.Success;
                break;
            }

            case NodeAgentSfUtilityExitCodes.OperationAborted:
            {
                ServiceEventSource.Current.InfoMessage(String.Format("Operation aborted for a claimed task"));
                result = await RepairManagerHelper.UpdateRepairTask(this.fabricClient, nodeName,
                                                                    RepairTaskState.Completed, RepairTaskResult.Failed,
                                                                    "Aborting the operation", updateState, timeout, cancellationToken);

                break;
            }

            default:
            {
                ServiceEventSource.Current.ErrorMessage(
                    String.Format("UpdateSearchAndDownloadStatusAsync called with invalid state {0}", updateState));
                result = NodeAgentSfUtilityExitCodes.InvalidArgument;
                break;
            }
            }

            ServiceEventSource.Current.InfoMessage("UpdateSearchAndDownloadStatusAsync result = {0}", result);
            return(result);
        }
Пример #6
0
        /// <summary>
        /// Gets the state of Windows Update operation using the state stored in RepairTask
        /// </summary>
        /// <param name="nodeName">Name of current Service Fabric node</param>
        /// <param name="timeout">Timeout for the async operation</param>
        /// <param name="cancellationToken">The cancellation token to cancel the async operation</param>
        /// <returns>A Task representing the asnyc operation, result of the task would be <see cref="NodeAgentSfUtilityExitCodes"/></returns>
        public async Task <NodeAgentSfUtilityExitCodes> GetWuOperationStateAsync(String nodeName, TimeSpan timeout, CancellationToken cancellationToken)
        {
            RepairTask repairTask;

            try
            {
                repairTask =
                    await
                    RepairManagerHelper.GetRepairTaskForNode(
                        this.fabricClient,
                        nodeName,
                        timeout,
                        cancellationToken);
            }
            catch (Exception e)
            {
                ServiceEventSource.Current.ErrorMessage(
                    String.Format("RepairManagerHelper.GetRepairTaskForNode failed. Exception details {0}", e));
                if (e is FabricTransientException)
                {
                    return(NodeAgentSfUtilityExitCodes.RetryableException);
                }
                else
                {
                    return(NodeAgentSfUtilityExitCodes.Failure);
                }
            }

            if (null == repairTask)
            {
                ServiceEventSource.Current.VerboseMessage(String.Format("No repair task found for this node, Operation State = {0}", NodeAgentSfUtilityExitCodes.None));
                return(NodeAgentSfUtilityExitCodes.None);
            }

            NodeAgentSfUtilityExitCodes resultState;
            ExecutorDataForRmTask       executorData             = SerializationUtility.Deserialize <ExecutorDataForRmTask>(repairTask.ExecutorData);
            ExecutorDataForNtService    executorDataForNtService = new ExecutorDataForNtService()
            {
                ApprovedDateTime = repairTask.ApprovedTimestamp, ExecutorTimeoutInMinutes = executorData.ExecutorTimeoutInMinutes
            };

            string workFolder = Path.GetDirectoryName(System.Reflection.Assembly.GetEntryAssembly().Location);
            string executorDataForNtServiceFilePath = Path.Combine(workFolder, ExecutorDataForNtServiceFileName);

            SerializationUtility.Serialize(executorDataForNtServiceFilePath, executorDataForNtService);

            switch (repairTask.State)
            {
            case RepairTaskState.Claimed:
            case RepairTaskState.Preparing:
                resultState = NodeAgentSfUtilityExitCodes.DownloadCompleted;
                break;

            case RepairTaskState.Approved:
                resultState = NodeAgentSfUtilityExitCodes.InstallationApproved;
                break;

            case RepairTaskState.Executing:
            {
                resultState = executorData.ExecutorSubState;
                if (resultState == NodeAgentSfUtilityExitCodes.RestartRequested)
                {
                    if (this.GetRestartStatus(executorData.RestartRequestedTime))
                    {
                        string resultDetails =
                            "Installation of the updates completed, Restart post installation completed successfully";
                        resultState = await RepairManagerHelper.UpdateRepairTask(this.fabricClient, nodeName,
                                                                                 RepairTaskState.Executing, RepairTaskResult.Pending,
                                                                                 resultDetails, NodeAgentSfUtilityExitCodes.RestartCompleted, timeout,
                                                                                 cancellationToken);

                        if (resultState == NodeAgentSfUtilityExitCodes.Success)
                        {
                            resultState = NodeAgentSfUtilityExitCodes.RestartCompleted;
                        }
                        else
                        {
                            ServiceEventSource.Current.ErrorMessage(
                                String.Format("Post restart, update of Repair task failed with {0}", resultState));
                            resultState = NodeAgentSfUtilityExitCodes.RetryableException;
                        }
                    }
                }

                break;
            }

            case RepairTaskState.Completed:
            case RepairTaskState.Restoring:
            {
                resultState = NodeAgentSfUtilityExitCodes.OperationCompleted;
                break;
            }

            default:
            {
                ServiceEventSource.Current.ErrorMessage(String.Format("Repair task for current node in unexpected state {0}", repairTask.State));
                resultState = NodeAgentSfUtilityExitCodes.RepairTaskInvalidState;
                break;
            }
            }

            ServiceEventSource.Current.InfoMessage("GetWuOperationStateAsync returned {0}", resultState);
            return(resultState);
        }