/// <summary> /// Updates the repair task for current node /// </summary> /// <param name="fc">Fabric client object used for carrying out service fabric client requests</param> /// <param name="task">Repair task which needs to be updated</param> /// <param name="taskState">State of the repair task <see cref="RepairTaskState"/></param> /// <param name="taskResultStatus">Result status for last completed operation by RE</param> /// <param name="resultDetails">Result details for last completed operation by RE</param> /// <param name="executorState">Substate of repair executor</param> /// <param name="timeout">Timeout for the async operation</param> /// <param name="cancellationToken">The cancellation token to cancel the async operation</param> /// <returns>A Task representing the asnyc operation</returns> private static async Task UpdateRepairTask(FabricClient fc, RepairTask task, RepairTaskState taskState, RepairTaskResult taskResultStatus, string resultDetails, NodeAgentSfUtilityExitCodes executorState, TimeSpan timeout, CancellationToken cancellationToken, DateTime?restartRequesteDateTime) { // Do the actual work before mark the task as Executing. task.State = taskState; task.ResultStatus = taskResultStatus; task.ResultDetails = resultDetails; ExecutorDataForRmTask executorData = SerializationUtility.Deserialize <ExecutorDataForRmTask>(task.ExecutorData); executorData.ExecutorSubState = executorState; if (restartRequesteDateTime.HasValue) { executorData.RestartRequestedTime = restartRequesteDateTime.Value; } task.ExecutorData = SerializationUtility.Serialize(executorData); await fc.RepairManager.UpdateRepairExecutionStateAsync(task, timeout, cancellationToken); }
/// <summary> /// Fetches all the repair tasks which are under execution and checks /// if any of them has exceeded the pre-specified execution timeout limit /// </summary> /// <param name="cancellationToken"></param> /// <returns></returns> internal async Task TimeoutRepairTasks(CancellationToken cancellationToken) { if (!this.ManageRepairTasksOnTimeout) { return; } // Get repair tasks which have been approved and are still under execution by POA RepairTaskList repairTasks = await this.fabricClient.RepairManager.GetRepairTaskListAsync(TaskIdPrefix, RepairTaskStateFilter.Approved | RepairTaskStateFilter.Executing, ExecutorName, this.DefaultTimeoutForOperation, cancellationToken); foreach (var task in repairTasks) { ExecutorDataForRmTask executorData = SerializationUtility.Deserialize <ExecutorDataForRmTask>(task.ExecutorData); Debug.Assert(task.ApprovedTimestamp != null, "ApprovedTimestamp of an approved repair task can never be null"); TimeSpan elapsedTime = DateTime.UtcNow.Subtract(task.ApprovedTimestamp.Value); if (elapsedTime > (TimeSpan.FromMinutes(executorData.ExecutorTimeoutInMinutes) + GraceTimeForNtService)) { // Check if the node exists or not. If node does not exists, then don't break; bool nodeExists = false; string nodeName = this.GetNodeNameFromRepairTask(task); NodeList nodeList = await this.fabricClient.QueryManager.GetNodeListAsync(nodeName, null, this.DefaultTimeoutForOperation, cancellationToken); foreach (var node in nodeList) { if (node.NodeName.Equals(nodeName)) { // Node Exists. nodeExists = true; break; } } if (!nodeExists) { // If node does not exist now, there is no point in waiting on the task. ServiceEventSource.Current.VerboseMessage("Cancelling repair task {0} which is in {1} state as the node {2} does not exist anymore.", task.TaskId, task.State, nodeName); await this.CancelRepairTask(task); continue; } switch (executorData.ExecutorSubState) { // These are special states where its best if NodeAgentNtService should move the repair task, just post warning in this case case NodeAgentSfUtilityExitCodes.RestartRequested: case NodeAgentSfUtilityExitCodes.RestartCompleted: case NodeAgentSfUtilityExitCodes.InstallationCompleted: { string healthproperty = string.Format( NodeTimeoutStatusFormat, nodeName); string healthDescription = string.Format( "Installation timeout {0} minutes alloted to repair task {1}, node {2} is over, however since node is in post-installation phase, wait for few more minutes for operation to complete" + "In case problem persists, please check if recent installations of updates has caused any problem on the node", executorData.ExecutorTimeoutInMinutes, task.TaskId, nodeName); ServiceEventSource.Current.ErrorMessage("Title = {0}, Description = {1}", healthproperty, healthDescription); HealthManagerHelper.PostNodeHealthReport(this.fabricClient, this.context.ServiceName, healthproperty, healthDescription, HealthState.Warning, 60); break; } default: { await UpdateRepairTaskState(task, nodeName, RepairTaskState.Restoring, executorData.ExecutorTimeoutInMinutes, cancellationToken); break; } } } } }
/// <summary> /// Fetches all the repair tasks which are under execution and checks /// if any of them has exceeded the pre-specified execution timeout limit /// </summary> /// <param name="cancellationToken"></param> /// <returns></returns> internal async Task TimeoutRepairTasks(CancellationToken cancellationToken) { if (!this.ManageRepairTasksOnTimeout) { return; } // Get repair tasks which have been approved and are still under execution by POA RepairTaskList repairTasks = await this.fabricClient.RepairManager.GetRepairTaskListAsync(TaskIdPrefix, RepairTaskStateFilter.Approved | RepairTaskStateFilter.Executing, ExecutorName, this.DefaultTimeoutForOperation, cancellationToken); foreach (var task in repairTasks) { ExecutorDataForRmTask executorData = SerializationUtility.Deserialize <ExecutorDataForRmTask>(task.ExecutorData); Debug.Assert(task.ApprovedTimestamp != null, "ApprovedTimestamp of an approved repair task can never be null"); TimeSpan elapsedTime = DateTime.UtcNow.Subtract(task.ApprovedTimestamp.Value); if (elapsedTime > (TimeSpan.FromMinutes(executorData.ExecutorTimeoutInMinutes) + GraceTimeForNtService)) { // Check if the node exists or not. If node does not exists, then don't break; bool nodeExists = false; string nodeName = this.GetNodeNameFromRepairTask(task); NodeList nodeList = await this.fabricClient.QueryManager.GetNodeListAsync(nodeName, null, this.DefaultTimeoutForOperation, cancellationToken); foreach (var node in nodeList) { if (node.NodeName.Equals(nodeName)) { // Node Exists. nodeExists = true; break; } } if (!nodeExists) { // If node does not exist now, there is no point in waiting on the task. ServiceEventSource.Current.VerboseMessage("Cancelling repair task {0} which is in {1} state as the node {2} does not exist anymore.", task.TaskId, task.State, nodeName); await this.CancelRepairTask(task); continue; } switch (executorData.ExecutorSubState) { // These are special states where its best if NodeAgentNtService should move the repair task, just post warning in this case case NodeAgentSfUtilityExitCodes.RestartRequested: case NodeAgentSfUtilityExitCodes.RestartCompleted: case NodeAgentSfUtilityExitCodes.InstallationCompleted: { string message = string.Format( "Repair Task {0} did not complete within the Timeout period for node {1}. Since Installation was already started, updating Repair Task state to further proceed with Node enabling", task.TaskId, nodeName); ServiceEventSource.Current.InfoMessage(message); await UpdateRepairTaskState(task, nodeName, RepairTaskState.Restoring, executorData.ExecutorTimeoutInMinutes, cancellationToken); break; } default: { string message = string.Format( "Repair Task {0} completed within the Timeout period for node {1}. Updating Repair Task state to further proceed with Node enabling", task.TaskId, nodeName); ServiceEventSource.Current.InfoMessage(message); await UpdateRepairTaskState(task, nodeName, RepairTaskState.Restoring, executorData.ExecutorTimeoutInMinutes, cancellationToken); break; } } } } }
/// <summary> /// Creates Repair task for a node with executor set as Patch Orchestration Service /// </summary> /// <param name="fc">Fabric client object used for carrying out service fabric client requests</param> /// <param name="nodeName">Node name for which repair task needs to be created</param> /// <param name="taskDescription">Description of repair task which needs to be created</param> /// <param name="resultDetails">Result details for the completed operation to make the repair task verbose</param> /// <param name="executorData">Executor data associated with the repair task</param> /// <param name="timeout">Timeout for the async operation</param> /// <param name="cancellationToken">The cancellation token to cancel the async operation</param> /// <returns>A Task representing the asnyc operation, result of task would be <see cref="NodeAgentSfUtilityExitCodes"/></returns> internal static async Task <NodeAgentSfUtilityExitCodes> CreateRepairTaskForNode(FabricClient fc, string nodeName, string taskDescription, string resultDetails, ExecutorDataForRmTask executorData, TimeSpan timeout, CancellationToken cancellationToken) { string taskIdPrefix = string.Format("{0}_{1}", TaskIdPrefix, nodeName); string taskId = string.Format("{0}_{1}", taskIdPrefix, Guid.NewGuid()); ClusterRepairTask repairTask = new ClusterRepairTask(taskId, RepairAction); repairTask.Description = taskDescription; repairTask.State = RepairTaskState.Claimed; repairTask.Executor = ExecutorName; repairTask.ExecutorData = SerializationUtility.Serialize(executorData); repairTask.Target = new NodeRepairTargetDescription(nodeName); repairTask.ResultDetails = resultDetails; try { await fc.RepairManager.CreateRepairTaskAsync(repairTask, timeout, cancellationToken); return(NodeAgentSfUtilityExitCodes.Success); } catch (Exception e) { ServiceEventSource.Current.ErrorMessage( String.Format("RepairManagerHelper.CreateRepairTaskForNode failed. Exception details {0}", e)); if (e is FabricTransientException) { return(NodeAgentSfUtilityExitCodes.RetryableException); } else { return(NodeAgentSfUtilityExitCodes.Failure); } } }
/// <summary> /// Updates the status of search and download operation in CoordinatorService's repliable store /// And updates the RepairTask with appropriate state as per the updateState provided /// </summary> /// <param name="nodeName">Name of current service fabric node</param> /// <param name="applicationName">Uri of the Patch Orchestration Application.</param> /// <param name="updateState">State of Wu operation, possible values are DownloadAvailable, DownloadCompleted, OperationCompleted</param> /// <param name="operationResult">result of the search and download operation, cannot be null</param> /// <param name="installationTimeout">Amount of time a node can undergo installation, during installation node would be in disabled state</param> /// <param name="timeout">Timeout for the async operation</param> /// <param name="cancellationToken">The cancellation token to cancel the async operation</param> /// <returns>Task containing result of operation, true for success, false for failure</returns> /// <returns> /// A Task representing the asnyc operation, result of the task would be <see cref="NodeAgentSfUtilityExitCodes.Success"/> in case of success /// Any other <see cref="NodeAgentSfUtilityExitCodes"/> in case of failure /// </returns> public async Task <NodeAgentSfUtilityExitCodes> UpdateSearchAndDownloadStatusAsync(String nodeName, Uri applicationName, NodeAgentSfUtilityExitCodes updateState, WindowsUpdateOperationResult operationResult, int installationTimeout, TimeSpan timeout, CancellationToken cancellationToken) { String taskDescription = null; String resultDetails = null; NodeAgentSfUtilityExitCodes result; ExecutorDataForRmTask executorData = new ExecutorDataForRmTask() { ExecutorSubState = updateState, ExecutorTimeoutInMinutes = installationTimeout }; if (null != operationResult) { int succeededOperations; int abortedOperations; int totalOperations; this.GetWuOperationResultCount(operationResult, out totalOperations, out abortedOperations, out succeededOperations); result = await CoordinatorServiceHelper.UpdateWuOperationResult( this.fabricClient, applicationName, operationResult, timeout, cancellationToken); if (result != NodeAgentSfUtilityExitCodes.Success) { return(result); } taskDescription = String.Format( "{0} updates successfully downloaded on {1}. Creating this repair task to install the downloaded updates", succeededOperations, operationResult.OperationTime); resultDetails = String.Format("{0} updates searched. {1} downloaded successfully, {2} downloads were aborted", operationResult.UpdateDetails.Count, succeededOperations, abortedOperations); } switch (updateState) { case NodeAgentSfUtilityExitCodes.DownloadCompleted: { result = await RepairManagerHelper.CreateRepairTaskForNode( this.fabricClient, nodeName, taskDescription, resultDetails, executorData, timeout, cancellationToken); break; } case NodeAgentSfUtilityExitCodes.OperationCompleted: { result = NodeAgentSfUtilityExitCodes.Success; break; } case NodeAgentSfUtilityExitCodes.OperationAborted: { ServiceEventSource.Current.InfoMessage(String.Format("Operation aborted for a claimed task")); result = await RepairManagerHelper.UpdateRepairTask(this.fabricClient, nodeName, RepairTaskState.Completed, RepairTaskResult.Failed, "Aborting the operation", updateState, timeout, cancellationToken); break; } default: { ServiceEventSource.Current.ErrorMessage( String.Format("UpdateSearchAndDownloadStatusAsync called with invalid state {0}", updateState)); result = NodeAgentSfUtilityExitCodes.InvalidArgument; break; } } ServiceEventSource.Current.InfoMessage("UpdateSearchAndDownloadStatusAsync result = {0}", result); return(result); }
/// <summary> /// Gets the state of Windows Update operation using the state stored in RepairTask /// </summary> /// <param name="nodeName">Name of current Service Fabric node</param> /// <param name="timeout">Timeout for the async operation</param> /// <param name="cancellationToken">The cancellation token to cancel the async operation</param> /// <returns>A Task representing the asnyc operation, result of the task would be <see cref="NodeAgentSfUtilityExitCodes"/></returns> public async Task <NodeAgentSfUtilityExitCodes> GetWuOperationStateAsync(String nodeName, TimeSpan timeout, CancellationToken cancellationToken) { RepairTask repairTask; try { repairTask = await RepairManagerHelper.GetRepairTaskForNode( this.fabricClient, nodeName, timeout, cancellationToken); } catch (Exception e) { ServiceEventSource.Current.ErrorMessage( String.Format("RepairManagerHelper.GetRepairTaskForNode failed. Exception details {0}", e)); if (e is FabricTransientException) { return(NodeAgentSfUtilityExitCodes.RetryableException); } else { return(NodeAgentSfUtilityExitCodes.Failure); } } if (null == repairTask) { ServiceEventSource.Current.VerboseMessage(String.Format("No repair task found for this node, Operation State = {0}", NodeAgentSfUtilityExitCodes.None)); return(NodeAgentSfUtilityExitCodes.None); } NodeAgentSfUtilityExitCodes resultState; ExecutorDataForRmTask executorData = SerializationUtility.Deserialize <ExecutorDataForRmTask>(repairTask.ExecutorData); ExecutorDataForNtService executorDataForNtService = new ExecutorDataForNtService() { ApprovedDateTime = repairTask.ApprovedTimestamp, ExecutorTimeoutInMinutes = executorData.ExecutorTimeoutInMinutes }; string workFolder = Path.GetDirectoryName(System.Reflection.Assembly.GetEntryAssembly().Location); string executorDataForNtServiceFilePath = Path.Combine(workFolder, ExecutorDataForNtServiceFileName); SerializationUtility.Serialize(executorDataForNtServiceFilePath, executorDataForNtService); switch (repairTask.State) { case RepairTaskState.Claimed: case RepairTaskState.Preparing: resultState = NodeAgentSfUtilityExitCodes.DownloadCompleted; break; case RepairTaskState.Approved: resultState = NodeAgentSfUtilityExitCodes.InstallationApproved; break; case RepairTaskState.Executing: { resultState = executorData.ExecutorSubState; if (resultState == NodeAgentSfUtilityExitCodes.RestartRequested) { if (this.GetRestartStatus(executorData.RestartRequestedTime)) { string resultDetails = "Installation of the updates completed, Restart post installation completed successfully"; resultState = await RepairManagerHelper.UpdateRepairTask(this.fabricClient, nodeName, RepairTaskState.Executing, RepairTaskResult.Pending, resultDetails, NodeAgentSfUtilityExitCodes.RestartCompleted, timeout, cancellationToken); if (resultState == NodeAgentSfUtilityExitCodes.Success) { resultState = NodeAgentSfUtilityExitCodes.RestartCompleted; } else { ServiceEventSource.Current.ErrorMessage( String.Format("Post restart, update of Repair task failed with {0}", resultState)); resultState = NodeAgentSfUtilityExitCodes.RetryableException; } } } break; } case RepairTaskState.Completed: case RepairTaskState.Restoring: { resultState = NodeAgentSfUtilityExitCodes.OperationCompleted; break; } default: { ServiceEventSource.Current.ErrorMessage(String.Format("Repair task for current node in unexpected state {0}", repairTask.State)); resultState = NodeAgentSfUtilityExitCodes.RepairTaskInvalidState; break; } } ServiceEventSource.Current.InfoMessage("GetWuOperationStateAsync returned {0}", resultState); return(resultState); }