/// <summary> /// Determines if a notfication qualifies for an re-ack (i.e. another SignalReady or SignalError being sent to FC). /// This assessment could be ignored and <see cref="Update"/> could still be invoked successfully. /// What scenarios does this happen? /// 1. Enough time hasnt elapsed to reack. (Post-UD notifications take a while to disappear on FC side) /// 2. This is really an "unmatched post". A new post shows up out of nowhere. /// 3. Somehow the job id/UDs dont match. /// 4. Failover where we not longer have the record of the last notification. /// 5. The ack is just not being accepted (node is in HI). /// </summary> public bool QualifiesForReAck(IManagementNotificationContext other) { if (!other.Matches(this.JobId, this.JobStepTargetUD) || other.NotificationType != NotificationType.CompleteJobStep) { Trace.WriteInfo( TraceType, "Notification does not qualify for a CompleteJobStep re-ack since there is no existing match. JobId/UD/type: {0}/{1}/{2}", other.ActiveJobId, other.ActiveJobStepTargetUD, other.NotificationType); return(false); } if (IsRecentlyUpdated()) { Trace.WriteInfo( TraceType, "Notification does not qualify for CompleteJobStep re-ack since an update happened recently. Last update time: {0}, JobId/UD: {1}/{2}", this.UpdateTime, other.ActiveJobId, other.ActiveJobStepTargetUD); return(false); } Trace.WriteInfo( TraceType, "Notification qualifies for CompleteJobStep re-ack. JobId/UD: {0}, {1}", this.UpdateTime, other.ActiveJobId, other.ActiveJobStepTargetUD); return(true); }
/// <summary> /// Claims and executes requested repairs /// </summary> /// <param name="executeRepairs">true if this call should execute the next ready repair task, false otherwise</param> /// <param name="currentNotification">The current MR notification</param> public void ProcessRepairs(bool executeRepairs, IManagementNotificationContext currentNotification) { if (!this.RepairManagerServiceExists()) { Trace.WriteInfo(TraceType, "RepairManager service not created; skipping repair execution"); return; } try { this.ProcessNewTasks(); } catch (Exception e) { Trace.WriteWarning(TraceType, "ProcessNewTasks exception: {0}", e); } try { this.ProcessReadyTasks(executeRepairs, currentNotification); } catch (Exception e) { Trace.WriteWarning(TraceType, "ProcessReadyTasks exception: {0}", e); } }
public IManagementNotificationContext GetCurrentNotification() { IManagementNotificationContext context = null; ManagementNotification realNotification; try { realNotification = this.client.GetCurrentNotification(); } catch (Exception ex) { string message = string.Format(CultureInfo.InvariantCulture, "Error getting current notification from management protocol. Exception: {0}", ex); Trace.WriteWarning(TraceType, message); throw new ManagementException(message, ex); } if (realNotification != null) { context = new NotificationContext(realNotification); } return(context); }
public static bool Matches(this IManagementNotificationContext notification, string jobId, int upgradeDomain) { return (notification != null && notification.ActiveJobId.Equals(jobId, StringComparison.OrdinalIgnoreCase) && notification.ActiveJobStepTargetUD == upgradeDomain); }
private bool DoesAssessmentMatchNotification(IManagementNotificationContext notification) { if (currentJobImpactData == null) { return(false); } bool equals = string.Equals(currentJobImpactData.JobId, notification.ActiveJobId, StringComparison.OrdinalIgnoreCase) && currentJobImpactData.UD == notification.ActiveJobStepTargetUD; return(equals); }
public static string ToShortDisplayString(this IManagementNotificationContext notification) { if (notification == null) { return("<null>"); } return(String.Format( CultureInfo.InvariantCulture, "{0}:{1}:{2}:{3}", notification.NotificationType, notification.ActiveJobType, notification.ActiveJobId, notification.ActiveJobStepTargetUD)); }
/// <summary> /// Gets the nodes involved in the current notification /// </summary> private static Dictionary <string, INode> GetNodesInNotification(IManagementNotificationContext notification, Dictionary <string, INode> queriedNodes) { var impactedNodeMap = new Dictionary <string, INode>(StringComparer.OrdinalIgnoreCase); foreach (var instance in notification.ImpactedInstances) { var name = instance.Id.TranslateRoleInstanceToNodeName(); if (queriedNodes.ContainsKey(name)) { impactedNodeMap[name] = queriedNodes[name]; } } return(impactedNodeMap); }
private static bool NotificationMatchesRepairTask(RepairTask repairTask, IManagementNotificationContext notification) { if (notification == null) { return(false); } if (notification.ActiveJobType != JobType.DeploymentMaintenanceJob) { return(false); } // Exact match of the context ID provided when RequestMaintenance was called if (string.Equals(notification.ActiveJobContextId, GenerateJobContextId(repairTask))) { return(true); } // TODO return false here if strict matching is enabled // No exact match; try to guess if the job is related to the repair task string repairTarget = GetTargetNodeName(repairTask); if (repairTarget == null) { return(false); } string roleInstanceName = repairTarget.TranslateNodeNameToRoleInstance(); if (!notification.ImpactedInstances.Any(i => i.Id == roleInstanceName)) { return(false); } // TODO compare impact reasons? return(true); }
/// <summary> /// Inspects the impact on each role instance and determines if the impact is severe enough /// for manual approval. /// </summary> /// <param name="notification">The management notification provided by the management protocol (MR)</param> /// <returns>True if manual approval is required. False otherwise.</returns> public static bool IsManualApprovalRequired(this IManagementNotificationContext notification) { if (notification == null) { return(false); } foreach (var impactReasons in notification.ImpactedInstances.Select(e => e.ImpactReasons)) { foreach (var impactReason in impactReasons) { switch (impactReason) { case ImpactReason.VendorRepairBegin: case ImpactReason.VendorRepairEnd: return(true); } } } return(false); }
public async Task <JobImpactTranslationMode> EvaluateJobImpactAsync(IManagementNotificationContext notification) { notification.Validate("notification"); if (notification.NotificationType != NotificationType.StartJobStep) { // this is a coding error throw new ArgumentException("Notification not relevant. Notification: {0}".ToString(notification.ToShortDisplayString())); } if (DoesAssessmentMatchNotification(notification)) { return(currentJobImpactData.AssessedImpact); } traceType.WriteInfo( "JobImpactData doesn't match, starting new evaluation. Notification: {0}, Current JobImpactData: {1}", notification.ToShortDisplayString(), currentJobImpactData != null ? currentJobImpactData.ToString() : "<null>"); var now = DateTimeOffset.UtcNow; // There is already retry built into the QueryClient wrapper. If it goes beyond retry boundaries, // we'll let the caller handle this IList <INode> nodeList = await QueryClient.GetNodeListAsync(now).ConfigureAwait(false); var queriedNodes = nodeList.ToDictionary(e => e.NodeName, StringComparer.OrdinalIgnoreCase); Dictionary <string, INode> nodesToBeImpacted = GetNodesInNotification(notification, queriedNodes); var newJobImpactData = new JobImpactData { JobId = notification.ActiveJobId, JobType = notification.ActiveJobType, UD = notification.ActiveJobStepTargetUD, AssessedNodes = nodesToBeImpacted, Timestamp = now, AssessedImpact = JobImpactTranslationMode.Default, }; // no previous data if (currentJobImpactData == null) { currentJobImpactData = newJobImpactData; traceType.WriteInfo( "New assessed job impact stored. Returning {0}, JobImpactData: {1}, Notification: {2}", currentJobImpactData.AssessedImpact, currentJobImpactData, notification.ToShortDisplayString()); return(currentJobImpactData.AssessedImpact); } // has too much time passed after assessment? bool expired = HasPreviousEvaluationExpired(now, currentJobImpactData); if (expired) { currentJobImpactData = newJobImpactData; traceType.WriteWarning( "New assessed job impact stored. Time since last assessment is either invalid or has exceeded expiration time. Returning {0}. JobImpactData: {1}, Notification: {2}", currentJobImpactData.AssessedImpact, currentJobImpactData, notification.ToShortDisplayString()); return(currentJobImpactData.AssessedImpact); } bool?restarted = DidPreviouslyAssessedNodesRestart(queriedNodes, currentJobImpactData); if (restarted == null) { traceType.WriteInfo( "Unable to assess job impact, continuing to use previous assessment. Returning {0}, JobImpactData: {1}, Notification: {2}", currentJobImpactData.AssessedImpact, currentJobImpactData, notification.ToShortDisplayString()); return(currentJobImpactData.AssessedImpact); } currentJobImpactData = newJobImpactData; currentJobImpactData.AssessedImpact = restarted.Value ? JobImpactTranslationMode.Default : JobImpactTranslationMode.Optimized; traceType.WriteInfo( "New assessed job impact stored. Returning {0}, JobImpactData: {1}, Notification: {2}", currentJobImpactData.AssessedImpact, currentJobImpactData, notification.ToShortDisplayString()); return(currentJobImpactData.AssessedImpact); }
/// <summary> /// Determines whether the given repair task matches the MR notification, and if so, whether it has completed. /// </summary> private static MatchResult CompareRepairToNotification(RepairTask repairTask, IManagementNotificationContext notification) { if (!NotificationMatchesRepairTask(repairTask, notification)) { return(MatchResult.None); } if (notification.NotificationType == NotificationType.CompleteJobStep) { return(MatchResult.Complete); } else { return(MatchResult.InProgress); } }
/// <summary> /// Performs work on any repairs owned by this executor that are in the Approved or Executing states /// </summary> private void ProcessReadyTasks(bool executeRepairs, IManagementNotificationContext currentNotification) { TimeSpan executionTimeout = this.GetRepairExecutionTimeout(); List <RepairTask> approvedTasks = new List <RepairTask>(); foreach (var repairTask in this.GetReadyRepairTasks()) { try { if (repairTask.State == RepairTaskState.Executing) { MatchResult match = CompareRepairToNotification(repairTask, currentNotification); if (match == MatchResult.Complete) { string details = string.Format( CultureInfo.InvariantCulture, "Completed Windows Azure job {0} ({1})", currentNotification.ActiveJobId, currentNotification.ActiveJobStepStatus); this.UpdateRepairToRestoring( repairTask, TranslateJobStepStatusToRepairTaskResult(currentNotification.ActiveJobStepStatus), details); } else if (match == MatchResult.InProgress) { // Saw an executing task that is in progress, don't do any other work executeRepairs = false; Trace.WriteInfo(TraceType, "Repair task {0}: currently executing (job = {1}, notif status = {2}, job step status = {3}, job detailed status = {4})", repairTask.TaskId, currentNotification.ActiveJobId, currentNotification.NotificationStatus, currentNotification.ActiveJobStepStatus, currentNotification.ActiveJobDetailedStatus); string details = string.Format( CultureInfo.InvariantCulture, "Windows Azure job {0} ({1}/{2}/{3})", currentNotification.ActiveJobId, currentNotification.NotificationStatus, currentNotification.ActiveJobStepStatus, currentNotification.ActiveJobDetailedStatus); // Publish details on the repair task, to aid troubleshooting stuck repairs if (!details.Equals(repairTask.ResultDetails)) { repairTask.ResultDetails = details; this.repairManager.UpdateRepairExecutionStateAsync(repairTask).Wait(); } } else if (DateTime.UtcNow - repairTask.ExecutingTimestamp > executionTimeout) { string message = string.Format( CultureInfo.InvariantCulture, "Repair timed out after being sent to Windows Azure for execution (timeout = {0})", executionTimeout); this.UpdateRepairToRestoring( repairTask, RepairTaskResult.Failed, message); } else { // Saw an executing task that has not timed out, don't do any other work executeRepairs = false; Trace.WriteInfo(TraceType, "Repair task {0}: currently executing (unknown job)", repairTask.TaskId); } } else if (repairTask.State == RepairTaskState.Approved) { if (repairTask.Flags.HasFlag(RepairTaskFlags.CancelRequested)) { this.UpdateRepairToRestoring( repairTask, RepairTaskResult.Cancelled, "Cancelled by user request"); } else { approvedTasks.Add(repairTask); } } } catch (Exception e) { Trace.WriteWarning(TraceType, "Failed to process ready task {0}: {1}", repairTask.TaskId, e); } } if (executeRepairs) { // Start executing the approved repair (if any) that has the oldest creation time var oldestApprovedTask = approvedTasks.OrderBy(t => t.CreatedTimestamp).FirstOrDefault(); if (oldestApprovedTask == null) { Trace.WriteInfo(TraceType, "No new repairs to send to Windows Azure"); } else { Trace.WriteInfo(TraceType, "Executing repair with oldest approved task Id: {0}", oldestApprovedTask.TaskId); this.ExecuteRepair(oldestApprovedTask); } } else { Trace.WriteInfo(TraceType, "Not executing new repairs; approved count = {0}", approvedTasks.Count); } }