/// <summary>
        /// Determines if a notfication qualifies for an re-ack (i.e. another SignalReady or SignalError being sent to FC).
        /// This assessment could be ignored and <see cref="Update"/> could still be invoked successfully.
        /// What scenarios does this happen?
        /// 1. Enough time hasnt elapsed to reack. (Post-UD notifications take a while to disappear on FC side)
        /// 2. This is really an "unmatched post". A new post shows up out of nowhere.
        /// 3. Somehow the job id/UDs dont match.
        /// 4. Failover where we not longer have the record of the last notification.
        /// 5. The ack is just not being accepted (node is in HI).
        /// </summary>
        public bool QualifiesForReAck(IManagementNotificationContext other)
        {
            if (!other.Matches(this.JobId, this.JobStepTargetUD) || other.NotificationType != NotificationType.CompleteJobStep)
            {
                Trace.WriteInfo(
                    TraceType,
                    "Notification does not qualify for a CompleteJobStep re-ack since there is no existing match. JobId/UD/type: {0}/{1}/{2}",
                    other.ActiveJobId,
                    other.ActiveJobStepTargetUD,
                    other.NotificationType);

                return(false);
            }

            if (IsRecentlyUpdated())
            {
                Trace.WriteInfo(
                    TraceType,
                    "Notification does not qualify for CompleteJobStep re-ack since an update happened recently. Last update time: {0}, JobId/UD: {1}/{2}",
                    this.UpdateTime,
                    other.ActiveJobId,
                    other.ActiveJobStepTargetUD);

                return(false);
            }

            Trace.WriteInfo(
                TraceType,
                "Notification qualifies for CompleteJobStep re-ack. JobId/UD: {0}, {1}",
                this.UpdateTime,
                other.ActiveJobId, other.ActiveJobStepTargetUD);

            return(true);
        }
Ejemplo n.º 2
0
        /// <summary>
        /// Claims and executes requested repairs
        /// </summary>
        /// <param name="executeRepairs">true if this call should execute the next ready repair task, false otherwise</param>
        /// <param name="currentNotification">The current MR notification</param>
        public void ProcessRepairs(bool executeRepairs, IManagementNotificationContext currentNotification)
        {
            if (!this.RepairManagerServiceExists())
            {
                Trace.WriteInfo(TraceType, "RepairManager service not created; skipping repair execution");
                return;
            }

            try
            {
                this.ProcessNewTasks();
            }
            catch (Exception e)
            {
                Trace.WriteWarning(TraceType, "ProcessNewTasks exception: {0}", e);
            }

            try
            {
                this.ProcessReadyTasks(executeRepairs, currentNotification);
            }
            catch (Exception e)
            {
                Trace.WriteWarning(TraceType, "ProcessReadyTasks exception: {0}", e);
            }
        }
Ejemplo n.º 3
0
        public IManagementNotificationContext GetCurrentNotification()
        {
            IManagementNotificationContext context = null;

            ManagementNotification realNotification;

            try
            {
                realNotification = this.client.GetCurrentNotification();
            }
            catch (Exception ex)
            {
                string message = string.Format(CultureInfo.InvariantCulture, "Error getting current notification from management protocol. Exception: {0}", ex);
                Trace.WriteWarning(TraceType, message);

                throw new ManagementException(message, ex);
            }

            if (realNotification != null)
            {
                context = new NotificationContext(realNotification);
            }

            return(context);
        }
Ejemplo n.º 4
0
 public static bool Matches(this IManagementNotificationContext notification, string jobId, int upgradeDomain)
 {
     return
         (notification != null &&
          notification.ActiveJobId.Equals(jobId, StringComparison.OrdinalIgnoreCase) &&
          notification.ActiveJobStepTargetUD == upgradeDomain);
 }
Ejemplo n.º 5
0
        private bool DoesAssessmentMatchNotification(IManagementNotificationContext notification)
        {
            if (currentJobImpactData == null)
            {
                return(false);
            }

            bool equals =
                string.Equals(currentJobImpactData.JobId, notification.ActiveJobId, StringComparison.OrdinalIgnoreCase) &&
                currentJobImpactData.UD == notification.ActiveJobStepTargetUD;

            return(equals);
        }
Ejemplo n.º 6
0
        public static string ToShortDisplayString(this IManagementNotificationContext notification)
        {
            if (notification == null)
            {
                return("<null>");
            }

            return(String.Format(
                       CultureInfo.InvariantCulture,
                       "{0}:{1}:{2}:{3}",
                       notification.NotificationType,
                       notification.ActiveJobType,
                       notification.ActiveJobId,
                       notification.ActiveJobStepTargetUD));
        }
Ejemplo n.º 7
0
        /// <summary>
        /// Gets the nodes involved in the current notification
        /// </summary>
        private static Dictionary <string, INode> GetNodesInNotification(IManagementNotificationContext notification, Dictionary <string, INode> queriedNodes)
        {
            var impactedNodeMap = new Dictionary <string, INode>(StringComparer.OrdinalIgnoreCase);

            foreach (var instance in notification.ImpactedInstances)
            {
                var name = instance.Id.TranslateRoleInstanceToNodeName();

                if (queriedNodes.ContainsKey(name))
                {
                    impactedNodeMap[name] = queriedNodes[name];
                }
            }

            return(impactedNodeMap);
        }
Ejemplo n.º 8
0
        private static bool NotificationMatchesRepairTask(RepairTask repairTask, IManagementNotificationContext notification)
        {
            if (notification == null)
            {
                return(false);
            }

            if (notification.ActiveJobType != JobType.DeploymentMaintenanceJob)
            {
                return(false);
            }

            // Exact match of the context ID provided when RequestMaintenance was called
            if (string.Equals(notification.ActiveJobContextId, GenerateJobContextId(repairTask)))
            {
                return(true);
            }

            // TODO return false here if strict matching is enabled

            // No exact match; try to guess if the job is related to the repair task
            string repairTarget = GetTargetNodeName(repairTask);

            if (repairTarget == null)
            {
                return(false);
            }

            string roleInstanceName = repairTarget.TranslateNodeNameToRoleInstance();

            if (!notification.ImpactedInstances.Any(i => i.Id == roleInstanceName))
            {
                return(false);
            }

            // TODO compare impact reasons?

            return(true);
        }
Ejemplo n.º 9
0
        /// <summary>
        /// Inspects the impact on each role instance and determines if the impact is severe enough
        /// for manual approval.
        /// </summary>
        /// <param name="notification">The management notification provided by the management protocol (MR)</param>
        /// <returns>True if manual approval is required. False otherwise.</returns>
        public static bool IsManualApprovalRequired(this IManagementNotificationContext notification)
        {
            if (notification == null)
            {
                return(false);
            }

            foreach (var impactReasons in notification.ImpactedInstances.Select(e => e.ImpactReasons))
            {
                foreach (var impactReason in impactReasons)
                {
                    switch (impactReason)
                    {
                    case ImpactReason.VendorRepairBegin:
                    case ImpactReason.VendorRepairEnd:
                        return(true);
                    }
                }
            }

            return(false);
        }
Ejemplo n.º 10
0
        public async Task <JobImpactTranslationMode> EvaluateJobImpactAsync(IManagementNotificationContext notification)
        {
            notification.Validate("notification");

            if (notification.NotificationType != NotificationType.StartJobStep)
            {
                // this is a coding error
                throw new ArgumentException("Notification not relevant. Notification: {0}".ToString(notification.ToShortDisplayString()));
            }

            if (DoesAssessmentMatchNotification(notification))
            {
                return(currentJobImpactData.AssessedImpact);
            }

            traceType.WriteInfo(
                "JobImpactData doesn't match, starting new evaluation. Notification: {0}, Current JobImpactData: {1}",
                notification.ToShortDisplayString(), currentJobImpactData != null ? currentJobImpactData.ToString() : "<null>");

            var now = DateTimeOffset.UtcNow;

            // There is already retry built into the QueryClient wrapper. If it goes beyond retry boundaries,
            // we'll let the caller handle this
            IList <INode> nodeList = await QueryClient.GetNodeListAsync(now).ConfigureAwait(false);

            var queriedNodes = nodeList.ToDictionary(e => e.NodeName, StringComparer.OrdinalIgnoreCase);

            Dictionary <string, INode> nodesToBeImpacted = GetNodesInNotification(notification, queriedNodes);

            var newJobImpactData = new JobImpactData
            {
                JobId          = notification.ActiveJobId,
                JobType        = notification.ActiveJobType,
                UD             = notification.ActiveJobStepTargetUD,
                AssessedNodes  = nodesToBeImpacted,
                Timestamp      = now,
                AssessedImpact = JobImpactTranslationMode.Default,
            };

            // no previous data
            if (currentJobImpactData == null)
            {
                currentJobImpactData = newJobImpactData;

                traceType.WriteInfo(
                    "New assessed job impact stored. Returning {0}, JobImpactData: {1}, Notification: {2}",
                    currentJobImpactData.AssessedImpact,
                    currentJobImpactData,
                    notification.ToShortDisplayString());

                return(currentJobImpactData.AssessedImpact);
            }

            // has too much time passed after assessment?
            bool expired = HasPreviousEvaluationExpired(now, currentJobImpactData);

            if (expired)
            {
                currentJobImpactData = newJobImpactData;

                traceType.WriteWarning(
                    "New assessed job impact stored. Time since last assessment is either invalid or has exceeded expiration time. Returning {0}. JobImpactData: {1}, Notification: {2}",
                    currentJobImpactData.AssessedImpact,
                    currentJobImpactData,
                    notification.ToShortDisplayString());

                return(currentJobImpactData.AssessedImpact);
            }

            bool?restarted = DidPreviouslyAssessedNodesRestart(queriedNodes, currentJobImpactData);

            if (restarted == null)
            {
                traceType.WriteInfo(
                    "Unable to assess job impact, continuing to use previous assessment. Returning {0}, JobImpactData: {1}, Notification: {2}",
                    currentJobImpactData.AssessedImpact,
                    currentJobImpactData,
                    notification.ToShortDisplayString());

                return(currentJobImpactData.AssessedImpact);
            }

            currentJobImpactData = newJobImpactData;

            currentJobImpactData.AssessedImpact = restarted.Value
                ? JobImpactTranslationMode.Default
                : JobImpactTranslationMode.Optimized;

            traceType.WriteInfo(
                "New assessed job impact stored. Returning {0}, JobImpactData: {1}, Notification: {2}",
                currentJobImpactData.AssessedImpact,
                currentJobImpactData,
                notification.ToShortDisplayString());

            return(currentJobImpactData.AssessedImpact);
        }
Ejemplo n.º 11
0
        /// <summary>
        /// Determines whether the given repair task matches the MR notification, and if so, whether it has completed.
        /// </summary>
        private static MatchResult CompareRepairToNotification(RepairTask repairTask, IManagementNotificationContext notification)
        {
            if (!NotificationMatchesRepairTask(repairTask, notification))
            {
                return(MatchResult.None);
            }

            if (notification.NotificationType == NotificationType.CompleteJobStep)
            {
                return(MatchResult.Complete);
            }
            else
            {
                return(MatchResult.InProgress);
            }
        }
Ejemplo n.º 12
0
        /// <summary>
        /// Performs work on any repairs owned by this executor that are in the Approved or Executing states
        /// </summary>
        private void ProcessReadyTasks(bool executeRepairs, IManagementNotificationContext currentNotification)
        {
            TimeSpan          executionTimeout = this.GetRepairExecutionTimeout();
            List <RepairTask> approvedTasks    = new List <RepairTask>();

            foreach (var repairTask in this.GetReadyRepairTasks())
            {
                try
                {
                    if (repairTask.State == RepairTaskState.Executing)
                    {
                        MatchResult match = CompareRepairToNotification(repairTask, currentNotification);

                        if (match == MatchResult.Complete)
                        {
                            string details = string.Format(
                                CultureInfo.InvariantCulture,
                                "Completed Windows Azure job {0} ({1})",
                                currentNotification.ActiveJobId,
                                currentNotification.ActiveJobStepStatus);

                            this.UpdateRepairToRestoring(
                                repairTask,
                                TranslateJobStepStatusToRepairTaskResult(currentNotification.ActiveJobStepStatus),
                                details);
                        }
                        else if (match == MatchResult.InProgress)
                        {
                            // Saw an executing task that is in progress, don't do any other work
                            executeRepairs = false;

                            Trace.WriteInfo(TraceType,
                                            "Repair task {0}: currently executing (job = {1}, notif status = {2}, job step status = {3}, job detailed status = {4})",
                                            repairTask.TaskId,
                                            currentNotification.ActiveJobId,
                                            currentNotification.NotificationStatus,
                                            currentNotification.ActiveJobStepStatus,
                                            currentNotification.ActiveJobDetailedStatus);

                            string details = string.Format(
                                CultureInfo.InvariantCulture,
                                "Windows Azure job {0} ({1}/{2}/{3})",
                                currentNotification.ActiveJobId,
                                currentNotification.NotificationStatus,
                                currentNotification.ActiveJobStepStatus,
                                currentNotification.ActiveJobDetailedStatus);

                            // Publish details on the repair task, to aid troubleshooting stuck repairs
                            if (!details.Equals(repairTask.ResultDetails))
                            {
                                repairTask.ResultDetails = details;
                                this.repairManager.UpdateRepairExecutionStateAsync(repairTask).Wait();
                            }
                        }
                        else if (DateTime.UtcNow - repairTask.ExecutingTimestamp > executionTimeout)
                        {
                            string message = string.Format(
                                CultureInfo.InvariantCulture,
                                "Repair timed out after being sent to Windows Azure for execution (timeout = {0})",
                                executionTimeout);

                            this.UpdateRepairToRestoring(
                                repairTask,
                                RepairTaskResult.Failed,
                                message);
                        }
                        else
                        {
                            // Saw an executing task that has not timed out, don't do any other work
                            executeRepairs = false;

                            Trace.WriteInfo(TraceType, "Repair task {0}: currently executing (unknown job)", repairTask.TaskId);
                        }
                    }
                    else if (repairTask.State == RepairTaskState.Approved)
                    {
                        if (repairTask.Flags.HasFlag(RepairTaskFlags.CancelRequested))
                        {
                            this.UpdateRepairToRestoring(
                                repairTask,
                                RepairTaskResult.Cancelled,
                                "Cancelled by user request");
                        }
                        else
                        {
                            approvedTasks.Add(repairTask);
                        }
                    }
                }
                catch (Exception e)
                {
                    Trace.WriteWarning(TraceType, "Failed to process ready task {0}: {1}", repairTask.TaskId, e);
                }
            }

            if (executeRepairs)
            {
                // Start executing the approved repair (if any) that has the oldest creation time
                var oldestApprovedTask = approvedTasks.OrderBy(t => t.CreatedTimestamp).FirstOrDefault();
                if (oldestApprovedTask == null)
                {
                    Trace.WriteInfo(TraceType, "No new repairs to send to Windows Azure");
                }
                else
                {
                    Trace.WriteInfo(TraceType, "Executing repair with oldest approved task Id: {0}", oldestApprovedTask.TaskId);
                    this.ExecuteRepair(oldestApprovedTask);
                }
            }
            else
            {
                Trace.WriteInfo(TraceType, "Not executing new repairs; approved count = {0}", approvedTasks.Count);
            }
        }