示例#1
0
 private void ClearHealthError(MockWatchdog watchdog, string property, MockRepairExecutor executor)
 {
     watchdog.ReportOk(property);
     // Make sure that the internal PE state is also back to Healthy
     Thread.Sleep(_minimumHealthyDuration + _actionSchedulingInterval);
     Thread.Sleep(executor.ProbationToHealthyPostRepair);
 }
示例#2
0
        public void Onebox_Successful_ReplicationAfterRepairCreated()
        {
            const string property = "Onebox_Successful_ReplicationAfterRepairCreated";

            MockRepairExecutor executor = Executors.First();

            LogHelper.Log("Starting test {0} for executor {1}", property, executor.Name);

            // Inject error
            Node         node     = GetHealthyNode(_fabricClient);
            MockWatchdog watchdog = new MockWatchdog(_fabricClient.HealthManager, node.NodeName);

            watchdog.ReportError(property);

            // Wait for PE to schedule the task
            RepairTask createdTask = PollingForCreatedTask(executor, node);

            Assert.IsNotNull(createdTask, "There must be a repair task created.");

            KillPrimaryReplica();

            RepairTask newTask = PollingForCreatedTask(executor, node);

            Assert.IsTrue(newTask.TaskId.Equals(createdTask.TaskId, StringComparison.OrdinalIgnoreCase),
                          "The repair task should be the same created before");
        }
示例#3
0
        public void Onebox_Successful_DeleteRepairInTheMiddle()
        {
            const string property = "Onebox_Successful_DeleteRepairInTheMiddle";

            MockRepairExecutor executor = Executors.First();

            LogHelper.Log("Starting test {0} for executor {1}", property, executor.Name);

            // Inject error
            Node         node     = GetHealthyNode(_fabricClient);
            MockWatchdog watchdog = new MockWatchdog(_fabricClient.HealthManager, node.NodeName);

            watchdog.ReportError(property);

            RepairTask createdTask = PollingForCreatedTask(executor, node);

            Assert.IsNotNull(createdTask, "There must be a repair task created.");
            // cancel the task
            long result = _fabricClient.RepairManager.CancelRepairTaskAsync(createdTask.TaskId, 0, true).Result;

            // Delete the repair task
            MockRepairExecutor.DeleteTask(createdTask, _fabricClient.RepairManager);

            ClearHealthError(watchdog, property, executor);

            // Report Error
            watchdog.ReportError(property);

            createdTask = PollingForCreatedTask(executor, node);
            Assert.IsNotNull(createdTask, "There must be a repair task created after deletion.");

            // Complete the repair task
            executor.Complete(createdTask);
        }
示例#4
0
        public void Onebox_Successful_SelfRepairAfterTaskIsApproved()
        {
            const string property = "Onebox_Successful_SelfRepairAfterTaskIsCreated";

            MockRepairExecutor executor = Executors.First();

            LogHelper.Log("Starting test {0}", property);

            Node node = GetHealthyNode(_fabricClient);
            // Inject error
            MockWatchdog watchdog = new MockWatchdog(_fabricClient.HealthManager, node.NodeName);

            watchdog.ReportError(property);

            RepairTask createdTask = PollingForCreatedTask(executor, node);

            Assert.IsNotNull(createdTask, "There must be a repair task created.");

            executor.Approve(ref createdTask);

            ClearHealthError(watchdog, property, executor);

            // Make sure the task is cancelled
            Thread.Sleep(_repairManagerLatency);
            RepairTaskList repairTaskList = _fabricClient.RepairManager.GetRepairTaskListAsync(createdTask.TaskId, RepairTaskStateFilter.All, null).Result;

            Assert.IsTrue((repairTaskList.Any() && repairTaskList.Count == 1), "There must be a unique repair task with task id = {0}", createdTask.TaskId);

            RepairTask cancelledTask = repairTaskList.First();

            Assert.IsTrue(cancelledTask.Flags == RepairTaskFlags.CancelRequested, "PE should request cancellation of the task");
        }
示例#5
0
        public void Onebox_Successful_ReplicationAfterRepairCompleted()
        {
            const string property = "Onebox_Successful_ReplicationAfterRepairCancelled";

            MockRepairExecutor executor = Executors.First();

            LogHelper.Log("Starting test {0} for executor {1}", property, executor.Name);

            // Inject error
            Node         node     = GetHealthyNode(_fabricClient);
            MockWatchdog watchdog = new MockWatchdog(_fabricClient.HealthManager, node.NodeName);

            watchdog.ReportError(property);

            var nodeHealth       = _fabricClient.HealthManager.GetNodeHealthAsync(node.NodeName).Result;
            var oldRepairHistory = nodeHealth.HealthEvents.Single(e => e.HealthInformation.Property.Equals("RepairPolicyEngineService::RepairHistory",
                                                                                                           StringComparison.OrdinalIgnoreCase));

            // Wait for PE to schedule the task
            RepairTask createdTask = PollingForCreatedTask(executor, node);

            Assert.IsNotNull(createdTask, "There must be a repair task created.");
            DateTime scheduledAt = DateTime.Now;

            executor.Complete(createdTask);

            KillPrimaryReplica();

            LogHelper.Log("Waiting for the RepairHistory in health store to have the latest repair");
            // Make sure the RepairHistory has the entry within 5 minutes
            string   newRepairDescription = null;
            TimeSpan timeout = TimeSpan.FromMinutes(5);

            while (timeout >= TimeSpan.Zero)
            {
                nodeHealth = _fabricClient.HealthManager.GetNodeHealthAsync(node.NodeName).Result;
                var repairHistory = nodeHealth.HealthEvents.Single(e => e.HealthInformation.Property.Equals("RepairPolicyEngineService::RepairHistory",
                                                                                                            StringComparison.OrdinalIgnoreCase));
                if (!repairHistory.HealthInformation.Description.Equals(oldRepairHistory.HealthInformation.Description, StringComparison.OrdinalIgnoreCase))
                {
                    string[] descriptions = repairHistory.HealthInformation.Description.Split(';');
                    newRepairDescription = descriptions[descriptions.Count() - 2]; // Last element is empty
                    break;
                }
                TimeSpan sleepTime = TimeSpan.FromSeconds(10);
                Thread.Sleep(sleepTime);
                timeout -= sleepTime;
            }
            Assert.IsFalse(String.IsNullOrEmpty(newRepairDescription), "There must be a new repair description");

            string repairType = newRepairDescription.Split('=').First();

            Assert.IsTrue(repairType.Equals(executor.Name, StringComparison.OrdinalIgnoreCase), "Repair type is the same as executor name");
            DateTime repairScheduledAt = DateTime.Parse(newRepairDescription.Split('=').Last());

            Assert.IsTrue(repairScheduledAt >= scheduledAt, "There should be a new entry in repair history");
        }
示例#6
0
        public static void TestCleanup()
        {
            LogHelper.Log("TestCleanup()");

            // Clear health store
            var nodeList = _fabricClient.QueryManager.GetNodeListAsync().Result;

            foreach (var node in nodeList)
            {
                MockWatchdog watchdog = new MockWatchdog(_fabricClient.HealthManager, node.NodeName);
                watchdog.ClearError();
            }

            // Cancel existing tasks
            MockRepairExecutor.CancelAll(Executors, _fabricClient.RepairManager);
        }
示例#7
0
        public void Onebox_Successful_ReplicationAfterRepairCancelled()
        {
            const string property = "Onebox_Successful_ReplicationAfterRepairCancelled";

            MockRepairExecutor executor = Executors.First();

            LogHelper.Log("Starting test {0} for executor {1}", property, executor.Name);

            // Inject error
            Node         node     = GetHealthyNode(_fabricClient);
            MockWatchdog watchdog = new MockWatchdog(_fabricClient.HealthManager, node.NodeName);

            watchdog.ReportError(property);

            // Wait for PE to schedule the task
            RepairTask createdTask = PollingForCreatedTask(executor, node);

            Assert.IsNotNull(createdTask, "There must be a repair task created.");
            Assert.IsNotNull(createdTask.CreatedTimestamp != null, "CreatedTimestamp must not be null");
            DateTime firstCreated = createdTask.CreatedTimestamp.Value;

            watchdog.ReportOk(property);

            // Make sure the task is cancelled
            Thread.Sleep(_minimumHealthyDuration + _actionSchedulingInterval + _repairManagerLatency);
            RepairTask cancelledTask = _fabricClient.RepairManager.GetRepairTaskListAsync(createdTask.TaskId, RepairTaskStateFilter.All, null).Result.Single();

            Assert.IsTrue(cancelledTask.State == RepairTaskState.Completed && cancelledTask.ResultStatus == RepairTaskResult.Cancelled,
                          "PE should request to cancel the task. Current State = {0}. Result Status = {1}", cancelledTask.State, cancelledTask.ResultStatus);

            KillPrimaryReplica();

            Thread.Sleep(_minimumHealthyDuration + _actionSchedulingInterval + _repairManagerLatency);
            var repairList = _fabricClient.RepairManager.GetRepairTaskListAsync(createdTask.TaskId, RepairTaskStateFilter.Completed, null).Result;

            var query = from repairTask in repairList
                        where repairTask.Target is NodeRepairTargetDescription &&
                        ((NodeRepairTargetDescription)repairTask.Target).Nodes.First() == node.NodeName &&
                        repairTask.State == RepairTaskState.Completed &&
                        repairTask.ResultStatus == RepairTaskResult.Cancelled &&
                        repairTask.CreatedTimestamp >= firstCreated
                        select repairTask;

            Assert.IsTrue(query.Count() == 1, "There must be no duplicated cancel request");
        }
示例#8
0
        public void Onebox_Successful_ScheduleSingleRepair()
        {
            const string property = "Onebox_Successful_ScheduleSingleRepair";

            MockRepairExecutor executor = Executors.First();

            LogHelper.Log("Starting test {0} for executor {1}", property, executor.Name);

            // Inject error
            Node         node     = GetHealthyNode(_fabricClient);
            MockWatchdog watchdog = new MockWatchdog(_fabricClient.HealthManager, node.NodeName);

            watchdog.ReportError(property);

            RepairTask createdTask = PollingForCreatedTask(executor, node);

            Assert.IsNotNull(createdTask, "There must be a repair task created.");
        }
示例#9
0
        public void Onebox_Successful_SingleFastTrackRepair()
        {
            const string test     = "Onebox_Successful_SingleFastTrackRepair";
            const string property = "RequestRepair";
            // Use reimage executor
            MockRepairExecutor executor = Executors.Skip(1).First();

            Console.WriteLine("Starting test {0} for executor {1}", test, executor.Name);
            string description = executor.Name;

            // Inject error
            Node         node     = GetHealthyNode(_fabricClient);
            MockWatchdog watchdog = new MockWatchdog(_fabricClient.HealthManager, node.NodeName);

            watchdog.ReportError(property, description);

            RepairTask createdTask = PollingForCreatedTask(executor, node);

            Assert.IsNotNull(createdTask, "There must be a repair task created.");
        }
示例#10
0
        public void Onebox_Successful_CorrectRepairStateTransition()
        {
            const string property = "Onebox_Successful_CorrectRepairStateTransition";

            LogHelper.Log("Starting test {0}", property);

            Node               node     = GetHealthyNode(_fabricClient);
            MockWatchdog       watchdog = new MockWatchdog(_fabricClient.HealthManager, node.NodeName);
            MockRepairExecutor executor = Executors.First();

            // Inject error
            watchdog.ReportError(property);

            // Wait for PE to schedule the task
            RepairTask createdTask = PollingForCreatedTask(executor, node);

            Assert.IsNotNull(createdTask, "There must be a repair task created.");

            executor.Process(ref createdTask);
        }
示例#11
0
        private static RepairTask PollingForCreatedTask(MockRepairExecutor executor, Node node)
        {
            LogHelper.Log("Polling for created task for executor '{0}' and node '{1}'", executor.Name, node.NodeName);
            TimeSpan timeout = executor.ActionTime + _probationToFailing + _actionSchedulingInterval + _repairManagerLatency;

            // Give a little extra time to account for software delays due to onebox environments
            timeout = timeout.Add(TimeSpan.FromMinutes(2));
            DateTime startTime = DateTime.Now;

            while (DateTime.Now - startTime <= timeout)
            {
                RepairTask task = executor.GetCreatedTask(node.NodeName);
                if (task != null)
                {
                    return(task);
                }
                Thread.Sleep(_actionSchedulingInterval);
            }

            return(null);
        }
示例#12
0
        private static void LoadExecutors(XDocument xmlSettings)
        {
            Executors = new List <MockRepairExecutor>();
            var allRepairAction = from element in xmlSettings.Descendants()
                                  where element.Name.LocalName == "Section" && element.Attribute("Name").Value == "NodeRepairActionList"
                                  select element;

            foreach (var repairActionSection in allRepairAction.First().Descendants())
            {
                string repairActionSectionName = repairActionSection.Attribute("Value").Value;

                string name = repairActionSection.Attribute("Name").Value;
                //TimeSpan ProbationToFailingPostRepair = GetParameterTimeSpan(xmlSettings, repairActionSectionName, "ProbationToFailingWaitDurationPostRepairInSeconds");
                TimeSpan ProbationToHealthyPostRepair = GetParameterTimeSpan(xmlSettings, repairActionSectionName, "ProbationToHealthyWaitDurationPostRepairInSeconds");
                TimeSpan policyActionTime             = GetParameterTimeSpan(xmlSettings, repairActionSectionName, "PolicyActionTimeInSeconds");
                bool     isEnabled = Convert.ToBoolean(GetParameter(xmlSettings, repairActionSectionName, "IsEnabled"));

                MockRepairExecutor executor = new MockRepairExecutor(name, policyActionTime, isEnabled, ProbationToHealthyPostRepair, _fabricClient.RepairManager);
                Executors.Add(executor);
            }

            Executors.Sort((executor1, executor2) => executor1.ActionTime.CompareTo(executor2.ActionTime));
        }
示例#13
0
        private void Onebox_Successful_CorrectRepairOrder(string property, bool reportOK)
        {
            LogHelper.Log("Starting test {0}", property);

            // Inject error
            Node         node          = GetHealthyNode(_fabricClient);
            DateTime     lastHealthyAt = DateTime.Now;
            MockWatchdog watchdog      = new MockWatchdog(_fabricClient.HealthManager, node.NodeName);

            watchdog.ReportError(property);

            int currentIndex            = -1;
            MockRepairExecutor executor = null;
            // timeout is 2 times the longest policy plusplus a little extra time to account for software delays due to onebox environments
            DateTime timeout = DateTime.Now + Executors.Last().ActionTime + Executors.Last().ActionTime + TimeSpan.FromMinutes(3);

            while (DateTime.Now < timeout)
            {
                // Wait for PE to schedule the task
                RepairTask createdTask = PollingForAnyCreatedTask(node);
                Assert.IsNotNull(createdTask, "There should be a new repair task");
                if (executor == null || executor.IsExecutable(createdTask) == false)
                {
                    ++currentIndex;
                    while (Executors[currentIndex].IsEnabled == false)
                    {
                        LogHelper.Log("Skipping disabled repair executor {0}", Executors[currentIndex].Name);
                        ++currentIndex;
                        if (currentIndex >= Executors.Count())
                        {
                            Assert.Fail("No subsequent enabled executor");
                        }
                    }
                }
                executor = Executors[currentIndex];

                Assert.IsTrue(executor.IsEnabled, "Current executor '{0}' should be enabled", executor.Name);

                Assert.IsTrue(executor.IsExecutable(createdTask), "Task should be executable by current repair executor {0}", executor.Name);

                Assert.IsTrue(DateTime.Now - lastHealthyAt > executor.ActionTime,
                              "Processing time should be bigger than executor's action time ({0} seconds)",
                              executor.ActionTime.TotalSeconds);

                executor.Complete(createdTask);


                if (reportOK)
                {
                    // Report OK in the middle to make sure that the escalation happens even in the flip
                    // flopping case where after a repair a machine stays healthy for a shortwhile and then reports error again.
                    Thread.Sleep(_actionSchedulingInterval);
                    watchdog.ReportOk(property);
                    // wait for the next action to happen
                    Thread.Sleep(_actionSchedulingInterval);
                    // give additional time to deal with any delays
                    Thread.Sleep(TimeSpan.FromMilliseconds(5));
                    // Flop it back to error
                    watchdog.ReportError(property);
                    Thread.Sleep(TimeSpan.FromMilliseconds(5));
                }
            }
            // By going thru the loop we should have reached the last executor
            Assert.IsTrue(executor.ActionTime == Executors.Last().ActionTime);
        }