Exemplo n.º 1
0
        public void Onebox_Successful_ReplicationAfterRepairCreated()
        {
            const string property = "Onebox_Successful_ReplicationAfterRepairCreated";

            MockRepairExecutor executor = Executors.First();

            LogHelper.Log("Starting test {0} for executor {1}", property, executor.Name);

            // Inject error
            Node         node     = GetHealthyNode(_fabricClient);
            MockWatchdog watchdog = new MockWatchdog(_fabricClient.HealthManager, node.NodeName);

            watchdog.ReportError(property);

            // Wait for PE to schedule the task
            RepairTask createdTask = PollingForCreatedTask(executor, node);

            Assert.IsNotNull(createdTask, "There must be a repair task created.");

            KillPrimaryReplica();

            RepairTask newTask = PollingForCreatedTask(executor, node);

            Assert.IsTrue(newTask.TaskId.Equals(createdTask.TaskId, StringComparison.OrdinalIgnoreCase),
                          "The repair task should be the same created before");
        }
Exemplo n.º 2
0
 private void ClearHealthError(MockWatchdog watchdog, string property, MockRepairExecutor executor)
 {
     watchdog.ReportOk(property);
     // Make sure that the internal PE state is also back to Healthy
     Thread.Sleep(_minimumHealthyDuration + _actionSchedulingInterval);
     Thread.Sleep(executor.ProbationToHealthyPostRepair);
 }
Exemplo n.º 3
0
        public void Onebox_Successful_DeleteRepairInTheMiddle()
        {
            const string property = "Onebox_Successful_DeleteRepairInTheMiddle";

            MockRepairExecutor executor = Executors.First();

            LogHelper.Log("Starting test {0} for executor {1}", property, executor.Name);

            // Inject error
            Node         node     = GetHealthyNode(_fabricClient);
            MockWatchdog watchdog = new MockWatchdog(_fabricClient.HealthManager, node.NodeName);

            watchdog.ReportError(property);

            RepairTask createdTask = PollingForCreatedTask(executor, node);

            Assert.IsNotNull(createdTask, "There must be a repair task created.");
            // cancel the task
            long result = _fabricClient.RepairManager.CancelRepairTaskAsync(createdTask.TaskId, 0, true).Result;

            // Delete the repair task
            MockRepairExecutor.DeleteTask(createdTask, _fabricClient.RepairManager);

            ClearHealthError(watchdog, property, executor);

            // Report Error
            watchdog.ReportError(property);

            createdTask = PollingForCreatedTask(executor, node);
            Assert.IsNotNull(createdTask, "There must be a repair task created after deletion.");

            // Complete the repair task
            executor.Complete(createdTask);
        }
Exemplo n.º 4
0
        public void Onebox_Successful_SelfRepairNoCreatedTask()
        {
            const string property = "Onebox_Successful_SelfRepairNoCreatedTask";

            LogHelper.Log("Starting test {0}", property);


            Node node = GetHealthyNode(_fabricClient);
            // Inject error
            MockWatchdog watchdog = new MockWatchdog(_fabricClient.HealthManager, node.NodeName);

            watchdog.ReportError(property);

            ClearHealthError(watchdog, property, Executors.First());

            var nodeHealth = _fabricClient.HealthManager.GetNodeHealthAsync(node.NodeName).Result;

            Assert.IsTrue(nodeHealth.AggregatedHealthState == HealthState.Ok, "Node health must be Ok after health error event is cleared");

            Parallel.ForEach(Executors, executor =>
            {
                if (!executor.IsEnabled)
                {
                    return;
                }
                RepairTask createdTask = PollingForCreatedTask(executor, node);
                Assert.IsNull(createdTask, "There must be no scheduled task in case of self-repair");
            });
        }
Exemplo n.º 5
0
        public void Onebox_Successful_SelfRepairAfterTaskIsApproved()
        {
            const string property = "Onebox_Successful_SelfRepairAfterTaskIsCreated";

            MockRepairExecutor executor = Executors.First();

            LogHelper.Log("Starting test {0}", property);

            Node node = GetHealthyNode(_fabricClient);
            // Inject error
            MockWatchdog watchdog = new MockWatchdog(_fabricClient.HealthManager, node.NodeName);

            watchdog.ReportError(property);

            RepairTask createdTask = PollingForCreatedTask(executor, node);

            Assert.IsNotNull(createdTask, "There must be a repair task created.");

            executor.Approve(ref createdTask);

            ClearHealthError(watchdog, property, executor);

            // Make sure the task is cancelled
            Thread.Sleep(_repairManagerLatency);
            RepairTaskList repairTaskList = _fabricClient.RepairManager.GetRepairTaskListAsync(createdTask.TaskId, RepairTaskStateFilter.All, null).Result;

            Assert.IsTrue((repairTaskList.Any() && repairTaskList.Count == 1), "There must be a unique repair task with task id = {0}", createdTask.TaskId);

            RepairTask cancelledTask = repairTaskList.First();

            Assert.IsTrue(cancelledTask.Flags == RepairTaskFlags.CancelRequested, "PE should request cancellation of the task");
        }
Exemplo n.º 6
0
        public void Onebox_Successful_ReplicationAfterRepairCompleted()
        {
            const string property = "Onebox_Successful_ReplicationAfterRepairCancelled";

            MockRepairExecutor executor = Executors.First();

            LogHelper.Log("Starting test {0} for executor {1}", property, executor.Name);

            // Inject error
            Node         node     = GetHealthyNode(_fabricClient);
            MockWatchdog watchdog = new MockWatchdog(_fabricClient.HealthManager, node.NodeName);

            watchdog.ReportError(property);

            var nodeHealth       = _fabricClient.HealthManager.GetNodeHealthAsync(node.NodeName).Result;
            var oldRepairHistory = nodeHealth.HealthEvents.Single(e => e.HealthInformation.Property.Equals("RepairPolicyEngineService::RepairHistory",
                                                                                                           StringComparison.OrdinalIgnoreCase));

            // Wait for PE to schedule the task
            RepairTask createdTask = PollingForCreatedTask(executor, node);

            Assert.IsNotNull(createdTask, "There must be a repair task created.");
            DateTime scheduledAt = DateTime.Now;

            executor.Complete(createdTask);

            KillPrimaryReplica();

            LogHelper.Log("Waiting for the RepairHistory in health store to have the latest repair");
            // Make sure the RepairHistory has the entry within 5 minutes
            string   newRepairDescription = null;
            TimeSpan timeout = TimeSpan.FromMinutes(5);

            while (timeout >= TimeSpan.Zero)
            {
                nodeHealth = _fabricClient.HealthManager.GetNodeHealthAsync(node.NodeName).Result;
                var repairHistory = nodeHealth.HealthEvents.Single(e => e.HealthInformation.Property.Equals("RepairPolicyEngineService::RepairHistory",
                                                                                                            StringComparison.OrdinalIgnoreCase));
                if (!repairHistory.HealthInformation.Description.Equals(oldRepairHistory.HealthInformation.Description, StringComparison.OrdinalIgnoreCase))
                {
                    string[] descriptions = repairHistory.HealthInformation.Description.Split(';');
                    newRepairDescription = descriptions[descriptions.Count() - 2]; // Last element is empty
                    break;
                }
                TimeSpan sleepTime = TimeSpan.FromSeconds(10);
                Thread.Sleep(sleepTime);
                timeout -= sleepTime;
            }
            Assert.IsFalse(String.IsNullOrEmpty(newRepairDescription), "There must be a new repair description");

            string repairType = newRepairDescription.Split('=').First();

            Assert.IsTrue(repairType.Equals(executor.Name, StringComparison.OrdinalIgnoreCase), "Repair type is the same as executor name");
            DateTime repairScheduledAt = DateTime.Parse(newRepairDescription.Split('=').Last());

            Assert.IsTrue(repairScheduledAt >= scheduledAt, "There should be a new entry in repair history");
        }
Exemplo n.º 7
0
        public static void TestCleanup()
        {
            LogHelper.Log("TestCleanup()");

            // Clear health store
            var nodeList = _fabricClient.QueryManager.GetNodeListAsync().Result;

            foreach (var node in nodeList)
            {
                MockWatchdog watchdog = new MockWatchdog(_fabricClient.HealthManager, node.NodeName);
                watchdog.ClearError();
            }

            // Cancel existing tasks
            MockRepairExecutor.CancelAll(Executors, _fabricClient.RepairManager);
        }
Exemplo n.º 8
0
        public void Onebox_Successful_ReplicationAfterRepairCancelled()
        {
            const string property = "Onebox_Successful_ReplicationAfterRepairCancelled";

            MockRepairExecutor executor = Executors.First();

            LogHelper.Log("Starting test {0} for executor {1}", property, executor.Name);

            // Inject error
            Node         node     = GetHealthyNode(_fabricClient);
            MockWatchdog watchdog = new MockWatchdog(_fabricClient.HealthManager, node.NodeName);

            watchdog.ReportError(property);

            // Wait for PE to schedule the task
            RepairTask createdTask = PollingForCreatedTask(executor, node);

            Assert.IsNotNull(createdTask, "There must be a repair task created.");
            Assert.IsNotNull(createdTask.CreatedTimestamp != null, "CreatedTimestamp must not be null");
            DateTime firstCreated = createdTask.CreatedTimestamp.Value;

            watchdog.ReportOk(property);

            // Make sure the task is cancelled
            Thread.Sleep(_minimumHealthyDuration + _actionSchedulingInterval + _repairManagerLatency);
            RepairTask cancelledTask = _fabricClient.RepairManager.GetRepairTaskListAsync(createdTask.TaskId, RepairTaskStateFilter.All, null).Result.Single();

            Assert.IsTrue(cancelledTask.State == RepairTaskState.Completed && cancelledTask.ResultStatus == RepairTaskResult.Cancelled,
                          "PE should request to cancel the task. Current State = {0}. Result Status = {1}", cancelledTask.State, cancelledTask.ResultStatus);

            KillPrimaryReplica();

            Thread.Sleep(_minimumHealthyDuration + _actionSchedulingInterval + _repairManagerLatency);
            var repairList = _fabricClient.RepairManager.GetRepairTaskListAsync(createdTask.TaskId, RepairTaskStateFilter.Completed, null).Result;

            var query = from repairTask in repairList
                        where repairTask.Target is NodeRepairTargetDescription &&
                        ((NodeRepairTargetDescription)repairTask.Target).Nodes.First() == node.NodeName &&
                        repairTask.State == RepairTaskState.Completed &&
                        repairTask.ResultStatus == RepairTaskResult.Cancelled &&
                        repairTask.CreatedTimestamp >= firstCreated
                        select repairTask;

            Assert.IsTrue(query.Count() == 1, "There must be no duplicated cancel request");
        }
Exemplo n.º 9
0
        public void Onebox_Successful_ScheduleSingleRepair()
        {
            const string property = "Onebox_Successful_ScheduleSingleRepair";

            MockRepairExecutor executor = Executors.First();

            LogHelper.Log("Starting test {0} for executor {1}", property, executor.Name);

            // Inject error
            Node         node     = GetHealthyNode(_fabricClient);
            MockWatchdog watchdog = new MockWatchdog(_fabricClient.HealthManager, node.NodeName);

            watchdog.ReportError(property);

            RepairTask createdTask = PollingForCreatedTask(executor, node);

            Assert.IsNotNull(createdTask, "There must be a repair task created.");
        }
Exemplo n.º 10
0
        public void Onebox_Successful_SingleFastTrackRepair()
        {
            const string test     = "Onebox_Successful_SingleFastTrackRepair";
            const string property = "RequestRepair";
            // Use reimage executor
            MockRepairExecutor executor = Executors.Skip(1).First();

            Console.WriteLine("Starting test {0} for executor {1}", test, executor.Name);
            string description = executor.Name;

            // Inject error
            Node         node     = GetHealthyNode(_fabricClient);
            MockWatchdog watchdog = new MockWatchdog(_fabricClient.HealthManager, node.NodeName);

            watchdog.ReportError(property, description);

            RepairTask createdTask = PollingForCreatedTask(executor, node);

            Assert.IsNotNull(createdTask, "There must be a repair task created.");
        }
Exemplo n.º 11
0
        public void Onebox_Successful_CorrectRepairStateTransition()
        {
            const string property = "Onebox_Successful_CorrectRepairStateTransition";

            LogHelper.Log("Starting test {0}", property);

            Node               node     = GetHealthyNode(_fabricClient);
            MockWatchdog       watchdog = new MockWatchdog(_fabricClient.HealthManager, node.NodeName);
            MockRepairExecutor executor = Executors.First();

            // Inject error
            watchdog.ReportError(property);

            // Wait for PE to schedule the task
            RepairTask createdTask = PollingForCreatedTask(executor, node);

            Assert.IsNotNull(createdTask, "There must be a repair task created.");

            executor.Process(ref createdTask);
        }
Exemplo n.º 12
0
        private void Onebox_Successful_CorrectRepairOrder(string property, bool reportOK)
        {
            LogHelper.Log("Starting test {0}", property);

            // Inject error
            Node         node          = GetHealthyNode(_fabricClient);
            DateTime     lastHealthyAt = DateTime.Now;
            MockWatchdog watchdog      = new MockWatchdog(_fabricClient.HealthManager, node.NodeName);

            watchdog.ReportError(property);

            int currentIndex            = -1;
            MockRepairExecutor executor = null;
            // timeout is 2 times the longest policy plusplus a little extra time to account for software delays due to onebox environments
            DateTime timeout = DateTime.Now + Executors.Last().ActionTime + Executors.Last().ActionTime + TimeSpan.FromMinutes(3);

            while (DateTime.Now < timeout)
            {
                // Wait for PE to schedule the task
                RepairTask createdTask = PollingForAnyCreatedTask(node);
                Assert.IsNotNull(createdTask, "There should be a new repair task");
                if (executor == null || executor.IsExecutable(createdTask) == false)
                {
                    ++currentIndex;
                    while (Executors[currentIndex].IsEnabled == false)
                    {
                        LogHelper.Log("Skipping disabled repair executor {0}", Executors[currentIndex].Name);
                        ++currentIndex;
                        if (currentIndex >= Executors.Count())
                        {
                            Assert.Fail("No subsequent enabled executor");
                        }
                    }
                }
                executor = Executors[currentIndex];

                Assert.IsTrue(executor.IsEnabled, "Current executor '{0}' should be enabled", executor.Name);

                Assert.IsTrue(executor.IsExecutable(createdTask), "Task should be executable by current repair executor {0}", executor.Name);

                Assert.IsTrue(DateTime.Now - lastHealthyAt > executor.ActionTime,
                              "Processing time should be bigger than executor's action time ({0} seconds)",
                              executor.ActionTime.TotalSeconds);

                executor.Complete(createdTask);


                if (reportOK)
                {
                    // Report OK in the middle to make sure that the escalation happens even in the flip
                    // flopping case where after a repair a machine stays healthy for a shortwhile and then reports error again.
                    Thread.Sleep(_actionSchedulingInterval);
                    watchdog.ReportOk(property);
                    // wait for the next action to happen
                    Thread.Sleep(_actionSchedulingInterval);
                    // give additional time to deal with any delays
                    Thread.Sleep(TimeSpan.FromMilliseconds(5));
                    // Flop it back to error
                    watchdog.ReportError(property);
                    Thread.Sleep(TimeSpan.FromMilliseconds(5));
                }
            }
            // By going thru the loop we should have reached the last executor
            Assert.IsTrue(executor.ActionTime == Executors.Last().ActionTime);
        }