Beispiel #1
0
        public void Onebox_Successful_DeleteRepairInTheMiddle()
        {
            const string property = "Onebox_Successful_DeleteRepairInTheMiddle";

            MockRepairExecutor executor = Executors.First();

            LogHelper.Log("Starting test {0} for executor {1}", property, executor.Name);

            // Inject error
            Node         node     = GetHealthyNode(_fabricClient);
            MockWatchdog watchdog = new MockWatchdog(_fabricClient.HealthManager, node.NodeName);

            watchdog.ReportError(property);

            RepairTask createdTask = PollingForCreatedTask(executor, node);

            Assert.IsNotNull(createdTask, "There must be a repair task created.");
            // cancel the task
            long result = _fabricClient.RepairManager.CancelRepairTaskAsync(createdTask.TaskId, 0, true).Result;

            // Delete the repair task
            MockRepairExecutor.DeleteTask(createdTask, _fabricClient.RepairManager);

            ClearHealthError(watchdog, property, executor);

            // Report Error
            watchdog.ReportError(property);

            createdTask = PollingForCreatedTask(executor, node);
            Assert.IsNotNull(createdTask, "There must be a repair task created after deletion.");

            // Complete the repair task
            executor.Complete(createdTask);
        }
Beispiel #2
0
        public void Onebox_Successful_ReplicationAfterRepairCompleted()
        {
            const string property = "Onebox_Successful_ReplicationAfterRepairCancelled";

            MockRepairExecutor executor = Executors.First();

            LogHelper.Log("Starting test {0} for executor {1}", property, executor.Name);

            // Inject error
            Node         node     = GetHealthyNode(_fabricClient);
            MockWatchdog watchdog = new MockWatchdog(_fabricClient.HealthManager, node.NodeName);

            watchdog.ReportError(property);

            var nodeHealth       = _fabricClient.HealthManager.GetNodeHealthAsync(node.NodeName).Result;
            var oldRepairHistory = nodeHealth.HealthEvents.Single(e => e.HealthInformation.Property.Equals("RepairPolicyEngineService::RepairHistory",
                                                                                                           StringComparison.OrdinalIgnoreCase));

            // Wait for PE to schedule the task
            RepairTask createdTask = PollingForCreatedTask(executor, node);

            Assert.IsNotNull(createdTask, "There must be a repair task created.");
            DateTime scheduledAt = DateTime.Now;

            executor.Complete(createdTask);

            KillPrimaryReplica();

            LogHelper.Log("Waiting for the RepairHistory in health store to have the latest repair");
            // Make sure the RepairHistory has the entry within 5 minutes
            string   newRepairDescription = null;
            TimeSpan timeout = TimeSpan.FromMinutes(5);

            while (timeout >= TimeSpan.Zero)
            {
                nodeHealth = _fabricClient.HealthManager.GetNodeHealthAsync(node.NodeName).Result;
                var repairHistory = nodeHealth.HealthEvents.Single(e => e.HealthInformation.Property.Equals("RepairPolicyEngineService::RepairHistory",
                                                                                                            StringComparison.OrdinalIgnoreCase));
                if (!repairHistory.HealthInformation.Description.Equals(oldRepairHistory.HealthInformation.Description, StringComparison.OrdinalIgnoreCase))
                {
                    string[] descriptions = repairHistory.HealthInformation.Description.Split(';');
                    newRepairDescription = descriptions[descriptions.Count() - 2]; // Last element is empty
                    break;
                }
                TimeSpan sleepTime = TimeSpan.FromSeconds(10);
                Thread.Sleep(sleepTime);
                timeout -= sleepTime;
            }
            Assert.IsFalse(String.IsNullOrEmpty(newRepairDescription), "There must be a new repair description");

            string repairType = newRepairDescription.Split('=').First();

            Assert.IsTrue(repairType.Equals(executor.Name, StringComparison.OrdinalIgnoreCase), "Repair type is the same as executor name");
            DateTime repairScheduledAt = DateTime.Parse(newRepairDescription.Split('=').Last());

            Assert.IsTrue(repairScheduledAt >= scheduledAt, "There should be a new entry in repair history");
        }
Beispiel #3
0
        private void Onebox_Successful_CorrectRepairOrder(string property, bool reportOK)
        {
            LogHelper.Log("Starting test {0}", property);

            // Inject error
            Node         node          = GetHealthyNode(_fabricClient);
            DateTime     lastHealthyAt = DateTime.Now;
            MockWatchdog watchdog      = new MockWatchdog(_fabricClient.HealthManager, node.NodeName);

            watchdog.ReportError(property);

            int currentIndex            = -1;
            MockRepairExecutor executor = null;
            // timeout is 2 times the longest policy plusplus a little extra time to account for software delays due to onebox environments
            DateTime timeout = DateTime.Now + Executors.Last().ActionTime + Executors.Last().ActionTime + TimeSpan.FromMinutes(3);

            while (DateTime.Now < timeout)
            {
                // Wait for PE to schedule the task
                RepairTask createdTask = PollingForAnyCreatedTask(node);
                Assert.IsNotNull(createdTask, "There should be a new repair task");
                if (executor == null || executor.IsExecutable(createdTask) == false)
                {
                    ++currentIndex;
                    while (Executors[currentIndex].IsEnabled == false)
                    {
                        LogHelper.Log("Skipping disabled repair executor {0}", Executors[currentIndex].Name);
                        ++currentIndex;
                        if (currentIndex >= Executors.Count())
                        {
                            Assert.Fail("No subsequent enabled executor");
                        }
                    }
                }
                executor = Executors[currentIndex];

                Assert.IsTrue(executor.IsEnabled, "Current executor '{0}' should be enabled", executor.Name);

                Assert.IsTrue(executor.IsExecutable(createdTask), "Task should be executable by current repair executor {0}", executor.Name);

                Assert.IsTrue(DateTime.Now - lastHealthyAt > executor.ActionTime,
                              "Processing time should be bigger than executor's action time ({0} seconds)",
                              executor.ActionTime.TotalSeconds);

                executor.Complete(createdTask);


                if (reportOK)
                {
                    // Report OK in the middle to make sure that the escalation happens even in the flip
                    // flopping case where after a repair a machine stays healthy for a shortwhile and then reports error again.
                    Thread.Sleep(_actionSchedulingInterval);
                    watchdog.ReportOk(property);
                    // wait for the next action to happen
                    Thread.Sleep(_actionSchedulingInterval);
                    // give additional time to deal with any delays
                    Thread.Sleep(TimeSpan.FromMilliseconds(5));
                    // Flop it back to error
                    watchdog.ReportError(property);
                    Thread.Sleep(TimeSpan.FromMilliseconds(5));
                }
            }
            // By going thru the loop we should have reached the last executor
            Assert.IsTrue(executor.ActionTime == Executors.Last().ActionTime);
        }