Beispiel #1
0
 private void ClearHealthError(MockWatchdog watchdog, string property, MockRepairExecutor executor)
 {
     watchdog.ReportOk(property);
     // Make sure that the internal PE state is also back to Healthy
     Thread.Sleep(_minimumHealthyDuration + _actionSchedulingInterval);
     Thread.Sleep(executor.ProbationToHealthyPostRepair);
 }
Beispiel #2
0
        public void Onebox_Successful_ReplicationAfterRepairCancelled()
        {
            const string property = "Onebox_Successful_ReplicationAfterRepairCancelled";

            MockRepairExecutor executor = Executors.First();

            LogHelper.Log("Starting test {0} for executor {1}", property, executor.Name);

            // Inject error
            Node         node     = GetHealthyNode(_fabricClient);
            MockWatchdog watchdog = new MockWatchdog(_fabricClient.HealthManager, node.NodeName);

            watchdog.ReportError(property);

            // Wait for PE to schedule the task
            RepairTask createdTask = PollingForCreatedTask(executor, node);

            Assert.IsNotNull(createdTask, "There must be a repair task created.");
            Assert.IsNotNull(createdTask.CreatedTimestamp != null, "CreatedTimestamp must not be null");
            DateTime firstCreated = createdTask.CreatedTimestamp.Value;

            watchdog.ReportOk(property);

            // Make sure the task is cancelled
            Thread.Sleep(_minimumHealthyDuration + _actionSchedulingInterval + _repairManagerLatency);
            RepairTask cancelledTask = _fabricClient.RepairManager.GetRepairTaskListAsync(createdTask.TaskId, RepairTaskStateFilter.All, null).Result.Single();

            Assert.IsTrue(cancelledTask.State == RepairTaskState.Completed && cancelledTask.ResultStatus == RepairTaskResult.Cancelled,
                          "PE should request to cancel the task. Current State = {0}. Result Status = {1}", cancelledTask.State, cancelledTask.ResultStatus);

            KillPrimaryReplica();

            Thread.Sleep(_minimumHealthyDuration + _actionSchedulingInterval + _repairManagerLatency);
            var repairList = _fabricClient.RepairManager.GetRepairTaskListAsync(createdTask.TaskId, RepairTaskStateFilter.Completed, null).Result;

            var query = from repairTask in repairList
                        where repairTask.Target is NodeRepairTargetDescription &&
                        ((NodeRepairTargetDescription)repairTask.Target).Nodes.First() == node.NodeName &&
                        repairTask.State == RepairTaskState.Completed &&
                        repairTask.ResultStatus == RepairTaskResult.Cancelled &&
                        repairTask.CreatedTimestamp >= firstCreated
                        select repairTask;

            Assert.IsTrue(query.Count() == 1, "There must be no duplicated cancel request");
        }
Beispiel #3
0
        private void Onebox_Successful_CorrectRepairOrder(string property, bool reportOK)
        {
            LogHelper.Log("Starting test {0}", property);

            // Inject error
            Node         node          = GetHealthyNode(_fabricClient);
            DateTime     lastHealthyAt = DateTime.Now;
            MockWatchdog watchdog      = new MockWatchdog(_fabricClient.HealthManager, node.NodeName);

            watchdog.ReportError(property);

            int currentIndex            = -1;
            MockRepairExecutor executor = null;
            // timeout is 2 times the longest policy plusplus a little extra time to account for software delays due to onebox environments
            DateTime timeout = DateTime.Now + Executors.Last().ActionTime + Executors.Last().ActionTime + TimeSpan.FromMinutes(3);

            while (DateTime.Now < timeout)
            {
                // Wait for PE to schedule the task
                RepairTask createdTask = PollingForAnyCreatedTask(node);
                Assert.IsNotNull(createdTask, "There should be a new repair task");
                if (executor == null || executor.IsExecutable(createdTask) == false)
                {
                    ++currentIndex;
                    while (Executors[currentIndex].IsEnabled == false)
                    {
                        LogHelper.Log("Skipping disabled repair executor {0}", Executors[currentIndex].Name);
                        ++currentIndex;
                        if (currentIndex >= Executors.Count())
                        {
                            Assert.Fail("No subsequent enabled executor");
                        }
                    }
                }
                executor = Executors[currentIndex];

                Assert.IsTrue(executor.IsEnabled, "Current executor '{0}' should be enabled", executor.Name);

                Assert.IsTrue(executor.IsExecutable(createdTask), "Task should be executable by current repair executor {0}", executor.Name);

                Assert.IsTrue(DateTime.Now - lastHealthyAt > executor.ActionTime,
                              "Processing time should be bigger than executor's action time ({0} seconds)",
                              executor.ActionTime.TotalSeconds);

                executor.Complete(createdTask);


                if (reportOK)
                {
                    // Report OK in the middle to make sure that the escalation happens even in the flip
                    // flopping case where after a repair a machine stays healthy for a shortwhile and then reports error again.
                    Thread.Sleep(_actionSchedulingInterval);
                    watchdog.ReportOk(property);
                    // wait for the next action to happen
                    Thread.Sleep(_actionSchedulingInterval);
                    // give additional time to deal with any delays
                    Thread.Sleep(TimeSpan.FromMilliseconds(5));
                    // Flop it back to error
                    watchdog.ReportError(property);
                    Thread.Sleep(TimeSpan.FromMilliseconds(5));
                }
            }
            // By going thru the loop we should have reached the last executor
            Assert.IsTrue(executor.ActionTime == Executors.Last().ActionTime);
        }