private void ClearHealthError(MockWatchdog watchdog, string property, MockRepairExecutor executor) { watchdog.ReportOk(property); // Make sure that the internal PE state is also back to Healthy Thread.Sleep(_minimumHealthyDuration + _actionSchedulingInterval); Thread.Sleep(executor.ProbationToHealthyPostRepair); }
public void Onebox_Successful_ReplicationAfterRepairCancelled() { const string property = "Onebox_Successful_ReplicationAfterRepairCancelled"; MockRepairExecutor executor = Executors.First(); LogHelper.Log("Starting test {0} for executor {1}", property, executor.Name); // Inject error Node node = GetHealthyNode(_fabricClient); MockWatchdog watchdog = new MockWatchdog(_fabricClient.HealthManager, node.NodeName); watchdog.ReportError(property); // Wait for PE to schedule the task RepairTask createdTask = PollingForCreatedTask(executor, node); Assert.IsNotNull(createdTask, "There must be a repair task created."); Assert.IsNotNull(createdTask.CreatedTimestamp != null, "CreatedTimestamp must not be null"); DateTime firstCreated = createdTask.CreatedTimestamp.Value; watchdog.ReportOk(property); // Make sure the task is cancelled Thread.Sleep(_minimumHealthyDuration + _actionSchedulingInterval + _repairManagerLatency); RepairTask cancelledTask = _fabricClient.RepairManager.GetRepairTaskListAsync(createdTask.TaskId, RepairTaskStateFilter.All, null).Result.Single(); Assert.IsTrue(cancelledTask.State == RepairTaskState.Completed && cancelledTask.ResultStatus == RepairTaskResult.Cancelled, "PE should request to cancel the task. Current State = {0}. Result Status = {1}", cancelledTask.State, cancelledTask.ResultStatus); KillPrimaryReplica(); Thread.Sleep(_minimumHealthyDuration + _actionSchedulingInterval + _repairManagerLatency); var repairList = _fabricClient.RepairManager.GetRepairTaskListAsync(createdTask.TaskId, RepairTaskStateFilter.Completed, null).Result; var query = from repairTask in repairList where repairTask.Target is NodeRepairTargetDescription && ((NodeRepairTargetDescription)repairTask.Target).Nodes.First() == node.NodeName && repairTask.State == RepairTaskState.Completed && repairTask.ResultStatus == RepairTaskResult.Cancelled && repairTask.CreatedTimestamp >= firstCreated select repairTask; Assert.IsTrue(query.Count() == 1, "There must be no duplicated cancel request"); }
private void Onebox_Successful_CorrectRepairOrder(string property, bool reportOK) { LogHelper.Log("Starting test {0}", property); // Inject error Node node = GetHealthyNode(_fabricClient); DateTime lastHealthyAt = DateTime.Now; MockWatchdog watchdog = new MockWatchdog(_fabricClient.HealthManager, node.NodeName); watchdog.ReportError(property); int currentIndex = -1; MockRepairExecutor executor = null; // timeout is 2 times the longest policy plusplus a little extra time to account for software delays due to onebox environments DateTime timeout = DateTime.Now + Executors.Last().ActionTime + Executors.Last().ActionTime + TimeSpan.FromMinutes(3); while (DateTime.Now < timeout) { // Wait for PE to schedule the task RepairTask createdTask = PollingForAnyCreatedTask(node); Assert.IsNotNull(createdTask, "There should be a new repair task"); if (executor == null || executor.IsExecutable(createdTask) == false) { ++currentIndex; while (Executors[currentIndex].IsEnabled == false) { LogHelper.Log("Skipping disabled repair executor {0}", Executors[currentIndex].Name); ++currentIndex; if (currentIndex >= Executors.Count()) { Assert.Fail("No subsequent enabled executor"); } } } executor = Executors[currentIndex]; Assert.IsTrue(executor.IsEnabled, "Current executor '{0}' should be enabled", executor.Name); Assert.IsTrue(executor.IsExecutable(createdTask), "Task should be executable by current repair executor {0}", executor.Name); Assert.IsTrue(DateTime.Now - lastHealthyAt > executor.ActionTime, "Processing time should be bigger than executor's action time ({0} seconds)", executor.ActionTime.TotalSeconds); executor.Complete(createdTask); if (reportOK) { // Report OK in the middle to make sure that the escalation happens even in the flip // flopping case where after a repair a machine stays healthy for a shortwhile and then reports error again. Thread.Sleep(_actionSchedulingInterval); watchdog.ReportOk(property); // wait for the next action to happen Thread.Sleep(_actionSchedulingInterval); // give additional time to deal with any delays Thread.Sleep(TimeSpan.FromMilliseconds(5)); // Flop it back to error watchdog.ReportError(property); Thread.Sleep(TimeSpan.FromMilliseconds(5)); } } // By going thru the loop we should have reached the last executor Assert.IsTrue(executor.ActionTime == Executors.Last().ActionTime); }