public void Onebox_Successful_DeleteRepairInTheMiddle() { const string property = "Onebox_Successful_DeleteRepairInTheMiddle"; MockRepairExecutor executor = Executors.First(); LogHelper.Log("Starting test {0} for executor {1}", property, executor.Name); // Inject error Node node = GetHealthyNode(_fabricClient); MockWatchdog watchdog = new MockWatchdog(_fabricClient.HealthManager, node.NodeName); watchdog.ReportError(property); RepairTask createdTask = PollingForCreatedTask(executor, node); Assert.IsNotNull(createdTask, "There must be a repair task created."); // cancel the task long result = _fabricClient.RepairManager.CancelRepairTaskAsync(createdTask.TaskId, 0, true).Result; // Delete the repair task MockRepairExecutor.DeleteTask(createdTask, _fabricClient.RepairManager); ClearHealthError(watchdog, property, executor); // Report Error watchdog.ReportError(property); createdTask = PollingForCreatedTask(executor, node); Assert.IsNotNull(createdTask, "There must be a repair task created after deletion."); // Complete the repair task executor.Complete(createdTask); }
public void Onebox_Successful_ReplicationAfterRepairCreated() { const string property = "Onebox_Successful_ReplicationAfterRepairCreated"; MockRepairExecutor executor = Executors.First(); LogHelper.Log("Starting test {0} for executor {1}", property, executor.Name); // Inject error Node node = GetHealthyNode(_fabricClient); MockWatchdog watchdog = new MockWatchdog(_fabricClient.HealthManager, node.NodeName); watchdog.ReportError(property); // Wait for PE to schedule the task RepairTask createdTask = PollingForCreatedTask(executor, node); Assert.IsNotNull(createdTask, "There must be a repair task created."); KillPrimaryReplica(); RepairTask newTask = PollingForCreatedTask(executor, node); Assert.IsTrue(newTask.TaskId.Equals(createdTask.TaskId, StringComparison.OrdinalIgnoreCase), "The repair task should be the same created before"); }
public void Onebox_Successful_SelfRepairNoCreatedTask() { const string property = "Onebox_Successful_SelfRepairNoCreatedTask"; LogHelper.Log("Starting test {0}", property); Node node = GetHealthyNode(_fabricClient); // Inject error MockWatchdog watchdog = new MockWatchdog(_fabricClient.HealthManager, node.NodeName); watchdog.ReportError(property); ClearHealthError(watchdog, property, Executors.First()); var nodeHealth = _fabricClient.HealthManager.GetNodeHealthAsync(node.NodeName).Result; Assert.IsTrue(nodeHealth.AggregatedHealthState == HealthState.Ok, "Node health must be Ok after health error event is cleared"); Parallel.ForEach(Executors, executor => { if (!executor.IsEnabled) { return; } RepairTask createdTask = PollingForCreatedTask(executor, node); Assert.IsNull(createdTask, "There must be no scheduled task in case of self-repair"); }); }
public void Onebox_Successful_SelfRepairAfterTaskIsApproved() { const string property = "Onebox_Successful_SelfRepairAfterTaskIsCreated"; MockRepairExecutor executor = Executors.First(); LogHelper.Log("Starting test {0}", property); Node node = GetHealthyNode(_fabricClient); // Inject error MockWatchdog watchdog = new MockWatchdog(_fabricClient.HealthManager, node.NodeName); watchdog.ReportError(property); RepairTask createdTask = PollingForCreatedTask(executor, node); Assert.IsNotNull(createdTask, "There must be a repair task created."); executor.Approve(ref createdTask); ClearHealthError(watchdog, property, executor); // Make sure the task is cancelled Thread.Sleep(_repairManagerLatency); RepairTaskList repairTaskList = _fabricClient.RepairManager.GetRepairTaskListAsync(createdTask.TaskId, RepairTaskStateFilter.All, null).Result; Assert.IsTrue((repairTaskList.Any() && repairTaskList.Count == 1), "There must be a unique repair task with task id = {0}", createdTask.TaskId); RepairTask cancelledTask = repairTaskList.First(); Assert.IsTrue(cancelledTask.Flags == RepairTaskFlags.CancelRequested, "PE should request cancellation of the task"); }
public void Onebox_Successful_ReplicationAfterRepairCompleted() { const string property = "Onebox_Successful_ReplicationAfterRepairCancelled"; MockRepairExecutor executor = Executors.First(); LogHelper.Log("Starting test {0} for executor {1}", property, executor.Name); // Inject error Node node = GetHealthyNode(_fabricClient); MockWatchdog watchdog = new MockWatchdog(_fabricClient.HealthManager, node.NodeName); watchdog.ReportError(property); var nodeHealth = _fabricClient.HealthManager.GetNodeHealthAsync(node.NodeName).Result; var oldRepairHistory = nodeHealth.HealthEvents.Single(e => e.HealthInformation.Property.Equals("RepairPolicyEngineService::RepairHistory", StringComparison.OrdinalIgnoreCase)); // Wait for PE to schedule the task RepairTask createdTask = PollingForCreatedTask(executor, node); Assert.IsNotNull(createdTask, "There must be a repair task created."); DateTime scheduledAt = DateTime.Now; executor.Complete(createdTask); KillPrimaryReplica(); LogHelper.Log("Waiting for the RepairHistory in health store to have the latest repair"); // Make sure the RepairHistory has the entry within 5 minutes string newRepairDescription = null; TimeSpan timeout = TimeSpan.FromMinutes(5); while (timeout >= TimeSpan.Zero) { nodeHealth = _fabricClient.HealthManager.GetNodeHealthAsync(node.NodeName).Result; var repairHistory = nodeHealth.HealthEvents.Single(e => e.HealthInformation.Property.Equals("RepairPolicyEngineService::RepairHistory", StringComparison.OrdinalIgnoreCase)); if (!repairHistory.HealthInformation.Description.Equals(oldRepairHistory.HealthInformation.Description, StringComparison.OrdinalIgnoreCase)) { string[] descriptions = repairHistory.HealthInformation.Description.Split(';'); newRepairDescription = descriptions[descriptions.Count() - 2]; // Last element is empty break; } TimeSpan sleepTime = TimeSpan.FromSeconds(10); Thread.Sleep(sleepTime); timeout -= sleepTime; } Assert.IsFalse(String.IsNullOrEmpty(newRepairDescription), "There must be a new repair description"); string repairType = newRepairDescription.Split('=').First(); Assert.IsTrue(repairType.Equals(executor.Name, StringComparison.OrdinalIgnoreCase), "Repair type is the same as executor name"); DateTime repairScheduledAt = DateTime.Parse(newRepairDescription.Split('=').Last()); Assert.IsTrue(repairScheduledAt >= scheduledAt, "There should be a new entry in repair history"); }
public void Onebox_Successful_ReplicationAfterRepairCancelled() { const string property = "Onebox_Successful_ReplicationAfterRepairCancelled"; MockRepairExecutor executor = Executors.First(); LogHelper.Log("Starting test {0} for executor {1}", property, executor.Name); // Inject error Node node = GetHealthyNode(_fabricClient); MockWatchdog watchdog = new MockWatchdog(_fabricClient.HealthManager, node.NodeName); watchdog.ReportError(property); // Wait for PE to schedule the task RepairTask createdTask = PollingForCreatedTask(executor, node); Assert.IsNotNull(createdTask, "There must be a repair task created."); Assert.IsNotNull(createdTask.CreatedTimestamp != null, "CreatedTimestamp must not be null"); DateTime firstCreated = createdTask.CreatedTimestamp.Value; watchdog.ReportOk(property); // Make sure the task is cancelled Thread.Sleep(_minimumHealthyDuration + _actionSchedulingInterval + _repairManagerLatency); RepairTask cancelledTask = _fabricClient.RepairManager.GetRepairTaskListAsync(createdTask.TaskId, RepairTaskStateFilter.All, null).Result.Single(); Assert.IsTrue(cancelledTask.State == RepairTaskState.Completed && cancelledTask.ResultStatus == RepairTaskResult.Cancelled, "PE should request to cancel the task. Current State = {0}. Result Status = {1}", cancelledTask.State, cancelledTask.ResultStatus); KillPrimaryReplica(); Thread.Sleep(_minimumHealthyDuration + _actionSchedulingInterval + _repairManagerLatency); var repairList = _fabricClient.RepairManager.GetRepairTaskListAsync(createdTask.TaskId, RepairTaskStateFilter.Completed, null).Result; var query = from repairTask in repairList where repairTask.Target is NodeRepairTargetDescription && ((NodeRepairTargetDescription)repairTask.Target).Nodes.First() == node.NodeName && repairTask.State == RepairTaskState.Completed && repairTask.ResultStatus == RepairTaskResult.Cancelled && repairTask.CreatedTimestamp >= firstCreated select repairTask; Assert.IsTrue(query.Count() == 1, "There must be no duplicated cancel request"); }
public void Onebox_Successful_ScheduleSingleRepair() { const string property = "Onebox_Successful_ScheduleSingleRepair"; MockRepairExecutor executor = Executors.First(); LogHelper.Log("Starting test {0} for executor {1}", property, executor.Name); // Inject error Node node = GetHealthyNode(_fabricClient); MockWatchdog watchdog = new MockWatchdog(_fabricClient.HealthManager, node.NodeName); watchdog.ReportError(property); RepairTask createdTask = PollingForCreatedTask(executor, node); Assert.IsNotNull(createdTask, "There must be a repair task created."); }
public void Onebox_Successful_SingleFastTrackRepair() { const string test = "Onebox_Successful_SingleFastTrackRepair"; const string property = "RequestRepair"; // Use reimage executor MockRepairExecutor executor = Executors.Skip(1).First(); Console.WriteLine("Starting test {0} for executor {1}", test, executor.Name); string description = executor.Name; // Inject error Node node = GetHealthyNode(_fabricClient); MockWatchdog watchdog = new MockWatchdog(_fabricClient.HealthManager, node.NodeName); watchdog.ReportError(property, description); RepairTask createdTask = PollingForCreatedTask(executor, node); Assert.IsNotNull(createdTask, "There must be a repair task created."); }
public void Onebox_Successful_CorrectRepairStateTransition() { const string property = "Onebox_Successful_CorrectRepairStateTransition"; LogHelper.Log("Starting test {0}", property); Node node = GetHealthyNode(_fabricClient); MockWatchdog watchdog = new MockWatchdog(_fabricClient.HealthManager, node.NodeName); MockRepairExecutor executor = Executors.First(); // Inject error watchdog.ReportError(property); // Wait for PE to schedule the task RepairTask createdTask = PollingForCreatedTask(executor, node); Assert.IsNotNull(createdTask, "There must be a repair task created."); executor.Process(ref createdTask); }
private void Onebox_Successful_CorrectRepairOrder(string property, bool reportOK) { LogHelper.Log("Starting test {0}", property); // Inject error Node node = GetHealthyNode(_fabricClient); DateTime lastHealthyAt = DateTime.Now; MockWatchdog watchdog = new MockWatchdog(_fabricClient.HealthManager, node.NodeName); watchdog.ReportError(property); int currentIndex = -1; MockRepairExecutor executor = null; // timeout is 2 times the longest policy plusplus a little extra time to account for software delays due to onebox environments DateTime timeout = DateTime.Now + Executors.Last().ActionTime + Executors.Last().ActionTime + TimeSpan.FromMinutes(3); while (DateTime.Now < timeout) { // Wait for PE to schedule the task RepairTask createdTask = PollingForAnyCreatedTask(node); Assert.IsNotNull(createdTask, "There should be a new repair task"); if (executor == null || executor.IsExecutable(createdTask) == false) { ++currentIndex; while (Executors[currentIndex].IsEnabled == false) { LogHelper.Log("Skipping disabled repair executor {0}", Executors[currentIndex].Name); ++currentIndex; if (currentIndex >= Executors.Count()) { Assert.Fail("No subsequent enabled executor"); } } } executor = Executors[currentIndex]; Assert.IsTrue(executor.IsEnabled, "Current executor '{0}' should be enabled", executor.Name); Assert.IsTrue(executor.IsExecutable(createdTask), "Task should be executable by current repair executor {0}", executor.Name); Assert.IsTrue(DateTime.Now - lastHealthyAt > executor.ActionTime, "Processing time should be bigger than executor's action time ({0} seconds)", executor.ActionTime.TotalSeconds); executor.Complete(createdTask); if (reportOK) { // Report OK in the middle to make sure that the escalation happens even in the flip // flopping case where after a repair a machine stays healthy for a shortwhile and then reports error again. Thread.Sleep(_actionSchedulingInterval); watchdog.ReportOk(property); // wait for the next action to happen Thread.Sleep(_actionSchedulingInterval); // give additional time to deal with any delays Thread.Sleep(TimeSpan.FromMilliseconds(5)); // Flop it back to error watchdog.ReportError(property); Thread.Sleep(TimeSpan.FromMilliseconds(5)); } } // By going thru the loop we should have reached the last executor Assert.IsTrue(executor.ActionTime == Executors.Last().ActionTime); }