private void ClearHealthError(MockWatchdog watchdog, string property, MockRepairExecutor executor) { watchdog.ReportOk(property); // Make sure that the internal PE state is also back to Healthy Thread.Sleep(_minimumHealthyDuration + _actionSchedulingInterval); Thread.Sleep(executor.ProbationToHealthyPostRepair); }
public void Onebox_Successful_ReplicationAfterRepairCreated() { const string property = "Onebox_Successful_ReplicationAfterRepairCreated"; MockRepairExecutor executor = Executors.First(); LogHelper.Log("Starting test {0} for executor {1}", property, executor.Name); // Inject error Node node = GetHealthyNode(_fabricClient); MockWatchdog watchdog = new MockWatchdog(_fabricClient.HealthManager, node.NodeName); watchdog.ReportError(property); // Wait for PE to schedule the task RepairTask createdTask = PollingForCreatedTask(executor, node); Assert.IsNotNull(createdTask, "There must be a repair task created."); KillPrimaryReplica(); RepairTask newTask = PollingForCreatedTask(executor, node); Assert.IsTrue(newTask.TaskId.Equals(createdTask.TaskId, StringComparison.OrdinalIgnoreCase), "The repair task should be the same created before"); }
public void Onebox_Successful_DeleteRepairInTheMiddle() { const string property = "Onebox_Successful_DeleteRepairInTheMiddle"; MockRepairExecutor executor = Executors.First(); LogHelper.Log("Starting test {0} for executor {1}", property, executor.Name); // Inject error Node node = GetHealthyNode(_fabricClient); MockWatchdog watchdog = new MockWatchdog(_fabricClient.HealthManager, node.NodeName); watchdog.ReportError(property); RepairTask createdTask = PollingForCreatedTask(executor, node); Assert.IsNotNull(createdTask, "There must be a repair task created."); // cancel the task long result = _fabricClient.RepairManager.CancelRepairTaskAsync(createdTask.TaskId, 0, true).Result; // Delete the repair task MockRepairExecutor.DeleteTask(createdTask, _fabricClient.RepairManager); ClearHealthError(watchdog, property, executor); // Report Error watchdog.ReportError(property); createdTask = PollingForCreatedTask(executor, node); Assert.IsNotNull(createdTask, "There must be a repair task created after deletion."); // Complete the repair task executor.Complete(createdTask); }
public void Onebox_Successful_SelfRepairAfterTaskIsApproved() { const string property = "Onebox_Successful_SelfRepairAfterTaskIsCreated"; MockRepairExecutor executor = Executors.First(); LogHelper.Log("Starting test {0}", property); Node node = GetHealthyNode(_fabricClient); // Inject error MockWatchdog watchdog = new MockWatchdog(_fabricClient.HealthManager, node.NodeName); watchdog.ReportError(property); RepairTask createdTask = PollingForCreatedTask(executor, node); Assert.IsNotNull(createdTask, "There must be a repair task created."); executor.Approve(ref createdTask); ClearHealthError(watchdog, property, executor); // Make sure the task is cancelled Thread.Sleep(_repairManagerLatency); RepairTaskList repairTaskList = _fabricClient.RepairManager.GetRepairTaskListAsync(createdTask.TaskId, RepairTaskStateFilter.All, null).Result; Assert.IsTrue((repairTaskList.Any() && repairTaskList.Count == 1), "There must be a unique repair task with task id = {0}", createdTask.TaskId); RepairTask cancelledTask = repairTaskList.First(); Assert.IsTrue(cancelledTask.Flags == RepairTaskFlags.CancelRequested, "PE should request cancellation of the task"); }
public void Onebox_Successful_ReplicationAfterRepairCompleted() { const string property = "Onebox_Successful_ReplicationAfterRepairCancelled"; MockRepairExecutor executor = Executors.First(); LogHelper.Log("Starting test {0} for executor {1}", property, executor.Name); // Inject error Node node = GetHealthyNode(_fabricClient); MockWatchdog watchdog = new MockWatchdog(_fabricClient.HealthManager, node.NodeName); watchdog.ReportError(property); var nodeHealth = _fabricClient.HealthManager.GetNodeHealthAsync(node.NodeName).Result; var oldRepairHistory = nodeHealth.HealthEvents.Single(e => e.HealthInformation.Property.Equals("RepairPolicyEngineService::RepairHistory", StringComparison.OrdinalIgnoreCase)); // Wait for PE to schedule the task RepairTask createdTask = PollingForCreatedTask(executor, node); Assert.IsNotNull(createdTask, "There must be a repair task created."); DateTime scheduledAt = DateTime.Now; executor.Complete(createdTask); KillPrimaryReplica(); LogHelper.Log("Waiting for the RepairHistory in health store to have the latest repair"); // Make sure the RepairHistory has the entry within 5 minutes string newRepairDescription = null; TimeSpan timeout = TimeSpan.FromMinutes(5); while (timeout >= TimeSpan.Zero) { nodeHealth = _fabricClient.HealthManager.GetNodeHealthAsync(node.NodeName).Result; var repairHistory = nodeHealth.HealthEvents.Single(e => e.HealthInformation.Property.Equals("RepairPolicyEngineService::RepairHistory", StringComparison.OrdinalIgnoreCase)); if (!repairHistory.HealthInformation.Description.Equals(oldRepairHistory.HealthInformation.Description, StringComparison.OrdinalIgnoreCase)) { string[] descriptions = repairHistory.HealthInformation.Description.Split(';'); newRepairDescription = descriptions[descriptions.Count() - 2]; // Last element is empty break; } TimeSpan sleepTime = TimeSpan.FromSeconds(10); Thread.Sleep(sleepTime); timeout -= sleepTime; } Assert.IsFalse(String.IsNullOrEmpty(newRepairDescription), "There must be a new repair description"); string repairType = newRepairDescription.Split('=').First(); Assert.IsTrue(repairType.Equals(executor.Name, StringComparison.OrdinalIgnoreCase), "Repair type is the same as executor name"); DateTime repairScheduledAt = DateTime.Parse(newRepairDescription.Split('=').Last()); Assert.IsTrue(repairScheduledAt >= scheduledAt, "There should be a new entry in repair history"); }
public static void TestCleanup() { LogHelper.Log("TestCleanup()"); // Clear health store var nodeList = _fabricClient.QueryManager.GetNodeListAsync().Result; foreach (var node in nodeList) { MockWatchdog watchdog = new MockWatchdog(_fabricClient.HealthManager, node.NodeName); watchdog.ClearError(); } // Cancel existing tasks MockRepairExecutor.CancelAll(Executors, _fabricClient.RepairManager); }
public void Onebox_Successful_ReplicationAfterRepairCancelled() { const string property = "Onebox_Successful_ReplicationAfterRepairCancelled"; MockRepairExecutor executor = Executors.First(); LogHelper.Log("Starting test {0} for executor {1}", property, executor.Name); // Inject error Node node = GetHealthyNode(_fabricClient); MockWatchdog watchdog = new MockWatchdog(_fabricClient.HealthManager, node.NodeName); watchdog.ReportError(property); // Wait for PE to schedule the task RepairTask createdTask = PollingForCreatedTask(executor, node); Assert.IsNotNull(createdTask, "There must be a repair task created."); Assert.IsNotNull(createdTask.CreatedTimestamp != null, "CreatedTimestamp must not be null"); DateTime firstCreated = createdTask.CreatedTimestamp.Value; watchdog.ReportOk(property); // Make sure the task is cancelled Thread.Sleep(_minimumHealthyDuration + _actionSchedulingInterval + _repairManagerLatency); RepairTask cancelledTask = _fabricClient.RepairManager.GetRepairTaskListAsync(createdTask.TaskId, RepairTaskStateFilter.All, null).Result.Single(); Assert.IsTrue(cancelledTask.State == RepairTaskState.Completed && cancelledTask.ResultStatus == RepairTaskResult.Cancelled, "PE should request to cancel the task. Current State = {0}. Result Status = {1}", cancelledTask.State, cancelledTask.ResultStatus); KillPrimaryReplica(); Thread.Sleep(_minimumHealthyDuration + _actionSchedulingInterval + _repairManagerLatency); var repairList = _fabricClient.RepairManager.GetRepairTaskListAsync(createdTask.TaskId, RepairTaskStateFilter.Completed, null).Result; var query = from repairTask in repairList where repairTask.Target is NodeRepairTargetDescription && ((NodeRepairTargetDescription)repairTask.Target).Nodes.First() == node.NodeName && repairTask.State == RepairTaskState.Completed && repairTask.ResultStatus == RepairTaskResult.Cancelled && repairTask.CreatedTimestamp >= firstCreated select repairTask; Assert.IsTrue(query.Count() == 1, "There must be no duplicated cancel request"); }
public void Onebox_Successful_ScheduleSingleRepair() { const string property = "Onebox_Successful_ScheduleSingleRepair"; MockRepairExecutor executor = Executors.First(); LogHelper.Log("Starting test {0} for executor {1}", property, executor.Name); // Inject error Node node = GetHealthyNode(_fabricClient); MockWatchdog watchdog = new MockWatchdog(_fabricClient.HealthManager, node.NodeName); watchdog.ReportError(property); RepairTask createdTask = PollingForCreatedTask(executor, node); Assert.IsNotNull(createdTask, "There must be a repair task created."); }
public void Onebox_Successful_SingleFastTrackRepair() { const string test = "Onebox_Successful_SingleFastTrackRepair"; const string property = "RequestRepair"; // Use reimage executor MockRepairExecutor executor = Executors.Skip(1).First(); Console.WriteLine("Starting test {0} for executor {1}", test, executor.Name); string description = executor.Name; // Inject error Node node = GetHealthyNode(_fabricClient); MockWatchdog watchdog = new MockWatchdog(_fabricClient.HealthManager, node.NodeName); watchdog.ReportError(property, description); RepairTask createdTask = PollingForCreatedTask(executor, node); Assert.IsNotNull(createdTask, "There must be a repair task created."); }
public void Onebox_Successful_CorrectRepairStateTransition() { const string property = "Onebox_Successful_CorrectRepairStateTransition"; LogHelper.Log("Starting test {0}", property); Node node = GetHealthyNode(_fabricClient); MockWatchdog watchdog = new MockWatchdog(_fabricClient.HealthManager, node.NodeName); MockRepairExecutor executor = Executors.First(); // Inject error watchdog.ReportError(property); // Wait for PE to schedule the task RepairTask createdTask = PollingForCreatedTask(executor, node); Assert.IsNotNull(createdTask, "There must be a repair task created."); executor.Process(ref createdTask); }
private static RepairTask PollingForCreatedTask(MockRepairExecutor executor, Node node) { LogHelper.Log("Polling for created task for executor '{0}' and node '{1}'", executor.Name, node.NodeName); TimeSpan timeout = executor.ActionTime + _probationToFailing + _actionSchedulingInterval + _repairManagerLatency; // Give a little extra time to account for software delays due to onebox environments timeout = timeout.Add(TimeSpan.FromMinutes(2)); DateTime startTime = DateTime.Now; while (DateTime.Now - startTime <= timeout) { RepairTask task = executor.GetCreatedTask(node.NodeName); if (task != null) { return(task); } Thread.Sleep(_actionSchedulingInterval); } return(null); }
private static void LoadExecutors(XDocument xmlSettings) { Executors = new List <MockRepairExecutor>(); var allRepairAction = from element in xmlSettings.Descendants() where element.Name.LocalName == "Section" && element.Attribute("Name").Value == "NodeRepairActionList" select element; foreach (var repairActionSection in allRepairAction.First().Descendants()) { string repairActionSectionName = repairActionSection.Attribute("Value").Value; string name = repairActionSection.Attribute("Name").Value; //TimeSpan ProbationToFailingPostRepair = GetParameterTimeSpan(xmlSettings, repairActionSectionName, "ProbationToFailingWaitDurationPostRepairInSeconds"); TimeSpan ProbationToHealthyPostRepair = GetParameterTimeSpan(xmlSettings, repairActionSectionName, "ProbationToHealthyWaitDurationPostRepairInSeconds"); TimeSpan policyActionTime = GetParameterTimeSpan(xmlSettings, repairActionSectionName, "PolicyActionTimeInSeconds"); bool isEnabled = Convert.ToBoolean(GetParameter(xmlSettings, repairActionSectionName, "IsEnabled")); MockRepairExecutor executor = new MockRepairExecutor(name, policyActionTime, isEnabled, ProbationToHealthyPostRepair, _fabricClient.RepairManager); Executors.Add(executor); } Executors.Sort((executor1, executor2) => executor1.ActionTime.CompareTo(executor2.ActionTime)); }
private void Onebox_Successful_CorrectRepairOrder(string property, bool reportOK) { LogHelper.Log("Starting test {0}", property); // Inject error Node node = GetHealthyNode(_fabricClient); DateTime lastHealthyAt = DateTime.Now; MockWatchdog watchdog = new MockWatchdog(_fabricClient.HealthManager, node.NodeName); watchdog.ReportError(property); int currentIndex = -1; MockRepairExecutor executor = null; // timeout is 2 times the longest policy plusplus a little extra time to account for software delays due to onebox environments DateTime timeout = DateTime.Now + Executors.Last().ActionTime + Executors.Last().ActionTime + TimeSpan.FromMinutes(3); while (DateTime.Now < timeout) { // Wait for PE to schedule the task RepairTask createdTask = PollingForAnyCreatedTask(node); Assert.IsNotNull(createdTask, "There should be a new repair task"); if (executor == null || executor.IsExecutable(createdTask) == false) { ++currentIndex; while (Executors[currentIndex].IsEnabled == false) { LogHelper.Log("Skipping disabled repair executor {0}", Executors[currentIndex].Name); ++currentIndex; if (currentIndex >= Executors.Count()) { Assert.Fail("No subsequent enabled executor"); } } } executor = Executors[currentIndex]; Assert.IsTrue(executor.IsEnabled, "Current executor '{0}' should be enabled", executor.Name); Assert.IsTrue(executor.IsExecutable(createdTask), "Task should be executable by current repair executor {0}", executor.Name); Assert.IsTrue(DateTime.Now - lastHealthyAt > executor.ActionTime, "Processing time should be bigger than executor's action time ({0} seconds)", executor.ActionTime.TotalSeconds); executor.Complete(createdTask); if (reportOK) { // Report OK in the middle to make sure that the escalation happens even in the flip // flopping case where after a repair a machine stays healthy for a shortwhile and then reports error again. Thread.Sleep(_actionSchedulingInterval); watchdog.ReportOk(property); // wait for the next action to happen Thread.Sleep(_actionSchedulingInterval); // give additional time to deal with any delays Thread.Sleep(TimeSpan.FromMilliseconds(5)); // Flop it back to error watchdog.ReportError(property); Thread.Sleep(TimeSpan.FromMilliseconds(5)); } } // By going thru the loop we should have reached the last executor Assert.IsTrue(executor.ActionTime == Executors.Last().ActionTime); }