public void Onebox_Successful_DeleteRepairInTheMiddle() { const string property = "Onebox_Successful_DeleteRepairInTheMiddle"; MockRepairExecutor executor = Executors.First(); LogHelper.Log("Starting test {0} for executor {1}", property, executor.Name); // Inject error Node node = GetHealthyNode(_fabricClient); MockWatchdog watchdog = new MockWatchdog(_fabricClient.HealthManager, node.NodeName); watchdog.ReportError(property); RepairTask createdTask = PollingForCreatedTask(executor, node); Assert.IsNotNull(createdTask, "There must be a repair task created."); // cancel the task long result = _fabricClient.RepairManager.CancelRepairTaskAsync(createdTask.TaskId, 0, true).Result; // Delete the repair task MockRepairExecutor.DeleteTask(createdTask, _fabricClient.RepairManager); ClearHealthError(watchdog, property, executor); // Report Error watchdog.ReportError(property); createdTask = PollingForCreatedTask(executor, node); Assert.IsNotNull(createdTask, "There must be a repair task created after deletion."); // Complete the repair task executor.Complete(createdTask); }
public void Onebox_Successful_ReplicationAfterRepairCompleted() { const string property = "Onebox_Successful_ReplicationAfterRepairCancelled"; MockRepairExecutor executor = Executors.First(); LogHelper.Log("Starting test {0} for executor {1}", property, executor.Name); // Inject error Node node = GetHealthyNode(_fabricClient); MockWatchdog watchdog = new MockWatchdog(_fabricClient.HealthManager, node.NodeName); watchdog.ReportError(property); var nodeHealth = _fabricClient.HealthManager.GetNodeHealthAsync(node.NodeName).Result; var oldRepairHistory = nodeHealth.HealthEvents.Single(e => e.HealthInformation.Property.Equals("RepairPolicyEngineService::RepairHistory", StringComparison.OrdinalIgnoreCase)); // Wait for PE to schedule the task RepairTask createdTask = PollingForCreatedTask(executor, node); Assert.IsNotNull(createdTask, "There must be a repair task created."); DateTime scheduledAt = DateTime.Now; executor.Complete(createdTask); KillPrimaryReplica(); LogHelper.Log("Waiting for the RepairHistory in health store to have the latest repair"); // Make sure the RepairHistory has the entry within 5 minutes string newRepairDescription = null; TimeSpan timeout = TimeSpan.FromMinutes(5); while (timeout >= TimeSpan.Zero) { nodeHealth = _fabricClient.HealthManager.GetNodeHealthAsync(node.NodeName).Result; var repairHistory = nodeHealth.HealthEvents.Single(e => e.HealthInformation.Property.Equals("RepairPolicyEngineService::RepairHistory", StringComparison.OrdinalIgnoreCase)); if (!repairHistory.HealthInformation.Description.Equals(oldRepairHistory.HealthInformation.Description, StringComparison.OrdinalIgnoreCase)) { string[] descriptions = repairHistory.HealthInformation.Description.Split(';'); newRepairDescription = descriptions[descriptions.Count() - 2]; // Last element is empty break; } TimeSpan sleepTime = TimeSpan.FromSeconds(10); Thread.Sleep(sleepTime); timeout -= sleepTime; } Assert.IsFalse(String.IsNullOrEmpty(newRepairDescription), "There must be a new repair description"); string repairType = newRepairDescription.Split('=').First(); Assert.IsTrue(repairType.Equals(executor.Name, StringComparison.OrdinalIgnoreCase), "Repair type is the same as executor name"); DateTime repairScheduledAt = DateTime.Parse(newRepairDescription.Split('=').Last()); Assert.IsTrue(repairScheduledAt >= scheduledAt, "There should be a new entry in repair history"); }
private void Onebox_Successful_CorrectRepairOrder(string property, bool reportOK) { LogHelper.Log("Starting test {0}", property); // Inject error Node node = GetHealthyNode(_fabricClient); DateTime lastHealthyAt = DateTime.Now; MockWatchdog watchdog = new MockWatchdog(_fabricClient.HealthManager, node.NodeName); watchdog.ReportError(property); int currentIndex = -1; MockRepairExecutor executor = null; // timeout is 2 times the longest policy plusplus a little extra time to account for software delays due to onebox environments DateTime timeout = DateTime.Now + Executors.Last().ActionTime + Executors.Last().ActionTime + TimeSpan.FromMinutes(3); while (DateTime.Now < timeout) { // Wait for PE to schedule the task RepairTask createdTask = PollingForAnyCreatedTask(node); Assert.IsNotNull(createdTask, "There should be a new repair task"); if (executor == null || executor.IsExecutable(createdTask) == false) { ++currentIndex; while (Executors[currentIndex].IsEnabled == false) { LogHelper.Log("Skipping disabled repair executor {0}", Executors[currentIndex].Name); ++currentIndex; if (currentIndex >= Executors.Count()) { Assert.Fail("No subsequent enabled executor"); } } } executor = Executors[currentIndex]; Assert.IsTrue(executor.IsEnabled, "Current executor '{0}' should be enabled", executor.Name); Assert.IsTrue(executor.IsExecutable(createdTask), "Task should be executable by current repair executor {0}", executor.Name); Assert.IsTrue(DateTime.Now - lastHealthyAt > executor.ActionTime, "Processing time should be bigger than executor's action time ({0} seconds)", executor.ActionTime.TotalSeconds); executor.Complete(createdTask); if (reportOK) { // Report OK in the middle to make sure that the escalation happens even in the flip // flopping case where after a repair a machine stays healthy for a shortwhile and then reports error again. Thread.Sleep(_actionSchedulingInterval); watchdog.ReportOk(property); // wait for the next action to happen Thread.Sleep(_actionSchedulingInterval); // give additional time to deal with any delays Thread.Sleep(TimeSpan.FromMilliseconds(5)); // Flop it back to error watchdog.ReportError(property); Thread.Sleep(TimeSpan.FromMilliseconds(5)); } } // By going thru the loop we should have reached the last executor Assert.IsTrue(executor.ActionTime == Executors.Last().ActionTime); }