public virtual void TestAddNewExcludePathToConfiguration() { Configuration conf = new Configuration(); rm = new MockRM(conf); rm.Start(); MockNM nm1 = rm.RegisterNode("host1:1234", 5120); MockNM nm2 = rm.RegisterNode("host2:5678", 10240); ClusterMetrics metrics = ClusterMetrics.GetMetrics(); System.Diagnostics.Debug.Assert((metrics != null)); int initialMetricCount = metrics.GetNumDecommisionedNMs(); NodeHeartbeatResponse nodeHeartbeat = nm1.NodeHeartbeat(true); NUnit.Framework.Assert.AreEqual(NodeAction.Normal, nodeHeartbeat.GetNodeAction()); nodeHeartbeat = nm2.NodeHeartbeat(true); NUnit.Framework.Assert.AreEqual(NodeAction.Normal, nodeHeartbeat.GetNodeAction()); WriteToHostsFile("host2"); conf.Set(YarnConfiguration.RmNodesExcludeFilePath, hostFile.GetAbsolutePath()); rm.GetNodesListManager().RefreshNodes(conf); CheckDecommissionedNMCount(rm, ++initialMetricCount); nodeHeartbeat = nm1.NodeHeartbeat(true); NUnit.Framework.Assert.AreEqual("Node should not have been decomissioned.", NodeAction .Normal, nodeHeartbeat.GetNodeAction()); nodeHeartbeat = nm2.NodeHeartbeat(true); NUnit.Framework.Assert.AreEqual("Node should have been decomissioned but is in state" + nodeHeartbeat.GetNodeAction(), NodeAction.Shutdown, nodeHeartbeat.GetNodeAction ()); }
/// <exception cref="System.Exception"/> private void VerifyClusterMetrics(int activeNodes, int appsSubmitted, int appsPending , int containersPending, int availableMB, int activeApplications) { int timeoutSecs = 0; QueueMetrics metrics = rm.GetResourceScheduler().GetRootQueueMetrics(); ClusterMetrics clusterMetrics = ClusterMetrics.GetMetrics(); bool isAllMetricAssertionDone = false; string message = null; while (timeoutSecs++ < 5) { try { // verify queue metrics AssertMetric("appsSubmitted", appsSubmitted, metrics.GetAppsSubmitted()); AssertMetric("appsPending", appsPending, metrics.GetAppsPending()); AssertMetric("containersPending", containersPending, metrics.GetPendingContainers ()); AssertMetric("availableMB", availableMB, metrics.GetAvailableMB()); AssertMetric("activeApplications", activeApplications, metrics.GetActiveApps()); // verify node metric AssertMetric("activeNodes", activeNodes, clusterMetrics.GetNumActiveNMs()); isAllMetricAssertionDone = true; break; } catch (Exception e) { message = e.Message; System.Console.Out.WriteLine("Waiting for metrics assertion to complete"); Sharpen.Thread.Sleep(1000); } } NUnit.Framework.Assert.IsTrue(message, isAllMetricAssertionDone); }
public virtual void TestReconnect() { RMNodeImpl node = GetRunningNode(); ClusterMetrics cm = ClusterMetrics.GetMetrics(); int initialActive = cm.GetNumActiveNMs(); int initialLost = cm.GetNumLostNMs(); int initialUnhealthy = cm.GetUnhealthyNMs(); int initialDecommissioned = cm.GetNumDecommisionedNMs(); int initialRebooted = cm.GetNumRebootedNMs(); node.Handle(new RMNodeReconnectEvent(node.GetNodeID(), node, null, null)); NUnit.Framework.Assert.AreEqual("Active Nodes", initialActive, cm.GetNumActiveNMs ()); NUnit.Framework.Assert.AreEqual("Lost Nodes", initialLost, cm.GetNumLostNMs()); NUnit.Framework.Assert.AreEqual("Unhealthy Nodes", initialUnhealthy, cm.GetUnhealthyNMs ()); NUnit.Framework.Assert.AreEqual("Decommissioned Nodes", initialDecommissioned, cm .GetNumDecommisionedNMs()); NUnit.Framework.Assert.AreEqual("Rebooted Nodes", initialRebooted, cm.GetNumRebootedNMs ()); NUnit.Framework.Assert.AreEqual(NodeState.Running, node.GetState()); NUnit.Framework.Assert.IsNotNull(nodesListManagerEvent); NUnit.Framework.Assert.AreEqual(NodesListManagerEventType.NodeUsable, nodesListManagerEvent .GetType()); }
internal static void Destroy() { lock (typeof(ClusterMetrics)) { isInitialized.Set(false); Instance = null; } }
public virtual void TearDown() { ClusterMetrics.Destroy(); MetricsSystem ms = DefaultMetricsSystem.Instance(); if (ms.GetSource("ClusterMetrics") != null) { DefaultMetricsSystem.Shutdown(); } }
public virtual void TestDecommissionWithExcludeHosts() { Configuration conf = new Configuration(); conf.Set(YarnConfiguration.RmNodesExcludeFilePath, hostFile.GetAbsolutePath()); WriteToHostsFile(string.Empty); DrainDispatcher dispatcher = new DrainDispatcher(); rm = new _MockRM_162(dispatcher, conf); rm.Start(); MockNM nm1 = rm.RegisterNode("host1:1234", 5120); MockNM nm2 = rm.RegisterNode("host2:5678", 10240); MockNM nm3 = rm.RegisterNode("localhost:4433", 1024); dispatcher.Await(); int metricCount = ClusterMetrics.GetMetrics().GetNumDecommisionedNMs(); NodeHeartbeatResponse nodeHeartbeat = nm1.NodeHeartbeat(true); NUnit.Framework.Assert.IsTrue(NodeAction.Normal.Equals(nodeHeartbeat.GetNodeAction ())); nodeHeartbeat = nm2.NodeHeartbeat(true); NUnit.Framework.Assert.IsTrue(NodeAction.Normal.Equals(nodeHeartbeat.GetNodeAction ())); dispatcher.Await(); // To test that IPs also work string ip = NetUtils.NormalizeHostName("localhost"); WriteToHostsFile("host2", ip); rm.GetNodesListManager().RefreshNodes(conf); CheckDecommissionedNMCount(rm, metricCount + 2); nodeHeartbeat = nm1.NodeHeartbeat(true); NUnit.Framework.Assert.IsTrue(NodeAction.Normal.Equals(nodeHeartbeat.GetNodeAction ())); nodeHeartbeat = nm2.NodeHeartbeat(true); NUnit.Framework.Assert.IsTrue("The decommisioned metrics are not updated", NodeAction .Shutdown.Equals(nodeHeartbeat.GetNodeAction())); nodeHeartbeat = nm3.NodeHeartbeat(true); NUnit.Framework.Assert.IsTrue("The decommisioned metrics are not updated", NodeAction .Shutdown.Equals(nodeHeartbeat.GetNodeAction())); dispatcher.Await(); WriteToHostsFile(string.Empty); rm.GetNodesListManager().RefreshNodes(conf); nm3 = rm.RegisterNode("localhost:4433", 1024); dispatcher.Await(); nodeHeartbeat = nm3.NodeHeartbeat(true); dispatcher.Await(); NUnit.Framework.Assert.IsTrue(NodeAction.Normal.Equals(nodeHeartbeat.GetNodeAction ())); // decommissined node is 1 since 1 node is rejoined after updating exclude // file CheckDecommissionedNMCount(rm, metricCount + 1); }
/// <exception cref="System.Exception"/> private void CheckRebootedNMCount(MockRM rm2, int count) { int waitCount = 0; while (ClusterMetrics.GetMetrics().GetNumRebootedNMs() != count && waitCount++ < 20) { lock (this) { Sharpen.Runtime.Wait(this, 100); } } NUnit.Framework.Assert.AreEqual("The rebooted metrics are not updated", count, ClusterMetrics .GetMetrics().GetNumRebootedNMs()); }
public static ClusterMetrics GetMetrics() { if (!isInitialized.Get()) { lock (typeof(ClusterMetrics)) { if (Instance == null) { Instance = new ClusterMetrics(); RegisterMetrics(); isInitialized.Set(true); } } } return(Instance); }
public virtual void TestDecommissionWithIncludeHosts() { WriteToHostsFile("localhost", "host1", "host2"); Configuration conf = new Configuration(); conf.Set(YarnConfiguration.RmNodesIncludeFilePath, hostFile.GetAbsolutePath()); rm = new MockRM(conf); rm.Start(); MockNM nm1 = rm.RegisterNode("host1:1234", 5120); MockNM nm2 = rm.RegisterNode("host2:5678", 10240); MockNM nm3 = rm.RegisterNode("localhost:4433", 1024); ClusterMetrics metrics = ClusterMetrics.GetMetrics(); System.Diagnostics.Debug.Assert((metrics != null)); int metricCount = metrics.GetNumDecommisionedNMs(); NodeHeartbeatResponse nodeHeartbeat = nm1.NodeHeartbeat(true); NUnit.Framework.Assert.IsTrue(NodeAction.Normal.Equals(nodeHeartbeat.GetNodeAction ())); nodeHeartbeat = nm2.NodeHeartbeat(true); NUnit.Framework.Assert.IsTrue(NodeAction.Normal.Equals(nodeHeartbeat.GetNodeAction ())); nodeHeartbeat = nm3.NodeHeartbeat(true); NUnit.Framework.Assert.IsTrue(NodeAction.Normal.Equals(nodeHeartbeat.GetNodeAction ())); // To test that IPs also work string ip = NetUtils.NormalizeHostName("localhost"); WriteToHostsFile("host1", ip); rm.GetNodesListManager().RefreshNodes(conf); CheckDecommissionedNMCount(rm, ++metricCount); nodeHeartbeat = nm1.NodeHeartbeat(true); NUnit.Framework.Assert.IsTrue(NodeAction.Normal.Equals(nodeHeartbeat.GetNodeAction ())); NUnit.Framework.Assert.AreEqual(1, ClusterMetrics.GetMetrics().GetNumDecommisionedNMs ()); nodeHeartbeat = nm2.NodeHeartbeat(true); NUnit.Framework.Assert.IsTrue("Node is not decommisioned.", NodeAction.Shutdown.Equals (nodeHeartbeat.GetNodeAction())); nodeHeartbeat = nm3.NodeHeartbeat(true); NUnit.Framework.Assert.IsTrue(NodeAction.Normal.Equals(nodeHeartbeat.GetNodeAction ())); NUnit.Framework.Assert.AreEqual(metricCount, ClusterMetrics.GetMetrics().GetNumDecommisionedNMs ()); }
public virtual void TearDown() { if (hostFile != null && hostFile.Exists()) { hostFile.Delete(); } ClusterMetrics.Destroy(); if (rm != null) { rm.Stop(); } MetricsSystem ms = DefaultMetricsSystem.Instance(); if (ms.GetSource("ClusterMetrics") != null) { DefaultMetricsSystem.Shutdown(); } }
public virtual void TestUnhealthyNodeStatus() { Configuration conf = new Configuration(); conf.Set(YarnConfiguration.RmNodesExcludeFilePath, hostFile.GetAbsolutePath()); rm = new MockRM(conf); rm.Start(); MockNM nm1 = rm.RegisterNode("host1:1234", 5120); NUnit.Framework.Assert.AreEqual(0, ClusterMetrics.GetMetrics().GetUnhealthyNMs()); // node healthy nm1.NodeHeartbeat(true); // node unhealthy nm1.NodeHeartbeat(false); CheckUnealthyNMCount(rm, nm1, true, 1); // node healthy again nm1.NodeHeartbeat(true); CheckUnealthyNMCount(rm, nm1, false, 0); }
public virtual void SetUp() { configuration = new Configuration(); UserGroupInformation.SetConfiguration(configuration); configuration.SetBoolean(YarnConfiguration.RmHaEnabled, true); configuration.Set(YarnConfiguration.RmHaIds, Rm1NodeId + "," + Rm2NodeId); foreach (string confKey in YarnConfiguration.GetServiceAddressConfKeys(configuration )) { configuration.Set(HAUtil.AddSuffix(confKey, Rm1NodeId), Rm1Address); configuration.Set(HAUtil.AddSuffix(confKey, Rm2NodeId), Rm2Address); configuration.Set(HAUtil.AddSuffix(confKey, Rm3NodeId), Rm3Address); } // Enable webapp to test web-services also configuration.SetBoolean(MockRM.EnableWebapp, true); configuration.SetBoolean(YarnConfiguration.YarnAclEnable, true); ClusterMetrics.Destroy(); QueueMetrics.ClearQueueMetrics(); DefaultMetricsSystem.Shutdown(); }
public virtual void TestReboot() { Configuration conf = new Configuration(); rm = new MockRM(conf); rm.Start(); MockNM nm1 = rm.RegisterNode("host1:1234", 5120); MockNM nm2 = rm.RegisterNode("host2:1234", 2048); int initialMetricCount = ClusterMetrics.GetMetrics().GetNumRebootedNMs(); NodeHeartbeatResponse nodeHeartbeat = nm1.NodeHeartbeat(true); NUnit.Framework.Assert.IsTrue(NodeAction.Normal.Equals(nodeHeartbeat.GetNodeAction ())); nodeHeartbeat = nm2.NodeHeartbeat(new Dictionary <ApplicationId, IList <ContainerStatus > >(), true, -100); NUnit.Framework.Assert.IsTrue(NodeAction.Resync.Equals(nodeHeartbeat.GetNodeAction ())); NUnit.Framework.Assert.AreEqual("Too far behind rm response id:0 nm response id:-100" , nodeHeartbeat.GetDiagnosticsMessage()); CheckRebootedNMCount(rm, ++initialMetricCount); }
public virtual void TestUnhealthyRebooting() { RMNodeImpl node = GetUnhealthyNode(); ClusterMetrics cm = ClusterMetrics.GetMetrics(); int initialActive = cm.GetNumActiveNMs(); int initialLost = cm.GetNumLostNMs(); int initialUnhealthy = cm.GetUnhealthyNMs(); int initialDecommissioned = cm.GetNumDecommisionedNMs(); int initialRebooted = cm.GetNumRebootedNMs(); node.Handle(new RMNodeEvent(node.GetNodeID(), RMNodeEventType.Rebooting)); NUnit.Framework.Assert.AreEqual("Active Nodes", initialActive, cm.GetNumActiveNMs ()); NUnit.Framework.Assert.AreEqual("Lost Nodes", initialLost, cm.GetNumLostNMs()); NUnit.Framework.Assert.AreEqual("Unhealthy Nodes", initialUnhealthy - 1, cm.GetUnhealthyNMs ()); NUnit.Framework.Assert.AreEqual("Decommissioned Nodes", initialDecommissioned, cm .GetNumDecommisionedNMs()); NUnit.Framework.Assert.AreEqual("Rebooted Nodes", initialRebooted + 1, cm.GetNumRebootedNMs ()); NUnit.Framework.Assert.AreEqual(NodeState.Rebooted, node.GetState()); }
public virtual void TestReconnectNode() { DrainDispatcher dispatcher = new DrainDispatcher(); rm = new _MockRM_567(this, dispatcher); rm.Start(); MockNM nm1 = rm.RegisterNode("host1:1234", 5120); MockNM nm2 = rm.RegisterNode("host2:5678", 5120); nm1.NodeHeartbeat(true); nm2.NodeHeartbeat(false); dispatcher.Await(); CheckUnealthyNMCount(rm, nm2, true, 1); int expectedNMs = ClusterMetrics.GetMetrics().GetNumActiveNMs(); QueueMetrics metrics = rm.GetResourceScheduler().GetRootQueueMetrics(); // TODO Metrics incorrect in case of the FifoScheduler NUnit.Framework.Assert.AreEqual(5120, metrics.GetAvailableMB()); // reconnect of healthy node nm1 = rm.RegisterNode("host1:1234", 5120); NodeHeartbeatResponse response = nm1.NodeHeartbeat(true); NUnit.Framework.Assert.IsTrue(NodeAction.Normal.Equals(response.GetNodeAction())); dispatcher.Await(); NUnit.Framework.Assert.AreEqual(expectedNMs, ClusterMetrics.GetMetrics().GetNumActiveNMs ()); CheckUnealthyNMCount(rm, nm2, true, 1); // reconnect of unhealthy node nm2 = rm.RegisterNode("host2:5678", 5120); response = nm2.NodeHeartbeat(false); NUnit.Framework.Assert.IsTrue(NodeAction.Normal.Equals(response.GetNodeAction())); dispatcher.Await(); NUnit.Framework.Assert.AreEqual(expectedNMs, ClusterMetrics.GetMetrics().GetNumActiveNMs ()); CheckUnealthyNMCount(rm, nm2, true, 1); // unhealthy node changed back to healthy nm2 = rm.RegisterNode("host2:5678", 5120); dispatcher.Await(); response = nm2.NodeHeartbeat(true); response = nm2.NodeHeartbeat(true); dispatcher.Await(); NUnit.Framework.Assert.AreEqual(5120 + 5120, metrics.GetAvailableMB()); // reconnect of node with changed capability nm1 = rm.RegisterNode("host2:5678", 10240); dispatcher.Await(); response = nm1.NodeHeartbeat(true); dispatcher.Await(); NUnit.Framework.Assert.IsTrue(NodeAction.Normal.Equals(response.GetNodeAction())); NUnit.Framework.Assert.AreEqual(5120 + 10240, metrics.GetAvailableMB()); // reconnect of node with changed capability and running applications IList <ApplicationId> runningApps = new AList <ApplicationId>(); runningApps.AddItem(ApplicationId.NewInstance(1, 0)); nm1 = rm.RegisterNode("host2:5678", 15360, 2, runningApps); dispatcher.Await(); response = nm1.NodeHeartbeat(true); dispatcher.Await(); NUnit.Framework.Assert.IsTrue(NodeAction.Normal.Equals(response.GetNodeAction())); NUnit.Framework.Assert.AreEqual(5120 + 15360, metrics.GetAvailableMB()); // reconnect healthy node changing http port nm1 = new MockNM("host1:1234", 5120, rm.GetResourceTrackerService()); nm1.SetHttpPort(3); nm1.RegisterNode(); dispatcher.Await(); response = nm1.NodeHeartbeat(true); response = nm1.NodeHeartbeat(true); dispatcher.Await(); RMNode rmNode = rm.GetRMContext().GetRMNodes()[nm1.GetNodeId()]; NUnit.Framework.Assert.AreEqual(3, rmNode.GetHttpPort()); NUnit.Framework.Assert.AreEqual(5120, rmNode.GetTotalCapability().GetMemory()); NUnit.Framework.Assert.AreEqual(5120 + 15360, metrics.GetAvailableMB()); }
/// <exception cref="System.Exception"/> private void CheckUnealthyNMCount(MockRM rm, MockNM nm1, bool health, int count) { int waitCount = 0; while ((rm.GetRMContext().GetRMNodes()[nm1.GetNodeId()].GetState() != NodeState.Unhealthy ) == health && waitCount++ < 20) { lock (this) { Sharpen.Runtime.Wait(this, 100); } } NUnit.Framework.Assert.IsFalse((rm.GetRMContext().GetRMNodes()[nm1.GetNodeId()].GetState () != NodeState.Unhealthy) == health); NUnit.Framework.Assert.AreEqual("Unhealthy metrics not incremented", count, ClusterMetrics .GetMetrics().GetUnhealthyNMs()); }
public virtual void TearDown() { ClusterMetrics.Destroy(); QueueMetrics.ClearQueueMetrics(); DefaultMetricsSystem.Shutdown(); }
public virtual void Setup() { DefaultMetricsSystem.Initialize("ResourceManager"); metrics = ClusterMetrics.GetMetrics(); }
private void SetDecomissionedNMsMetrics() { ICollection <string> excludeList = hostsReader.GetExcludedHosts(); ClusterMetrics.GetMetrics().SetDecommisionedNMs(excludeList.Count); }