Exemplo n.º 1
0
        public virtual void TestAddNewExcludePathToConfiguration()
        {
            Configuration conf = new Configuration();

            rm = new MockRM(conf);
            rm.Start();
            MockNM         nm1     = rm.RegisterNode("host1:1234", 5120);
            MockNM         nm2     = rm.RegisterNode("host2:5678", 10240);
            ClusterMetrics metrics = ClusterMetrics.GetMetrics();

            System.Diagnostics.Debug.Assert((metrics != null));
            int initialMetricCount = metrics.GetNumDecommisionedNMs();
            NodeHeartbeatResponse nodeHeartbeat = nm1.NodeHeartbeat(true);

            NUnit.Framework.Assert.AreEqual(NodeAction.Normal, nodeHeartbeat.GetNodeAction());
            nodeHeartbeat = nm2.NodeHeartbeat(true);
            NUnit.Framework.Assert.AreEqual(NodeAction.Normal, nodeHeartbeat.GetNodeAction());
            WriteToHostsFile("host2");
            conf.Set(YarnConfiguration.RmNodesExcludeFilePath, hostFile.GetAbsolutePath());
            rm.GetNodesListManager().RefreshNodes(conf);
            CheckDecommissionedNMCount(rm, ++initialMetricCount);
            nodeHeartbeat = nm1.NodeHeartbeat(true);
            NUnit.Framework.Assert.AreEqual("Node should not have been decomissioned.", NodeAction
                                            .Normal, nodeHeartbeat.GetNodeAction());
            nodeHeartbeat = nm2.NodeHeartbeat(true);
            NUnit.Framework.Assert.AreEqual("Node should have been decomissioned but is in state"
                                            + nodeHeartbeat.GetNodeAction(), NodeAction.Shutdown, nodeHeartbeat.GetNodeAction
                                                ());
        }
Exemplo n.º 2
0
        /// <exception cref="System.Exception"/>
        private void VerifyClusterMetrics(int activeNodes, int appsSubmitted, int appsPending
                                          , int containersPending, int availableMB, int activeApplications)
        {
            int            timeoutSecs              = 0;
            QueueMetrics   metrics                  = rm.GetResourceScheduler().GetRootQueueMetrics();
            ClusterMetrics clusterMetrics           = ClusterMetrics.GetMetrics();
            bool           isAllMetricAssertionDone = false;
            string         message                  = null;

            while (timeoutSecs++ < 5)
            {
                try
                {
                    // verify queue metrics
                    AssertMetric("appsSubmitted", appsSubmitted, metrics.GetAppsSubmitted());
                    AssertMetric("appsPending", appsPending, metrics.GetAppsPending());
                    AssertMetric("containersPending", containersPending, metrics.GetPendingContainers
                                     ());
                    AssertMetric("availableMB", availableMB, metrics.GetAvailableMB());
                    AssertMetric("activeApplications", activeApplications, metrics.GetActiveApps());
                    // verify node metric
                    AssertMetric("activeNodes", activeNodes, clusterMetrics.GetNumActiveNMs());
                    isAllMetricAssertionDone = true;
                    break;
                }
                catch (Exception e)
                {
                    message = e.Message;
                    System.Console.Out.WriteLine("Waiting for metrics assertion to complete");
                    Sharpen.Thread.Sleep(1000);
                }
            }
            NUnit.Framework.Assert.IsTrue(message, isAllMetricAssertionDone);
        }
Exemplo n.º 3
0
        public virtual void TestReconnect()
        {
            RMNodeImpl     node                  = GetRunningNode();
            ClusterMetrics cm                    = ClusterMetrics.GetMetrics();
            int            initialActive         = cm.GetNumActiveNMs();
            int            initialLost           = cm.GetNumLostNMs();
            int            initialUnhealthy      = cm.GetUnhealthyNMs();
            int            initialDecommissioned = cm.GetNumDecommisionedNMs();
            int            initialRebooted       = cm.GetNumRebootedNMs();

            node.Handle(new RMNodeReconnectEvent(node.GetNodeID(), node, null, null));
            NUnit.Framework.Assert.AreEqual("Active Nodes", initialActive, cm.GetNumActiveNMs
                                                ());
            NUnit.Framework.Assert.AreEqual("Lost Nodes", initialLost, cm.GetNumLostNMs());
            NUnit.Framework.Assert.AreEqual("Unhealthy Nodes", initialUnhealthy, cm.GetUnhealthyNMs
                                                ());
            NUnit.Framework.Assert.AreEqual("Decommissioned Nodes", initialDecommissioned, cm
                                            .GetNumDecommisionedNMs());
            NUnit.Framework.Assert.AreEqual("Rebooted Nodes", initialRebooted, cm.GetNumRebootedNMs
                                                ());
            NUnit.Framework.Assert.AreEqual(NodeState.Running, node.GetState());
            NUnit.Framework.Assert.IsNotNull(nodesListManagerEvent);
            NUnit.Framework.Assert.AreEqual(NodesListManagerEventType.NodeUsable, nodesListManagerEvent
                                            .GetType());
        }
Exemplo n.º 4
0
 internal static void Destroy()
 {
     lock (typeof(ClusterMetrics))
     {
         isInitialized.Set(false);
         Instance = null;
     }
 }
Exemplo n.º 5
0
        public virtual void TearDown()
        {
            ClusterMetrics.Destroy();
            MetricsSystem ms = DefaultMetricsSystem.Instance();

            if (ms.GetSource("ClusterMetrics") != null)
            {
                DefaultMetricsSystem.Shutdown();
            }
        }
Exemplo n.º 6
0
        public virtual void TestDecommissionWithExcludeHosts()
        {
            Configuration conf = new Configuration();

            conf.Set(YarnConfiguration.RmNodesExcludeFilePath, hostFile.GetAbsolutePath());
            WriteToHostsFile(string.Empty);
            DrainDispatcher dispatcher = new DrainDispatcher();

            rm = new _MockRM_162(dispatcher, conf);
            rm.Start();
            MockNM nm1 = rm.RegisterNode("host1:1234", 5120);
            MockNM nm2 = rm.RegisterNode("host2:5678", 10240);
            MockNM nm3 = rm.RegisterNode("localhost:4433", 1024);

            dispatcher.Await();
            int metricCount = ClusterMetrics.GetMetrics().GetNumDecommisionedNMs();
            NodeHeartbeatResponse nodeHeartbeat = nm1.NodeHeartbeat(true);

            NUnit.Framework.Assert.IsTrue(NodeAction.Normal.Equals(nodeHeartbeat.GetNodeAction
                                                                       ()));
            nodeHeartbeat = nm2.NodeHeartbeat(true);
            NUnit.Framework.Assert.IsTrue(NodeAction.Normal.Equals(nodeHeartbeat.GetNodeAction
                                                                       ()));
            dispatcher.Await();
            // To test that IPs also work
            string ip = NetUtils.NormalizeHostName("localhost");

            WriteToHostsFile("host2", ip);
            rm.GetNodesListManager().RefreshNodes(conf);
            CheckDecommissionedNMCount(rm, metricCount + 2);
            nodeHeartbeat = nm1.NodeHeartbeat(true);
            NUnit.Framework.Assert.IsTrue(NodeAction.Normal.Equals(nodeHeartbeat.GetNodeAction
                                                                       ()));
            nodeHeartbeat = nm2.NodeHeartbeat(true);
            NUnit.Framework.Assert.IsTrue("The decommisioned metrics are not updated", NodeAction
                                          .Shutdown.Equals(nodeHeartbeat.GetNodeAction()));
            nodeHeartbeat = nm3.NodeHeartbeat(true);
            NUnit.Framework.Assert.IsTrue("The decommisioned metrics are not updated", NodeAction
                                          .Shutdown.Equals(nodeHeartbeat.GetNodeAction()));
            dispatcher.Await();
            WriteToHostsFile(string.Empty);
            rm.GetNodesListManager().RefreshNodes(conf);
            nm3 = rm.RegisterNode("localhost:4433", 1024);
            dispatcher.Await();
            nodeHeartbeat = nm3.NodeHeartbeat(true);
            dispatcher.Await();
            NUnit.Framework.Assert.IsTrue(NodeAction.Normal.Equals(nodeHeartbeat.GetNodeAction
                                                                       ()));
            // decommissined node is 1 since 1 node is rejoined after updating exclude
            // file
            CheckDecommissionedNMCount(rm, metricCount + 1);
        }
Exemplo n.º 7
0
        /// <exception cref="System.Exception"/>
        private void CheckRebootedNMCount(MockRM rm2, int count)
        {
            int waitCount = 0;

            while (ClusterMetrics.GetMetrics().GetNumRebootedNMs() != count && waitCount++ <
                   20)
            {
                lock (this)
                {
                    Sharpen.Runtime.Wait(this, 100);
                }
            }
            NUnit.Framework.Assert.AreEqual("The rebooted metrics are not updated", count, ClusterMetrics
                                            .GetMetrics().GetNumRebootedNMs());
        }
Exemplo n.º 8
0
 public static ClusterMetrics GetMetrics()
 {
     if (!isInitialized.Get())
     {
         lock (typeof(ClusterMetrics))
         {
             if (Instance == null)
             {
                 Instance = new ClusterMetrics();
                 RegisterMetrics();
                 isInitialized.Set(true);
             }
         }
     }
     return(Instance);
 }
Exemplo n.º 9
0
        public virtual void TestDecommissionWithIncludeHosts()
        {
            WriteToHostsFile("localhost", "host1", "host2");
            Configuration conf = new Configuration();

            conf.Set(YarnConfiguration.RmNodesIncludeFilePath, hostFile.GetAbsolutePath());
            rm = new MockRM(conf);
            rm.Start();
            MockNM         nm1     = rm.RegisterNode("host1:1234", 5120);
            MockNM         nm2     = rm.RegisterNode("host2:5678", 10240);
            MockNM         nm3     = rm.RegisterNode("localhost:4433", 1024);
            ClusterMetrics metrics = ClusterMetrics.GetMetrics();

            System.Diagnostics.Debug.Assert((metrics != null));
            int metricCount = metrics.GetNumDecommisionedNMs();
            NodeHeartbeatResponse nodeHeartbeat = nm1.NodeHeartbeat(true);

            NUnit.Framework.Assert.IsTrue(NodeAction.Normal.Equals(nodeHeartbeat.GetNodeAction
                                                                       ()));
            nodeHeartbeat = nm2.NodeHeartbeat(true);
            NUnit.Framework.Assert.IsTrue(NodeAction.Normal.Equals(nodeHeartbeat.GetNodeAction
                                                                       ()));
            nodeHeartbeat = nm3.NodeHeartbeat(true);
            NUnit.Framework.Assert.IsTrue(NodeAction.Normal.Equals(nodeHeartbeat.GetNodeAction
                                                                       ()));
            // To test that IPs also work
            string ip = NetUtils.NormalizeHostName("localhost");

            WriteToHostsFile("host1", ip);
            rm.GetNodesListManager().RefreshNodes(conf);
            CheckDecommissionedNMCount(rm, ++metricCount);
            nodeHeartbeat = nm1.NodeHeartbeat(true);
            NUnit.Framework.Assert.IsTrue(NodeAction.Normal.Equals(nodeHeartbeat.GetNodeAction
                                                                       ()));
            NUnit.Framework.Assert.AreEqual(1, ClusterMetrics.GetMetrics().GetNumDecommisionedNMs
                                                ());
            nodeHeartbeat = nm2.NodeHeartbeat(true);
            NUnit.Framework.Assert.IsTrue("Node is not decommisioned.", NodeAction.Shutdown.Equals
                                              (nodeHeartbeat.GetNodeAction()));
            nodeHeartbeat = nm3.NodeHeartbeat(true);
            NUnit.Framework.Assert.IsTrue(NodeAction.Normal.Equals(nodeHeartbeat.GetNodeAction
                                                                       ()));
            NUnit.Framework.Assert.AreEqual(metricCount, ClusterMetrics.GetMetrics().GetNumDecommisionedNMs
                                                ());
        }
Exemplo n.º 10
0
        public virtual void TearDown()
        {
            if (hostFile != null && hostFile.Exists())
            {
                hostFile.Delete();
            }
            ClusterMetrics.Destroy();
            if (rm != null)
            {
                rm.Stop();
            }
            MetricsSystem ms = DefaultMetricsSystem.Instance();

            if (ms.GetSource("ClusterMetrics") != null)
            {
                DefaultMetricsSystem.Shutdown();
            }
        }
Exemplo n.º 11
0
        public virtual void TestUnhealthyNodeStatus()
        {
            Configuration conf = new Configuration();

            conf.Set(YarnConfiguration.RmNodesExcludeFilePath, hostFile.GetAbsolutePath());
            rm = new MockRM(conf);
            rm.Start();
            MockNM nm1 = rm.RegisterNode("host1:1234", 5120);

            NUnit.Framework.Assert.AreEqual(0, ClusterMetrics.GetMetrics().GetUnhealthyNMs());
            // node healthy
            nm1.NodeHeartbeat(true);
            // node unhealthy
            nm1.NodeHeartbeat(false);
            CheckUnealthyNMCount(rm, nm1, true, 1);
            // node healthy again
            nm1.NodeHeartbeat(true);
            CheckUnealthyNMCount(rm, nm1, false, 0);
        }
Exemplo n.º 12
0
 public virtual void SetUp()
 {
     configuration = new Configuration();
     UserGroupInformation.SetConfiguration(configuration);
     configuration.SetBoolean(YarnConfiguration.RmHaEnabled, true);
     configuration.Set(YarnConfiguration.RmHaIds, Rm1NodeId + "," + Rm2NodeId);
     foreach (string confKey in YarnConfiguration.GetServiceAddressConfKeys(configuration
                                                                            ))
     {
         configuration.Set(HAUtil.AddSuffix(confKey, Rm1NodeId), Rm1Address);
         configuration.Set(HAUtil.AddSuffix(confKey, Rm2NodeId), Rm2Address);
         configuration.Set(HAUtil.AddSuffix(confKey, Rm3NodeId), Rm3Address);
     }
     // Enable webapp to test web-services also
     configuration.SetBoolean(MockRM.EnableWebapp, true);
     configuration.SetBoolean(YarnConfiguration.YarnAclEnable, true);
     ClusterMetrics.Destroy();
     QueueMetrics.ClearQueueMetrics();
     DefaultMetricsSystem.Shutdown();
 }
Exemplo n.º 13
0
        public virtual void TestReboot()
        {
            Configuration conf = new Configuration();

            rm = new MockRM(conf);
            rm.Start();
            MockNM nm1 = rm.RegisterNode("host1:1234", 5120);
            MockNM nm2 = rm.RegisterNode("host2:1234", 2048);
            int    initialMetricCount           = ClusterMetrics.GetMetrics().GetNumRebootedNMs();
            NodeHeartbeatResponse nodeHeartbeat = nm1.NodeHeartbeat(true);

            NUnit.Framework.Assert.IsTrue(NodeAction.Normal.Equals(nodeHeartbeat.GetNodeAction
                                                                       ()));
            nodeHeartbeat = nm2.NodeHeartbeat(new Dictionary <ApplicationId, IList <ContainerStatus
                                                                                    > >(), true, -100);
            NUnit.Framework.Assert.IsTrue(NodeAction.Resync.Equals(nodeHeartbeat.GetNodeAction
                                                                       ()));
            NUnit.Framework.Assert.AreEqual("Too far behind rm response id:0 nm response id:-100"
                                            , nodeHeartbeat.GetDiagnosticsMessage());
            CheckRebootedNMCount(rm, ++initialMetricCount);
        }
Exemplo n.º 14
0
        public virtual void TestUnhealthyRebooting()
        {
            RMNodeImpl     node                  = GetUnhealthyNode();
            ClusterMetrics cm                    = ClusterMetrics.GetMetrics();
            int            initialActive         = cm.GetNumActiveNMs();
            int            initialLost           = cm.GetNumLostNMs();
            int            initialUnhealthy      = cm.GetUnhealthyNMs();
            int            initialDecommissioned = cm.GetNumDecommisionedNMs();
            int            initialRebooted       = cm.GetNumRebootedNMs();

            node.Handle(new RMNodeEvent(node.GetNodeID(), RMNodeEventType.Rebooting));
            NUnit.Framework.Assert.AreEqual("Active Nodes", initialActive, cm.GetNumActiveNMs
                                                ());
            NUnit.Framework.Assert.AreEqual("Lost Nodes", initialLost, cm.GetNumLostNMs());
            NUnit.Framework.Assert.AreEqual("Unhealthy Nodes", initialUnhealthy - 1, cm.GetUnhealthyNMs
                                                ());
            NUnit.Framework.Assert.AreEqual("Decommissioned Nodes", initialDecommissioned, cm
                                            .GetNumDecommisionedNMs());
            NUnit.Framework.Assert.AreEqual("Rebooted Nodes", initialRebooted + 1, cm.GetNumRebootedNMs
                                                ());
            NUnit.Framework.Assert.AreEqual(NodeState.Rebooted, node.GetState());
        }
Exemplo n.º 15
0
        public virtual void TestReconnectNode()
        {
            DrainDispatcher dispatcher = new DrainDispatcher();

            rm = new _MockRM_567(this, dispatcher);
            rm.Start();
            MockNM nm1 = rm.RegisterNode("host1:1234", 5120);
            MockNM nm2 = rm.RegisterNode("host2:5678", 5120);

            nm1.NodeHeartbeat(true);
            nm2.NodeHeartbeat(false);
            dispatcher.Await();
            CheckUnealthyNMCount(rm, nm2, true, 1);
            int          expectedNMs = ClusterMetrics.GetMetrics().GetNumActiveNMs();
            QueueMetrics metrics     = rm.GetResourceScheduler().GetRootQueueMetrics();

            // TODO Metrics incorrect in case of the FifoScheduler
            NUnit.Framework.Assert.AreEqual(5120, metrics.GetAvailableMB());
            // reconnect of healthy node
            nm1 = rm.RegisterNode("host1:1234", 5120);
            NodeHeartbeatResponse response = nm1.NodeHeartbeat(true);

            NUnit.Framework.Assert.IsTrue(NodeAction.Normal.Equals(response.GetNodeAction()));
            dispatcher.Await();
            NUnit.Framework.Assert.AreEqual(expectedNMs, ClusterMetrics.GetMetrics().GetNumActiveNMs
                                                ());
            CheckUnealthyNMCount(rm, nm2, true, 1);
            // reconnect of unhealthy node
            nm2      = rm.RegisterNode("host2:5678", 5120);
            response = nm2.NodeHeartbeat(false);
            NUnit.Framework.Assert.IsTrue(NodeAction.Normal.Equals(response.GetNodeAction()));
            dispatcher.Await();
            NUnit.Framework.Assert.AreEqual(expectedNMs, ClusterMetrics.GetMetrics().GetNumActiveNMs
                                                ());
            CheckUnealthyNMCount(rm, nm2, true, 1);
            // unhealthy node changed back to healthy
            nm2 = rm.RegisterNode("host2:5678", 5120);
            dispatcher.Await();
            response = nm2.NodeHeartbeat(true);
            response = nm2.NodeHeartbeat(true);
            dispatcher.Await();
            NUnit.Framework.Assert.AreEqual(5120 + 5120, metrics.GetAvailableMB());
            // reconnect of node with changed capability
            nm1 = rm.RegisterNode("host2:5678", 10240);
            dispatcher.Await();
            response = nm1.NodeHeartbeat(true);
            dispatcher.Await();
            NUnit.Framework.Assert.IsTrue(NodeAction.Normal.Equals(response.GetNodeAction()));
            NUnit.Framework.Assert.AreEqual(5120 + 10240, metrics.GetAvailableMB());
            // reconnect of node with changed capability and running applications
            IList <ApplicationId> runningApps = new AList <ApplicationId>();

            runningApps.AddItem(ApplicationId.NewInstance(1, 0));
            nm1 = rm.RegisterNode("host2:5678", 15360, 2, runningApps);
            dispatcher.Await();
            response = nm1.NodeHeartbeat(true);
            dispatcher.Await();
            NUnit.Framework.Assert.IsTrue(NodeAction.Normal.Equals(response.GetNodeAction()));
            NUnit.Framework.Assert.AreEqual(5120 + 15360, metrics.GetAvailableMB());
            // reconnect healthy node changing http port
            nm1 = new MockNM("host1:1234", 5120, rm.GetResourceTrackerService());
            nm1.SetHttpPort(3);
            nm1.RegisterNode();
            dispatcher.Await();
            response = nm1.NodeHeartbeat(true);
            response = nm1.NodeHeartbeat(true);
            dispatcher.Await();
            RMNode rmNode = rm.GetRMContext().GetRMNodes()[nm1.GetNodeId()];

            NUnit.Framework.Assert.AreEqual(3, rmNode.GetHttpPort());
            NUnit.Framework.Assert.AreEqual(5120, rmNode.GetTotalCapability().GetMemory());
            NUnit.Framework.Assert.AreEqual(5120 + 15360, metrics.GetAvailableMB());
        }
Exemplo n.º 16
0
        /// <exception cref="System.Exception"/>
        private void CheckUnealthyNMCount(MockRM rm, MockNM nm1, bool health, int count)
        {
            int waitCount = 0;

            while ((rm.GetRMContext().GetRMNodes()[nm1.GetNodeId()].GetState() != NodeState.Unhealthy
                    ) == health && waitCount++ < 20)
            {
                lock (this)
                {
                    Sharpen.Runtime.Wait(this, 100);
                }
            }
            NUnit.Framework.Assert.IsFalse((rm.GetRMContext().GetRMNodes()[nm1.GetNodeId()].GetState
                                                () != NodeState.Unhealthy) == health);
            NUnit.Framework.Assert.AreEqual("Unhealthy metrics not incremented", count, ClusterMetrics
                                            .GetMetrics().GetUnhealthyNMs());
        }
Exemplo n.º 17
0
 public virtual void TearDown()
 {
     ClusterMetrics.Destroy();
     QueueMetrics.ClearQueueMetrics();
     DefaultMetricsSystem.Shutdown();
 }
Exemplo n.º 18
0
 public virtual void Setup()
 {
     DefaultMetricsSystem.Initialize("ResourceManager");
     metrics = ClusterMetrics.GetMetrics();
 }
Exemplo n.º 19
0
        private void SetDecomissionedNMsMetrics()
        {
            ICollection <string> excludeList = hostsReader.GetExcludedHosts();

            ClusterMetrics.GetMetrics().SetDecommisionedNMs(excludeList.Count);
        }