/// <exception cref="System.Exception"/>
        protected internal virtual void WaitForContainerCleanup(DrainDispatcher dispatcher
                                                                , MockNM nm, NodeHeartbeatResponse resp)
        {
            int waitCount    = 0;
            int cleanedConts = 0;
            IList <ContainerId> contsToClean;

            do
            {
                dispatcher.Await();
                contsToClean  = resp.GetContainersToCleanup();
                cleanedConts += contsToClean.Count;
                if (cleanedConts >= 1)
                {
                    break;
                }
                Sharpen.Thread.Sleep(100);
                resp = nm.NodeHeartbeat(true);
            }while (waitCount++ < 200);
            if (contsToClean.IsEmpty())
            {
                Log.Error("Failed to get any containers to cleanup");
            }
            else
            {
                Log.Info("Got cleanup for " + contsToClean[0]);
            }
            NUnit.Framework.Assert.AreEqual(1, cleanedConts);
        }
Example #2
0
        /// <exception cref="Org.Apache.Hadoop.Yarn.Exceptions.YarnException"/>
        /// <exception cref="System.IO.IOException"/>
        public virtual NodeHeartbeatResponse NodeHeartbeat(NodeHeartbeatRequest request)
        {
            NodeHeartbeatResponse response = recordFactory.NewRecordInstance <NodeHeartbeatResponse
                                                                              >();

            return(response);
        }
Example #3
0
        public virtual void TestNodeHeartBeatResponse()
        {
            NodeHeartbeatResponse record = Org.Apache.Hadoop.Yarn.Util.Records.NewRecord <NodeHeartbeatResponse
                                                                                          >();
            IDictionary <ApplicationId, ByteBuffer> appCredentials = new Dictionary <ApplicationId
                                                                                     , ByteBuffer>();
            Credentials app1Cred = new Credentials();

            Org.Apache.Hadoop.Security.Token.Token <DelegationTokenIdentifier> token1 = new Org.Apache.Hadoop.Security.Token.Token
                                                                                        <DelegationTokenIdentifier>();
            token1.SetKind(new Text("kind1"));
            app1Cred.AddToken(new Text("token1"), token1);
            Org.Apache.Hadoop.Security.Token.Token <DelegationTokenIdentifier> token2 = new Org.Apache.Hadoop.Security.Token.Token
                                                                                        <DelegationTokenIdentifier>();
            token2.SetKind(new Text("kind2"));
            app1Cred.AddToken(new Text("token2"), token2);
            DataOutputBuffer dob = new DataOutputBuffer();

            app1Cred.WriteTokenStorageToStream(dob);
            ByteBuffer byteBuffer1 = ByteBuffer.Wrap(dob.GetData(), 0, dob.GetLength());

            appCredentials[ApplicationId.NewInstance(1234, 1)] = byteBuffer1;
            record.SetSystemCredentialsForApps(appCredentials);
            NodeHeartbeatResponse proto = new NodeHeartbeatResponsePBImpl(((NodeHeartbeatResponsePBImpl
                                                                            )record).GetProto());

            NUnit.Framework.Assert.AreEqual(appCredentials, proto.GetSystemCredentialsForApps
                                                ());
        }
Example #4
0
        public virtual void TestUpdateHeartbeatResponseForCleanup()
        {
            RMNodeImpl node   = GetRunningNode();
            NodeId     nodeId = node.GetNodeID();
            // Expire a container
            ContainerId completedContainerId = BuilderUtils.NewContainerId(BuilderUtils.NewApplicationAttemptId
                                                                               (BuilderUtils.NewApplicationId(0, 0), 0), 0);

            node.Handle(new RMNodeCleanContainerEvent(nodeId, completedContainerId));
            NUnit.Framework.Assert.AreEqual(1, node.GetContainersToCleanUp().Count);
            // Finish an application
            ApplicationId finishedAppId = BuilderUtils.NewApplicationId(0, 1);

            node.Handle(new RMNodeCleanAppEvent(nodeId, finishedAppId));
            NUnit.Framework.Assert.AreEqual(1, node.GetAppsToCleanup().Count);
            // Verify status update does not clear containers/apps to cleanup
            // but updating heartbeat response for cleanup does
            RMNodeStatusEvent statusEvent = GetMockRMNodeStatusEvent();

            node.Handle(statusEvent);
            NUnit.Framework.Assert.AreEqual(1, node.GetContainersToCleanUp().Count);
            NUnit.Framework.Assert.AreEqual(1, node.GetAppsToCleanup().Count);
            NodeHeartbeatResponse hbrsp = Org.Apache.Hadoop.Yarn.Util.Records.NewRecord <NodeHeartbeatResponse
                                                                                         >();

            node.UpdateNodeHeartbeatResponseForCleanup(hbrsp);
            NUnit.Framework.Assert.AreEqual(0, node.GetContainersToCleanUp().Count);
            NUnit.Framework.Assert.AreEqual(0, node.GetAppsToCleanup().Count);
            NUnit.Framework.Assert.AreEqual(1, hbrsp.GetContainersToCleanup().Count);
            NUnit.Framework.Assert.AreEqual(completedContainerId, hbrsp.GetContainersToCleanup
                                                ()[0]);
            NUnit.Framework.Assert.AreEqual(1, hbrsp.GetApplicationsToCleanup().Count);
            NUnit.Framework.Assert.AreEqual(finishedAppId, hbrsp.GetApplicationsToCleanup()[0
                                            ]);
        }
Example #5
0
        public virtual void TestAddNewExcludePathToConfiguration()
        {
            Configuration conf = new Configuration();

            rm = new MockRM(conf);
            rm.Start();
            MockNM         nm1     = rm.RegisterNode("host1:1234", 5120);
            MockNM         nm2     = rm.RegisterNode("host2:5678", 10240);
            ClusterMetrics metrics = ClusterMetrics.GetMetrics();

            System.Diagnostics.Debug.Assert((metrics != null));
            int initialMetricCount = metrics.GetNumDecommisionedNMs();
            NodeHeartbeatResponse nodeHeartbeat = nm1.NodeHeartbeat(true);

            NUnit.Framework.Assert.AreEqual(NodeAction.Normal, nodeHeartbeat.GetNodeAction());
            nodeHeartbeat = nm2.NodeHeartbeat(true);
            NUnit.Framework.Assert.AreEqual(NodeAction.Normal, nodeHeartbeat.GetNodeAction());
            WriteToHostsFile("host2");
            conf.Set(YarnConfiguration.RmNodesExcludeFilePath, hostFile.GetAbsolutePath());
            rm.GetNodesListManager().RefreshNodes(conf);
            CheckDecommissionedNMCount(rm, ++initialMetricCount);
            nodeHeartbeat = nm1.NodeHeartbeat(true);
            NUnit.Framework.Assert.AreEqual("Node should not have been decomissioned.", NodeAction
                                            .Normal, nodeHeartbeat.GetNodeAction());
            nodeHeartbeat = nm2.NodeHeartbeat(true);
            NUnit.Framework.Assert.AreEqual("Node should have been decomissioned but is in state"
                                            + nodeHeartbeat.GetNodeAction(), NodeAction.Shutdown, nodeHeartbeat.GetNodeAction
                                                ());
        }
Example #6
0
 public RMNodeStatusEvent(NodeId nodeId, NodeHealthStatus nodeHealthStatus, IList <
                              ContainerStatus> collection, IList <ApplicationId> keepAliveAppIds, NodeHeartbeatResponse
                          latestResponse)
     : base(nodeId, RMNodeEventType.StatusUpdate)
 {
     this.nodeHealthStatus     = nodeHealthStatus;
     this.containersCollection = collection;
     this.keepAliveAppIds      = keepAliveAppIds;
     this.latestResponse       = latestResponse;
 }
            /// <exception cref="Org.Apache.Hadoop.Yarn.Exceptions.YarnException"/>
            /// <exception cref="System.IO.IOException"/>
            public virtual NodeHeartbeatResponse NodeHeartbeat(NodeHeartbeatRequest request)
            {
                NodeStatus nodeStatus = request.GetNodeStatus();

                Log.Info("Got heartbeat number " + heartBeatID);
                nodeStatus.SetResponseId(heartBeatID++);
                NodeHeartbeatResponse nhResponse = YarnServerBuilderUtils.NewNodeHeartbeatResponse
                                                       (heartBeatID, null, null, null, null, null, 1000L);

                return(nhResponse);
            }
Example #8
0
        public virtual void TestDecommissionWithExcludeHosts()
        {
            Configuration conf = new Configuration();

            conf.Set(YarnConfiguration.RmNodesExcludeFilePath, hostFile.GetAbsolutePath());
            WriteToHostsFile(string.Empty);
            DrainDispatcher dispatcher = new DrainDispatcher();

            rm = new _MockRM_162(dispatcher, conf);
            rm.Start();
            MockNM nm1 = rm.RegisterNode("host1:1234", 5120);
            MockNM nm2 = rm.RegisterNode("host2:5678", 10240);
            MockNM nm3 = rm.RegisterNode("localhost:4433", 1024);

            dispatcher.Await();
            int metricCount = ClusterMetrics.GetMetrics().GetNumDecommisionedNMs();
            NodeHeartbeatResponse nodeHeartbeat = nm1.NodeHeartbeat(true);

            NUnit.Framework.Assert.IsTrue(NodeAction.Normal.Equals(nodeHeartbeat.GetNodeAction
                                                                       ()));
            nodeHeartbeat = nm2.NodeHeartbeat(true);
            NUnit.Framework.Assert.IsTrue(NodeAction.Normal.Equals(nodeHeartbeat.GetNodeAction
                                                                       ()));
            dispatcher.Await();
            // To test that IPs also work
            string ip = NetUtils.NormalizeHostName("localhost");

            WriteToHostsFile("host2", ip);
            rm.GetNodesListManager().RefreshNodes(conf);
            CheckDecommissionedNMCount(rm, metricCount + 2);
            nodeHeartbeat = nm1.NodeHeartbeat(true);
            NUnit.Framework.Assert.IsTrue(NodeAction.Normal.Equals(nodeHeartbeat.GetNodeAction
                                                                       ()));
            nodeHeartbeat = nm2.NodeHeartbeat(true);
            NUnit.Framework.Assert.IsTrue("The decommisioned metrics are not updated", NodeAction
                                          .Shutdown.Equals(nodeHeartbeat.GetNodeAction()));
            nodeHeartbeat = nm3.NodeHeartbeat(true);
            NUnit.Framework.Assert.IsTrue("The decommisioned metrics are not updated", NodeAction
                                          .Shutdown.Equals(nodeHeartbeat.GetNodeAction()));
            dispatcher.Await();
            WriteToHostsFile(string.Empty);
            rm.GetNodesListManager().RefreshNodes(conf);
            nm3 = rm.RegisterNode("localhost:4433", 1024);
            dispatcher.Await();
            nodeHeartbeat = nm3.NodeHeartbeat(true);
            dispatcher.Await();
            NUnit.Framework.Assert.IsTrue(NodeAction.Normal.Equals(nodeHeartbeat.GetNodeAction
                                                                       ()));
            // decommissined node is 1 since 1 node is rejoined after updating exclude
            // file
            CheckDecommissionedNMCount(rm, metricCount + 1);
        }
Example #9
0
        /// <exception cref="System.IO.IOException"/>
        /// <exception cref="Org.Apache.Hadoop.Yarn.Exceptions.YarnException"/>
        public virtual void Heartbeat()
        {
            NodeStatus nodeStatus = Org.Apache.Hadoop.Yarn.Server.Resourcemanager.NodeManager
                                    .CreateNodeStatus(nodeId, GetContainerStatuses(containers));

            nodeStatus.SetResponseId(responseID);
            NodeHeartbeatRequest request = recordFactory.NewRecordInstance <NodeHeartbeatRequest
                                                                            >();

            request.SetNodeStatus(nodeStatus);
            NodeHeartbeatResponse response = resourceTrackerService.NodeHeartbeat(request);

            responseID = response.GetResponseId();
        }
        /// <exception cref="System.Exception"/>
        private void TestRMWritingMassiveHistory(MockRM rm)
        {
            rm.Start();
            MockNM nm  = rm.RegisterNode("127.0.0.1:1234", 1024 * 10100);
            RMApp  app = rm.SubmitApp(1024);

            nm.NodeHeartbeat(true);
            RMAppAttempt attempt = app.GetCurrentAppAttempt();
            MockAM       am      = rm.SendAMLaunched(attempt.GetAppAttemptId());

            am.RegisterAppAttempt();
            int request = 10000;

            am.Allocate("127.0.0.1", 1024, request, new AList <ContainerId>());
            nm.NodeHeartbeat(true);
            IList <Container> allocated = am.Allocate(new AList <ResourceRequest>(), new AList <
                                                          ContainerId>()).GetAllocatedContainers();
            int waitCount     = 0;
            int allocatedSize = allocated.Count;

            while (allocatedSize < request && waitCount++ < 200)
            {
                Sharpen.Thread.Sleep(300);
                allocated = am.Allocate(new AList <ResourceRequest>(), new AList <ContainerId>()).GetAllocatedContainers
                                ();
                allocatedSize += allocated.Count;
                nm.NodeHeartbeat(true);
            }
            NUnit.Framework.Assert.AreEqual(request, allocatedSize);
            am.UnregisterAppAttempt();
            am.WaitForState(RMAppAttemptState.Finishing);
            nm.NodeHeartbeat(am.GetApplicationAttemptId(), 1, ContainerState.Complete);
            am.WaitForState(RMAppAttemptState.Finished);
            NodeHeartbeatResponse resp    = nm.NodeHeartbeat(true);
            IList <ContainerId>   cleaned = resp.GetContainersToCleanup();
            int cleanedSize = cleaned.Count;

            waitCount = 0;
            while (cleanedSize < allocatedSize && waitCount++ < 200)
            {
                Sharpen.Thread.Sleep(300);
                resp         = nm.NodeHeartbeat(true);
                cleaned      = resp.GetContainersToCleanup();
                cleanedSize += cleaned.Count;
            }
            NUnit.Framework.Assert.AreEqual(allocatedSize, cleanedSize);
            rm.WaitForState(app.GetApplicationId(), RMAppState.Finished);
            rm.Stop();
        }
Example #11
0
        private RMNodeStatusEvent GetMockRMNodeStatusEvent()
        {
            NodeHeartbeatResponse response = Org.Mockito.Mockito.Mock <NodeHeartbeatResponse>(
                );
            NodeHealthStatus healthStatus = Org.Mockito.Mockito.Mock <NodeHealthStatus>();
            bool             yes          = true;

            Org.Mockito.Mockito.DoReturn(yes).When(healthStatus).GetIsNodeHealthy();
            RMNodeStatusEvent @event = Org.Mockito.Mockito.Mock <RMNodeStatusEvent>();

            Org.Mockito.Mockito.DoReturn(healthStatus).When(@event).GetNodeHealthStatus();
            Org.Mockito.Mockito.DoReturn(response).When(@event).GetLatestResponse();
            Org.Mockito.Mockito.DoReturn(RMNodeEventType.StatusUpdate).When(@event).GetType();
            return(@event);
        }
Example #12
0
            private void UpdateMasterKeys(NodeHeartbeatResponse response)
            {
                MasterKey updatedMasterKey = response.GetContainerTokenMasterKey();

                if (updatedMasterKey != null)
                {
                    this._enclosing.context.GetContainerTokenSecretManager().SetMasterKey(updatedMasterKey
                                                                                          );
                }
                updatedMasterKey = response.GetNMTokenMasterKey();
                if (updatedMasterKey != null)
                {
                    this._enclosing.context.GetNMTokenSecretManager().SetMasterKey(updatedMasterKey);
                }
            }
 /// <exception cref="System.Exception"/>
 private void WaitForAppCleanupMessageRecved(MockNM nm, ApplicationId appId)
 {
     while (true)
     {
         NodeHeartbeatResponse response = nm.NodeHeartbeat(true);
         if (response.GetApplicationsToCleanup() != null && response.GetApplicationsToCleanup
                 ().Count == 1 && appId.Equals(response.GetApplicationsToCleanup()[0]))
         {
             return;
         }
         Log.Info("Haven't got application=" + appId.ToString() + " in cleanup list from node heartbeat response, "
                  + "sleep for a while before next heartbeat");
         Sharpen.Thread.Sleep(1000);
     }
 }
Example #14
0
        /// <summary>
        /// Test RM read NM next heartBeat Interval correctly from Configuration file,
        /// and NM get next heartBeat Interval from RM correctly
        /// </summary>
        /// <exception cref="System.Exception"/>
        public virtual void TestGetNextHeartBeatInterval()
        {
            Configuration conf = new Configuration();

            conf.Set(YarnConfiguration.RmNmHeartbeatIntervalMs, "4000");
            rm = new MockRM(conf);
            rm.Start();
            MockNM nm1 = rm.RegisterNode("host1:1234", 5120);
            MockNM nm2 = rm.RegisterNode("host2:5678", 10240);
            NodeHeartbeatResponse nodeHeartbeat = nm1.NodeHeartbeat(true);

            NUnit.Framework.Assert.AreEqual(4000, nodeHeartbeat.GetNextHeartBeatInterval());
            NodeHeartbeatResponse nodeHeartbeat2 = nm2.NodeHeartbeat(true);

            NUnit.Framework.Assert.AreEqual(4000, nodeHeartbeat2.GetNextHeartBeatInterval());
        }
        public virtual void TestRPCResponseId()
        {
            string   node       = "localhost";
            Resource capability = BuilderUtils.NewResource(1024, 1);
            RegisterNodeManagerRequest request = recordFactory.NewRecordInstance <RegisterNodeManagerRequest
                                                                                  >();

            nodeId = NodeId.NewInstance(node, 1234);
            request.SetNodeId(nodeId);
            request.SetHttpPort(0);
            request.SetResource(capability);
            RegisterNodeManagerRequest request1 = recordFactory.NewRecordInstance <RegisterNodeManagerRequest
                                                                                   >();

            request1.SetNodeId(nodeId);
            request1.SetHttpPort(0);
            request1.SetResource(capability);
            resourceTrackerService.RegisterNodeManager(request1);
            NodeStatus nodeStatus = recordFactory.NewRecordInstance <NodeStatus>();

            nodeStatus.SetNodeId(nodeId);
            NodeHealthStatus nodeHealthStatus = recordFactory.NewRecordInstance <NodeHealthStatus
                                                                                 >();

            nodeHealthStatus.SetIsNodeHealthy(true);
            nodeStatus.SetNodeHealthStatus(nodeHealthStatus);
            NodeHeartbeatRequest nodeHeartBeatRequest = recordFactory.NewRecordInstance <NodeHeartbeatRequest
                                                                                         >();

            nodeHeartBeatRequest.SetNodeStatus(nodeStatus);
            nodeStatus.SetResponseId(0);
            NodeHeartbeatResponse response = resourceTrackerService.NodeHeartbeat(nodeHeartBeatRequest
                                                                                  );

            NUnit.Framework.Assert.IsTrue(response.GetResponseId() == 1);
            nodeStatus.SetResponseId(response.GetResponseId());
            response = resourceTrackerService.NodeHeartbeat(nodeHeartBeatRequest);
            NUnit.Framework.Assert.IsTrue(response.GetResponseId() == 2);
            /* try calling with less response id */
            response = resourceTrackerService.NodeHeartbeat(nodeHeartBeatRequest);
            NUnit.Framework.Assert.IsTrue(response.GetResponseId() == 2);
            nodeStatus.SetResponseId(0);
            response = resourceTrackerService.NodeHeartbeat(nodeHeartBeatRequest);
            NUnit.Framework.Assert.IsTrue(NodeAction.Resync.Equals(response.GetNodeAction()));
            NUnit.Framework.Assert.AreEqual("Too far behind rm response id:2 nm response id:0"
                                            , response.GetDiagnosticsMessage());
        }
Example #16
0
        public virtual void TestDecommissionWithIncludeHosts()
        {
            WriteToHostsFile("localhost", "host1", "host2");
            Configuration conf = new Configuration();

            conf.Set(YarnConfiguration.RmNodesIncludeFilePath, hostFile.GetAbsolutePath());
            rm = new MockRM(conf);
            rm.Start();
            MockNM         nm1     = rm.RegisterNode("host1:1234", 5120);
            MockNM         nm2     = rm.RegisterNode("host2:5678", 10240);
            MockNM         nm3     = rm.RegisterNode("localhost:4433", 1024);
            ClusterMetrics metrics = ClusterMetrics.GetMetrics();

            System.Diagnostics.Debug.Assert((metrics != null));
            int metricCount = metrics.GetNumDecommisionedNMs();
            NodeHeartbeatResponse nodeHeartbeat = nm1.NodeHeartbeat(true);

            NUnit.Framework.Assert.IsTrue(NodeAction.Normal.Equals(nodeHeartbeat.GetNodeAction
                                                                       ()));
            nodeHeartbeat = nm2.NodeHeartbeat(true);
            NUnit.Framework.Assert.IsTrue(NodeAction.Normal.Equals(nodeHeartbeat.GetNodeAction
                                                                       ()));
            nodeHeartbeat = nm3.NodeHeartbeat(true);
            NUnit.Framework.Assert.IsTrue(NodeAction.Normal.Equals(nodeHeartbeat.GetNodeAction
                                                                       ()));
            // To test that IPs also work
            string ip = NetUtils.NormalizeHostName("localhost");

            WriteToHostsFile("host1", ip);
            rm.GetNodesListManager().RefreshNodes(conf);
            CheckDecommissionedNMCount(rm, ++metricCount);
            nodeHeartbeat = nm1.NodeHeartbeat(true);
            NUnit.Framework.Assert.IsTrue(NodeAction.Normal.Equals(nodeHeartbeat.GetNodeAction
                                                                       ()));
            NUnit.Framework.Assert.AreEqual(1, ClusterMetrics.GetMetrics().GetNumDecommisionedNMs
                                                ());
            nodeHeartbeat = nm2.NodeHeartbeat(true);
            NUnit.Framework.Assert.IsTrue("Node is not decommisioned.", NodeAction.Shutdown.Equals
                                              (nodeHeartbeat.GetNodeAction()));
            nodeHeartbeat = nm3.NodeHeartbeat(true);
            NUnit.Framework.Assert.IsTrue(NodeAction.Normal.Equals(nodeHeartbeat.GetNodeAction
                                                                       ()));
            NUnit.Framework.Assert.AreEqual(metricCount, ClusterMetrics.GetMetrics().GetNumDecommisionedNMs
                                                ());
        }
Example #17
0
        /// <exception cref="System.Exception"/>
        public virtual NodeHeartbeatResponse NodeHeartbeat(IDictionary <ApplicationId, IList
                                                                        <Org.Apache.Hadoop.Yarn.Api.Records.ContainerStatus> > conts, bool isHealthy, int
                                                           resId)
        {
            NodeHeartbeatRequest req = Org.Apache.Hadoop.Yarn.Util.Records.NewRecord <NodeHeartbeatRequest
                                                                                      >();
            NodeStatus status = Org.Apache.Hadoop.Yarn.Util.Records.NewRecord <NodeStatus>();

            status.SetResponseId(resId);
            status.SetNodeId(nodeId);
            foreach (KeyValuePair <ApplicationId, IList <Org.Apache.Hadoop.Yarn.Api.Records.ContainerStatus
                                                         > > entry in conts)
            {
                Org.Mortbay.Log.Log.Info("entry.getValue() " + entry.Value);
                status.SetContainersStatuses(entry.Value);
            }
            NodeHealthStatus healthStatus = Org.Apache.Hadoop.Yarn.Util.Records.NewRecord <NodeHealthStatus
                                                                                           >();

            healthStatus.SetHealthReport(string.Empty);
            healthStatus.SetIsNodeHealthy(isHealthy);
            healthStatus.SetLastHealthReportTime(1);
            status.SetNodeHealthStatus(healthStatus);
            req.SetNodeStatus(status);
            req.SetLastKnownContainerTokenMasterKey(this.currentContainerTokenMasterKey);
            req.SetLastKnownNMTokenMasterKey(this.currentNMTokenMasterKey);
            NodeHeartbeatResponse heartbeatResponse = resourceTracker.NodeHeartbeat(req);
            MasterKey             masterKeyFromRM   = heartbeatResponse.GetContainerTokenMasterKey();

            if (masterKeyFromRM != null && masterKeyFromRM.GetKeyId() != this.currentContainerTokenMasterKey
                .GetKeyId())
            {
                this.currentContainerTokenMasterKey = masterKeyFromRM;
            }
            masterKeyFromRM = heartbeatResponse.GetNMTokenMasterKey();
            if (masterKeyFromRM != null && masterKeyFromRM.GetKeyId() != this.currentNMTokenMasterKey
                .GetKeyId())
            {
                this.currentNMTokenMasterKey = masterKeyFromRM;
            }
            return(heartbeatResponse);
        }
Example #18
0
        /// <exception cref="Com.Google.Protobuf.ServiceException"/>
        public virtual YarnServerCommonServiceProtos.NodeHeartbeatResponseProto NodeHeartbeat
            (RpcController controller, YarnServerCommonServiceProtos.NodeHeartbeatRequestProto
            proto)
        {
            NodeHeartbeatRequestPBImpl request = new NodeHeartbeatRequestPBImpl(proto);

            try
            {
                NodeHeartbeatResponse response = real.NodeHeartbeat(request);
                return(((NodeHeartbeatResponsePBImpl)response).GetProto());
            }
            catch (YarnException e)
            {
                throw new ServiceException(e);
            }
            catch (IOException e)
            {
                throw new ServiceException(e);
            }
        }
        private void PopulateKeys(NodeHeartbeatRequest request, NodeHeartbeatResponse nodeHeartBeatResponse
                                  )
        {
            // Check if node's masterKey needs to be updated and if the currentKey has
            // roller over, send it across
            // ContainerTokenMasterKey
            MasterKey nextMasterKeyForNode = this.containerTokenSecretManager.GetNextKey();

            if (nextMasterKeyForNode != null && (request.GetLastKnownContainerTokenMasterKey(
                                                     ).GetKeyId() != nextMasterKeyForNode.GetKeyId()))
            {
                nodeHeartBeatResponse.SetContainerTokenMasterKey(nextMasterKeyForNode);
            }
            // NMTokenMasterKey
            nextMasterKeyForNode = this.nmTokenSecretManager.GetNextKey();
            if (nextMasterKeyForNode != null && (request.GetLastKnownNMTokenMasterKey().GetKeyId
                                                     () != nextMasterKeyForNode.GetKeyId()))
            {
                nodeHeartBeatResponse.SetNMTokenMasterKey(nextMasterKeyForNode);
            }
        }
Example #20
0
        public virtual void TestReboot()
        {
            Configuration conf = new Configuration();

            rm = new MockRM(conf);
            rm.Start();
            MockNM nm1 = rm.RegisterNode("host1:1234", 5120);
            MockNM nm2 = rm.RegisterNode("host2:1234", 2048);
            int    initialMetricCount           = ClusterMetrics.GetMetrics().GetNumRebootedNMs();
            NodeHeartbeatResponse nodeHeartbeat = nm1.NodeHeartbeat(true);

            NUnit.Framework.Assert.IsTrue(NodeAction.Normal.Equals(nodeHeartbeat.GetNodeAction
                                                                       ()));
            nodeHeartbeat = nm2.NodeHeartbeat(new Dictionary <ApplicationId, IList <ContainerStatus
                                                                                    > >(), true, -100);
            NUnit.Framework.Assert.IsTrue(NodeAction.Resync.Equals(nodeHeartbeat.GetNodeAction
                                                                       ()));
            NUnit.Framework.Assert.AreEqual("Too far behind rm response id:0 nm response id:-100"
                                            , nodeHeartbeat.GetDiagnosticsMessage());
            CheckRebootedNMCount(rm, ++initialMetricCount);
        }
        public static NodeHeartbeatResponse NewNodeHeartbeatResponse(int responseId, NodeAction
                                                                     action, IList <ContainerId> containersToCleanUp, IList <ApplicationId> applicationsToCleanUp
                                                                     , MasterKey containerTokenMasterKey, MasterKey nmTokenMasterKey, long nextHeartbeatInterval
                                                                     )
        {
            NodeHeartbeatResponse response = recordFactory.NewRecordInstance <NodeHeartbeatResponse
                                                                              >();

            response.SetResponseId(responseId);
            response.SetNodeAction(action);
            response.SetContainerTokenMasterKey(containerTokenMasterKey);
            response.SetNMTokenMasterKey(nmTokenMasterKey);
            response.SetNextHeartBeatInterval(nextHeartbeatInterval);
            if (containersToCleanUp != null)
            {
                response.AddAllContainersToCleanup(containersToCleanUp);
            }
            if (applicationsToCleanUp != null)
            {
                response.AddAllApplicationsToCleanup(applicationsToCleanUp);
            }
            return(response);
        }
        /// <exception cref="System.Exception"/>
        public virtual void TestContainerCleanupWhenRMRestartedAppNotRegistered()
        {
            conf.SetInt(YarnConfiguration.RmAmMaxAttempts, 1);
            MemoryRMStateStore memStore = new MemoryRMStateStore();

            memStore.Init(conf);
            // start RM
            DrainDispatcher dispatcher = new DrainDispatcher();
            MockRM          rm1        = new _MockRM_413(dispatcher, conf, memStore);

            rm1.Start();
            MockNM nm1 = new MockNM("127.0.0.1:1234", 15120, rm1.GetResourceTrackerService());

            nm1.RegisterNode();
            // create app and launch the AM
            RMApp  app0 = rm1.SubmitApp(200);
            MockAM am0  = LaunchAM(app0, rm1, nm1);

            nm1.NodeHeartbeat(am0.GetApplicationAttemptId(), 1, ContainerState.Running);
            rm1.WaitForState(app0.GetApplicationId(), RMAppState.Running);
            // start new RM
            DrainDispatcher dispatcher2 = new DrainDispatcher();
            MockRM          rm2         = new _MockRM_432(dispatcher2, conf, memStore);

            rm2.Start();
            // nm1 register to rm2, and do a heartbeat
            nm1.SetResourceTrackerService(rm2.GetResourceTrackerService());
            nm1.RegisterNode(Arrays.AsList(app0.GetApplicationId()));
            rm2.WaitForState(app0.GetApplicationId(), RMAppState.Accepted);
            // Add unknown container for application unknown to scheduler
            NodeHeartbeatResponse response = nm1.NodeHeartbeat(am0.GetApplicationAttemptId(),
                                                               2, ContainerState.Running);

            WaitForContainerCleanup(dispatcher2, nm1, response);
            rm1.Stop();
            rm2.Stop();
        }
Example #23
0
 /// <summary>
 /// Update a
 /// <see cref="Org.Apache.Hadoop.Yarn.Server.Api.Protocolrecords.NodeHeartbeatResponse
 ///     "/>
 /// with the list of containers and
 /// applications to clean up for this node.
 /// </summary>
 /// <param name="response">
 /// the
 /// <see cref="Org.Apache.Hadoop.Yarn.Server.Api.Protocolrecords.NodeHeartbeatResponse
 ///     "/>
 /// to update
 /// </param>
 public abstract void UpdateNodeHeartbeatResponseForCleanup(NodeHeartbeatResponse
                                                            response);
        /// <exception cref="Org.Apache.Hadoop.Yarn.Exceptions.YarnException"/>
        /// <exception cref="System.IO.IOException"/>
        public virtual NodeHeartbeatResponse NodeHeartbeat(NodeHeartbeatRequest request)
        {
            NodeStatus remoteNodeStatus = request.GetNodeStatus();
            NodeId     nodeId           = remoteNodeStatus.GetNodeId();

            // 1. Check if it's a valid (i.e. not excluded) node
            if (!this.nodesListManager.IsValidNode(nodeId.GetHost()))
            {
                string message = "Disallowed NodeManager nodeId: " + nodeId + " hostname: " + nodeId
                                 .GetHost();
                Log.Info(message);
                shutDown.SetDiagnosticsMessage(message);
                return(shutDown);
            }
            // 2. Check if it's a registered node
            RMNode rmNode = this.rmContext.GetRMNodes()[nodeId];

            if (rmNode == null)
            {
                /* node does not exist */
                string message = "Node not found resyncing " + remoteNodeStatus.GetNodeId();
                Log.Info(message);
                resync.SetDiagnosticsMessage(message);
                return(resync);
            }
            // Send ping
            this.nmLivelinessMonitor.ReceivedPing(nodeId);
            // 3. Check if it's a 'fresh' heartbeat i.e. not duplicate heartbeat
            NodeHeartbeatResponse lastNodeHeartbeatResponse = rmNode.GetLastNodeHeartBeatResponse
                                                                  ();

            if (remoteNodeStatus.GetResponseId() + 1 == lastNodeHeartbeatResponse.GetResponseId
                    ())
            {
                Log.Info("Received duplicate heartbeat from node " + rmNode.GetNodeAddress() + " responseId="
                         + remoteNodeStatus.GetResponseId());
                return(lastNodeHeartbeatResponse);
            }
            else
            {
                if (remoteNodeStatus.GetResponseId() + 1 < lastNodeHeartbeatResponse.GetResponseId
                        ())
                {
                    string message = "Too far behind rm response id:" + lastNodeHeartbeatResponse.GetResponseId
                                         () + " nm response id:" + remoteNodeStatus.GetResponseId();
                    Log.Info(message);
                    resync.SetDiagnosticsMessage(message);
                    // TODO: Just sending reboot is not enough. Think more.
                    this.rmContext.GetDispatcher().GetEventHandler().Handle(new RMNodeEvent(nodeId, RMNodeEventType
                                                                                            .Rebooting));
                    return(resync);
                }
            }
            // Heartbeat response
            NodeHeartbeatResponse nodeHeartBeatResponse = YarnServerBuilderUtils.NewNodeHeartbeatResponse
                                                              (lastNodeHeartbeatResponse.GetResponseId() + 1, NodeAction.Normal, null, null, null
                                                              , null, nextHeartBeatInterval);

            rmNode.UpdateNodeHeartbeatResponseForCleanup(nodeHeartBeatResponse);
            PopulateKeys(request, nodeHeartBeatResponse);
            ConcurrentMap <ApplicationId, ByteBuffer> systemCredentials = rmContext.GetSystemCredentialsForApps
                                                                              ();

            if (!systemCredentials.IsEmpty())
            {
                nodeHeartBeatResponse.SetSystemCredentialsForApps(systemCredentials);
            }
            // 4. Send status to RMNode, saving the latest response.
            this.rmContext.GetDispatcher().GetEventHandler().Handle(new RMNodeStatusEvent(nodeId
                                                                                          , remoteNodeStatus.GetNodeHealthStatus(), remoteNodeStatus.GetContainersStatuses
                                                                                              (), remoteNodeStatus.GetKeepAliveApplications(), nodeHeartBeatResponse));
            return(nodeHeartBeatResponse);
        }
Example #25
0
 public override void UpdateNodeHeartbeatResponseForCleanup(NodeHeartbeatResponse
                                                            response)
 {
 }
Example #26
0
        /// <exception cref="System.Exception"/>
        private void ValidateRMNMKeyExchange(YarnConfiguration conf)
        {
            // Default rolling and activation intervals are large enough, no need to
            // intervene
            DrainDispatcher dispatcher = new DrainDispatcher();
            ResourceManager rm         = new _ResourceManager_56(dispatcher);

            // Do nothing.
            // Don't need it, skip.
            rm.Init(conf);
            rm.Start();
            // Testing ContainerToken and NMToken
            string containerToken = "Container Token : ";
            string nmToken        = "NM Token : ";
            MockNM nm             = new MockNM("host:1234", 3072, rm.GetResourceTrackerService());
            RegisterNodeManagerResponse registrationResponse = nm.RegisterNode();
            MasterKey containerTokenMasterKey = registrationResponse.GetContainerTokenMasterKey
                                                    ();

            NUnit.Framework.Assert.IsNotNull(containerToken + "Registration should cause a key-update!"
                                             , containerTokenMasterKey);
            MasterKey nmTokenMasterKey = registrationResponse.GetNMTokenMasterKey();

            NUnit.Framework.Assert.IsNotNull(nmToken + "Registration should cause a key-update!"
                                             , nmTokenMasterKey);
            dispatcher.Await();
            NodeHeartbeatResponse response = nm.NodeHeartbeat(true);

            NUnit.Framework.Assert.IsNull(containerToken + "First heartbeat after registration shouldn't get any key updates!"
                                          , response.GetContainerTokenMasterKey());
            NUnit.Framework.Assert.IsNull(nmToken + "First heartbeat after registration shouldn't get any key updates!"
                                          , response.GetNMTokenMasterKey());
            dispatcher.Await();
            response = nm.NodeHeartbeat(true);
            NUnit.Framework.Assert.IsNull(containerToken + "Even second heartbeat after registration shouldn't get any key updates!"
                                          , response.GetContainerTokenMasterKey());
            NUnit.Framework.Assert.IsNull(nmToken + "Even second heartbeat after registration shouldn't get any key updates!"
                                          , response.GetContainerTokenMasterKey());
            dispatcher.Await();
            // Let's force a roll-over
            rm.GetRMContext().GetContainerTokenSecretManager().RollMasterKey();
            rm.GetRMContext().GetNMTokenSecretManager().RollMasterKey();
            // Heartbeats after roll-over and before activation should be fine.
            response = nm.NodeHeartbeat(true);
            NUnit.Framework.Assert.IsNotNull(containerToken + "Heartbeats after roll-over and before activation should not err out."
                                             , response.GetContainerTokenMasterKey());
            NUnit.Framework.Assert.IsNotNull(nmToken + "Heartbeats after roll-over and before activation should not err out."
                                             , response.GetNMTokenMasterKey());
            NUnit.Framework.Assert.AreEqual(containerToken + "Roll-over should have incremented the key-id only by one!"
                                            , containerTokenMasterKey.GetKeyId() + 1, response.GetContainerTokenMasterKey().
                                            GetKeyId());
            NUnit.Framework.Assert.AreEqual(nmToken + "Roll-over should have incremented the key-id only by one!"
                                            , nmTokenMasterKey.GetKeyId() + 1, response.GetNMTokenMasterKey().GetKeyId());
            dispatcher.Await();
            response = nm.NodeHeartbeat(true);
            NUnit.Framework.Assert.IsNull(containerToken + "Second heartbeat after roll-over shouldn't get any key updates!"
                                          , response.GetContainerTokenMasterKey());
            NUnit.Framework.Assert.IsNull(nmToken + "Second heartbeat after roll-over shouldn't get any key updates!"
                                          , response.GetNMTokenMasterKey());
            dispatcher.Await();
            // Let's force activation
            rm.GetRMContext().GetContainerTokenSecretManager().ActivateNextMasterKey();
            rm.GetRMContext().GetNMTokenSecretManager().ActivateNextMasterKey();
            response = nm.NodeHeartbeat(true);
            NUnit.Framework.Assert.IsNull(containerToken + "Activation shouldn't cause any key updates!"
                                          , response.GetContainerTokenMasterKey());
            NUnit.Framework.Assert.IsNull(nmToken + "Activation shouldn't cause any key updates!"
                                          , response.GetNMTokenMasterKey());
            dispatcher.Await();
            response = nm.NodeHeartbeat(true);
            NUnit.Framework.Assert.IsNull(containerToken + "Even second heartbeat after activation shouldn't get any key updates!"
                                          , response.GetContainerTokenMasterKey());
            NUnit.Framework.Assert.IsNull(nmToken + "Even second heartbeat after activation shouldn't get any key updates!"
                                          , response.GetNMTokenMasterKey());
            dispatcher.Await();
            rm.Stop();
        }
        public virtual void TestAppCleanup()
        {
            Logger rootLogger = LogManager.GetRootLogger();

            rootLogger.SetLevel(Level.Debug);
            MockRM rm = new MockRM();

            rm.Start();
            MockNM nm1 = rm.RegisterNode("127.0.0.1:1234", 5000);
            RMApp  app = rm.SubmitApp(2000);

            //kick the scheduling
            nm1.NodeHeartbeat(true);
            RMAppAttempt attempt = app.GetCurrentAppAttempt();
            MockAM       am      = rm.SendAMLaunched(attempt.GetAppAttemptId());

            am.RegisterAppAttempt();
            //request for containers
            int request = 2;

            am.Allocate("127.0.0.1", 1000, request, new AList <ContainerId>());
            //kick the scheduler
            nm1.NodeHeartbeat(true);
            IList <Container> conts = am.Allocate(new AList <ResourceRequest>(), new AList <ContainerId
                                                                                            >()).GetAllocatedContainers();
            int contReceived = conts.Count;
            int waitCount    = 0;

            while (contReceived < request && waitCount++ < 200)
            {
                Log.Info("Got " + contReceived + " containers. Waiting to get " + request);
                Sharpen.Thread.Sleep(100);
                conts = am.Allocate(new AList <ResourceRequest>(), new AList <ContainerId>()).GetAllocatedContainers
                            ();
                contReceived += conts.Count;
                nm1.NodeHeartbeat(true);
            }
            NUnit.Framework.Assert.AreEqual(request, contReceived);
            am.UnregisterAppAttempt();
            NodeHeartbeatResponse resp = nm1.NodeHeartbeat(attempt.GetAppAttemptId(), 1, ContainerState
                                                           .Complete);

            am.WaitForState(RMAppAttemptState.Finished);
            //currently only containers are cleaned via this
            //AM container is cleaned via container launcher
            resp = nm1.NodeHeartbeat(true);
            IList <ContainerId>   containersToCleanup = resp.GetContainersToCleanup();
            IList <ApplicationId> appsToCleanup       = resp.GetApplicationsToCleanup();
            int numCleanedContainers = containersToCleanup.Count;
            int numCleanedApps       = appsToCleanup.Count;

            waitCount = 0;
            while ((numCleanedContainers < 2 || numCleanedApps < 1) && waitCount++ < 200)
            {
                Log.Info("Waiting to get cleanup events.. cleanedConts: " + numCleanedContainers
                         + " cleanedApps: " + numCleanedApps);
                Sharpen.Thread.Sleep(100);
                resp = nm1.NodeHeartbeat(true);
                IList <ContainerId>   deltaContainersToCleanup = resp.GetContainersToCleanup();
                IList <ApplicationId> deltaAppsToCleanup       = resp.GetApplicationsToCleanup();
                // Add the deltas to the global list
                Sharpen.Collections.AddAll(containersToCleanup, deltaContainersToCleanup);
                Sharpen.Collections.AddAll(appsToCleanup, deltaAppsToCleanup);
                // Update counts now
                numCleanedContainers = containersToCleanup.Count;
                numCleanedApps       = appsToCleanup.Count;
            }
            NUnit.Framework.Assert.AreEqual(1, appsToCleanup.Count);
            NUnit.Framework.Assert.AreEqual(app.GetApplicationId(), appsToCleanup[0]);
            NUnit.Framework.Assert.AreEqual(1, numCleanedApps);
            NUnit.Framework.Assert.AreEqual(2, numCleanedContainers);
            rm.Stop();
        }
Example #28
0
        public virtual void TestReconnectNode()
        {
            DrainDispatcher dispatcher = new DrainDispatcher();

            rm = new _MockRM_567(this, dispatcher);
            rm.Start();
            MockNM nm1 = rm.RegisterNode("host1:1234", 5120);
            MockNM nm2 = rm.RegisterNode("host2:5678", 5120);

            nm1.NodeHeartbeat(true);
            nm2.NodeHeartbeat(false);
            dispatcher.Await();
            CheckUnealthyNMCount(rm, nm2, true, 1);
            int          expectedNMs = ClusterMetrics.GetMetrics().GetNumActiveNMs();
            QueueMetrics metrics     = rm.GetResourceScheduler().GetRootQueueMetrics();

            // TODO Metrics incorrect in case of the FifoScheduler
            NUnit.Framework.Assert.AreEqual(5120, metrics.GetAvailableMB());
            // reconnect of healthy node
            nm1 = rm.RegisterNode("host1:1234", 5120);
            NodeHeartbeatResponse response = nm1.NodeHeartbeat(true);

            NUnit.Framework.Assert.IsTrue(NodeAction.Normal.Equals(response.GetNodeAction()));
            dispatcher.Await();
            NUnit.Framework.Assert.AreEqual(expectedNMs, ClusterMetrics.GetMetrics().GetNumActiveNMs
                                                ());
            CheckUnealthyNMCount(rm, nm2, true, 1);
            // reconnect of unhealthy node
            nm2      = rm.RegisterNode("host2:5678", 5120);
            response = nm2.NodeHeartbeat(false);
            NUnit.Framework.Assert.IsTrue(NodeAction.Normal.Equals(response.GetNodeAction()));
            dispatcher.Await();
            NUnit.Framework.Assert.AreEqual(expectedNMs, ClusterMetrics.GetMetrics().GetNumActiveNMs
                                                ());
            CheckUnealthyNMCount(rm, nm2, true, 1);
            // unhealthy node changed back to healthy
            nm2 = rm.RegisterNode("host2:5678", 5120);
            dispatcher.Await();
            response = nm2.NodeHeartbeat(true);
            response = nm2.NodeHeartbeat(true);
            dispatcher.Await();
            NUnit.Framework.Assert.AreEqual(5120 + 5120, metrics.GetAvailableMB());
            // reconnect of node with changed capability
            nm1 = rm.RegisterNode("host2:5678", 10240);
            dispatcher.Await();
            response = nm1.NodeHeartbeat(true);
            dispatcher.Await();
            NUnit.Framework.Assert.IsTrue(NodeAction.Normal.Equals(response.GetNodeAction()));
            NUnit.Framework.Assert.AreEqual(5120 + 10240, metrics.GetAvailableMB());
            // reconnect of node with changed capability and running applications
            IList <ApplicationId> runningApps = new AList <ApplicationId>();

            runningApps.AddItem(ApplicationId.NewInstance(1, 0));
            nm1 = rm.RegisterNode("host2:5678", 15360, 2, runningApps);
            dispatcher.Await();
            response = nm1.NodeHeartbeat(true);
            dispatcher.Await();
            NUnit.Framework.Assert.IsTrue(NodeAction.Normal.Equals(response.GetNodeAction()));
            NUnit.Framework.Assert.AreEqual(5120 + 15360, metrics.GetAvailableMB());
            // reconnect healthy node changing http port
            nm1 = new MockNM("host1:1234", 5120, rm.GetResourceTrackerService());
            nm1.SetHttpPort(3);
            nm1.RegisterNode();
            dispatcher.Await();
            response = nm1.NodeHeartbeat(true);
            response = nm1.NodeHeartbeat(true);
            dispatcher.Await();
            RMNode rmNode = rm.GetRMContext().GetRMNodes()[nm1.GetNodeId()];

            NUnit.Framework.Assert.AreEqual(3, rmNode.GetHttpPort());
            NUnit.Framework.Assert.AreEqual(5120, rmNode.GetTotalCapability().GetMemory());
            NUnit.Framework.Assert.AreEqual(5120 + 15360, metrics.GetAvailableMB());
        }
        public virtual void TestContainerCleanup()
        {
            Logger rootLogger = LogManager.GetRootLogger();

            rootLogger.SetLevel(Level.Debug);
            DrainDispatcher dispatcher = new DrainDispatcher();
            MockRM          rm         = new _MockRM_167(this, dispatcher);

            rm.Start();
            MockNM nm1 = rm.RegisterNode("127.0.0.1:1234", 5000);
            RMApp  app = rm.SubmitApp(2000);

            //kick the scheduling
            nm1.NodeHeartbeat(true);
            RMAppAttempt attempt = app.GetCurrentAppAttempt();
            MockAM       am      = rm.SendAMLaunched(attempt.GetAppAttemptId());

            am.RegisterAppAttempt();
            //request for containers
            int request = 2;

            am.Allocate("127.0.0.1", 1000, request, new AList <ContainerId>());
            dispatcher.Await();
            //kick the scheduler
            nm1.NodeHeartbeat(true);
            IList <Container> conts = am.Allocate(new AList <ResourceRequest>(), new AList <ContainerId
                                                                                            >()).GetAllocatedContainers();
            int contReceived = conts.Count;
            int waitCount    = 0;

            while (contReceived < request && waitCount++ < 200)
            {
                Log.Info("Got " + contReceived + " containers. Waiting to get " + request);
                Sharpen.Thread.Sleep(100);
                conts = am.Allocate(new AList <ResourceRequest>(), new AList <ContainerId>()).GetAllocatedContainers
                            ();
                dispatcher.Await();
                contReceived += conts.Count;
                nm1.NodeHeartbeat(true);
            }
            NUnit.Framework.Assert.AreEqual(request, contReceived);
            // Release a container.
            AList <ContainerId> release = new AList <ContainerId>();

            release.AddItem(conts[0].GetId());
            am.Allocate(new AList <ResourceRequest>(), release);
            dispatcher.Await();
            // Send one more heartbeat with a fake running container. This is to
            // simulate the situation that can happen if the NM reports that container
            // is running in the same heartbeat when the RM asks it to clean it up.
            IDictionary <ApplicationId, IList <ContainerStatus> > containerStatuses = new Dictionary
                                                                                      <ApplicationId, IList <ContainerStatus> >();
            AList <ContainerStatus> containerStatusList = new AList <ContainerStatus>();

            containerStatusList.AddItem(BuilderUtils.NewContainerStatus(conts[0].GetId(), ContainerState
                                                                        .Running, "nothing", 0));
            containerStatuses[app.GetApplicationId()] = containerStatusList;
            NodeHeartbeatResponse resp = nm1.NodeHeartbeat(containerStatuses, true);

            WaitForContainerCleanup(dispatcher, nm1, resp);
            // Now to test the case when RM already gave cleanup, and NM suddenly
            // realizes that the container is running.
            Log.Info("Testing container launch much after release and " + "NM getting cleanup"
                     );
            containerStatuses.Clear();
            containerStatusList.Clear();
            containerStatusList.AddItem(BuilderUtils.NewContainerStatus(conts[0].GetId(), ContainerState
                                                                        .Running, "nothing", 0));
            containerStatuses[app.GetApplicationId()] = containerStatusList;
            resp = nm1.NodeHeartbeat(containerStatuses, true);
            // The cleanup list won't be instantaneous as it is given out by scheduler
            // and not RMNodeImpl.
            WaitForContainerCleanup(dispatcher, nm1, resp);
            rm.Stop();
        }
Example #30
0
            public void Run()
            {
                int lastHeartBeatID = 0;

                while (!this._enclosing.isStopped)
                {
                    try
                    {
                        NodeHeartbeatResponse response   = null;
                        NodeStatus            nodeStatus = this._enclosing.GetNodeStatus(lastHeartBeatID);
                        NodeHeartbeatRequest  request    = NodeHeartbeatRequest.NewInstance(nodeStatus, this.
                                                                                            _enclosing.context.GetContainerTokenSecretManager().GetCurrentKey(), this._enclosing
                                                                                            .context.GetNMTokenSecretManager().GetCurrentKey());
                        response = this._enclosing.resourceTracker.NodeHeartbeat(request);
                        this._enclosing.nextHeartBeatInterval = response.GetNextHeartBeatInterval();
                        this.UpdateMasterKeys(response);
                        if (response.GetNodeAction() == NodeAction.Shutdown)
                        {
                            Org.Apache.Hadoop.Yarn.Server.Nodemanager.NodeStatusUpdaterImpl.Log.Warn("Recieved SHUTDOWN signal from Resourcemanager as part of heartbeat,"
                                                                                                     + " hence shutting down.");
                            Org.Apache.Hadoop.Yarn.Server.Nodemanager.NodeStatusUpdaterImpl.Log.Warn("Message from ResourceManager: "
                                                                                                     + response.GetDiagnosticsMessage());
                            this._enclosing.context.SetDecommissioned(true);
                            this._enclosing.dispatcher.GetEventHandler().Handle(new NodeManagerEvent(NodeManagerEventType
                                                                                                     .Shutdown));
                            break;
                        }
                        if (response.GetNodeAction() == NodeAction.Resync)
                        {
                            Org.Apache.Hadoop.Yarn.Server.Nodemanager.NodeStatusUpdaterImpl.Log.Warn("Node is out of sync with ResourceManager,"
                                                                                                     + " hence resyncing.");
                            Org.Apache.Hadoop.Yarn.Server.Nodemanager.NodeStatusUpdaterImpl.Log.Warn("Message from ResourceManager: "
                                                                                                     + response.GetDiagnosticsMessage());
                            this._enclosing.rmIdentifier = ResourceManagerConstants.RmInvalidIdentifier;
                            this._enclosing.dispatcher.GetEventHandler().Handle(new NodeManagerEvent(NodeManagerEventType
                                                                                                     .Resync));
                            this._enclosing.pendingCompletedContainers.Clear();
                            break;
                        }
                        this._enclosing.RemoveOrTrackCompletedContainersFromContext(response.GetContainersToBeRemovedFromNM
                                                                                        ());
                        lastHeartBeatID = response.GetResponseId();
                        IList <ContainerId> containersToCleanup = response.GetContainersToCleanup();
                        if (!containersToCleanup.IsEmpty())
                        {
                            this._enclosing.dispatcher.GetEventHandler().Handle(new CMgrCompletedContainersEvent
                                                                                    (containersToCleanup, CMgrCompletedContainersEvent.Reason.ByResourcemanager));
                        }
                        IList <ApplicationId> appsToCleanup = response.GetApplicationsToCleanup();
                        this._enclosing.TrackAppsForKeepAlive(appsToCleanup);
                        if (!appsToCleanup.IsEmpty())
                        {
                            this._enclosing.dispatcher.GetEventHandler().Handle(new CMgrCompletedAppsEvent(appsToCleanup
                                                                                                           , CMgrCompletedAppsEvent.Reason.ByResourcemanager));
                        }
                        IDictionary <ApplicationId, ByteBuffer> systemCredentials = response.GetSystemCredentialsForApps
                                                                                        ();
                        if (systemCredentials != null && !systemCredentials.IsEmpty())
                        {
                            ((NodeManager.NMContext) this._enclosing.context).SetSystemCrendentialsForApps(Org.Apache.Hadoop.Yarn.Server.Nodemanager.NodeStatusUpdaterImpl
                                                                                                           .ParseCredentials(systemCredentials));
                        }
                    }
                    catch (ConnectException e)
                    {
                        this._enclosing.dispatcher.GetEventHandler().Handle(new NodeManagerEvent(NodeManagerEventType
                                                                                                 .Shutdown));
                        throw new YarnRuntimeException(e);
                    }
                    catch (Exception e)
                    {
                        Org.Apache.Hadoop.Yarn.Server.Nodemanager.NodeStatusUpdaterImpl.Log.Error("Caught exception in status-updater"
                                                                                                  , e);
                    }
                    finally
                    {
                        lock (this._enclosing.heartbeatMonitor)
                        {
                            this._enclosing.nextHeartBeatInterval = this._enclosing.nextHeartBeatInterval <=
                                                                    0 ? YarnConfiguration.DefaultRmNmHeartbeatIntervalMs : this._enclosing.nextHeartBeatInterval;
                            try
                            {
                                Sharpen.Runtime.Wait(this._enclosing.heartbeatMonitor, this._enclosing.nextHeartBeatInterval
                                                     );
                            }
                            catch (Exception)
                            {
                            }
                        }
                    }
                }
            }