Beispiel #1
0
        public virtual void TestUpdateHeartbeatResponseForCleanup()
        {
            RMNodeImpl node   = GetRunningNode();
            NodeId     nodeId = node.GetNodeID();
            // Expire a container
            ContainerId completedContainerId = BuilderUtils.NewContainerId(BuilderUtils.NewApplicationAttemptId
                                                                               (BuilderUtils.NewApplicationId(0, 0), 0), 0);

            node.Handle(new RMNodeCleanContainerEvent(nodeId, completedContainerId));
            NUnit.Framework.Assert.AreEqual(1, node.GetContainersToCleanUp().Count);
            // Finish an application
            ApplicationId finishedAppId = BuilderUtils.NewApplicationId(0, 1);

            node.Handle(new RMNodeCleanAppEvent(nodeId, finishedAppId));
            NUnit.Framework.Assert.AreEqual(1, node.GetAppsToCleanup().Count);
            // Verify status update does not clear containers/apps to cleanup
            // but updating heartbeat response for cleanup does
            RMNodeStatusEvent statusEvent = GetMockRMNodeStatusEvent();

            node.Handle(statusEvent);
            NUnit.Framework.Assert.AreEqual(1, node.GetContainersToCleanUp().Count);
            NUnit.Framework.Assert.AreEqual(1, node.GetAppsToCleanup().Count);
            NodeHeartbeatResponse hbrsp = Org.Apache.Hadoop.Yarn.Util.Records.NewRecord <NodeHeartbeatResponse
                                                                                         >();

            node.UpdateNodeHeartbeatResponseForCleanup(hbrsp);
            NUnit.Framework.Assert.AreEqual(0, node.GetContainersToCleanUp().Count);
            NUnit.Framework.Assert.AreEqual(0, node.GetAppsToCleanup().Count);
            NUnit.Framework.Assert.AreEqual(1, hbrsp.GetContainersToCleanup().Count);
            NUnit.Framework.Assert.AreEqual(completedContainerId, hbrsp.GetContainersToCleanup
                                                ()[0]);
            NUnit.Framework.Assert.AreEqual(1, hbrsp.GetApplicationsToCleanup().Count);
            NUnit.Framework.Assert.AreEqual(finishedAppId, hbrsp.GetApplicationsToCleanup()[0
                                            ]);
        }
 /// <exception cref="System.Exception"/>
 private void WaitForAppCleanupMessageRecved(MockNM nm, ApplicationId appId)
 {
     while (true)
     {
         NodeHeartbeatResponse response = nm.NodeHeartbeat(true);
         if (response.GetApplicationsToCleanup() != null && response.GetApplicationsToCleanup
                 ().Count == 1 && appId.Equals(response.GetApplicationsToCleanup()[0]))
         {
             return;
         }
         Log.Info("Haven't got application=" + appId.ToString() + " in cleanup list from node heartbeat response, "
                  + "sleep for a while before next heartbeat");
         Sharpen.Thread.Sleep(1000);
     }
 }
        public virtual void TestAppCleanup()
        {
            Logger rootLogger = LogManager.GetRootLogger();

            rootLogger.SetLevel(Level.Debug);
            MockRM rm = new MockRM();

            rm.Start();
            MockNM nm1 = rm.RegisterNode("127.0.0.1:1234", 5000);
            RMApp  app = rm.SubmitApp(2000);

            //kick the scheduling
            nm1.NodeHeartbeat(true);
            RMAppAttempt attempt = app.GetCurrentAppAttempt();
            MockAM       am      = rm.SendAMLaunched(attempt.GetAppAttemptId());

            am.RegisterAppAttempt();
            //request for containers
            int request = 2;

            am.Allocate("127.0.0.1", 1000, request, new AList <ContainerId>());
            //kick the scheduler
            nm1.NodeHeartbeat(true);
            IList <Container> conts = am.Allocate(new AList <ResourceRequest>(), new AList <ContainerId
                                                                                            >()).GetAllocatedContainers();
            int contReceived = conts.Count;
            int waitCount    = 0;

            while (contReceived < request && waitCount++ < 200)
            {
                Log.Info("Got " + contReceived + " containers. Waiting to get " + request);
                Sharpen.Thread.Sleep(100);
                conts = am.Allocate(new AList <ResourceRequest>(), new AList <ContainerId>()).GetAllocatedContainers
                            ();
                contReceived += conts.Count;
                nm1.NodeHeartbeat(true);
            }
            NUnit.Framework.Assert.AreEqual(request, contReceived);
            am.UnregisterAppAttempt();
            NodeHeartbeatResponse resp = nm1.NodeHeartbeat(attempt.GetAppAttemptId(), 1, ContainerState
                                                           .Complete);

            am.WaitForState(RMAppAttemptState.Finished);
            //currently only containers are cleaned via this
            //AM container is cleaned via container launcher
            resp = nm1.NodeHeartbeat(true);
            IList <ContainerId>   containersToCleanup = resp.GetContainersToCleanup();
            IList <ApplicationId> appsToCleanup       = resp.GetApplicationsToCleanup();
            int numCleanedContainers = containersToCleanup.Count;
            int numCleanedApps       = appsToCleanup.Count;

            waitCount = 0;
            while ((numCleanedContainers < 2 || numCleanedApps < 1) && waitCount++ < 200)
            {
                Log.Info("Waiting to get cleanup events.. cleanedConts: " + numCleanedContainers
                         + " cleanedApps: " + numCleanedApps);
                Sharpen.Thread.Sleep(100);
                resp = nm1.NodeHeartbeat(true);
                IList <ContainerId>   deltaContainersToCleanup = resp.GetContainersToCleanup();
                IList <ApplicationId> deltaAppsToCleanup       = resp.GetApplicationsToCleanup();
                // Add the deltas to the global list
                Sharpen.Collections.AddAll(containersToCleanup, deltaContainersToCleanup);
                Sharpen.Collections.AddAll(appsToCleanup, deltaAppsToCleanup);
                // Update counts now
                numCleanedContainers = containersToCleanup.Count;
                numCleanedApps       = appsToCleanup.Count;
            }
            NUnit.Framework.Assert.AreEqual(1, appsToCleanup.Count);
            NUnit.Framework.Assert.AreEqual(app.GetApplicationId(), appsToCleanup[0]);
            NUnit.Framework.Assert.AreEqual(1, numCleanedApps);
            NUnit.Framework.Assert.AreEqual(2, numCleanedContainers);
            rm.Stop();
        }
Beispiel #4
0
            public void Run()
            {
                int lastHeartBeatID = 0;

                while (!this._enclosing.isStopped)
                {
                    try
                    {
                        NodeHeartbeatResponse response   = null;
                        NodeStatus            nodeStatus = this._enclosing.GetNodeStatus(lastHeartBeatID);
                        NodeHeartbeatRequest  request    = NodeHeartbeatRequest.NewInstance(nodeStatus, this.
                                                                                            _enclosing.context.GetContainerTokenSecretManager().GetCurrentKey(), this._enclosing
                                                                                            .context.GetNMTokenSecretManager().GetCurrentKey());
                        response = this._enclosing.resourceTracker.NodeHeartbeat(request);
                        this._enclosing.nextHeartBeatInterval = response.GetNextHeartBeatInterval();
                        this.UpdateMasterKeys(response);
                        if (response.GetNodeAction() == NodeAction.Shutdown)
                        {
                            Org.Apache.Hadoop.Yarn.Server.Nodemanager.NodeStatusUpdaterImpl.Log.Warn("Recieved SHUTDOWN signal from Resourcemanager as part of heartbeat,"
                                                                                                     + " hence shutting down.");
                            Org.Apache.Hadoop.Yarn.Server.Nodemanager.NodeStatusUpdaterImpl.Log.Warn("Message from ResourceManager: "
                                                                                                     + response.GetDiagnosticsMessage());
                            this._enclosing.context.SetDecommissioned(true);
                            this._enclosing.dispatcher.GetEventHandler().Handle(new NodeManagerEvent(NodeManagerEventType
                                                                                                     .Shutdown));
                            break;
                        }
                        if (response.GetNodeAction() == NodeAction.Resync)
                        {
                            Org.Apache.Hadoop.Yarn.Server.Nodemanager.NodeStatusUpdaterImpl.Log.Warn("Node is out of sync with ResourceManager,"
                                                                                                     + " hence resyncing.");
                            Org.Apache.Hadoop.Yarn.Server.Nodemanager.NodeStatusUpdaterImpl.Log.Warn("Message from ResourceManager: "
                                                                                                     + response.GetDiagnosticsMessage());
                            this._enclosing.rmIdentifier = ResourceManagerConstants.RmInvalidIdentifier;
                            this._enclosing.dispatcher.GetEventHandler().Handle(new NodeManagerEvent(NodeManagerEventType
                                                                                                     .Resync));
                            this._enclosing.pendingCompletedContainers.Clear();
                            break;
                        }
                        this._enclosing.RemoveOrTrackCompletedContainersFromContext(response.GetContainersToBeRemovedFromNM
                                                                                        ());
                        lastHeartBeatID = response.GetResponseId();
                        IList <ContainerId> containersToCleanup = response.GetContainersToCleanup();
                        if (!containersToCleanup.IsEmpty())
                        {
                            this._enclosing.dispatcher.GetEventHandler().Handle(new CMgrCompletedContainersEvent
                                                                                    (containersToCleanup, CMgrCompletedContainersEvent.Reason.ByResourcemanager));
                        }
                        IList <ApplicationId> appsToCleanup = response.GetApplicationsToCleanup();
                        this._enclosing.TrackAppsForKeepAlive(appsToCleanup);
                        if (!appsToCleanup.IsEmpty())
                        {
                            this._enclosing.dispatcher.GetEventHandler().Handle(new CMgrCompletedAppsEvent(appsToCleanup
                                                                                                           , CMgrCompletedAppsEvent.Reason.ByResourcemanager));
                        }
                        IDictionary <ApplicationId, ByteBuffer> systemCredentials = response.GetSystemCredentialsForApps
                                                                                        ();
                        if (systemCredentials != null && !systemCredentials.IsEmpty())
                        {
                            ((NodeManager.NMContext) this._enclosing.context).SetSystemCrendentialsForApps(Org.Apache.Hadoop.Yarn.Server.Nodemanager.NodeStatusUpdaterImpl
                                                                                                           .ParseCredentials(systemCredentials));
                        }
                    }
                    catch (ConnectException e)
                    {
                        this._enclosing.dispatcher.GetEventHandler().Handle(new NodeManagerEvent(NodeManagerEventType
                                                                                                 .Shutdown));
                        throw new YarnRuntimeException(e);
                    }
                    catch (Exception e)
                    {
                        Org.Apache.Hadoop.Yarn.Server.Nodemanager.NodeStatusUpdaterImpl.Log.Error("Caught exception in status-updater"
                                                                                                  , e);
                    }
                    finally
                    {
                        lock (this._enclosing.heartbeatMonitor)
                        {
                            this._enclosing.nextHeartBeatInterval = this._enclosing.nextHeartBeatInterval <=
                                                                    0 ? YarnConfiguration.DefaultRmNmHeartbeatIntervalMs : this._enclosing.nextHeartBeatInterval;
                            try
                            {
                                Sharpen.Runtime.Wait(this._enclosing.heartbeatMonitor, this._enclosing.nextHeartBeatInterval
                                                     );
                            }
                            catch (Exception)
                            {
                            }
                        }
                    }
                }
            }