public virtual void TestUpdateHeartbeatResponseForCleanup() { RMNodeImpl node = GetRunningNode(); NodeId nodeId = node.GetNodeID(); // Expire a container ContainerId completedContainerId = BuilderUtils.NewContainerId(BuilderUtils.NewApplicationAttemptId (BuilderUtils.NewApplicationId(0, 0), 0), 0); node.Handle(new RMNodeCleanContainerEvent(nodeId, completedContainerId)); NUnit.Framework.Assert.AreEqual(1, node.GetContainersToCleanUp().Count); // Finish an application ApplicationId finishedAppId = BuilderUtils.NewApplicationId(0, 1); node.Handle(new RMNodeCleanAppEvent(nodeId, finishedAppId)); NUnit.Framework.Assert.AreEqual(1, node.GetAppsToCleanup().Count); // Verify status update does not clear containers/apps to cleanup // but updating heartbeat response for cleanup does RMNodeStatusEvent statusEvent = GetMockRMNodeStatusEvent(); node.Handle(statusEvent); NUnit.Framework.Assert.AreEqual(1, node.GetContainersToCleanUp().Count); NUnit.Framework.Assert.AreEqual(1, node.GetAppsToCleanup().Count); NodeHeartbeatResponse hbrsp = Org.Apache.Hadoop.Yarn.Util.Records.NewRecord <NodeHeartbeatResponse >(); node.UpdateNodeHeartbeatResponseForCleanup(hbrsp); NUnit.Framework.Assert.AreEqual(0, node.GetContainersToCleanUp().Count); NUnit.Framework.Assert.AreEqual(0, node.GetAppsToCleanup().Count); NUnit.Framework.Assert.AreEqual(1, hbrsp.GetContainersToCleanup().Count); NUnit.Framework.Assert.AreEqual(completedContainerId, hbrsp.GetContainersToCleanup ()[0]); NUnit.Framework.Assert.AreEqual(1, hbrsp.GetApplicationsToCleanup().Count); NUnit.Framework.Assert.AreEqual(finishedAppId, hbrsp.GetApplicationsToCleanup()[0 ]); }
/// <exception cref="System.Exception"/> private void WaitForAppCleanupMessageRecved(MockNM nm, ApplicationId appId) { while (true) { NodeHeartbeatResponse response = nm.NodeHeartbeat(true); if (response.GetApplicationsToCleanup() != null && response.GetApplicationsToCleanup ().Count == 1 && appId.Equals(response.GetApplicationsToCleanup()[0])) { return; } Log.Info("Haven't got application=" + appId.ToString() + " in cleanup list from node heartbeat response, " + "sleep for a while before next heartbeat"); Sharpen.Thread.Sleep(1000); } }
public virtual void TestAppCleanup() { Logger rootLogger = LogManager.GetRootLogger(); rootLogger.SetLevel(Level.Debug); MockRM rm = new MockRM(); rm.Start(); MockNM nm1 = rm.RegisterNode("127.0.0.1:1234", 5000); RMApp app = rm.SubmitApp(2000); //kick the scheduling nm1.NodeHeartbeat(true); RMAppAttempt attempt = app.GetCurrentAppAttempt(); MockAM am = rm.SendAMLaunched(attempt.GetAppAttemptId()); am.RegisterAppAttempt(); //request for containers int request = 2; am.Allocate("127.0.0.1", 1000, request, new AList <ContainerId>()); //kick the scheduler nm1.NodeHeartbeat(true); IList <Container> conts = am.Allocate(new AList <ResourceRequest>(), new AList <ContainerId >()).GetAllocatedContainers(); int contReceived = conts.Count; int waitCount = 0; while (contReceived < request && waitCount++ < 200) { Log.Info("Got " + contReceived + " containers. Waiting to get " + request); Sharpen.Thread.Sleep(100); conts = am.Allocate(new AList <ResourceRequest>(), new AList <ContainerId>()).GetAllocatedContainers (); contReceived += conts.Count; nm1.NodeHeartbeat(true); } NUnit.Framework.Assert.AreEqual(request, contReceived); am.UnregisterAppAttempt(); NodeHeartbeatResponse resp = nm1.NodeHeartbeat(attempt.GetAppAttemptId(), 1, ContainerState .Complete); am.WaitForState(RMAppAttemptState.Finished); //currently only containers are cleaned via this //AM container is cleaned via container launcher resp = nm1.NodeHeartbeat(true); IList <ContainerId> containersToCleanup = resp.GetContainersToCleanup(); IList <ApplicationId> appsToCleanup = resp.GetApplicationsToCleanup(); int numCleanedContainers = containersToCleanup.Count; int numCleanedApps = appsToCleanup.Count; waitCount = 0; while ((numCleanedContainers < 2 || numCleanedApps < 1) && waitCount++ < 200) { Log.Info("Waiting to get cleanup events.. cleanedConts: " + numCleanedContainers + " cleanedApps: " + numCleanedApps); Sharpen.Thread.Sleep(100); resp = nm1.NodeHeartbeat(true); IList <ContainerId> deltaContainersToCleanup = resp.GetContainersToCleanup(); IList <ApplicationId> deltaAppsToCleanup = resp.GetApplicationsToCleanup(); // Add the deltas to the global list Sharpen.Collections.AddAll(containersToCleanup, deltaContainersToCleanup); Sharpen.Collections.AddAll(appsToCleanup, deltaAppsToCleanup); // Update counts now numCleanedContainers = containersToCleanup.Count; numCleanedApps = appsToCleanup.Count; } NUnit.Framework.Assert.AreEqual(1, appsToCleanup.Count); NUnit.Framework.Assert.AreEqual(app.GetApplicationId(), appsToCleanup[0]); NUnit.Framework.Assert.AreEqual(1, numCleanedApps); NUnit.Framework.Assert.AreEqual(2, numCleanedContainers); rm.Stop(); }
public void Run() { int lastHeartBeatID = 0; while (!this._enclosing.isStopped) { try { NodeHeartbeatResponse response = null; NodeStatus nodeStatus = this._enclosing.GetNodeStatus(lastHeartBeatID); NodeHeartbeatRequest request = NodeHeartbeatRequest.NewInstance(nodeStatus, this. _enclosing.context.GetContainerTokenSecretManager().GetCurrentKey(), this._enclosing .context.GetNMTokenSecretManager().GetCurrentKey()); response = this._enclosing.resourceTracker.NodeHeartbeat(request); this._enclosing.nextHeartBeatInterval = response.GetNextHeartBeatInterval(); this.UpdateMasterKeys(response); if (response.GetNodeAction() == NodeAction.Shutdown) { Org.Apache.Hadoop.Yarn.Server.Nodemanager.NodeStatusUpdaterImpl.Log.Warn("Recieved SHUTDOWN signal from Resourcemanager as part of heartbeat," + " hence shutting down."); Org.Apache.Hadoop.Yarn.Server.Nodemanager.NodeStatusUpdaterImpl.Log.Warn("Message from ResourceManager: " + response.GetDiagnosticsMessage()); this._enclosing.context.SetDecommissioned(true); this._enclosing.dispatcher.GetEventHandler().Handle(new NodeManagerEvent(NodeManagerEventType .Shutdown)); break; } if (response.GetNodeAction() == NodeAction.Resync) { Org.Apache.Hadoop.Yarn.Server.Nodemanager.NodeStatusUpdaterImpl.Log.Warn("Node is out of sync with ResourceManager," + " hence resyncing."); Org.Apache.Hadoop.Yarn.Server.Nodemanager.NodeStatusUpdaterImpl.Log.Warn("Message from ResourceManager: " + response.GetDiagnosticsMessage()); this._enclosing.rmIdentifier = ResourceManagerConstants.RmInvalidIdentifier; this._enclosing.dispatcher.GetEventHandler().Handle(new NodeManagerEvent(NodeManagerEventType .Resync)); this._enclosing.pendingCompletedContainers.Clear(); break; } this._enclosing.RemoveOrTrackCompletedContainersFromContext(response.GetContainersToBeRemovedFromNM ()); lastHeartBeatID = response.GetResponseId(); IList <ContainerId> containersToCleanup = response.GetContainersToCleanup(); if (!containersToCleanup.IsEmpty()) { this._enclosing.dispatcher.GetEventHandler().Handle(new CMgrCompletedContainersEvent (containersToCleanup, CMgrCompletedContainersEvent.Reason.ByResourcemanager)); } IList <ApplicationId> appsToCleanup = response.GetApplicationsToCleanup(); this._enclosing.TrackAppsForKeepAlive(appsToCleanup); if (!appsToCleanup.IsEmpty()) { this._enclosing.dispatcher.GetEventHandler().Handle(new CMgrCompletedAppsEvent(appsToCleanup , CMgrCompletedAppsEvent.Reason.ByResourcemanager)); } IDictionary <ApplicationId, ByteBuffer> systemCredentials = response.GetSystemCredentialsForApps (); if (systemCredentials != null && !systemCredentials.IsEmpty()) { ((NodeManager.NMContext) this._enclosing.context).SetSystemCrendentialsForApps(Org.Apache.Hadoop.Yarn.Server.Nodemanager.NodeStatusUpdaterImpl .ParseCredentials(systemCredentials)); } } catch (ConnectException e) { this._enclosing.dispatcher.GetEventHandler().Handle(new NodeManagerEvent(NodeManagerEventType .Shutdown)); throw new YarnRuntimeException(e); } catch (Exception e) { Org.Apache.Hadoop.Yarn.Server.Nodemanager.NodeStatusUpdaterImpl.Log.Error("Caught exception in status-updater" , e); } finally { lock (this._enclosing.heartbeatMonitor) { this._enclosing.nextHeartBeatInterval = this._enclosing.nextHeartBeatInterval <= 0 ? YarnConfiguration.DefaultRmNmHeartbeatIntervalMs : this._enclosing.nextHeartBeatInterval; try { Sharpen.Runtime.Wait(this._enclosing.heartbeatMonitor, this._enclosing.nextHeartBeatInterval ); } catch (Exception) { } } } } }