/// <exception cref="System.Exception"/> protected internal virtual void WaitForContainerCleanup(DrainDispatcher dispatcher , MockNM nm, NodeHeartbeatResponse resp) { int waitCount = 0; int cleanedConts = 0; IList <ContainerId> contsToClean; do { dispatcher.Await(); contsToClean = resp.GetContainersToCleanup(); cleanedConts += contsToClean.Count; if (cleanedConts >= 1) { break; } Sharpen.Thread.Sleep(100); resp = nm.NodeHeartbeat(true); }while (waitCount++ < 200); if (contsToClean.IsEmpty()) { Log.Error("Failed to get any containers to cleanup"); } else { Log.Info("Got cleanup for " + contsToClean[0]); } NUnit.Framework.Assert.AreEqual(1, cleanedConts); }
/// <exception cref="Org.Apache.Hadoop.Yarn.Exceptions.YarnException"/> /// <exception cref="System.IO.IOException"/> public virtual NodeHeartbeatResponse NodeHeartbeat(NodeHeartbeatRequest request) { NodeHeartbeatResponse response = recordFactory.NewRecordInstance <NodeHeartbeatResponse >(); return(response); }
public virtual void TestNodeHeartBeatResponse() { NodeHeartbeatResponse record = Org.Apache.Hadoop.Yarn.Util.Records.NewRecord <NodeHeartbeatResponse >(); IDictionary <ApplicationId, ByteBuffer> appCredentials = new Dictionary <ApplicationId , ByteBuffer>(); Credentials app1Cred = new Credentials(); Org.Apache.Hadoop.Security.Token.Token <DelegationTokenIdentifier> token1 = new Org.Apache.Hadoop.Security.Token.Token <DelegationTokenIdentifier>(); token1.SetKind(new Text("kind1")); app1Cred.AddToken(new Text("token1"), token1); Org.Apache.Hadoop.Security.Token.Token <DelegationTokenIdentifier> token2 = new Org.Apache.Hadoop.Security.Token.Token <DelegationTokenIdentifier>(); token2.SetKind(new Text("kind2")); app1Cred.AddToken(new Text("token2"), token2); DataOutputBuffer dob = new DataOutputBuffer(); app1Cred.WriteTokenStorageToStream(dob); ByteBuffer byteBuffer1 = ByteBuffer.Wrap(dob.GetData(), 0, dob.GetLength()); appCredentials[ApplicationId.NewInstance(1234, 1)] = byteBuffer1; record.SetSystemCredentialsForApps(appCredentials); NodeHeartbeatResponse proto = new NodeHeartbeatResponsePBImpl(((NodeHeartbeatResponsePBImpl )record).GetProto()); NUnit.Framework.Assert.AreEqual(appCredentials, proto.GetSystemCredentialsForApps ()); }
public virtual void TestUpdateHeartbeatResponseForCleanup() { RMNodeImpl node = GetRunningNode(); NodeId nodeId = node.GetNodeID(); // Expire a container ContainerId completedContainerId = BuilderUtils.NewContainerId(BuilderUtils.NewApplicationAttemptId (BuilderUtils.NewApplicationId(0, 0), 0), 0); node.Handle(new RMNodeCleanContainerEvent(nodeId, completedContainerId)); NUnit.Framework.Assert.AreEqual(1, node.GetContainersToCleanUp().Count); // Finish an application ApplicationId finishedAppId = BuilderUtils.NewApplicationId(0, 1); node.Handle(new RMNodeCleanAppEvent(nodeId, finishedAppId)); NUnit.Framework.Assert.AreEqual(1, node.GetAppsToCleanup().Count); // Verify status update does not clear containers/apps to cleanup // but updating heartbeat response for cleanup does RMNodeStatusEvent statusEvent = GetMockRMNodeStatusEvent(); node.Handle(statusEvent); NUnit.Framework.Assert.AreEqual(1, node.GetContainersToCleanUp().Count); NUnit.Framework.Assert.AreEqual(1, node.GetAppsToCleanup().Count); NodeHeartbeatResponse hbrsp = Org.Apache.Hadoop.Yarn.Util.Records.NewRecord <NodeHeartbeatResponse >(); node.UpdateNodeHeartbeatResponseForCleanup(hbrsp); NUnit.Framework.Assert.AreEqual(0, node.GetContainersToCleanUp().Count); NUnit.Framework.Assert.AreEqual(0, node.GetAppsToCleanup().Count); NUnit.Framework.Assert.AreEqual(1, hbrsp.GetContainersToCleanup().Count); NUnit.Framework.Assert.AreEqual(completedContainerId, hbrsp.GetContainersToCleanup ()[0]); NUnit.Framework.Assert.AreEqual(1, hbrsp.GetApplicationsToCleanup().Count); NUnit.Framework.Assert.AreEqual(finishedAppId, hbrsp.GetApplicationsToCleanup()[0 ]); }
public virtual void TestAddNewExcludePathToConfiguration() { Configuration conf = new Configuration(); rm = new MockRM(conf); rm.Start(); MockNM nm1 = rm.RegisterNode("host1:1234", 5120); MockNM nm2 = rm.RegisterNode("host2:5678", 10240); ClusterMetrics metrics = ClusterMetrics.GetMetrics(); System.Diagnostics.Debug.Assert((metrics != null)); int initialMetricCount = metrics.GetNumDecommisionedNMs(); NodeHeartbeatResponse nodeHeartbeat = nm1.NodeHeartbeat(true); NUnit.Framework.Assert.AreEqual(NodeAction.Normal, nodeHeartbeat.GetNodeAction()); nodeHeartbeat = nm2.NodeHeartbeat(true); NUnit.Framework.Assert.AreEqual(NodeAction.Normal, nodeHeartbeat.GetNodeAction()); WriteToHostsFile("host2"); conf.Set(YarnConfiguration.RmNodesExcludeFilePath, hostFile.GetAbsolutePath()); rm.GetNodesListManager().RefreshNodes(conf); CheckDecommissionedNMCount(rm, ++initialMetricCount); nodeHeartbeat = nm1.NodeHeartbeat(true); NUnit.Framework.Assert.AreEqual("Node should not have been decomissioned.", NodeAction .Normal, nodeHeartbeat.GetNodeAction()); nodeHeartbeat = nm2.NodeHeartbeat(true); NUnit.Framework.Assert.AreEqual("Node should have been decomissioned but is in state" + nodeHeartbeat.GetNodeAction(), NodeAction.Shutdown, nodeHeartbeat.GetNodeAction ()); }
public RMNodeStatusEvent(NodeId nodeId, NodeHealthStatus nodeHealthStatus, IList < ContainerStatus> collection, IList <ApplicationId> keepAliveAppIds, NodeHeartbeatResponse latestResponse) : base(nodeId, RMNodeEventType.StatusUpdate) { this.nodeHealthStatus = nodeHealthStatus; this.containersCollection = collection; this.keepAliveAppIds = keepAliveAppIds; this.latestResponse = latestResponse; }
/// <exception cref="Org.Apache.Hadoop.Yarn.Exceptions.YarnException"/> /// <exception cref="System.IO.IOException"/> public virtual NodeHeartbeatResponse NodeHeartbeat(NodeHeartbeatRequest request) { NodeStatus nodeStatus = request.GetNodeStatus(); Log.Info("Got heartbeat number " + heartBeatID); nodeStatus.SetResponseId(heartBeatID++); NodeHeartbeatResponse nhResponse = YarnServerBuilderUtils.NewNodeHeartbeatResponse (heartBeatID, null, null, null, null, null, 1000L); return(nhResponse); }
public virtual void TestDecommissionWithExcludeHosts() { Configuration conf = new Configuration(); conf.Set(YarnConfiguration.RmNodesExcludeFilePath, hostFile.GetAbsolutePath()); WriteToHostsFile(string.Empty); DrainDispatcher dispatcher = new DrainDispatcher(); rm = new _MockRM_162(dispatcher, conf); rm.Start(); MockNM nm1 = rm.RegisterNode("host1:1234", 5120); MockNM nm2 = rm.RegisterNode("host2:5678", 10240); MockNM nm3 = rm.RegisterNode("localhost:4433", 1024); dispatcher.Await(); int metricCount = ClusterMetrics.GetMetrics().GetNumDecommisionedNMs(); NodeHeartbeatResponse nodeHeartbeat = nm1.NodeHeartbeat(true); NUnit.Framework.Assert.IsTrue(NodeAction.Normal.Equals(nodeHeartbeat.GetNodeAction ())); nodeHeartbeat = nm2.NodeHeartbeat(true); NUnit.Framework.Assert.IsTrue(NodeAction.Normal.Equals(nodeHeartbeat.GetNodeAction ())); dispatcher.Await(); // To test that IPs also work string ip = NetUtils.NormalizeHostName("localhost"); WriteToHostsFile("host2", ip); rm.GetNodesListManager().RefreshNodes(conf); CheckDecommissionedNMCount(rm, metricCount + 2); nodeHeartbeat = nm1.NodeHeartbeat(true); NUnit.Framework.Assert.IsTrue(NodeAction.Normal.Equals(nodeHeartbeat.GetNodeAction ())); nodeHeartbeat = nm2.NodeHeartbeat(true); NUnit.Framework.Assert.IsTrue("The decommisioned metrics are not updated", NodeAction .Shutdown.Equals(nodeHeartbeat.GetNodeAction())); nodeHeartbeat = nm3.NodeHeartbeat(true); NUnit.Framework.Assert.IsTrue("The decommisioned metrics are not updated", NodeAction .Shutdown.Equals(nodeHeartbeat.GetNodeAction())); dispatcher.Await(); WriteToHostsFile(string.Empty); rm.GetNodesListManager().RefreshNodes(conf); nm3 = rm.RegisterNode("localhost:4433", 1024); dispatcher.Await(); nodeHeartbeat = nm3.NodeHeartbeat(true); dispatcher.Await(); NUnit.Framework.Assert.IsTrue(NodeAction.Normal.Equals(nodeHeartbeat.GetNodeAction ())); // decommissined node is 1 since 1 node is rejoined after updating exclude // file CheckDecommissionedNMCount(rm, metricCount + 1); }
/// <exception cref="System.IO.IOException"/> /// <exception cref="Org.Apache.Hadoop.Yarn.Exceptions.YarnException"/> public virtual void Heartbeat() { NodeStatus nodeStatus = Org.Apache.Hadoop.Yarn.Server.Resourcemanager.NodeManager .CreateNodeStatus(nodeId, GetContainerStatuses(containers)); nodeStatus.SetResponseId(responseID); NodeHeartbeatRequest request = recordFactory.NewRecordInstance <NodeHeartbeatRequest >(); request.SetNodeStatus(nodeStatus); NodeHeartbeatResponse response = resourceTrackerService.NodeHeartbeat(request); responseID = response.GetResponseId(); }
/// <exception cref="System.Exception"/> private void TestRMWritingMassiveHistory(MockRM rm) { rm.Start(); MockNM nm = rm.RegisterNode("127.0.0.1:1234", 1024 * 10100); RMApp app = rm.SubmitApp(1024); nm.NodeHeartbeat(true); RMAppAttempt attempt = app.GetCurrentAppAttempt(); MockAM am = rm.SendAMLaunched(attempt.GetAppAttemptId()); am.RegisterAppAttempt(); int request = 10000; am.Allocate("127.0.0.1", 1024, request, new AList <ContainerId>()); nm.NodeHeartbeat(true); IList <Container> allocated = am.Allocate(new AList <ResourceRequest>(), new AList < ContainerId>()).GetAllocatedContainers(); int waitCount = 0; int allocatedSize = allocated.Count; while (allocatedSize < request && waitCount++ < 200) { Sharpen.Thread.Sleep(300); allocated = am.Allocate(new AList <ResourceRequest>(), new AList <ContainerId>()).GetAllocatedContainers (); allocatedSize += allocated.Count; nm.NodeHeartbeat(true); } NUnit.Framework.Assert.AreEqual(request, allocatedSize); am.UnregisterAppAttempt(); am.WaitForState(RMAppAttemptState.Finishing); nm.NodeHeartbeat(am.GetApplicationAttemptId(), 1, ContainerState.Complete); am.WaitForState(RMAppAttemptState.Finished); NodeHeartbeatResponse resp = nm.NodeHeartbeat(true); IList <ContainerId> cleaned = resp.GetContainersToCleanup(); int cleanedSize = cleaned.Count; waitCount = 0; while (cleanedSize < allocatedSize && waitCount++ < 200) { Sharpen.Thread.Sleep(300); resp = nm.NodeHeartbeat(true); cleaned = resp.GetContainersToCleanup(); cleanedSize += cleaned.Count; } NUnit.Framework.Assert.AreEqual(allocatedSize, cleanedSize); rm.WaitForState(app.GetApplicationId(), RMAppState.Finished); rm.Stop(); }
private RMNodeStatusEvent GetMockRMNodeStatusEvent() { NodeHeartbeatResponse response = Org.Mockito.Mockito.Mock <NodeHeartbeatResponse>( ); NodeHealthStatus healthStatus = Org.Mockito.Mockito.Mock <NodeHealthStatus>(); bool yes = true; Org.Mockito.Mockito.DoReturn(yes).When(healthStatus).GetIsNodeHealthy(); RMNodeStatusEvent @event = Org.Mockito.Mockito.Mock <RMNodeStatusEvent>(); Org.Mockito.Mockito.DoReturn(healthStatus).When(@event).GetNodeHealthStatus(); Org.Mockito.Mockito.DoReturn(response).When(@event).GetLatestResponse(); Org.Mockito.Mockito.DoReturn(RMNodeEventType.StatusUpdate).When(@event).GetType(); return(@event); }
private void UpdateMasterKeys(NodeHeartbeatResponse response) { MasterKey updatedMasterKey = response.GetContainerTokenMasterKey(); if (updatedMasterKey != null) { this._enclosing.context.GetContainerTokenSecretManager().SetMasterKey(updatedMasterKey ); } updatedMasterKey = response.GetNMTokenMasterKey(); if (updatedMasterKey != null) { this._enclosing.context.GetNMTokenSecretManager().SetMasterKey(updatedMasterKey); } }
/// <exception cref="System.Exception"/> private void WaitForAppCleanupMessageRecved(MockNM nm, ApplicationId appId) { while (true) { NodeHeartbeatResponse response = nm.NodeHeartbeat(true); if (response.GetApplicationsToCleanup() != null && response.GetApplicationsToCleanup ().Count == 1 && appId.Equals(response.GetApplicationsToCleanup()[0])) { return; } Log.Info("Haven't got application=" + appId.ToString() + " in cleanup list from node heartbeat response, " + "sleep for a while before next heartbeat"); Sharpen.Thread.Sleep(1000); } }
/// <summary> /// Test RM read NM next heartBeat Interval correctly from Configuration file, /// and NM get next heartBeat Interval from RM correctly /// </summary> /// <exception cref="System.Exception"/> public virtual void TestGetNextHeartBeatInterval() { Configuration conf = new Configuration(); conf.Set(YarnConfiguration.RmNmHeartbeatIntervalMs, "4000"); rm = new MockRM(conf); rm.Start(); MockNM nm1 = rm.RegisterNode("host1:1234", 5120); MockNM nm2 = rm.RegisterNode("host2:5678", 10240); NodeHeartbeatResponse nodeHeartbeat = nm1.NodeHeartbeat(true); NUnit.Framework.Assert.AreEqual(4000, nodeHeartbeat.GetNextHeartBeatInterval()); NodeHeartbeatResponse nodeHeartbeat2 = nm2.NodeHeartbeat(true); NUnit.Framework.Assert.AreEqual(4000, nodeHeartbeat2.GetNextHeartBeatInterval()); }
public virtual void TestRPCResponseId() { string node = "localhost"; Resource capability = BuilderUtils.NewResource(1024, 1); RegisterNodeManagerRequest request = recordFactory.NewRecordInstance <RegisterNodeManagerRequest >(); nodeId = NodeId.NewInstance(node, 1234); request.SetNodeId(nodeId); request.SetHttpPort(0); request.SetResource(capability); RegisterNodeManagerRequest request1 = recordFactory.NewRecordInstance <RegisterNodeManagerRequest >(); request1.SetNodeId(nodeId); request1.SetHttpPort(0); request1.SetResource(capability); resourceTrackerService.RegisterNodeManager(request1); NodeStatus nodeStatus = recordFactory.NewRecordInstance <NodeStatus>(); nodeStatus.SetNodeId(nodeId); NodeHealthStatus nodeHealthStatus = recordFactory.NewRecordInstance <NodeHealthStatus >(); nodeHealthStatus.SetIsNodeHealthy(true); nodeStatus.SetNodeHealthStatus(nodeHealthStatus); NodeHeartbeatRequest nodeHeartBeatRequest = recordFactory.NewRecordInstance <NodeHeartbeatRequest >(); nodeHeartBeatRequest.SetNodeStatus(nodeStatus); nodeStatus.SetResponseId(0); NodeHeartbeatResponse response = resourceTrackerService.NodeHeartbeat(nodeHeartBeatRequest ); NUnit.Framework.Assert.IsTrue(response.GetResponseId() == 1); nodeStatus.SetResponseId(response.GetResponseId()); response = resourceTrackerService.NodeHeartbeat(nodeHeartBeatRequest); NUnit.Framework.Assert.IsTrue(response.GetResponseId() == 2); /* try calling with less response id */ response = resourceTrackerService.NodeHeartbeat(nodeHeartBeatRequest); NUnit.Framework.Assert.IsTrue(response.GetResponseId() == 2); nodeStatus.SetResponseId(0); response = resourceTrackerService.NodeHeartbeat(nodeHeartBeatRequest); NUnit.Framework.Assert.IsTrue(NodeAction.Resync.Equals(response.GetNodeAction())); NUnit.Framework.Assert.AreEqual("Too far behind rm response id:2 nm response id:0" , response.GetDiagnosticsMessage()); }
public virtual void TestDecommissionWithIncludeHosts() { WriteToHostsFile("localhost", "host1", "host2"); Configuration conf = new Configuration(); conf.Set(YarnConfiguration.RmNodesIncludeFilePath, hostFile.GetAbsolutePath()); rm = new MockRM(conf); rm.Start(); MockNM nm1 = rm.RegisterNode("host1:1234", 5120); MockNM nm2 = rm.RegisterNode("host2:5678", 10240); MockNM nm3 = rm.RegisterNode("localhost:4433", 1024); ClusterMetrics metrics = ClusterMetrics.GetMetrics(); System.Diagnostics.Debug.Assert((metrics != null)); int metricCount = metrics.GetNumDecommisionedNMs(); NodeHeartbeatResponse nodeHeartbeat = nm1.NodeHeartbeat(true); NUnit.Framework.Assert.IsTrue(NodeAction.Normal.Equals(nodeHeartbeat.GetNodeAction ())); nodeHeartbeat = nm2.NodeHeartbeat(true); NUnit.Framework.Assert.IsTrue(NodeAction.Normal.Equals(nodeHeartbeat.GetNodeAction ())); nodeHeartbeat = nm3.NodeHeartbeat(true); NUnit.Framework.Assert.IsTrue(NodeAction.Normal.Equals(nodeHeartbeat.GetNodeAction ())); // To test that IPs also work string ip = NetUtils.NormalizeHostName("localhost"); WriteToHostsFile("host1", ip); rm.GetNodesListManager().RefreshNodes(conf); CheckDecommissionedNMCount(rm, ++metricCount); nodeHeartbeat = nm1.NodeHeartbeat(true); NUnit.Framework.Assert.IsTrue(NodeAction.Normal.Equals(nodeHeartbeat.GetNodeAction ())); NUnit.Framework.Assert.AreEqual(1, ClusterMetrics.GetMetrics().GetNumDecommisionedNMs ()); nodeHeartbeat = nm2.NodeHeartbeat(true); NUnit.Framework.Assert.IsTrue("Node is not decommisioned.", NodeAction.Shutdown.Equals (nodeHeartbeat.GetNodeAction())); nodeHeartbeat = nm3.NodeHeartbeat(true); NUnit.Framework.Assert.IsTrue(NodeAction.Normal.Equals(nodeHeartbeat.GetNodeAction ())); NUnit.Framework.Assert.AreEqual(metricCount, ClusterMetrics.GetMetrics().GetNumDecommisionedNMs ()); }
/// <exception cref="System.Exception"/> public virtual NodeHeartbeatResponse NodeHeartbeat(IDictionary <ApplicationId, IList <Org.Apache.Hadoop.Yarn.Api.Records.ContainerStatus> > conts, bool isHealthy, int resId) { NodeHeartbeatRequest req = Org.Apache.Hadoop.Yarn.Util.Records.NewRecord <NodeHeartbeatRequest >(); NodeStatus status = Org.Apache.Hadoop.Yarn.Util.Records.NewRecord <NodeStatus>(); status.SetResponseId(resId); status.SetNodeId(nodeId); foreach (KeyValuePair <ApplicationId, IList <Org.Apache.Hadoop.Yarn.Api.Records.ContainerStatus > > entry in conts) { Org.Mortbay.Log.Log.Info("entry.getValue() " + entry.Value); status.SetContainersStatuses(entry.Value); } NodeHealthStatus healthStatus = Org.Apache.Hadoop.Yarn.Util.Records.NewRecord <NodeHealthStatus >(); healthStatus.SetHealthReport(string.Empty); healthStatus.SetIsNodeHealthy(isHealthy); healthStatus.SetLastHealthReportTime(1); status.SetNodeHealthStatus(healthStatus); req.SetNodeStatus(status); req.SetLastKnownContainerTokenMasterKey(this.currentContainerTokenMasterKey); req.SetLastKnownNMTokenMasterKey(this.currentNMTokenMasterKey); NodeHeartbeatResponse heartbeatResponse = resourceTracker.NodeHeartbeat(req); MasterKey masterKeyFromRM = heartbeatResponse.GetContainerTokenMasterKey(); if (masterKeyFromRM != null && masterKeyFromRM.GetKeyId() != this.currentContainerTokenMasterKey .GetKeyId()) { this.currentContainerTokenMasterKey = masterKeyFromRM; } masterKeyFromRM = heartbeatResponse.GetNMTokenMasterKey(); if (masterKeyFromRM != null && masterKeyFromRM.GetKeyId() != this.currentNMTokenMasterKey .GetKeyId()) { this.currentNMTokenMasterKey = masterKeyFromRM; } return(heartbeatResponse); }
/// <exception cref="Com.Google.Protobuf.ServiceException"/> public virtual YarnServerCommonServiceProtos.NodeHeartbeatResponseProto NodeHeartbeat (RpcController controller, YarnServerCommonServiceProtos.NodeHeartbeatRequestProto proto) { NodeHeartbeatRequestPBImpl request = new NodeHeartbeatRequestPBImpl(proto); try { NodeHeartbeatResponse response = real.NodeHeartbeat(request); return(((NodeHeartbeatResponsePBImpl)response).GetProto()); } catch (YarnException e) { throw new ServiceException(e); } catch (IOException e) { throw new ServiceException(e); } }
private void PopulateKeys(NodeHeartbeatRequest request, NodeHeartbeatResponse nodeHeartBeatResponse ) { // Check if node's masterKey needs to be updated and if the currentKey has // roller over, send it across // ContainerTokenMasterKey MasterKey nextMasterKeyForNode = this.containerTokenSecretManager.GetNextKey(); if (nextMasterKeyForNode != null && (request.GetLastKnownContainerTokenMasterKey( ).GetKeyId() != nextMasterKeyForNode.GetKeyId())) { nodeHeartBeatResponse.SetContainerTokenMasterKey(nextMasterKeyForNode); } // NMTokenMasterKey nextMasterKeyForNode = this.nmTokenSecretManager.GetNextKey(); if (nextMasterKeyForNode != null && (request.GetLastKnownNMTokenMasterKey().GetKeyId () != nextMasterKeyForNode.GetKeyId())) { nodeHeartBeatResponse.SetNMTokenMasterKey(nextMasterKeyForNode); } }
public virtual void TestReboot() { Configuration conf = new Configuration(); rm = new MockRM(conf); rm.Start(); MockNM nm1 = rm.RegisterNode("host1:1234", 5120); MockNM nm2 = rm.RegisterNode("host2:1234", 2048); int initialMetricCount = ClusterMetrics.GetMetrics().GetNumRebootedNMs(); NodeHeartbeatResponse nodeHeartbeat = nm1.NodeHeartbeat(true); NUnit.Framework.Assert.IsTrue(NodeAction.Normal.Equals(nodeHeartbeat.GetNodeAction ())); nodeHeartbeat = nm2.NodeHeartbeat(new Dictionary <ApplicationId, IList <ContainerStatus > >(), true, -100); NUnit.Framework.Assert.IsTrue(NodeAction.Resync.Equals(nodeHeartbeat.GetNodeAction ())); NUnit.Framework.Assert.AreEqual("Too far behind rm response id:0 nm response id:-100" , nodeHeartbeat.GetDiagnosticsMessage()); CheckRebootedNMCount(rm, ++initialMetricCount); }
public static NodeHeartbeatResponse NewNodeHeartbeatResponse(int responseId, NodeAction action, IList <ContainerId> containersToCleanUp, IList <ApplicationId> applicationsToCleanUp , MasterKey containerTokenMasterKey, MasterKey nmTokenMasterKey, long nextHeartbeatInterval ) { NodeHeartbeatResponse response = recordFactory.NewRecordInstance <NodeHeartbeatResponse >(); response.SetResponseId(responseId); response.SetNodeAction(action); response.SetContainerTokenMasterKey(containerTokenMasterKey); response.SetNMTokenMasterKey(nmTokenMasterKey); response.SetNextHeartBeatInterval(nextHeartbeatInterval); if (containersToCleanUp != null) { response.AddAllContainersToCleanup(containersToCleanUp); } if (applicationsToCleanUp != null) { response.AddAllApplicationsToCleanup(applicationsToCleanUp); } return(response); }
/// <exception cref="System.Exception"/> public virtual void TestContainerCleanupWhenRMRestartedAppNotRegistered() { conf.SetInt(YarnConfiguration.RmAmMaxAttempts, 1); MemoryRMStateStore memStore = new MemoryRMStateStore(); memStore.Init(conf); // start RM DrainDispatcher dispatcher = new DrainDispatcher(); MockRM rm1 = new _MockRM_413(dispatcher, conf, memStore); rm1.Start(); MockNM nm1 = new MockNM("127.0.0.1:1234", 15120, rm1.GetResourceTrackerService()); nm1.RegisterNode(); // create app and launch the AM RMApp app0 = rm1.SubmitApp(200); MockAM am0 = LaunchAM(app0, rm1, nm1); nm1.NodeHeartbeat(am0.GetApplicationAttemptId(), 1, ContainerState.Running); rm1.WaitForState(app0.GetApplicationId(), RMAppState.Running); // start new RM DrainDispatcher dispatcher2 = new DrainDispatcher(); MockRM rm2 = new _MockRM_432(dispatcher2, conf, memStore); rm2.Start(); // nm1 register to rm2, and do a heartbeat nm1.SetResourceTrackerService(rm2.GetResourceTrackerService()); nm1.RegisterNode(Arrays.AsList(app0.GetApplicationId())); rm2.WaitForState(app0.GetApplicationId(), RMAppState.Accepted); // Add unknown container for application unknown to scheduler NodeHeartbeatResponse response = nm1.NodeHeartbeat(am0.GetApplicationAttemptId(), 2, ContainerState.Running); WaitForContainerCleanup(dispatcher2, nm1, response); rm1.Stop(); rm2.Stop(); }
/// <summary> /// Update a /// <see cref="Org.Apache.Hadoop.Yarn.Server.Api.Protocolrecords.NodeHeartbeatResponse /// "/> /// with the list of containers and /// applications to clean up for this node. /// </summary> /// <param name="response"> /// the /// <see cref="Org.Apache.Hadoop.Yarn.Server.Api.Protocolrecords.NodeHeartbeatResponse /// "/> /// to update /// </param> public abstract void UpdateNodeHeartbeatResponseForCleanup(NodeHeartbeatResponse response);
/// <exception cref="Org.Apache.Hadoop.Yarn.Exceptions.YarnException"/> /// <exception cref="System.IO.IOException"/> public virtual NodeHeartbeatResponse NodeHeartbeat(NodeHeartbeatRequest request) { NodeStatus remoteNodeStatus = request.GetNodeStatus(); NodeId nodeId = remoteNodeStatus.GetNodeId(); // 1. Check if it's a valid (i.e. not excluded) node if (!this.nodesListManager.IsValidNode(nodeId.GetHost())) { string message = "Disallowed NodeManager nodeId: " + nodeId + " hostname: " + nodeId .GetHost(); Log.Info(message); shutDown.SetDiagnosticsMessage(message); return(shutDown); } // 2. Check if it's a registered node RMNode rmNode = this.rmContext.GetRMNodes()[nodeId]; if (rmNode == null) { /* node does not exist */ string message = "Node not found resyncing " + remoteNodeStatus.GetNodeId(); Log.Info(message); resync.SetDiagnosticsMessage(message); return(resync); } // Send ping this.nmLivelinessMonitor.ReceivedPing(nodeId); // 3. Check if it's a 'fresh' heartbeat i.e. not duplicate heartbeat NodeHeartbeatResponse lastNodeHeartbeatResponse = rmNode.GetLastNodeHeartBeatResponse (); if (remoteNodeStatus.GetResponseId() + 1 == lastNodeHeartbeatResponse.GetResponseId ()) { Log.Info("Received duplicate heartbeat from node " + rmNode.GetNodeAddress() + " responseId=" + remoteNodeStatus.GetResponseId()); return(lastNodeHeartbeatResponse); } else { if (remoteNodeStatus.GetResponseId() + 1 < lastNodeHeartbeatResponse.GetResponseId ()) { string message = "Too far behind rm response id:" + lastNodeHeartbeatResponse.GetResponseId () + " nm response id:" + remoteNodeStatus.GetResponseId(); Log.Info(message); resync.SetDiagnosticsMessage(message); // TODO: Just sending reboot is not enough. Think more. this.rmContext.GetDispatcher().GetEventHandler().Handle(new RMNodeEvent(nodeId, RMNodeEventType .Rebooting)); return(resync); } } // Heartbeat response NodeHeartbeatResponse nodeHeartBeatResponse = YarnServerBuilderUtils.NewNodeHeartbeatResponse (lastNodeHeartbeatResponse.GetResponseId() + 1, NodeAction.Normal, null, null, null , null, nextHeartBeatInterval); rmNode.UpdateNodeHeartbeatResponseForCleanup(nodeHeartBeatResponse); PopulateKeys(request, nodeHeartBeatResponse); ConcurrentMap <ApplicationId, ByteBuffer> systemCredentials = rmContext.GetSystemCredentialsForApps (); if (!systemCredentials.IsEmpty()) { nodeHeartBeatResponse.SetSystemCredentialsForApps(systemCredentials); } // 4. Send status to RMNode, saving the latest response. this.rmContext.GetDispatcher().GetEventHandler().Handle(new RMNodeStatusEvent(nodeId , remoteNodeStatus.GetNodeHealthStatus(), remoteNodeStatus.GetContainersStatuses (), remoteNodeStatus.GetKeepAliveApplications(), nodeHeartBeatResponse)); return(nodeHeartBeatResponse); }
public override void UpdateNodeHeartbeatResponseForCleanup(NodeHeartbeatResponse response) { }
/// <exception cref="System.Exception"/> private void ValidateRMNMKeyExchange(YarnConfiguration conf) { // Default rolling and activation intervals are large enough, no need to // intervene DrainDispatcher dispatcher = new DrainDispatcher(); ResourceManager rm = new _ResourceManager_56(dispatcher); // Do nothing. // Don't need it, skip. rm.Init(conf); rm.Start(); // Testing ContainerToken and NMToken string containerToken = "Container Token : "; string nmToken = "NM Token : "; MockNM nm = new MockNM("host:1234", 3072, rm.GetResourceTrackerService()); RegisterNodeManagerResponse registrationResponse = nm.RegisterNode(); MasterKey containerTokenMasterKey = registrationResponse.GetContainerTokenMasterKey (); NUnit.Framework.Assert.IsNotNull(containerToken + "Registration should cause a key-update!" , containerTokenMasterKey); MasterKey nmTokenMasterKey = registrationResponse.GetNMTokenMasterKey(); NUnit.Framework.Assert.IsNotNull(nmToken + "Registration should cause a key-update!" , nmTokenMasterKey); dispatcher.Await(); NodeHeartbeatResponse response = nm.NodeHeartbeat(true); NUnit.Framework.Assert.IsNull(containerToken + "First heartbeat after registration shouldn't get any key updates!" , response.GetContainerTokenMasterKey()); NUnit.Framework.Assert.IsNull(nmToken + "First heartbeat after registration shouldn't get any key updates!" , response.GetNMTokenMasterKey()); dispatcher.Await(); response = nm.NodeHeartbeat(true); NUnit.Framework.Assert.IsNull(containerToken + "Even second heartbeat after registration shouldn't get any key updates!" , response.GetContainerTokenMasterKey()); NUnit.Framework.Assert.IsNull(nmToken + "Even second heartbeat after registration shouldn't get any key updates!" , response.GetContainerTokenMasterKey()); dispatcher.Await(); // Let's force a roll-over rm.GetRMContext().GetContainerTokenSecretManager().RollMasterKey(); rm.GetRMContext().GetNMTokenSecretManager().RollMasterKey(); // Heartbeats after roll-over and before activation should be fine. response = nm.NodeHeartbeat(true); NUnit.Framework.Assert.IsNotNull(containerToken + "Heartbeats after roll-over and before activation should not err out." , response.GetContainerTokenMasterKey()); NUnit.Framework.Assert.IsNotNull(nmToken + "Heartbeats after roll-over and before activation should not err out." , response.GetNMTokenMasterKey()); NUnit.Framework.Assert.AreEqual(containerToken + "Roll-over should have incremented the key-id only by one!" , containerTokenMasterKey.GetKeyId() + 1, response.GetContainerTokenMasterKey(). GetKeyId()); NUnit.Framework.Assert.AreEqual(nmToken + "Roll-over should have incremented the key-id only by one!" , nmTokenMasterKey.GetKeyId() + 1, response.GetNMTokenMasterKey().GetKeyId()); dispatcher.Await(); response = nm.NodeHeartbeat(true); NUnit.Framework.Assert.IsNull(containerToken + "Second heartbeat after roll-over shouldn't get any key updates!" , response.GetContainerTokenMasterKey()); NUnit.Framework.Assert.IsNull(nmToken + "Second heartbeat after roll-over shouldn't get any key updates!" , response.GetNMTokenMasterKey()); dispatcher.Await(); // Let's force activation rm.GetRMContext().GetContainerTokenSecretManager().ActivateNextMasterKey(); rm.GetRMContext().GetNMTokenSecretManager().ActivateNextMasterKey(); response = nm.NodeHeartbeat(true); NUnit.Framework.Assert.IsNull(containerToken + "Activation shouldn't cause any key updates!" , response.GetContainerTokenMasterKey()); NUnit.Framework.Assert.IsNull(nmToken + "Activation shouldn't cause any key updates!" , response.GetNMTokenMasterKey()); dispatcher.Await(); response = nm.NodeHeartbeat(true); NUnit.Framework.Assert.IsNull(containerToken + "Even second heartbeat after activation shouldn't get any key updates!" , response.GetContainerTokenMasterKey()); NUnit.Framework.Assert.IsNull(nmToken + "Even second heartbeat after activation shouldn't get any key updates!" , response.GetNMTokenMasterKey()); dispatcher.Await(); rm.Stop(); }
public virtual void TestAppCleanup() { Logger rootLogger = LogManager.GetRootLogger(); rootLogger.SetLevel(Level.Debug); MockRM rm = new MockRM(); rm.Start(); MockNM nm1 = rm.RegisterNode("127.0.0.1:1234", 5000); RMApp app = rm.SubmitApp(2000); //kick the scheduling nm1.NodeHeartbeat(true); RMAppAttempt attempt = app.GetCurrentAppAttempt(); MockAM am = rm.SendAMLaunched(attempt.GetAppAttemptId()); am.RegisterAppAttempt(); //request for containers int request = 2; am.Allocate("127.0.0.1", 1000, request, new AList <ContainerId>()); //kick the scheduler nm1.NodeHeartbeat(true); IList <Container> conts = am.Allocate(new AList <ResourceRequest>(), new AList <ContainerId >()).GetAllocatedContainers(); int contReceived = conts.Count; int waitCount = 0; while (contReceived < request && waitCount++ < 200) { Log.Info("Got " + contReceived + " containers. Waiting to get " + request); Sharpen.Thread.Sleep(100); conts = am.Allocate(new AList <ResourceRequest>(), new AList <ContainerId>()).GetAllocatedContainers (); contReceived += conts.Count; nm1.NodeHeartbeat(true); } NUnit.Framework.Assert.AreEqual(request, contReceived); am.UnregisterAppAttempt(); NodeHeartbeatResponse resp = nm1.NodeHeartbeat(attempt.GetAppAttemptId(), 1, ContainerState .Complete); am.WaitForState(RMAppAttemptState.Finished); //currently only containers are cleaned via this //AM container is cleaned via container launcher resp = nm1.NodeHeartbeat(true); IList <ContainerId> containersToCleanup = resp.GetContainersToCleanup(); IList <ApplicationId> appsToCleanup = resp.GetApplicationsToCleanup(); int numCleanedContainers = containersToCleanup.Count; int numCleanedApps = appsToCleanup.Count; waitCount = 0; while ((numCleanedContainers < 2 || numCleanedApps < 1) && waitCount++ < 200) { Log.Info("Waiting to get cleanup events.. cleanedConts: " + numCleanedContainers + " cleanedApps: " + numCleanedApps); Sharpen.Thread.Sleep(100); resp = nm1.NodeHeartbeat(true); IList <ContainerId> deltaContainersToCleanup = resp.GetContainersToCleanup(); IList <ApplicationId> deltaAppsToCleanup = resp.GetApplicationsToCleanup(); // Add the deltas to the global list Sharpen.Collections.AddAll(containersToCleanup, deltaContainersToCleanup); Sharpen.Collections.AddAll(appsToCleanup, deltaAppsToCleanup); // Update counts now numCleanedContainers = containersToCleanup.Count; numCleanedApps = appsToCleanup.Count; } NUnit.Framework.Assert.AreEqual(1, appsToCleanup.Count); NUnit.Framework.Assert.AreEqual(app.GetApplicationId(), appsToCleanup[0]); NUnit.Framework.Assert.AreEqual(1, numCleanedApps); NUnit.Framework.Assert.AreEqual(2, numCleanedContainers); rm.Stop(); }
public virtual void TestReconnectNode() { DrainDispatcher dispatcher = new DrainDispatcher(); rm = new _MockRM_567(this, dispatcher); rm.Start(); MockNM nm1 = rm.RegisterNode("host1:1234", 5120); MockNM nm2 = rm.RegisterNode("host2:5678", 5120); nm1.NodeHeartbeat(true); nm2.NodeHeartbeat(false); dispatcher.Await(); CheckUnealthyNMCount(rm, nm2, true, 1); int expectedNMs = ClusterMetrics.GetMetrics().GetNumActiveNMs(); QueueMetrics metrics = rm.GetResourceScheduler().GetRootQueueMetrics(); // TODO Metrics incorrect in case of the FifoScheduler NUnit.Framework.Assert.AreEqual(5120, metrics.GetAvailableMB()); // reconnect of healthy node nm1 = rm.RegisterNode("host1:1234", 5120); NodeHeartbeatResponse response = nm1.NodeHeartbeat(true); NUnit.Framework.Assert.IsTrue(NodeAction.Normal.Equals(response.GetNodeAction())); dispatcher.Await(); NUnit.Framework.Assert.AreEqual(expectedNMs, ClusterMetrics.GetMetrics().GetNumActiveNMs ()); CheckUnealthyNMCount(rm, nm2, true, 1); // reconnect of unhealthy node nm2 = rm.RegisterNode("host2:5678", 5120); response = nm2.NodeHeartbeat(false); NUnit.Framework.Assert.IsTrue(NodeAction.Normal.Equals(response.GetNodeAction())); dispatcher.Await(); NUnit.Framework.Assert.AreEqual(expectedNMs, ClusterMetrics.GetMetrics().GetNumActiveNMs ()); CheckUnealthyNMCount(rm, nm2, true, 1); // unhealthy node changed back to healthy nm2 = rm.RegisterNode("host2:5678", 5120); dispatcher.Await(); response = nm2.NodeHeartbeat(true); response = nm2.NodeHeartbeat(true); dispatcher.Await(); NUnit.Framework.Assert.AreEqual(5120 + 5120, metrics.GetAvailableMB()); // reconnect of node with changed capability nm1 = rm.RegisterNode("host2:5678", 10240); dispatcher.Await(); response = nm1.NodeHeartbeat(true); dispatcher.Await(); NUnit.Framework.Assert.IsTrue(NodeAction.Normal.Equals(response.GetNodeAction())); NUnit.Framework.Assert.AreEqual(5120 + 10240, metrics.GetAvailableMB()); // reconnect of node with changed capability and running applications IList <ApplicationId> runningApps = new AList <ApplicationId>(); runningApps.AddItem(ApplicationId.NewInstance(1, 0)); nm1 = rm.RegisterNode("host2:5678", 15360, 2, runningApps); dispatcher.Await(); response = nm1.NodeHeartbeat(true); dispatcher.Await(); NUnit.Framework.Assert.IsTrue(NodeAction.Normal.Equals(response.GetNodeAction())); NUnit.Framework.Assert.AreEqual(5120 + 15360, metrics.GetAvailableMB()); // reconnect healthy node changing http port nm1 = new MockNM("host1:1234", 5120, rm.GetResourceTrackerService()); nm1.SetHttpPort(3); nm1.RegisterNode(); dispatcher.Await(); response = nm1.NodeHeartbeat(true); response = nm1.NodeHeartbeat(true); dispatcher.Await(); RMNode rmNode = rm.GetRMContext().GetRMNodes()[nm1.GetNodeId()]; NUnit.Framework.Assert.AreEqual(3, rmNode.GetHttpPort()); NUnit.Framework.Assert.AreEqual(5120, rmNode.GetTotalCapability().GetMemory()); NUnit.Framework.Assert.AreEqual(5120 + 15360, metrics.GetAvailableMB()); }
public virtual void TestContainerCleanup() { Logger rootLogger = LogManager.GetRootLogger(); rootLogger.SetLevel(Level.Debug); DrainDispatcher dispatcher = new DrainDispatcher(); MockRM rm = new _MockRM_167(this, dispatcher); rm.Start(); MockNM nm1 = rm.RegisterNode("127.0.0.1:1234", 5000); RMApp app = rm.SubmitApp(2000); //kick the scheduling nm1.NodeHeartbeat(true); RMAppAttempt attempt = app.GetCurrentAppAttempt(); MockAM am = rm.SendAMLaunched(attempt.GetAppAttemptId()); am.RegisterAppAttempt(); //request for containers int request = 2; am.Allocate("127.0.0.1", 1000, request, new AList <ContainerId>()); dispatcher.Await(); //kick the scheduler nm1.NodeHeartbeat(true); IList <Container> conts = am.Allocate(new AList <ResourceRequest>(), new AList <ContainerId >()).GetAllocatedContainers(); int contReceived = conts.Count; int waitCount = 0; while (contReceived < request && waitCount++ < 200) { Log.Info("Got " + contReceived + " containers. Waiting to get " + request); Sharpen.Thread.Sleep(100); conts = am.Allocate(new AList <ResourceRequest>(), new AList <ContainerId>()).GetAllocatedContainers (); dispatcher.Await(); contReceived += conts.Count; nm1.NodeHeartbeat(true); } NUnit.Framework.Assert.AreEqual(request, contReceived); // Release a container. AList <ContainerId> release = new AList <ContainerId>(); release.AddItem(conts[0].GetId()); am.Allocate(new AList <ResourceRequest>(), release); dispatcher.Await(); // Send one more heartbeat with a fake running container. This is to // simulate the situation that can happen if the NM reports that container // is running in the same heartbeat when the RM asks it to clean it up. IDictionary <ApplicationId, IList <ContainerStatus> > containerStatuses = new Dictionary <ApplicationId, IList <ContainerStatus> >(); AList <ContainerStatus> containerStatusList = new AList <ContainerStatus>(); containerStatusList.AddItem(BuilderUtils.NewContainerStatus(conts[0].GetId(), ContainerState .Running, "nothing", 0)); containerStatuses[app.GetApplicationId()] = containerStatusList; NodeHeartbeatResponse resp = nm1.NodeHeartbeat(containerStatuses, true); WaitForContainerCleanup(dispatcher, nm1, resp); // Now to test the case when RM already gave cleanup, and NM suddenly // realizes that the container is running. Log.Info("Testing container launch much after release and " + "NM getting cleanup" ); containerStatuses.Clear(); containerStatusList.Clear(); containerStatusList.AddItem(BuilderUtils.NewContainerStatus(conts[0].GetId(), ContainerState .Running, "nothing", 0)); containerStatuses[app.GetApplicationId()] = containerStatusList; resp = nm1.NodeHeartbeat(containerStatuses, true); // The cleanup list won't be instantaneous as it is given out by scheduler // and not RMNodeImpl. WaitForContainerCleanup(dispatcher, nm1, resp); rm.Stop(); }
public void Run() { int lastHeartBeatID = 0; while (!this._enclosing.isStopped) { try { NodeHeartbeatResponse response = null; NodeStatus nodeStatus = this._enclosing.GetNodeStatus(lastHeartBeatID); NodeHeartbeatRequest request = NodeHeartbeatRequest.NewInstance(nodeStatus, this. _enclosing.context.GetContainerTokenSecretManager().GetCurrentKey(), this._enclosing .context.GetNMTokenSecretManager().GetCurrentKey()); response = this._enclosing.resourceTracker.NodeHeartbeat(request); this._enclosing.nextHeartBeatInterval = response.GetNextHeartBeatInterval(); this.UpdateMasterKeys(response); if (response.GetNodeAction() == NodeAction.Shutdown) { Org.Apache.Hadoop.Yarn.Server.Nodemanager.NodeStatusUpdaterImpl.Log.Warn("Recieved SHUTDOWN signal from Resourcemanager as part of heartbeat," + " hence shutting down."); Org.Apache.Hadoop.Yarn.Server.Nodemanager.NodeStatusUpdaterImpl.Log.Warn("Message from ResourceManager: " + response.GetDiagnosticsMessage()); this._enclosing.context.SetDecommissioned(true); this._enclosing.dispatcher.GetEventHandler().Handle(new NodeManagerEvent(NodeManagerEventType .Shutdown)); break; } if (response.GetNodeAction() == NodeAction.Resync) { Org.Apache.Hadoop.Yarn.Server.Nodemanager.NodeStatusUpdaterImpl.Log.Warn("Node is out of sync with ResourceManager," + " hence resyncing."); Org.Apache.Hadoop.Yarn.Server.Nodemanager.NodeStatusUpdaterImpl.Log.Warn("Message from ResourceManager: " + response.GetDiagnosticsMessage()); this._enclosing.rmIdentifier = ResourceManagerConstants.RmInvalidIdentifier; this._enclosing.dispatcher.GetEventHandler().Handle(new NodeManagerEvent(NodeManagerEventType .Resync)); this._enclosing.pendingCompletedContainers.Clear(); break; } this._enclosing.RemoveOrTrackCompletedContainersFromContext(response.GetContainersToBeRemovedFromNM ()); lastHeartBeatID = response.GetResponseId(); IList <ContainerId> containersToCleanup = response.GetContainersToCleanup(); if (!containersToCleanup.IsEmpty()) { this._enclosing.dispatcher.GetEventHandler().Handle(new CMgrCompletedContainersEvent (containersToCleanup, CMgrCompletedContainersEvent.Reason.ByResourcemanager)); } IList <ApplicationId> appsToCleanup = response.GetApplicationsToCleanup(); this._enclosing.TrackAppsForKeepAlive(appsToCleanup); if (!appsToCleanup.IsEmpty()) { this._enclosing.dispatcher.GetEventHandler().Handle(new CMgrCompletedAppsEvent(appsToCleanup , CMgrCompletedAppsEvent.Reason.ByResourcemanager)); } IDictionary <ApplicationId, ByteBuffer> systemCredentials = response.GetSystemCredentialsForApps (); if (systemCredentials != null && !systemCredentials.IsEmpty()) { ((NodeManager.NMContext) this._enclosing.context).SetSystemCrendentialsForApps(Org.Apache.Hadoop.Yarn.Server.Nodemanager.NodeStatusUpdaterImpl .ParseCredentials(systemCredentials)); } } catch (ConnectException e) { this._enclosing.dispatcher.GetEventHandler().Handle(new NodeManagerEvent(NodeManagerEventType .Shutdown)); throw new YarnRuntimeException(e); } catch (Exception e) { Org.Apache.Hadoop.Yarn.Server.Nodemanager.NodeStatusUpdaterImpl.Log.Error("Caught exception in status-updater" , e); } finally { lock (this._enclosing.heartbeatMonitor) { this._enclosing.nextHeartBeatInterval = this._enclosing.nextHeartBeatInterval <= 0 ? YarnConfiguration.DefaultRmNmHeartbeatIntervalMs : this._enclosing.nextHeartBeatInterval; try { Sharpen.Runtime.Wait(this._enclosing.heartbeatMonitor, this._enclosing.nextHeartBeatInterval ); } catch (Exception) { } } } } }