public virtual void TestRPCResponseId() { string node = "localhost"; Resource capability = BuilderUtils.NewResource(1024, 1); RegisterNodeManagerRequest request = recordFactory.NewRecordInstance <RegisterNodeManagerRequest >(); nodeId = NodeId.NewInstance(node, 1234); request.SetNodeId(nodeId); request.SetHttpPort(0); request.SetResource(capability); RegisterNodeManagerRequest request1 = recordFactory.NewRecordInstance <RegisterNodeManagerRequest >(); request1.SetNodeId(nodeId); request1.SetHttpPort(0); request1.SetResource(capability); resourceTrackerService.RegisterNodeManager(request1); NodeStatus nodeStatus = recordFactory.NewRecordInstance <NodeStatus>(); nodeStatus.SetNodeId(nodeId); NodeHealthStatus nodeHealthStatus = recordFactory.NewRecordInstance <NodeHealthStatus >(); nodeHealthStatus.SetIsNodeHealthy(true); nodeStatus.SetNodeHealthStatus(nodeHealthStatus); NodeHeartbeatRequest nodeHeartBeatRequest = recordFactory.NewRecordInstance <NodeHeartbeatRequest >(); nodeHeartBeatRequest.SetNodeStatus(nodeStatus); nodeStatus.SetResponseId(0); NodeHeartbeatResponse response = resourceTrackerService.NodeHeartbeat(nodeHeartBeatRequest ); NUnit.Framework.Assert.IsTrue(response.GetResponseId() == 1); nodeStatus.SetResponseId(response.GetResponseId()); response = resourceTrackerService.NodeHeartbeat(nodeHeartBeatRequest); NUnit.Framework.Assert.IsTrue(response.GetResponseId() == 2); /* try calling with less response id */ response = resourceTrackerService.NodeHeartbeat(nodeHeartBeatRequest); NUnit.Framework.Assert.IsTrue(response.GetResponseId() == 2); nodeStatus.SetResponseId(0); response = resourceTrackerService.NodeHeartbeat(nodeHeartBeatRequest); NUnit.Framework.Assert.IsTrue(NodeAction.Resync.Equals(response.GetNodeAction())); NUnit.Framework.Assert.AreEqual("Too far behind rm response id:2 nm response id:0" , response.GetDiagnosticsMessage()); }
/// <exception cref="System.IO.IOException"/> /// <exception cref="Org.Apache.Hadoop.Yarn.Exceptions.YarnException"/> public virtual void Heartbeat() { NodeStatus nodeStatus = Org.Apache.Hadoop.Yarn.Server.Resourcemanager.NodeManager .CreateNodeStatus(nodeId, GetContainerStatuses(containers)); nodeStatus.SetResponseId(responseID); NodeHeartbeatRequest request = recordFactory.NewRecordInstance <NodeHeartbeatRequest >(); request.SetNodeStatus(nodeStatus); NodeHeartbeatResponse response = resourceTrackerService.NodeHeartbeat(request); responseID = response.GetResponseId(); }
public void Run() { int lastHeartBeatID = 0; while (!this._enclosing.isStopped) { try { NodeHeartbeatResponse response = null; NodeStatus nodeStatus = this._enclosing.GetNodeStatus(lastHeartBeatID); NodeHeartbeatRequest request = NodeHeartbeatRequest.NewInstance(nodeStatus, this. _enclosing.context.GetContainerTokenSecretManager().GetCurrentKey(), this._enclosing .context.GetNMTokenSecretManager().GetCurrentKey()); response = this._enclosing.resourceTracker.NodeHeartbeat(request); this._enclosing.nextHeartBeatInterval = response.GetNextHeartBeatInterval(); this.UpdateMasterKeys(response); if (response.GetNodeAction() == NodeAction.Shutdown) { Org.Apache.Hadoop.Yarn.Server.Nodemanager.NodeStatusUpdaterImpl.Log.Warn("Recieved SHUTDOWN signal from Resourcemanager as part of heartbeat," + " hence shutting down."); Org.Apache.Hadoop.Yarn.Server.Nodemanager.NodeStatusUpdaterImpl.Log.Warn("Message from ResourceManager: " + response.GetDiagnosticsMessage()); this._enclosing.context.SetDecommissioned(true); this._enclosing.dispatcher.GetEventHandler().Handle(new NodeManagerEvent(NodeManagerEventType .Shutdown)); break; } if (response.GetNodeAction() == NodeAction.Resync) { Org.Apache.Hadoop.Yarn.Server.Nodemanager.NodeStatusUpdaterImpl.Log.Warn("Node is out of sync with ResourceManager," + " hence resyncing."); Org.Apache.Hadoop.Yarn.Server.Nodemanager.NodeStatusUpdaterImpl.Log.Warn("Message from ResourceManager: " + response.GetDiagnosticsMessage()); this._enclosing.rmIdentifier = ResourceManagerConstants.RmInvalidIdentifier; this._enclosing.dispatcher.GetEventHandler().Handle(new NodeManagerEvent(NodeManagerEventType .Resync)); this._enclosing.pendingCompletedContainers.Clear(); break; } this._enclosing.RemoveOrTrackCompletedContainersFromContext(response.GetContainersToBeRemovedFromNM ()); lastHeartBeatID = response.GetResponseId(); IList <ContainerId> containersToCleanup = response.GetContainersToCleanup(); if (!containersToCleanup.IsEmpty()) { this._enclosing.dispatcher.GetEventHandler().Handle(new CMgrCompletedContainersEvent (containersToCleanup, CMgrCompletedContainersEvent.Reason.ByResourcemanager)); } IList <ApplicationId> appsToCleanup = response.GetApplicationsToCleanup(); this._enclosing.TrackAppsForKeepAlive(appsToCleanup); if (!appsToCleanup.IsEmpty()) { this._enclosing.dispatcher.GetEventHandler().Handle(new CMgrCompletedAppsEvent(appsToCleanup , CMgrCompletedAppsEvent.Reason.ByResourcemanager)); } IDictionary <ApplicationId, ByteBuffer> systemCredentials = response.GetSystemCredentialsForApps (); if (systemCredentials != null && !systemCredentials.IsEmpty()) { ((NodeManager.NMContext) this._enclosing.context).SetSystemCrendentialsForApps(Org.Apache.Hadoop.Yarn.Server.Nodemanager.NodeStatusUpdaterImpl .ParseCredentials(systemCredentials)); } } catch (ConnectException e) { this._enclosing.dispatcher.GetEventHandler().Handle(new NodeManagerEvent(NodeManagerEventType .Shutdown)); throw new YarnRuntimeException(e); } catch (Exception e) { Org.Apache.Hadoop.Yarn.Server.Nodemanager.NodeStatusUpdaterImpl.Log.Error("Caught exception in status-updater" , e); } finally { lock (this._enclosing.heartbeatMonitor) { this._enclosing.nextHeartBeatInterval = this._enclosing.nextHeartBeatInterval <= 0 ? YarnConfiguration.DefaultRmNmHeartbeatIntervalMs : this._enclosing.nextHeartBeatInterval; try { Sharpen.Runtime.Wait(this._enclosing.heartbeatMonitor, this._enclosing.nextHeartBeatInterval ); } catch (Exception) { } } } } }
/// <exception cref="Org.Apache.Hadoop.Yarn.Exceptions.YarnException"/> /// <exception cref="System.IO.IOException"/> public virtual NodeHeartbeatResponse NodeHeartbeat(NodeHeartbeatRequest request) { NodeStatus remoteNodeStatus = request.GetNodeStatus(); NodeId nodeId = remoteNodeStatus.GetNodeId(); // 1. Check if it's a valid (i.e. not excluded) node if (!this.nodesListManager.IsValidNode(nodeId.GetHost())) { string message = "Disallowed NodeManager nodeId: " + nodeId + " hostname: " + nodeId .GetHost(); Log.Info(message); shutDown.SetDiagnosticsMessage(message); return(shutDown); } // 2. Check if it's a registered node RMNode rmNode = this.rmContext.GetRMNodes()[nodeId]; if (rmNode == null) { /* node does not exist */ string message = "Node not found resyncing " + remoteNodeStatus.GetNodeId(); Log.Info(message); resync.SetDiagnosticsMessage(message); return(resync); } // Send ping this.nmLivelinessMonitor.ReceivedPing(nodeId); // 3. Check if it's a 'fresh' heartbeat i.e. not duplicate heartbeat NodeHeartbeatResponse lastNodeHeartbeatResponse = rmNode.GetLastNodeHeartBeatResponse (); if (remoteNodeStatus.GetResponseId() + 1 == lastNodeHeartbeatResponse.GetResponseId ()) { Log.Info("Received duplicate heartbeat from node " + rmNode.GetNodeAddress() + " responseId=" + remoteNodeStatus.GetResponseId()); return(lastNodeHeartbeatResponse); } else { if (remoteNodeStatus.GetResponseId() + 1 < lastNodeHeartbeatResponse.GetResponseId ()) { string message = "Too far behind rm response id:" + lastNodeHeartbeatResponse.GetResponseId () + " nm response id:" + remoteNodeStatus.GetResponseId(); Log.Info(message); resync.SetDiagnosticsMessage(message); // TODO: Just sending reboot is not enough. Think more. this.rmContext.GetDispatcher().GetEventHandler().Handle(new RMNodeEvent(nodeId, RMNodeEventType .Rebooting)); return(resync); } } // Heartbeat response NodeHeartbeatResponse nodeHeartBeatResponse = YarnServerBuilderUtils.NewNodeHeartbeatResponse (lastNodeHeartbeatResponse.GetResponseId() + 1, NodeAction.Normal, null, null, null , null, nextHeartBeatInterval); rmNode.UpdateNodeHeartbeatResponseForCleanup(nodeHeartBeatResponse); PopulateKeys(request, nodeHeartBeatResponse); ConcurrentMap <ApplicationId, ByteBuffer> systemCredentials = rmContext.GetSystemCredentialsForApps (); if (!systemCredentials.IsEmpty()) { nodeHeartBeatResponse.SetSystemCredentialsForApps(systemCredentials); } // 4. Send status to RMNode, saving the latest response. this.rmContext.GetDispatcher().GetEventHandler().Handle(new RMNodeStatusEvent(nodeId , remoteNodeStatus.GetNodeHealthStatus(), remoteNodeStatus.GetContainersStatuses (), remoteNodeStatus.GetKeepAliveApplications(), nodeHeartBeatResponse)); return(nodeHeartBeatResponse); }