// Test verify for AM issued with rolled-over AMRMToken // is still able to communicate with restarted RM. /// <exception cref="System.Exception"/> public virtual void TestAMRMClientOnAMRMTokenRollOverOnRMRestart() { conf.SetLong(YarnConfiguration.RmAmrmTokenMasterKeyRollingIntervalSecs, rolling_interval_sec ); conf.SetLong(YarnConfiguration.RmAmExpiryIntervalMs, am_expire_ms); MemoryRMStateStore memStore = new MemoryRMStateStore(); memStore.Init(conf); // start first RM TestAMRMClientOnRMRestart.MyResourceManager2 rm1 = new TestAMRMClientOnRMRestart.MyResourceManager2 (conf, memStore); rm1.Start(); DrainDispatcher dispatcher = (DrainDispatcher)rm1.GetRMContext().GetDispatcher(); long startTime = Runtime.CurrentTimeMillis(); // Submit the application RMApp app = rm1.SubmitApp(1024); dispatcher.Await(); MockNM nm1 = new MockNM("h1:1234", 15120, rm1.GetResourceTrackerService()); nm1.RegisterNode(); nm1.NodeHeartbeat(true); // Node heartbeat dispatcher.Await(); ApplicationAttemptId appAttemptId = app.GetCurrentAppAttempt().GetAppAttemptId(); rm1.SendAMLaunched(appAttemptId); dispatcher.Await(); AMRMTokenSecretManager amrmTokenSecretManagerForRM1 = rm1.GetRMContext().GetAMRMTokenSecretManager (); Org.Apache.Hadoop.Security.Token.Token <AMRMTokenIdentifier> token = amrmTokenSecretManagerForRM1 .CreateAndGetAMRMToken(appAttemptId); UserGroupInformation ugi = UserGroupInformation.GetCurrentUser(); ugi.AddTokenIdentifier(token.DecodeIdentifier()); AMRMClient <AMRMClient.ContainerRequest> amClient = new TestAMRMClientOnRMRestart.MyAMRMClientImpl (rm1); amClient.Init(conf); amClient.Start(); amClient.RegisterApplicationMaster("h1", 10000, string.Empty); amClient.Allocate(0.1f); // Wait for enough time and make sure the roll_over happens // At mean time, the old AMRMToken should continue to work while (Runtime.CurrentTimeMillis() - startTime < rolling_interval_sec * 1000) { amClient.Allocate(0.1f); try { Sharpen.Thread.Sleep(1000); } catch (Exception) { } } // DO NOTHING NUnit.Framework.Assert.IsTrue(amrmTokenSecretManagerForRM1.GetMasterKey().GetMasterKey ().GetKeyId() != token.DecodeIdentifier().GetKeyId()); amClient.Allocate(0.1f); // active the nextMasterKey, and replace the currentMasterKey Org.Apache.Hadoop.Security.Token.Token <AMRMTokenIdentifier> newToken = amrmTokenSecretManagerForRM1 .CreateAndGetAMRMToken(appAttemptId); int waitCount = 0; while (waitCount++ <= 50) { if (amrmTokenSecretManagerForRM1.GetCurrnetMasterKeyData().GetMasterKey().GetKeyId () != token.DecodeIdentifier().GetKeyId()) { break; } try { amClient.Allocate(0.1f); } catch (Exception) { break; } Sharpen.Thread.Sleep(500); } NUnit.Framework.Assert.IsTrue(amrmTokenSecretManagerForRM1.GetNextMasterKeyData() == null); NUnit.Framework.Assert.IsTrue(amrmTokenSecretManagerForRM1.GetCurrnetMasterKeyData ().GetMasterKey().GetKeyId() == newToken.DecodeIdentifier().GetKeyId()); // start 2nd RM conf.Set(YarnConfiguration.RmSchedulerAddress, "0.0.0.0:9030"); TestAMRMClientOnRMRestart.MyResourceManager2 rm2 = new TestAMRMClientOnRMRestart.MyResourceManager2 (conf, memStore); rm2.Start(); nm1.SetResourceTrackerService(rm2.GetResourceTrackerService()); ((TestAMRMClientOnRMRestart.MyAMRMClientImpl)amClient).UpdateRMProxy(rm2); dispatcher = (DrainDispatcher)rm2.GetRMContext().GetDispatcher(); AMRMTokenSecretManager amrmTokenSecretManagerForRM2 = rm2.GetRMContext().GetAMRMTokenSecretManager (); NUnit.Framework.Assert.IsTrue(amrmTokenSecretManagerForRM2.GetCurrnetMasterKeyData ().GetMasterKey().GetKeyId() == newToken.DecodeIdentifier().GetKeyId()); NUnit.Framework.Assert.IsTrue(amrmTokenSecretManagerForRM2.GetNextMasterKeyData() == null); try { UserGroupInformation testUser = UserGroupInformation.CreateRemoteUser("testUser"); SecurityUtil.SetTokenService(token, rm2.GetApplicationMasterService().GetBindAddress ()); testUser.AddToken(token); testUser.DoAs(new _PrivilegedAction_480(rm2)).Allocate(Org.Apache.Hadoop.Yarn.Util.Records .NewRecord <AllocateRequest>()); NUnit.Framework.Assert.Fail("The old Token should not work"); } catch (Exception ex) { NUnit.Framework.Assert.IsTrue(ex is SecretManager.InvalidToken); NUnit.Framework.Assert.IsTrue(ex.Message.Contains("Invalid AMRMToken from " + token .DecodeIdentifier().GetApplicationAttemptId())); } // make sure the recovered AMRMToken works for new RM amClient.Allocate(0.1f); amClient.UnregisterApplicationMaster(FinalApplicationStatus.Succeeded, null, null ); amClient.Stop(); rm1.Stop(); rm2.Stop(); }
/// <exception cref="System.Exception"/> private void SyncNodeHeartbeat(MockNM nm, bool health) { nm.NodeHeartbeat(health); dispatcher.Await(); }
// Test verify for // 1. AM try to unregister without registering // 2. AM register to RM, and try to unregister immediately after RM restart /// <exception cref="System.Exception"/> public virtual void TestAMRMClientForUnregisterAMOnRMRestart() { MemoryRMStateStore memStore = new MemoryRMStateStore(); memStore.Init(conf); // Phase-1 Start 1st RM TestAMRMClientOnRMRestart.MyResourceManager rm1 = new TestAMRMClientOnRMRestart.MyResourceManager (conf, memStore); rm1.Start(); DrainDispatcher dispatcher = (DrainDispatcher)rm1.GetRMContext().GetDispatcher(); // Submit the application RMApp app = rm1.SubmitApp(1024); dispatcher.Await(); MockNM nm1 = new MockNM("h1:1234", 15120, rm1.GetResourceTrackerService()); nm1.RegisterNode(); nm1.NodeHeartbeat(true); // Node heartbeat dispatcher.Await(); ApplicationAttemptId appAttemptId = app.GetCurrentAppAttempt().GetAppAttemptId(); rm1.SendAMLaunched(appAttemptId); dispatcher.Await(); Org.Apache.Hadoop.Security.Token.Token <AMRMTokenIdentifier> token = rm1.GetRMContext ().GetRMApps()[appAttemptId.GetApplicationId()].GetRMAppAttempt(appAttemptId).GetAMRMToken (); UserGroupInformation ugi = UserGroupInformation.GetCurrentUser(); ugi.AddTokenIdentifier(token.DecodeIdentifier()); AMRMClient <AMRMClient.ContainerRequest> amClient = new TestAMRMClientOnRMRestart.MyAMRMClientImpl (rm1); amClient.Init(conf); amClient.Start(); amClient.RegisterApplicationMaster("h1", 10000, string.Empty); amClient.Allocate(0.1f); // Phase-2 start 2nd RM is up TestAMRMClientOnRMRestart.MyResourceManager rm2 = new TestAMRMClientOnRMRestart.MyResourceManager (conf, memStore); rm2.Start(); nm1.SetResourceTrackerService(rm2.GetResourceTrackerService()); ((TestAMRMClientOnRMRestart.MyAMRMClientImpl)amClient).UpdateRMProxy(rm2); dispatcher = (DrainDispatcher)rm2.GetRMContext().GetDispatcher(); // NM should be rebooted on heartbeat, even first heartbeat for nm2 NodeHeartbeatResponse hbResponse = nm1.NodeHeartbeat(true); NUnit.Framework.Assert.AreEqual(NodeAction.Resync, hbResponse.GetNodeAction()); // new NM to represent NM re-register nm1 = new MockNM("h1:1234", 10240, rm2.GetResourceTrackerService()); ContainerId containerId = ContainerId.NewContainerId(appAttemptId, 1); NMContainerStatus containerReport = NMContainerStatus.NewInstance(containerId, ContainerState .Running, Resource.NewInstance(1024, 1), "recover container", 0, Priority.NewInstance (0), 0); nm1.RegisterNode(Arrays.AsList(containerReport), null); nm1.NodeHeartbeat(true); dispatcher.Await(); amClient.UnregisterApplicationMaster(FinalApplicationStatus.Succeeded, null, null ); rm2.WaitForState(appAttemptId, RMAppAttemptState.Finishing); nm1.NodeHeartbeat(appAttemptId, 1, ContainerState.Complete); rm2.WaitForState(appAttemptId, RMAppAttemptState.Finished); rm2.WaitForState(app.GetApplicationId(), RMAppState.Finished); amClient.Stop(); rm1.Stop(); rm2.Stop(); }
public virtual void TestAMRMUnusableNodes() { MockNM nm1 = rm.RegisterNode("127.0.0.1:1234", 10000); MockNM nm2 = rm.RegisterNode("127.0.0.2:1234", 10000); MockNM nm3 = rm.RegisterNode("127.0.0.3:1234", 10000); MockNM nm4 = rm.RegisterNode("127.0.0.4:1234", 10000); dispatcher.Await(); RMApp app1 = rm.SubmitApp(2000); // Trigger the scheduling so the AM gets 'launched' on nm1 nm1.NodeHeartbeat(true); RMAppAttempt attempt1 = app1.GetCurrentAppAttempt(); MockAM am1 = rm.SendAMLaunched(attempt1.GetAppAttemptId()); // register AM returns no unusable node am1.RegisterAppAttempt(); // allocate request returns no updated node AllocateRequest allocateRequest1 = AllocateRequest.NewInstance(0, 0F, null, null, null); AllocateResponse response1 = Allocate(attempt1.GetAppAttemptId(), allocateRequest1 ); IList <NodeReport> updatedNodes = response1.GetUpdatedNodes(); NUnit.Framework.Assert.AreEqual(0, updatedNodes.Count); SyncNodeHeartbeat(nm4, false); // allocate request returns updated node allocateRequest1 = AllocateRequest.NewInstance(response1.GetResponseId(), 0F, null , null, null); response1 = Allocate(attempt1.GetAppAttemptId(), allocateRequest1); updatedNodes = response1.GetUpdatedNodes(); NUnit.Framework.Assert.AreEqual(1, updatedNodes.Count); NodeReport nr = updatedNodes.GetEnumerator().Next(); NUnit.Framework.Assert.AreEqual(nm4.GetNodeId(), nr.GetNodeId()); NUnit.Framework.Assert.AreEqual(NodeState.Unhealthy, nr.GetNodeState()); // resending the allocate request returns the same result response1 = Allocate(attempt1.GetAppAttemptId(), allocateRequest1); updatedNodes = response1.GetUpdatedNodes(); NUnit.Framework.Assert.AreEqual(1, updatedNodes.Count); nr = updatedNodes.GetEnumerator().Next(); NUnit.Framework.Assert.AreEqual(nm4.GetNodeId(), nr.GetNodeId()); NUnit.Framework.Assert.AreEqual(NodeState.Unhealthy, nr.GetNodeState()); SyncNodeLost(nm3); // subsequent allocate request returns delta allocateRequest1 = AllocateRequest.NewInstance(response1.GetResponseId(), 0F, null , null, null); response1 = Allocate(attempt1.GetAppAttemptId(), allocateRequest1); updatedNodes = response1.GetUpdatedNodes(); NUnit.Framework.Assert.AreEqual(1, updatedNodes.Count); nr = updatedNodes.GetEnumerator().Next(); NUnit.Framework.Assert.AreEqual(nm3.GetNodeId(), nr.GetNodeId()); NUnit.Framework.Assert.AreEqual(NodeState.Lost, nr.GetNodeState()); // registering another AM gives it the complete failed list RMApp app2 = rm.SubmitApp(2000); // Trigger nm2 heartbeat so that AM gets launched on it nm2.NodeHeartbeat(true); RMAppAttempt attempt2 = app2.GetCurrentAppAttempt(); MockAM am2 = rm.SendAMLaunched(attempt2.GetAppAttemptId()); // register AM returns all unusable nodes am2.RegisterAppAttempt(); // allocate request returns no updated node AllocateRequest allocateRequest2 = AllocateRequest.NewInstance(0, 0F, null, null, null); AllocateResponse response2 = Allocate(attempt2.GetAppAttemptId(), allocateRequest2 ); updatedNodes = response2.GetUpdatedNodes(); NUnit.Framework.Assert.AreEqual(0, updatedNodes.Count); SyncNodeHeartbeat(nm4, true); // both AM's should get delta updated nodes allocateRequest1 = AllocateRequest.NewInstance(response1.GetResponseId(), 0F, null , null, null); response1 = Allocate(attempt1.GetAppAttemptId(), allocateRequest1); updatedNodes = response1.GetUpdatedNodes(); NUnit.Framework.Assert.AreEqual(1, updatedNodes.Count); nr = updatedNodes.GetEnumerator().Next(); NUnit.Framework.Assert.AreEqual(nm4.GetNodeId(), nr.GetNodeId()); NUnit.Framework.Assert.AreEqual(NodeState.Running, nr.GetNodeState()); allocateRequest2 = AllocateRequest.NewInstance(response2.GetResponseId(), 0F, null , null, null); response2 = Allocate(attempt2.GetAppAttemptId(), allocateRequest2); updatedNodes = response2.GetUpdatedNodes(); NUnit.Framework.Assert.AreEqual(1, updatedNodes.Count); nr = updatedNodes.GetEnumerator().Next(); NUnit.Framework.Assert.AreEqual(nm4.GetNodeId(), nr.GetNodeId()); NUnit.Framework.Assert.AreEqual(NodeState.Running, nr.GetNodeState()); // subsequent allocate calls should return no updated nodes allocateRequest2 = AllocateRequest.NewInstance(response2.GetResponseId(), 0F, null , null, null); response2 = Allocate(attempt2.GetAppAttemptId(), allocateRequest2); updatedNodes = response2.GetUpdatedNodes(); NUnit.Framework.Assert.AreEqual(0, updatedNodes.Count); }