// Test regular RM restart/failover, new RM should not count // AM failure towards the max-retry-account and should be able to // re-launch the AM. /// <exception cref="System.Exception"/> public virtual void TestRMRestartOrFailoverNotCountedForAMFailures() { YarnConfiguration conf = new YarnConfiguration(); conf.SetClass(YarnConfiguration.RmScheduler, typeof(CapacityScheduler), typeof(ResourceScheduler )); conf.SetBoolean(YarnConfiguration.RecoveryEnabled, true); conf.SetBoolean(YarnConfiguration.RmWorkPreservingRecoveryEnabled, false); conf.Set(YarnConfiguration.RmStore, typeof(MemoryRMStateStore).FullName); // explicitly set max-am-retry count as 1. conf.SetInt(YarnConfiguration.RmAmMaxAttempts, 1); MemoryRMStateStore memStore = new MemoryRMStateStore(); memStore.Init(conf); MockRM rm1 = new MockRM(conf, memStore); rm1.Start(); MockNM nm1 = new MockNM("127.0.0.1:1234", 8000, rm1.GetResourceTrackerService()); nm1.RegisterNode(); RMApp app1 = rm1.SubmitApp(200); // AM should be restarted even though max-am-attempt is 1. MockAM am1 = MockRM.LaunchAndRegisterAM(app1, rm1, nm1); RMAppAttempt attempt1 = app1.GetCurrentAppAttempt(); NUnit.Framework.Assert.IsTrue(((RMAppAttemptImpl)attempt1).MayBeLastAttempt()); // Restart rm. MockRM rm2 = new MockRM(conf, memStore); rm2.Start(); ApplicationStateData appState = memStore.GetState().GetApplicationState()[app1.GetApplicationId ()]; // re-register the NM nm1.SetResourceTrackerService(rm2.GetResourceTrackerService()); NMContainerStatus status = Org.Apache.Hadoop.Yarn.Util.Records.NewRecord <NMContainerStatus >(); status.SetContainerExitStatus(ContainerExitStatus.KilledByResourcemanager); status.SetContainerId(attempt1.GetMasterContainer().GetId()); status.SetContainerState(ContainerState.Complete); status.SetDiagnostics(string.Empty); nm1.RegisterNode(Sharpen.Collections.SingletonList(status), null); rm2.WaitForState(attempt1.GetAppAttemptId(), RMAppAttemptState.Failed); NUnit.Framework.Assert.AreEqual(ContainerExitStatus.KilledByResourcemanager, appState .GetAttempt(am1.GetApplicationAttemptId()).GetAMContainerExitStatus()); // Will automatically start a new AppAttempt in rm2 rm2.WaitForState(app1.GetApplicationId(), RMAppState.Accepted); MockAM am2 = rm2.WaitForNewAMToLaunchAndRegister(app1.GetApplicationId(), 2, nm1); MockRM.FinishAMAndVerifyAppState(app1, rm2, nm1, am2); RMAppAttempt attempt3 = rm2.GetRMContext().GetRMApps()[app1.GetApplicationId()].GetCurrentAppAttempt (); NUnit.Framework.Assert.IsTrue(attempt3.ShouldCountTowardsMaxAttemptRetry()); NUnit.Framework.Assert.AreEqual(ContainerExitStatus.Invalid, appState.GetAttempt( am2.GetApplicationAttemptId()).GetAMContainerExitStatus()); rm1.Stop(); rm2.Stop(); }
// Test the DT mast key in the state-store when the mast key is being rolled. /// <exception cref="System.Exception"/> public virtual void TestRMDTMasterKeyStateOnRollingMasterKey() { MemoryRMStateStore memStore = new MemoryRMStateStore(); memStore.Init(conf); RMStateStore.RMState rmState = memStore.GetState(); IDictionary <RMDelegationTokenIdentifier, long> rmDTState = rmState.GetRMDTSecretManagerState ().GetTokenState(); ICollection <DelegationKey> rmDTMasterKeyState = rmState.GetRMDTSecretManagerState ().GetMasterKeyState(); MockRM rm1 = new TestRMDelegationTokens.MyMockRM(this, conf, memStore); rm1.Start(); // on rm start, two master keys are created. // One is created at RMDTSecretMgr.startThreads.updateCurrentKey(); // the other is created on the first run of // tokenRemoverThread.rollMasterKey() RMDelegationTokenSecretManager dtSecretManager = rm1.GetRMContext().GetRMDelegationTokenSecretManager (); // assert all master keys are saved NUnit.Framework.Assert.AreEqual(dtSecretManager.GetAllMasterKeys(), rmDTMasterKeyState ); ICollection <DelegationKey> expiringKeys = new HashSet <DelegationKey>(); Sharpen.Collections.AddAll(expiringKeys, dtSecretManager.GetAllMasterKeys()); // request to generate a RMDelegationToken GetDelegationTokenRequest request = Org.Mockito.Mockito.Mock <GetDelegationTokenRequest >(); Org.Mockito.Mockito.When(request.GetRenewer()).ThenReturn("renewer1"); GetDelegationTokenResponse response = rm1.GetClientRMService().GetDelegationToken (request); Org.Apache.Hadoop.Yarn.Api.Records.Token delegationToken = response.GetRMDelegationToken (); Org.Apache.Hadoop.Security.Token.Token <RMDelegationTokenIdentifier> token1 = ConverterUtils .ConvertFromYarn(delegationToken, (Text)null); RMDelegationTokenIdentifier dtId1 = token1.DecodeIdentifier(); // For all keys that still remain in memory, we should have them stored // in state-store also. while (((TestRMDelegationTokens.TestRMDelegationTokenSecretManager)dtSecretManager ).numUpdatedKeys.Get() < 3) { ((TestRMDelegationTokens.TestRMDelegationTokenSecretManager)dtSecretManager).CheckCurrentKeyInStateStore (rmDTMasterKeyState); Sharpen.Thread.Sleep(100); } // wait for token to expire and remove from state-store // rollMasterKey is called every 1 second. int count = 0; while (rmDTState.Contains(dtId1) && count < 100) { Sharpen.Thread.Sleep(100); count++; } rm1.Stop(); }
/// <exception cref="System.Exception"/> public virtual void TestAppCleanupWhenNMReconnects() { conf.SetInt(YarnConfiguration.RmAmMaxAttempts, 1); MemoryRMStateStore memStore = new MemoryRMStateStore(); memStore.Init(conf); // start RM MockRM rm1 = new MockRM(conf, memStore); rm1.Start(); MockNM nm1 = new MockNM("127.0.0.1:1234", 15120, rm1.GetResourceTrackerService()); nm1.RegisterNode(); // create app and launch the AM RMApp app0 = rm1.SubmitApp(200); MockAM am0 = LaunchAM(app0, rm1, nm1); nm1.NodeHeartbeat(am0.GetApplicationAttemptId(), 1, ContainerState.Complete); rm1.WaitForState(app0.GetApplicationId(), RMAppState.Failed); // wait for application cleanup message received WaitForAppCleanupMessageRecved(nm1, app0.GetApplicationId()); // reconnect NM with application still active nm1.RegisterNode(Arrays.AsList(app0.GetApplicationId())); WaitForAppCleanupMessageRecved(nm1, app0.GetApplicationId()); rm1.Stop(); }
// Test RM restarts after AM container is preempted, new RM should not count // AM preemption failure towards the max-retry-account and should be able to // re-launch the AM. /// <exception cref="System.Exception"/> public virtual void TestPreemptedAMRestartOnRMRestart() { YarnConfiguration conf = new YarnConfiguration(); conf.SetClass(YarnConfiguration.RmScheduler, typeof(CapacityScheduler), typeof(ResourceScheduler )); conf.SetBoolean(YarnConfiguration.RecoveryEnabled, true); conf.SetBoolean(YarnConfiguration.RmWorkPreservingRecoveryEnabled, false); conf.Set(YarnConfiguration.RmStore, typeof(MemoryRMStateStore).FullName); // explicitly set max-am-retry count as 1. conf.SetInt(YarnConfiguration.RmAmMaxAttempts, 1); MemoryRMStateStore memStore = new MemoryRMStateStore(); memStore.Init(conf); MockRM rm1 = new MockRM(conf, memStore); rm1.Start(); MockNM nm1 = new MockNM("127.0.0.1:1234", 8000, rm1.GetResourceTrackerService()); nm1.RegisterNode(); RMApp app1 = rm1.SubmitApp(200); RMAppAttempt attempt1 = app1.GetCurrentAppAttempt(); MockAM am1 = MockRM.LaunchAndRegisterAM(app1, rm1, nm1); CapacityScheduler scheduler = (CapacityScheduler)rm1.GetResourceScheduler(); ContainerId amContainer = ContainerId.NewContainerId(am1.GetApplicationAttemptId( ), 1); // Forcibly preempt the am container; scheduler.KillContainer(scheduler.GetRMContainer(amContainer)); am1.WaitForState(RMAppAttemptState.Failed); NUnit.Framework.Assert.IsTrue(!attempt1.ShouldCountTowardsMaxAttemptRetry()); rm1.WaitForState(app1.GetApplicationId(), RMAppState.Accepted); // state store has 1 attempt stored. ApplicationStateData appState = memStore.GetState().GetApplicationState()[app1.GetApplicationId ()]; NUnit.Framework.Assert.AreEqual(1, appState.GetAttemptCount()); // attempt stored has the preempted container exit status. NUnit.Framework.Assert.AreEqual(ContainerExitStatus.Preempted, appState.GetAttempt (am1.GetApplicationAttemptId()).GetAMContainerExitStatus()); // Restart rm. MockRM rm2 = new MockRM(conf, memStore); nm1.SetResourceTrackerService(rm2.GetResourceTrackerService()); nm1.RegisterNode(); rm2.Start(); // Restarted RM should re-launch the am. MockAM am2 = rm2.WaitForNewAMToLaunchAndRegister(app1.GetApplicationId(), 2, nm1); MockRM.FinishAMAndVerifyAppState(app1, rm2, nm1, am2); RMAppAttempt attempt2 = rm2.GetRMContext().GetRMApps()[app1.GetApplicationId()].GetCurrentAppAttempt (); NUnit.Framework.Assert.IsTrue(attempt2.ShouldCountTowardsMaxAttemptRetry()); NUnit.Framework.Assert.AreEqual(ContainerExitStatus.Invalid, appState.GetAttempt( am2.GetApplicationAttemptId()).GetAMContainerExitStatus()); rm1.Stop(); rm2.Stop(); }
/// <exception cref="System.Exception"/> public virtual void TestAppCleanupWhenRMRestartedBeforeAppFinished() { conf.SetInt(YarnConfiguration.RmAmMaxAttempts, 1); MemoryRMStateStore memStore = new MemoryRMStateStore(); memStore.Init(conf); // start RM MockRM rm1 = new MockRM(conf, memStore); rm1.Start(); MockNM nm1 = new MockNM("127.0.0.1:1234", 1024, rm1.GetResourceTrackerService()); nm1.RegisterNode(); MockNM nm2 = new MockNM("127.0.0.1:5678", 1024, rm1.GetResourceTrackerService()); nm2.RegisterNode(); // create app and launch the AM RMApp app0 = rm1.SubmitApp(200); MockAM am0 = LaunchAM(app0, rm1, nm1); // alloc another container on nm2 AllocateResponse allocResponse = am0.Allocate(Arrays.AsList(ResourceRequest.NewInstance (Priority.NewInstance(1), "*", Resource.NewInstance(1024, 0), 1)), null); while (null == allocResponse.GetAllocatedContainers() || allocResponse.GetAllocatedContainers ().IsEmpty()) { nm2.NodeHeartbeat(true); allocResponse = am0.Allocate(null, null); Sharpen.Thread.Sleep(1000); } // start new RM MockRM rm2 = new MockRM(conf, memStore); rm2.Start(); // nm1/nm2 register to rm2, and do a heartbeat nm1.SetResourceTrackerService(rm2.GetResourceTrackerService()); nm1.RegisterNode(Arrays.AsList(NMContainerStatus.NewInstance(ContainerId.NewContainerId (am0.GetApplicationAttemptId(), 1), ContainerState.Complete, Resource.NewInstance (1024, 1), string.Empty, 0, Priority.NewInstance(0), 1234)), Arrays.AsList(app0. GetApplicationId())); nm2.SetResourceTrackerService(rm2.GetResourceTrackerService()); nm2.RegisterNode(Arrays.AsList(app0.GetApplicationId())); // assert app state has been saved. rm2.WaitForState(app0.GetApplicationId(), RMAppState.Failed); // wait for application cleanup message received on NM1 WaitForAppCleanupMessageRecved(nm1, app0.GetApplicationId()); // wait for application cleanup message received on NM2 WaitForAppCleanupMessageRecved(nm2, app0.GetApplicationId()); rm1.Stop(); rm2.Stop(); }
public virtual void TestFailoverClearsRMContext() { configuration.SetBoolean(YarnConfiguration.AutoFailoverEnabled, false); configuration.SetBoolean(YarnConfiguration.RecoveryEnabled, true); Configuration conf = new YarnConfiguration(configuration); MemoryRMStateStore memStore = new MemoryRMStateStore(); memStore.Init(conf); // 1. start RM rm = new MockRM(conf, memStore); rm.Init(conf); rm.Start(); HAServiceProtocol.StateChangeRequestInfo requestInfo = new HAServiceProtocol.StateChangeRequestInfo (HAServiceProtocol.RequestSource.RequestByUser); CheckMonitorHealth(); CheckStandbyRMFunctionality(); // 2. Transition to active rm.adminService.TransitionToActive(requestInfo); CheckMonitorHealth(); CheckActiveRMFunctionality(); VerifyClusterMetrics(1, 1, 1, 1, 2048, 1); NUnit.Framework.Assert.AreEqual(1, rm.GetRMContext().GetRMNodes().Count); NUnit.Framework.Assert.AreEqual(1, rm.GetRMContext().GetRMApps().Count); // 3. Create new RM rm = new _MockRM_550(conf, memStore); rm.Init(conf); rm.Start(); CheckMonitorHealth(); CheckStandbyRMFunctionality(); // 4. Try Transition to active, throw exception try { rm.adminService.TransitionToActive(requestInfo); NUnit.Framework.Assert.Fail("Transitioned to Active should throw exception."); } catch (Exception e) { NUnit.Framework.Assert.IsTrue("Error when transitioning to Active mode".Contains( e.Message)); } // 5. Clears the metrics VerifyClusterMetrics(0, 0, 0, 0, 0, 0); NUnit.Framework.Assert.AreEqual(0, rm.GetRMContext().GetRMNodes().Count); NUnit.Framework.Assert.AreEqual(0, rm.GetRMContext().GetRMApps().Count); }
// Test all expired keys are removed from state-store. /// <exception cref="System.Exception"/> public virtual void TestRemoveExpiredMasterKeyInRMStateStore() { MemoryRMStateStore memStore = new MemoryRMStateStore(); memStore.Init(conf); RMStateStore.RMState rmState = memStore.GetState(); ICollection <DelegationKey> rmDTMasterKeyState = rmState.GetRMDTSecretManagerState ().GetMasterKeyState(); MockRM rm1 = new TestRMDelegationTokens.MyMockRM(this, conf, memStore); rm1.Start(); RMDelegationTokenSecretManager dtSecretManager = rm1.GetRMContext().GetRMDelegationTokenSecretManager (); // assert all master keys are saved NUnit.Framework.Assert.AreEqual(dtSecretManager.GetAllMasterKeys(), rmDTMasterKeyState ); ICollection <DelegationKey> expiringKeys = new HashSet <DelegationKey>(); Sharpen.Collections.AddAll(expiringKeys, dtSecretManager.GetAllMasterKeys()); // wait for expiringKeys to expire while (true) { bool allExpired = true; foreach (DelegationKey key in expiringKeys) { if (rmDTMasterKeyState.Contains(key)) { allExpired = false; } } if (allExpired) { break; } Sharpen.Thread.Sleep(500); } }
/// <exception cref="System.Exception"/> public virtual void TestContainerCleanupWhenRMRestartedAppNotRegistered() { conf.SetInt(YarnConfiguration.RmAmMaxAttempts, 1); MemoryRMStateStore memStore = new MemoryRMStateStore(); memStore.Init(conf); // start RM DrainDispatcher dispatcher = new DrainDispatcher(); MockRM rm1 = new _MockRM_413(dispatcher, conf, memStore); rm1.Start(); MockNM nm1 = new MockNM("127.0.0.1:1234", 15120, rm1.GetResourceTrackerService()); nm1.RegisterNode(); // create app and launch the AM RMApp app0 = rm1.SubmitApp(200); MockAM am0 = LaunchAM(app0, rm1, nm1); nm1.NodeHeartbeat(am0.GetApplicationAttemptId(), 1, ContainerState.Running); rm1.WaitForState(app0.GetApplicationId(), RMAppState.Running); // start new RM DrainDispatcher dispatcher2 = new DrainDispatcher(); MockRM rm2 = new _MockRM_432(dispatcher2, conf, memStore); rm2.Start(); // nm1 register to rm2, and do a heartbeat nm1.SetResourceTrackerService(rm2.GetResourceTrackerService()); nm1.RegisterNode(Arrays.AsList(app0.GetApplicationId())); rm2.WaitForState(app0.GetApplicationId(), RMAppState.Accepted); // Add unknown container for application unknown to scheduler NodeHeartbeatResponse response = nm1.NodeHeartbeat(am0.GetApplicationAttemptId(), 2, ContainerState.Running); WaitForContainerCleanup(dispatcher2, nm1, response); rm1.Stop(); rm2.Stop(); }
/// <exception cref="System.Exception"/> public virtual void TestRMAppAttemptFailuresValidityInterval() { YarnConfiguration conf = new YarnConfiguration(); conf.SetClass(YarnConfiguration.RmScheduler, typeof(CapacityScheduler), typeof(ResourceScheduler )); conf.SetBoolean(YarnConfiguration.RecoveryEnabled, true); conf.SetBoolean(YarnConfiguration.RmWorkPreservingRecoveryEnabled, false); conf.Set(YarnConfiguration.RmStore, typeof(MemoryRMStateStore).FullName); // explicitly set max-am-retry count as 2. conf.SetInt(YarnConfiguration.RmAmMaxAttempts, 2); MemoryRMStateStore memStore = new MemoryRMStateStore(); memStore.Init(conf); MockRM rm1 = new MockRM(conf, memStore); rm1.Start(); MockNM nm1 = new MockNM("127.0.0.1:1234", 8000, rm1.GetResourceTrackerService()); nm1.RegisterNode(); // set window size to a larger number : 20s // we will verify the app should be failed if // two continuous attempts failed in 20s. RMApp app = rm1.SubmitApp(200, 20000); MockAM am = MockRM.LaunchAM(app, rm1, nm1); // Fail current attempt normally nm1.NodeHeartbeat(am.GetApplicationAttemptId(), 1, ContainerState.Complete); am.WaitForState(RMAppAttemptState.Failed); // launch the second attempt rm1.WaitForState(app.GetApplicationId(), RMAppState.Accepted); NUnit.Framework.Assert.AreEqual(2, app.GetAppAttempts().Count); NUnit.Framework.Assert.IsTrue(((RMAppAttemptImpl)app.GetCurrentAppAttempt()).MayBeLastAttempt ()); MockAM am_2 = MockRM.LaunchAndRegisterAM(app, rm1, nm1); am_2.WaitForState(RMAppAttemptState.Running); nm1.NodeHeartbeat(am_2.GetApplicationAttemptId(), 1, ContainerState.Complete); am_2.WaitForState(RMAppAttemptState.Failed); // current app should be failed. rm1.WaitForState(app.GetApplicationId(), RMAppState.Failed); ControlledClock clock = new ControlledClock(new SystemClock()); // set window size to 6s RMAppImpl app1 = (RMAppImpl)rm1.SubmitApp(200, 6000); app1.SetSystemClock(clock); MockAM am1 = MockRM.LaunchAndRegisterAM(app1, rm1, nm1); // Fail attempt1 normally nm1.NodeHeartbeat(am1.GetApplicationAttemptId(), 1, ContainerState.Complete); am1.WaitForState(RMAppAttemptState.Failed); // launch the second attempt rm1.WaitForState(app1.GetApplicationId(), RMAppState.Accepted); NUnit.Framework.Assert.AreEqual(2, app1.GetAppAttempts().Count); RMAppAttempt attempt2 = app1.GetCurrentAppAttempt(); NUnit.Framework.Assert.IsTrue(((RMAppAttemptImpl)attempt2).MayBeLastAttempt()); MockAM am2 = MockRM.LaunchAndRegisterAM(app1, rm1, nm1); am2.WaitForState(RMAppAttemptState.Running); // wait for 6 seconds clock.SetTime(Runtime.CurrentTimeMillis() + 6 * 1000); // Fail attempt2 normally nm1.NodeHeartbeat(am2.GetApplicationAttemptId(), 1, ContainerState.Complete); am2.WaitForState(RMAppAttemptState.Failed); // can launch the third attempt successfully rm1.WaitForState(app1.GetApplicationId(), RMAppState.Accepted); NUnit.Framework.Assert.AreEqual(3, app1.GetAppAttempts().Count); RMAppAttempt attempt3 = app1.GetCurrentAppAttempt(); clock.Reset(); MockAM am3 = MockRM.LaunchAndRegisterAM(app1, rm1, nm1); am3.WaitForState(RMAppAttemptState.Running); // Restart rm. MockRM rm2 = new MockRM(conf, memStore); rm2.Start(); // re-register the NM nm1.SetResourceTrackerService(rm2.GetResourceTrackerService()); NMContainerStatus status = Org.Apache.Hadoop.Yarn.Util.Records.NewRecord <NMContainerStatus >(); status.SetContainerExitStatus(ContainerExitStatus.KilledByResourcemanager); status.SetContainerId(attempt3.GetMasterContainer().GetId()); status.SetContainerState(ContainerState.Complete); status.SetDiagnostics(string.Empty); nm1.RegisterNode(Sharpen.Collections.SingletonList(status), null); rm2.WaitForState(attempt3.GetAppAttemptId(), RMAppAttemptState.Failed); rm2.WaitForState(app1.GetApplicationId(), RMAppState.Accepted); // Lauch Attempt 4 MockAM am4 = rm2.WaitForNewAMToLaunchAndRegister(app1.GetApplicationId(), 4, nm1); // wait for 6 seconds clock.SetTime(Runtime.CurrentTimeMillis() + 6 * 1000); // Fail attempt4 normally nm1.NodeHeartbeat(am4.GetApplicationAttemptId(), 1, ContainerState.Complete); am4.WaitForState(RMAppAttemptState.Failed); // can launch the 5th attempt successfully rm2.WaitForState(app1.GetApplicationId(), RMAppState.Accepted); MockAM am5 = rm2.WaitForNewAMToLaunchAndRegister(app1.GetApplicationId(), 5, nm1); clock.Reset(); am5.WaitForState(RMAppAttemptState.Running); // Fail attempt5 normally nm1.NodeHeartbeat(am5.GetApplicationAttemptId(), 1, ContainerState.Complete); am5.WaitForState(RMAppAttemptState.Failed); rm2.WaitForState(app1.GetApplicationId(), RMAppState.Failed); rm1.Stop(); rm2.Stop(); }
// AM container preempted, nm disk failure // should not be counted towards AM max retry count. /// <exception cref="System.Exception"/> public virtual void TestShouldNotCountFailureToMaxAttemptRetry() { YarnConfiguration conf = new YarnConfiguration(); conf.SetClass(YarnConfiguration.RmScheduler, typeof(CapacityScheduler), typeof(ResourceScheduler )); // explicitly set max-am-retry count as 1. conf.SetInt(YarnConfiguration.RmAmMaxAttempts, 1); conf.SetBoolean(YarnConfiguration.RecoveryEnabled, true); conf.Set(YarnConfiguration.RmStore, typeof(MemoryRMStateStore).FullName); MemoryRMStateStore memStore = new MemoryRMStateStore(); memStore.Init(conf); MockRM rm1 = new MockRM(conf, memStore); rm1.Start(); MockNM nm1 = new MockNM("127.0.0.1:1234", 8000, rm1.GetResourceTrackerService()); nm1.RegisterNode(); RMApp app1 = rm1.SubmitApp(200); RMAppAttempt attempt1 = app1.GetCurrentAppAttempt(); MockAM am1 = MockRM.LaunchAndRegisterAM(app1, rm1, nm1); CapacityScheduler scheduler = (CapacityScheduler)rm1.GetResourceScheduler(); ContainerId amContainer = ContainerId.NewContainerId(am1.GetApplicationAttemptId( ), 1); // Preempt the first attempt; scheduler.KillContainer(scheduler.GetRMContainer(amContainer)); am1.WaitForState(RMAppAttemptState.Failed); NUnit.Framework.Assert.IsTrue(!attempt1.ShouldCountTowardsMaxAttemptRetry()); rm1.WaitForState(app1.GetApplicationId(), RMAppState.Accepted); ApplicationStateData appState = memStore.GetState().GetApplicationState()[app1.GetApplicationId ()]; // AM should be restarted even though max-am-attempt is 1. MockAM am2 = rm1.WaitForNewAMToLaunchAndRegister(app1.GetApplicationId(), 2, nm1); RMAppAttempt attempt2 = app1.GetCurrentAppAttempt(); NUnit.Framework.Assert.IsTrue(((RMAppAttemptImpl)attempt2).MayBeLastAttempt()); // Preempt the second attempt. ContainerId amContainer2 = ContainerId.NewContainerId(am2.GetApplicationAttemptId (), 1); scheduler.KillContainer(scheduler.GetRMContainer(amContainer2)); am2.WaitForState(RMAppAttemptState.Failed); NUnit.Framework.Assert.IsTrue(!attempt2.ShouldCountTowardsMaxAttemptRetry()); rm1.WaitForState(app1.GetApplicationId(), RMAppState.Accepted); MockAM am3 = rm1.WaitForNewAMToLaunchAndRegister(app1.GetApplicationId(), 3, nm1); RMAppAttempt attempt3 = app1.GetCurrentAppAttempt(); NUnit.Framework.Assert.IsTrue(((RMAppAttemptImpl)attempt3).MayBeLastAttempt()); // mimic NM disk_failure ContainerStatus containerStatus = Org.Apache.Hadoop.Yarn.Util.Records.NewRecord <ContainerStatus >(); containerStatus.SetContainerId(attempt3.GetMasterContainer().GetId()); containerStatus.SetDiagnostics("mimic NM disk_failure"); containerStatus.SetState(ContainerState.Complete); containerStatus.SetExitStatus(ContainerExitStatus.DisksFailed); IDictionary <ApplicationId, IList <ContainerStatus> > conts = new Dictionary <ApplicationId , IList <ContainerStatus> >(); conts[app1.GetApplicationId()] = Sharpen.Collections.SingletonList(containerStatus ); nm1.NodeHeartbeat(conts, true); am3.WaitForState(RMAppAttemptState.Failed); NUnit.Framework.Assert.IsTrue(!attempt3.ShouldCountTowardsMaxAttemptRetry()); NUnit.Framework.Assert.AreEqual(ContainerExitStatus.DisksFailed, appState.GetAttempt (am3.GetApplicationAttemptId()).GetAMContainerExitStatus()); rm1.WaitForState(app1.GetApplicationId(), RMAppState.Accepted); MockAM am4 = rm1.WaitForNewAMToLaunchAndRegister(app1.GetApplicationId(), 4, nm1); RMAppAttempt attempt4 = app1.GetCurrentAppAttempt(); NUnit.Framework.Assert.IsTrue(((RMAppAttemptImpl)attempt4).MayBeLastAttempt()); // create second NM, and register to rm1 MockNM nm2 = new MockNM("127.0.0.1:2234", 8000, rm1.GetResourceTrackerService()); nm2.RegisterNode(); // nm1 heartbeats to report unhealthy // This will mimic ContainerExitStatus.ABORT nm1.NodeHeartbeat(false); am4.WaitForState(RMAppAttemptState.Failed); NUnit.Framework.Assert.IsTrue(!attempt4.ShouldCountTowardsMaxAttemptRetry()); NUnit.Framework.Assert.AreEqual(ContainerExitStatus.Aborted, appState.GetAttempt( am4.GetApplicationAttemptId()).GetAMContainerExitStatus()); // launch next AM in nm2 nm2.NodeHeartbeat(true); MockAM am5 = rm1.WaitForNewAMToLaunchAndRegister(app1.GetApplicationId(), 5, nm2); RMAppAttempt attempt5 = app1.GetCurrentAppAttempt(); NUnit.Framework.Assert.IsTrue(((RMAppAttemptImpl)attempt5).MayBeLastAttempt()); // fail the AM normally nm2.NodeHeartbeat(am5.GetApplicationAttemptId(), 1, ContainerState.Complete); am5.WaitForState(RMAppAttemptState.Failed); NUnit.Framework.Assert.IsTrue(attempt5.ShouldCountTowardsMaxAttemptRetry()); // AM should not be restarted. rm1.WaitForState(app1.GetApplicationId(), RMAppState.Failed); NUnit.Framework.Assert.AreEqual(5, app1.GetAppAttempts().Count); rm1.Stop(); }
/// <exception cref="System.Exception"/> public virtual void TestUsageWithMultipleContainersAndRMRestart() { // Set max attempts to 1 so that when the first attempt fails, the app // won't try to start a new one. conf.SetInt(YarnConfiguration.RmAmMaxAttempts, 1); conf.SetBoolean(YarnConfiguration.RecoveryEnabled, true); conf.SetBoolean(YarnConfiguration.RmWorkPreservingRecoveryEnabled, false); MemoryRMStateStore memStore = new MemoryRMStateStore(); memStore.Init(conf); MockRM rm0 = new MockRM(conf, memStore); rm0.Start(); MockNM nm = new MockNM("127.0.0.1:1234", 65536, rm0.GetResourceTrackerService()); nm.RegisterNode(); RMApp app0 = rm0.SubmitApp(200); rm0.WaitForState(app0.GetApplicationId(), RMAppState.Accepted); RMAppAttempt attempt0 = app0.GetCurrentAppAttempt(); ApplicationAttemptId attemptId0 = attempt0.GetAppAttemptId(); rm0.WaitForState(attemptId0, RMAppAttemptState.Scheduled); nm.NodeHeartbeat(true); rm0.WaitForState(attemptId0, RMAppAttemptState.Allocated); MockAM am0 = rm0.SendAMLaunched(attempt0.GetAppAttemptId()); am0.RegisterAppAttempt(); int NumContainers = 2; am0.Allocate("127.0.0.1", 1000, NumContainers, new AList <ContainerId>()); nm.NodeHeartbeat(true); IList <Container> conts = am0.Allocate(new AList <ResourceRequest>(), new AList <ContainerId >()).GetAllocatedContainers(); while (conts.Count != NumContainers) { nm.NodeHeartbeat(true); Sharpen.Collections.AddAll(conts, am0.Allocate(new AList <ResourceRequest>(), new AList <ContainerId>()).GetAllocatedContainers()); Sharpen.Thread.Sleep(500); } // launch the 2nd and 3rd containers. foreach (Container c in conts) { nm.NodeHeartbeat(attempt0.GetAppAttemptId(), c.GetId().GetContainerId(), ContainerState .Running); rm0.WaitForState(nm, c.GetId(), RMContainerState.Running); } // Get the RMContainers for all of the live containers, to be used later // for metrics calculations and comparisons. ICollection <RMContainer> rmContainers = rm0.scheduler.GetSchedulerAppInfo(attempt0 .GetAppAttemptId()).GetLiveContainers(); // Allow metrics to accumulate. int sleepInterval = 1000; int cumulativeSleepTime = 0; while (app0.GetRMAppMetrics().GetMemorySeconds() <= 0 && cumulativeSleepTime < 5000 ) { Sharpen.Thread.Sleep(sleepInterval); cumulativeSleepTime += sleepInterval; } // Stop all non-AM containers foreach (Container c_1 in conts) { if (c_1.GetId().GetContainerId() == 1) { continue; } nm.NodeHeartbeat(attempt0.GetAppAttemptId(), c_1.GetId().GetContainerId(), ContainerState .Complete); rm0.WaitForState(nm, c_1.GetId(), RMContainerState.Completed); } // After all other containers have completed, manually complete the master // container in order to trigger a save to the state store of the resource // usage metrics. This will cause the attempt to fail, and, since the max // attempt retries is 1, the app will also fail. This is intentional so // that all containers will complete prior to saving. ContainerId cId = ContainerId.NewContainerId(attempt0.GetAppAttemptId(), 1); nm.NodeHeartbeat(attempt0.GetAppAttemptId(), cId.GetContainerId(), ContainerState .Complete); rm0.WaitForState(nm, cId, RMContainerState.Completed); // Check that the container metrics match those from the app usage report. long memorySeconds = 0; long vcoreSeconds = 0; foreach (RMContainer c_2 in rmContainers) { AggregateAppResourceUsage ru = CalculateContainerResourceMetrics(c_2); memorySeconds += ru.GetMemorySeconds(); vcoreSeconds += ru.GetVcoreSeconds(); } RMAppMetrics metricsBefore = app0.GetRMAppMetrics(); NUnit.Framework.Assert.AreEqual("Unexcpected MemorySeconds value", memorySeconds, metricsBefore.GetMemorySeconds()); NUnit.Framework.Assert.AreEqual("Unexpected VcoreSeconds value", vcoreSeconds, metricsBefore .GetVcoreSeconds()); // create new RM to represent RM restart. Load up the state store. MockRM rm1 = new MockRM(conf, memStore); rm1.Start(); RMApp app0After = rm1.GetRMContext().GetRMApps()[app0.GetApplicationId()]; // Compare container resource usage metrics from before and after restart. RMAppMetrics metricsAfter = app0After.GetRMAppMetrics(); NUnit.Framework.Assert.AreEqual("Vcore seconds were not the same after RM Restart" , metricsBefore.GetVcoreSeconds(), metricsAfter.GetVcoreSeconds()); NUnit.Framework.Assert.AreEqual("Memory seconds were not the same after RM Restart" , metricsBefore.GetMemorySeconds(), metricsAfter.GetMemorySeconds()); rm0.Stop(); rm0.Close(); rm1.Stop(); rm1.Close(); }
// The test verifies processing of NMContainerStatuses which are sent during // NM registration. // 1. Start the cluster-RM,NM,Submit app with 1024MB,Launch & register AM // 2. AM sends ResourceRequest for 1 container with memory 2048MB. // 3. Verify for number of container allocated by RM // 4. Verify Memory Usage by cluster, it should be 3072. AM memory + requested // memory. 1024 + 2048=3072 // 5. Re-register NM by sending completed container status // 6. Verify for Memory Used, it should be 1024 // 7. Send AM heatbeat to RM. Allocated response should contain completed // container. /// <exception cref="System.Exception"/> public virtual void TestProcessingNMContainerStatusesOnNMRestart() { conf.SetInt(YarnConfiguration.RmAmMaxAttempts, 1); MemoryRMStateStore memStore = new MemoryRMStateStore(); memStore.Init(conf); // 1. Start the cluster-RM,NM,Submit app with 1024MB,Launch & register AM MockRM rm1 = new MockRM(conf, memStore); rm1.Start(); int nmMemory = 8192; int amMemory = 1024; int containerMemory = 2048; MockNM nm1 = new MockNM("127.0.0.1:1234", nmMemory, rm1.GetResourceTrackerService ()); nm1.RegisterNode(); RMApp app0 = rm1.SubmitApp(amMemory); MockAM am0 = MockRM.LaunchAndRegisterAM(app0, rm1, nm1); // 2. AM sends ResourceRequest for 1 container with memory 2048MB. int noOfContainers = 1; IList <Container> allocateContainers = am0.AllocateAndWaitForContainers(noOfContainers , containerMemory, nm1); // 3. Verify for number of container allocated by RM NUnit.Framework.Assert.AreEqual(noOfContainers, allocateContainers.Count); Container container = allocateContainers[0]; nm1.NodeHeartbeat(am0.GetApplicationAttemptId(), 1, ContainerState.Running); nm1.NodeHeartbeat(am0.GetApplicationAttemptId(), container.GetId().GetContainerId (), ContainerState.Running); rm1.WaitForState(app0.GetApplicationId(), RMAppState.Running); // 4. Verify Memory Usage by cluster, it should be 3072. AM memory + // requested memory. 1024 + 2048=3072 ResourceScheduler rs = rm1.GetRMContext().GetScheduler(); int allocatedMB = rs.GetRootQueueMetrics().GetAllocatedMB(); NUnit.Framework.Assert.AreEqual(amMemory + containerMemory, allocatedMB); // 5. Re-register NM by sending completed container status IList <NMContainerStatus> nMContainerStatusForApp = CreateNMContainerStatusForApp( am0); nm1.RegisterNode(nMContainerStatusForApp, Arrays.AsList(app0.GetApplicationId())); WaitForClusterMemory(nm1, rs, amMemory); // 6. Verify for Memory Used, it should be 1024 NUnit.Framework.Assert.AreEqual(amMemory, rs.GetRootQueueMetrics().GetAllocatedMB ()); // 7. Send AM heatbeat to RM. Allocated response should contain completed // container AllocateRequest req = AllocateRequest.NewInstance(0, 0F, new AList <ResourceRequest >(), new AList <ContainerId>(), null); AllocateResponse allocate = am0.Allocate(req); IList <ContainerStatus> completedContainersStatuses = allocate.GetCompletedContainersStatuses (); NUnit.Framework.Assert.AreEqual(noOfContainers, completedContainersStatuses.Count ); // Application clean up should happen Cluster memory used is 0 nm1.NodeHeartbeat(am0.GetApplicationAttemptId(), 1, ContainerState.Complete); WaitForClusterMemory(nm1, rs, 0); rm1.Stop(); }
// Test does major 6 steps verification. // Step-1 : AMRMClient send allocate request for 2 container requests // Step-2 : 2 containers are allocated by RM. // Step-3 : AM Send 1 containerRequest(cRequest3) and 1 releaseRequests to // RM // Step-4 : On RM restart, AM(does not know RM is restarted) sends additional // containerRequest(cRequest4) and blacklisted nodes. // Intern RM send resync command // Step-5 : Allocater after resync command & new containerRequest(cRequest5) // Step-6 : RM allocates containers i.e cRequest3,cRequest4 and cRequest5 /// <exception cref="System.Exception"/> public virtual void TestAMRMClientResendsRequestsOnRMRestart() { UserGroupInformation.SetLoginUser(null); MemoryRMStateStore memStore = new MemoryRMStateStore(); memStore.Init(conf); // Phase-1 Start 1st RM TestAMRMClientOnRMRestart.MyResourceManager rm1 = new TestAMRMClientOnRMRestart.MyResourceManager (conf, memStore); rm1.Start(); DrainDispatcher dispatcher = (DrainDispatcher)rm1.GetRMContext().GetDispatcher(); // Submit the application RMApp app = rm1.SubmitApp(1024); dispatcher.Await(); MockNM nm1 = new MockNM("h1:1234", 15120, rm1.GetResourceTrackerService()); nm1.RegisterNode(); nm1.NodeHeartbeat(true); // Node heartbeat dispatcher.Await(); ApplicationAttemptId appAttemptId = app.GetCurrentAppAttempt().GetAppAttemptId(); rm1.SendAMLaunched(appAttemptId); dispatcher.Await(); Org.Apache.Hadoop.Security.Token.Token <AMRMTokenIdentifier> token = rm1.GetRMContext ().GetRMApps()[appAttemptId.GetApplicationId()].GetRMAppAttempt(appAttemptId).GetAMRMToken (); UserGroupInformation ugi = UserGroupInformation.GetCurrentUser(); ugi.AddTokenIdentifier(token.DecodeIdentifier()); // Step-1 : AMRMClient send allocate request for 2 ContainerRequest // cRequest1 = h1 and cRequest2 = h1,h2 // blacklisted nodes = h2 AMRMClient <AMRMClient.ContainerRequest> amClient = new TestAMRMClientOnRMRestart.MyAMRMClientImpl (rm1); amClient.Init(conf); amClient.Start(); amClient.RegisterApplicationMaster("Host", 10000, string.Empty); AMRMClient.ContainerRequest cRequest1 = CreateReq(1, 1024, new string[] { "h1" }); amClient.AddContainerRequest(cRequest1); AMRMClient.ContainerRequest cRequest2 = CreateReq(1, 1024, new string[] { "h1", "h2" }); amClient.AddContainerRequest(cRequest2); IList <string> blacklistAdditions = new AList <string>(); IList <string> blacklistRemoval = new AList <string>(); blacklistAdditions.AddItem("h2"); blacklistRemoval.AddItem("h10"); amClient.UpdateBlacklist(blacklistAdditions, blacklistRemoval); blacklistAdditions.Remove("h2"); // remove from local list AllocateResponse allocateResponse = amClient.Allocate(0.1f); dispatcher.Await(); NUnit.Framework.Assert.AreEqual("No of assignments must be 0", 0, allocateResponse .GetAllocatedContainers().Count); // Why 4 ask, why not 3 ask even h2 is blacklisted? // On blacklisting host,applicationmaster has to remove ask request from // remoterequest table.Here,test does not remove explicitely AssertAsksAndReleases(4, 0, rm1); AssertBlacklistAdditionsAndRemovals(1, 1, rm1); // Step-2 : NM heart beat is sent. // On 2nd AM allocate request, RM allocates 2 containers to AM nm1.NodeHeartbeat(true); // Node heartbeat dispatcher.Await(); allocateResponse = amClient.Allocate(0.2f); dispatcher.Await(); // 2 containers are allocated i.e for cRequest1 and cRequest2. NUnit.Framework.Assert.AreEqual("No of assignments must be 0", 2, allocateResponse .GetAllocatedContainers().Count); AssertAsksAndReleases(0, 0, rm1); AssertBlacklistAdditionsAndRemovals(0, 0, rm1); IList <Container> allocatedContainers = allocateResponse.GetAllocatedContainers(); // removed allocated container requests amClient.RemoveContainerRequest(cRequest1); amClient.RemoveContainerRequest(cRequest2); allocateResponse = amClient.Allocate(0.2f); dispatcher.Await(); NUnit.Framework.Assert.AreEqual("No of assignments must be 0", 0, allocateResponse .GetAllocatedContainers().Count); AssertAsksAndReleases(4, 0, rm1); AssertBlacklistAdditionsAndRemovals(0, 0, rm1); // Step-3 : Send 1 containerRequest and 1 releaseRequests to RM AMRMClient.ContainerRequest cRequest3 = CreateReq(1, 1024, new string[] { "h1" }); amClient.AddContainerRequest(cRequest3); int pendingRelease = 0; IEnumerator <Container> it = allocatedContainers.GetEnumerator(); while (it.HasNext()) { amClient.ReleaseAssignedContainer(it.Next().GetId()); pendingRelease++; it.Remove(); break; } // remove one container allocateResponse = amClient.Allocate(0.3f); dispatcher.Await(); NUnit.Framework.Assert.AreEqual("No of assignments must be 0", 0, allocateResponse .GetAllocatedContainers().Count); AssertAsksAndReleases(3, pendingRelease, rm1); AssertBlacklistAdditionsAndRemovals(0, 0, rm1); int completedContainer = allocateResponse.GetCompletedContainersStatuses().Count; pendingRelease -= completedContainer; // Phase-2 start 2nd RM is up TestAMRMClientOnRMRestart.MyResourceManager rm2 = new TestAMRMClientOnRMRestart.MyResourceManager (conf, memStore); rm2.Start(); nm1.SetResourceTrackerService(rm2.GetResourceTrackerService()); ((TestAMRMClientOnRMRestart.MyAMRMClientImpl)amClient).UpdateRMProxy(rm2); dispatcher = (DrainDispatcher)rm2.GetRMContext().GetDispatcher(); // NM should be rebooted on heartbeat, even first heartbeat for nm2 NodeHeartbeatResponse hbResponse = nm1.NodeHeartbeat(true); NUnit.Framework.Assert.AreEqual(NodeAction.Resync, hbResponse.GetNodeAction()); // new NM to represent NM re-register nm1 = new MockNM("h1:1234", 10240, rm2.GetResourceTrackerService()); nm1.RegisterNode(); nm1.NodeHeartbeat(true); dispatcher.Await(); blacklistAdditions.AddItem("h3"); amClient.UpdateBlacklist(blacklistAdditions, null); blacklistAdditions.Remove("h3"); it = allocatedContainers.GetEnumerator(); while (it.HasNext()) { amClient.ReleaseAssignedContainer(it.Next().GetId()); pendingRelease++; it.Remove(); } AMRMClient.ContainerRequest cRequest4 = CreateReq(1, 1024, new string[] { "h1", "h2" }); amClient.AddContainerRequest(cRequest4); // Step-4 : On RM restart, AM(does not know RM is restarted) sends // additional // containerRequest and blacklisted nodes. // Intern RM send resync command,AMRMClient resend allocate request allocateResponse = amClient.Allocate(0.3f); dispatcher.Await(); completedContainer = allocateResponse.GetCompletedContainersStatuses().Count; pendingRelease -= completedContainer; AssertAsksAndReleases(4, pendingRelease, rm2); AssertBlacklistAdditionsAndRemovals(2, 0, rm2); AMRMClient.ContainerRequest cRequest5 = CreateReq(1, 1024, new string[] { "h1", "h2" , "h3" }); amClient.AddContainerRequest(cRequest5); // Step-5 : Allocater after resync command allocateResponse = amClient.Allocate(0.5f); dispatcher.Await(); NUnit.Framework.Assert.AreEqual("No of assignments must be 0", 0, allocateResponse .GetAllocatedContainers().Count); AssertAsksAndReleases(5, 0, rm2); AssertBlacklistAdditionsAndRemovals(0, 0, rm2); int noAssignedContainer = 0; int count = 5; while (count-- > 0) { nm1.NodeHeartbeat(true); dispatcher.Await(); allocateResponse = amClient.Allocate(0.5f); dispatcher.Await(); noAssignedContainer += allocateResponse.GetAllocatedContainers().Count; if (noAssignedContainer == 3) { break; } Sharpen.Thread.Sleep(1000); } // Step-6 : RM allocates containers i.e cRequest3,cRequest4 and cRequest5 NUnit.Framework.Assert.AreEqual("Number of container should be 3", 3, noAssignedContainer ); amClient.Stop(); rm1.Stop(); rm2.Stop(); }
// Test verify for AM issued with rolled-over AMRMToken // is still able to communicate with restarted RM. /// <exception cref="System.Exception"/> public virtual void TestAMRMClientOnAMRMTokenRollOverOnRMRestart() { conf.SetLong(YarnConfiguration.RmAmrmTokenMasterKeyRollingIntervalSecs, rolling_interval_sec ); conf.SetLong(YarnConfiguration.RmAmExpiryIntervalMs, am_expire_ms); MemoryRMStateStore memStore = new MemoryRMStateStore(); memStore.Init(conf); // start first RM TestAMRMClientOnRMRestart.MyResourceManager2 rm1 = new TestAMRMClientOnRMRestart.MyResourceManager2 (conf, memStore); rm1.Start(); DrainDispatcher dispatcher = (DrainDispatcher)rm1.GetRMContext().GetDispatcher(); long startTime = Runtime.CurrentTimeMillis(); // Submit the application RMApp app = rm1.SubmitApp(1024); dispatcher.Await(); MockNM nm1 = new MockNM("h1:1234", 15120, rm1.GetResourceTrackerService()); nm1.RegisterNode(); nm1.NodeHeartbeat(true); // Node heartbeat dispatcher.Await(); ApplicationAttemptId appAttemptId = app.GetCurrentAppAttempt().GetAppAttemptId(); rm1.SendAMLaunched(appAttemptId); dispatcher.Await(); AMRMTokenSecretManager amrmTokenSecretManagerForRM1 = rm1.GetRMContext().GetAMRMTokenSecretManager (); Org.Apache.Hadoop.Security.Token.Token <AMRMTokenIdentifier> token = amrmTokenSecretManagerForRM1 .CreateAndGetAMRMToken(appAttemptId); UserGroupInformation ugi = UserGroupInformation.GetCurrentUser(); ugi.AddTokenIdentifier(token.DecodeIdentifier()); AMRMClient <AMRMClient.ContainerRequest> amClient = new TestAMRMClientOnRMRestart.MyAMRMClientImpl (rm1); amClient.Init(conf); amClient.Start(); amClient.RegisterApplicationMaster("h1", 10000, string.Empty); amClient.Allocate(0.1f); // Wait for enough time and make sure the roll_over happens // At mean time, the old AMRMToken should continue to work while (Runtime.CurrentTimeMillis() - startTime < rolling_interval_sec * 1000) { amClient.Allocate(0.1f); try { Sharpen.Thread.Sleep(1000); } catch (Exception) { } } // DO NOTHING NUnit.Framework.Assert.IsTrue(amrmTokenSecretManagerForRM1.GetMasterKey().GetMasterKey ().GetKeyId() != token.DecodeIdentifier().GetKeyId()); amClient.Allocate(0.1f); // active the nextMasterKey, and replace the currentMasterKey Org.Apache.Hadoop.Security.Token.Token <AMRMTokenIdentifier> newToken = amrmTokenSecretManagerForRM1 .CreateAndGetAMRMToken(appAttemptId); int waitCount = 0; while (waitCount++ <= 50) { if (amrmTokenSecretManagerForRM1.GetCurrnetMasterKeyData().GetMasterKey().GetKeyId () != token.DecodeIdentifier().GetKeyId()) { break; } try { amClient.Allocate(0.1f); } catch (Exception) { break; } Sharpen.Thread.Sleep(500); } NUnit.Framework.Assert.IsTrue(amrmTokenSecretManagerForRM1.GetNextMasterKeyData() == null); NUnit.Framework.Assert.IsTrue(amrmTokenSecretManagerForRM1.GetCurrnetMasterKeyData ().GetMasterKey().GetKeyId() == newToken.DecodeIdentifier().GetKeyId()); // start 2nd RM conf.Set(YarnConfiguration.RmSchedulerAddress, "0.0.0.0:9030"); TestAMRMClientOnRMRestart.MyResourceManager2 rm2 = new TestAMRMClientOnRMRestart.MyResourceManager2 (conf, memStore); rm2.Start(); nm1.SetResourceTrackerService(rm2.GetResourceTrackerService()); ((TestAMRMClientOnRMRestart.MyAMRMClientImpl)amClient).UpdateRMProxy(rm2); dispatcher = (DrainDispatcher)rm2.GetRMContext().GetDispatcher(); AMRMTokenSecretManager amrmTokenSecretManagerForRM2 = rm2.GetRMContext().GetAMRMTokenSecretManager (); NUnit.Framework.Assert.IsTrue(amrmTokenSecretManagerForRM2.GetCurrnetMasterKeyData ().GetMasterKey().GetKeyId() == newToken.DecodeIdentifier().GetKeyId()); NUnit.Framework.Assert.IsTrue(amrmTokenSecretManagerForRM2.GetNextMasterKeyData() == null); try { UserGroupInformation testUser = UserGroupInformation.CreateRemoteUser("testUser"); SecurityUtil.SetTokenService(token, rm2.GetApplicationMasterService().GetBindAddress ()); testUser.AddToken(token); testUser.DoAs(new _PrivilegedAction_480(rm2)).Allocate(Org.Apache.Hadoop.Yarn.Util.Records .NewRecord <AllocateRequest>()); NUnit.Framework.Assert.Fail("The old Token should not work"); } catch (Exception ex) { NUnit.Framework.Assert.IsTrue(ex is SecretManager.InvalidToken); NUnit.Framework.Assert.IsTrue(ex.Message.Contains("Invalid AMRMToken from " + token .DecodeIdentifier().GetApplicationAttemptId())); } // make sure the recovered AMRMToken works for new RM amClient.Allocate(0.1f); amClient.UnregisterApplicationMaster(FinalApplicationStatus.Succeeded, null, null ); amClient.Stop(); rm1.Stop(); rm2.Stop(); }
// Test verify for // 1. AM try to unregister without registering // 2. AM register to RM, and try to unregister immediately after RM restart /// <exception cref="System.Exception"/> public virtual void TestAMRMClientForUnregisterAMOnRMRestart() { MemoryRMStateStore memStore = new MemoryRMStateStore(); memStore.Init(conf); // Phase-1 Start 1st RM TestAMRMClientOnRMRestart.MyResourceManager rm1 = new TestAMRMClientOnRMRestart.MyResourceManager (conf, memStore); rm1.Start(); DrainDispatcher dispatcher = (DrainDispatcher)rm1.GetRMContext().GetDispatcher(); // Submit the application RMApp app = rm1.SubmitApp(1024); dispatcher.Await(); MockNM nm1 = new MockNM("h1:1234", 15120, rm1.GetResourceTrackerService()); nm1.RegisterNode(); nm1.NodeHeartbeat(true); // Node heartbeat dispatcher.Await(); ApplicationAttemptId appAttemptId = app.GetCurrentAppAttempt().GetAppAttemptId(); rm1.SendAMLaunched(appAttemptId); dispatcher.Await(); Org.Apache.Hadoop.Security.Token.Token <AMRMTokenIdentifier> token = rm1.GetRMContext ().GetRMApps()[appAttemptId.GetApplicationId()].GetRMAppAttempt(appAttemptId).GetAMRMToken (); UserGroupInformation ugi = UserGroupInformation.GetCurrentUser(); ugi.AddTokenIdentifier(token.DecodeIdentifier()); AMRMClient <AMRMClient.ContainerRequest> amClient = new TestAMRMClientOnRMRestart.MyAMRMClientImpl (rm1); amClient.Init(conf); amClient.Start(); amClient.RegisterApplicationMaster("h1", 10000, string.Empty); amClient.Allocate(0.1f); // Phase-2 start 2nd RM is up TestAMRMClientOnRMRestart.MyResourceManager rm2 = new TestAMRMClientOnRMRestart.MyResourceManager (conf, memStore); rm2.Start(); nm1.SetResourceTrackerService(rm2.GetResourceTrackerService()); ((TestAMRMClientOnRMRestart.MyAMRMClientImpl)amClient).UpdateRMProxy(rm2); dispatcher = (DrainDispatcher)rm2.GetRMContext().GetDispatcher(); // NM should be rebooted on heartbeat, even first heartbeat for nm2 NodeHeartbeatResponse hbResponse = nm1.NodeHeartbeat(true); NUnit.Framework.Assert.AreEqual(NodeAction.Resync, hbResponse.GetNodeAction()); // new NM to represent NM re-register nm1 = new MockNM("h1:1234", 10240, rm2.GetResourceTrackerService()); ContainerId containerId = ContainerId.NewContainerId(appAttemptId, 1); NMContainerStatus containerReport = NMContainerStatus.NewInstance(containerId, ContainerState .Running, Resource.NewInstance(1024, 1), "recover container", 0, Priority.NewInstance (0), 0); nm1.RegisterNode(Arrays.AsList(containerReport), null); nm1.NodeHeartbeat(true); dispatcher.Await(); amClient.UnregisterApplicationMaster(FinalApplicationStatus.Succeeded, null, null ); rm2.WaitForState(appAttemptId, RMAppAttemptState.Finishing); nm1.NodeHeartbeat(appAttemptId, 1, ContainerState.Complete); rm2.WaitForState(appAttemptId, RMAppAttemptState.Finished); rm2.WaitForState(app.GetApplicationId(), RMAppState.Finished); amClient.Stop(); rm1.Stop(); rm2.Stop(); }