// Test regular RM restart/failover, new RM should not count // AM failure towards the max-retry-account and should be able to // re-launch the AM. /// <exception cref="System.Exception"/> public virtual void TestRMRestartOrFailoverNotCountedForAMFailures() { YarnConfiguration conf = new YarnConfiguration(); conf.SetClass(YarnConfiguration.RmScheduler, typeof(CapacityScheduler), typeof(ResourceScheduler )); conf.SetBoolean(YarnConfiguration.RecoveryEnabled, true); conf.SetBoolean(YarnConfiguration.RmWorkPreservingRecoveryEnabled, false); conf.Set(YarnConfiguration.RmStore, typeof(MemoryRMStateStore).FullName); // explicitly set max-am-retry count as 1. conf.SetInt(YarnConfiguration.RmAmMaxAttempts, 1); MemoryRMStateStore memStore = new MemoryRMStateStore(); memStore.Init(conf); MockRM rm1 = new MockRM(conf, memStore); rm1.Start(); MockNM nm1 = new MockNM("127.0.0.1:1234", 8000, rm1.GetResourceTrackerService()); nm1.RegisterNode(); RMApp app1 = rm1.SubmitApp(200); // AM should be restarted even though max-am-attempt is 1. MockAM am1 = MockRM.LaunchAndRegisterAM(app1, rm1, nm1); RMAppAttempt attempt1 = app1.GetCurrentAppAttempt(); NUnit.Framework.Assert.IsTrue(((RMAppAttemptImpl)attempt1).MayBeLastAttempt()); // Restart rm. MockRM rm2 = new MockRM(conf, memStore); rm2.Start(); ApplicationStateData appState = memStore.GetState().GetApplicationState()[app1.GetApplicationId ()]; // re-register the NM nm1.SetResourceTrackerService(rm2.GetResourceTrackerService()); NMContainerStatus status = Org.Apache.Hadoop.Yarn.Util.Records.NewRecord <NMContainerStatus >(); status.SetContainerExitStatus(ContainerExitStatus.KilledByResourcemanager); status.SetContainerId(attempt1.GetMasterContainer().GetId()); status.SetContainerState(ContainerState.Complete); status.SetDiagnostics(string.Empty); nm1.RegisterNode(Sharpen.Collections.SingletonList(status), null); rm2.WaitForState(attempt1.GetAppAttemptId(), RMAppAttemptState.Failed); NUnit.Framework.Assert.AreEqual(ContainerExitStatus.KilledByResourcemanager, appState .GetAttempt(am1.GetApplicationAttemptId()).GetAMContainerExitStatus()); // Will automatically start a new AppAttempt in rm2 rm2.WaitForState(app1.GetApplicationId(), RMAppState.Accepted); MockAM am2 = rm2.WaitForNewAMToLaunchAndRegister(app1.GetApplicationId(), 2, nm1); MockRM.FinishAMAndVerifyAppState(app1, rm2, nm1, am2); RMAppAttempt attempt3 = rm2.GetRMContext().GetRMApps()[app1.GetApplicationId()].GetCurrentAppAttempt (); NUnit.Framework.Assert.IsTrue(attempt3.ShouldCountTowardsMaxAttemptRetry()); NUnit.Framework.Assert.AreEqual(ContainerExitStatus.Invalid, appState.GetAttempt( am2.GetApplicationAttemptId()).GetAMContainerExitStatus()); rm1.Stop(); rm2.Stop(); }
// Test the DT mast key in the state-store when the mast key is being rolled. /// <exception cref="System.Exception"/> public virtual void TestRMDTMasterKeyStateOnRollingMasterKey() { MemoryRMStateStore memStore = new MemoryRMStateStore(); memStore.Init(conf); RMStateStore.RMState rmState = memStore.GetState(); IDictionary <RMDelegationTokenIdentifier, long> rmDTState = rmState.GetRMDTSecretManagerState ().GetTokenState(); ICollection <DelegationKey> rmDTMasterKeyState = rmState.GetRMDTSecretManagerState ().GetMasterKeyState(); MockRM rm1 = new TestRMDelegationTokens.MyMockRM(this, conf, memStore); rm1.Start(); // on rm start, two master keys are created. // One is created at RMDTSecretMgr.startThreads.updateCurrentKey(); // the other is created on the first run of // tokenRemoverThread.rollMasterKey() RMDelegationTokenSecretManager dtSecretManager = rm1.GetRMContext().GetRMDelegationTokenSecretManager (); // assert all master keys are saved NUnit.Framework.Assert.AreEqual(dtSecretManager.GetAllMasterKeys(), rmDTMasterKeyState ); ICollection <DelegationKey> expiringKeys = new HashSet <DelegationKey>(); Sharpen.Collections.AddAll(expiringKeys, dtSecretManager.GetAllMasterKeys()); // request to generate a RMDelegationToken GetDelegationTokenRequest request = Org.Mockito.Mockito.Mock <GetDelegationTokenRequest >(); Org.Mockito.Mockito.When(request.GetRenewer()).ThenReturn("renewer1"); GetDelegationTokenResponse response = rm1.GetClientRMService().GetDelegationToken (request); Org.Apache.Hadoop.Yarn.Api.Records.Token delegationToken = response.GetRMDelegationToken (); Org.Apache.Hadoop.Security.Token.Token <RMDelegationTokenIdentifier> token1 = ConverterUtils .ConvertFromYarn(delegationToken, (Text)null); RMDelegationTokenIdentifier dtId1 = token1.DecodeIdentifier(); // For all keys that still remain in memory, we should have them stored // in state-store also. while (((TestRMDelegationTokens.TestRMDelegationTokenSecretManager)dtSecretManager ).numUpdatedKeys.Get() < 3) { ((TestRMDelegationTokens.TestRMDelegationTokenSecretManager)dtSecretManager).CheckCurrentKeyInStateStore (rmDTMasterKeyState); Sharpen.Thread.Sleep(100); } // wait for token to expire and remove from state-store // rollMasterKey is called every 1 second. int count = 0; while (rmDTState.Contains(dtId1) && count < 100) { Sharpen.Thread.Sleep(100); count++; } rm1.Stop(); }
// Test RM restarts after AM container is preempted, new RM should not count // AM preemption failure towards the max-retry-account and should be able to // re-launch the AM. /// <exception cref="System.Exception"/> public virtual void TestPreemptedAMRestartOnRMRestart() { YarnConfiguration conf = new YarnConfiguration(); conf.SetClass(YarnConfiguration.RmScheduler, typeof(CapacityScheduler), typeof(ResourceScheduler )); conf.SetBoolean(YarnConfiguration.RecoveryEnabled, true); conf.SetBoolean(YarnConfiguration.RmWorkPreservingRecoveryEnabled, false); conf.Set(YarnConfiguration.RmStore, typeof(MemoryRMStateStore).FullName); // explicitly set max-am-retry count as 1. conf.SetInt(YarnConfiguration.RmAmMaxAttempts, 1); MemoryRMStateStore memStore = new MemoryRMStateStore(); memStore.Init(conf); MockRM rm1 = new MockRM(conf, memStore); rm1.Start(); MockNM nm1 = new MockNM("127.0.0.1:1234", 8000, rm1.GetResourceTrackerService()); nm1.RegisterNode(); RMApp app1 = rm1.SubmitApp(200); RMAppAttempt attempt1 = app1.GetCurrentAppAttempt(); MockAM am1 = MockRM.LaunchAndRegisterAM(app1, rm1, nm1); CapacityScheduler scheduler = (CapacityScheduler)rm1.GetResourceScheduler(); ContainerId amContainer = ContainerId.NewContainerId(am1.GetApplicationAttemptId( ), 1); // Forcibly preempt the am container; scheduler.KillContainer(scheduler.GetRMContainer(amContainer)); am1.WaitForState(RMAppAttemptState.Failed); NUnit.Framework.Assert.IsTrue(!attempt1.ShouldCountTowardsMaxAttemptRetry()); rm1.WaitForState(app1.GetApplicationId(), RMAppState.Accepted); // state store has 1 attempt stored. ApplicationStateData appState = memStore.GetState().GetApplicationState()[app1.GetApplicationId ()]; NUnit.Framework.Assert.AreEqual(1, appState.GetAttemptCount()); // attempt stored has the preempted container exit status. NUnit.Framework.Assert.AreEqual(ContainerExitStatus.Preempted, appState.GetAttempt (am1.GetApplicationAttemptId()).GetAMContainerExitStatus()); // Restart rm. MockRM rm2 = new MockRM(conf, memStore); nm1.SetResourceTrackerService(rm2.GetResourceTrackerService()); nm1.RegisterNode(); rm2.Start(); // Restarted RM should re-launch the am. MockAM am2 = rm2.WaitForNewAMToLaunchAndRegister(app1.GetApplicationId(), 2, nm1); MockRM.FinishAMAndVerifyAppState(app1, rm2, nm1, am2); RMAppAttempt attempt2 = rm2.GetRMContext().GetRMApps()[app1.GetApplicationId()].GetCurrentAppAttempt (); NUnit.Framework.Assert.IsTrue(attempt2.ShouldCountTowardsMaxAttemptRetry()); NUnit.Framework.Assert.AreEqual(ContainerExitStatus.Invalid, appState.GetAttempt( am2.GetApplicationAttemptId()).GetAMContainerExitStatus()); rm1.Stop(); rm2.Stop(); }
// Test all expired keys are removed from state-store. /// <exception cref="System.Exception"/> public virtual void TestRemoveExpiredMasterKeyInRMStateStore() { MemoryRMStateStore memStore = new MemoryRMStateStore(); memStore.Init(conf); RMStateStore.RMState rmState = memStore.GetState(); ICollection <DelegationKey> rmDTMasterKeyState = rmState.GetRMDTSecretManagerState ().GetMasterKeyState(); MockRM rm1 = new TestRMDelegationTokens.MyMockRM(this, conf, memStore); rm1.Start(); RMDelegationTokenSecretManager dtSecretManager = rm1.GetRMContext().GetRMDelegationTokenSecretManager (); // assert all master keys are saved NUnit.Framework.Assert.AreEqual(dtSecretManager.GetAllMasterKeys(), rmDTMasterKeyState ); ICollection <DelegationKey> expiringKeys = new HashSet <DelegationKey>(); Sharpen.Collections.AddAll(expiringKeys, dtSecretManager.GetAllMasterKeys()); // wait for expiringKeys to expire while (true) { bool allExpired = true; foreach (DelegationKey key in expiringKeys) { if (rmDTMasterKeyState.Contains(key)) { allExpired = false; } } if (allExpired) { break; } Sharpen.Thread.Sleep(500); } }
// AM container preempted, nm disk failure // should not be counted towards AM max retry count. /// <exception cref="System.Exception"/> public virtual void TestShouldNotCountFailureToMaxAttemptRetry() { YarnConfiguration conf = new YarnConfiguration(); conf.SetClass(YarnConfiguration.RmScheduler, typeof(CapacityScheduler), typeof(ResourceScheduler )); // explicitly set max-am-retry count as 1. conf.SetInt(YarnConfiguration.RmAmMaxAttempts, 1); conf.SetBoolean(YarnConfiguration.RecoveryEnabled, true); conf.Set(YarnConfiguration.RmStore, typeof(MemoryRMStateStore).FullName); MemoryRMStateStore memStore = new MemoryRMStateStore(); memStore.Init(conf); MockRM rm1 = new MockRM(conf, memStore); rm1.Start(); MockNM nm1 = new MockNM("127.0.0.1:1234", 8000, rm1.GetResourceTrackerService()); nm1.RegisterNode(); RMApp app1 = rm1.SubmitApp(200); RMAppAttempt attempt1 = app1.GetCurrentAppAttempt(); MockAM am1 = MockRM.LaunchAndRegisterAM(app1, rm1, nm1); CapacityScheduler scheduler = (CapacityScheduler)rm1.GetResourceScheduler(); ContainerId amContainer = ContainerId.NewContainerId(am1.GetApplicationAttemptId( ), 1); // Preempt the first attempt; scheduler.KillContainer(scheduler.GetRMContainer(amContainer)); am1.WaitForState(RMAppAttemptState.Failed); NUnit.Framework.Assert.IsTrue(!attempt1.ShouldCountTowardsMaxAttemptRetry()); rm1.WaitForState(app1.GetApplicationId(), RMAppState.Accepted); ApplicationStateData appState = memStore.GetState().GetApplicationState()[app1.GetApplicationId ()]; // AM should be restarted even though max-am-attempt is 1. MockAM am2 = rm1.WaitForNewAMToLaunchAndRegister(app1.GetApplicationId(), 2, nm1); RMAppAttempt attempt2 = app1.GetCurrentAppAttempt(); NUnit.Framework.Assert.IsTrue(((RMAppAttemptImpl)attempt2).MayBeLastAttempt()); // Preempt the second attempt. ContainerId amContainer2 = ContainerId.NewContainerId(am2.GetApplicationAttemptId (), 1); scheduler.KillContainer(scheduler.GetRMContainer(amContainer2)); am2.WaitForState(RMAppAttemptState.Failed); NUnit.Framework.Assert.IsTrue(!attempt2.ShouldCountTowardsMaxAttemptRetry()); rm1.WaitForState(app1.GetApplicationId(), RMAppState.Accepted); MockAM am3 = rm1.WaitForNewAMToLaunchAndRegister(app1.GetApplicationId(), 3, nm1); RMAppAttempt attempt3 = app1.GetCurrentAppAttempt(); NUnit.Framework.Assert.IsTrue(((RMAppAttemptImpl)attempt3).MayBeLastAttempt()); // mimic NM disk_failure ContainerStatus containerStatus = Org.Apache.Hadoop.Yarn.Util.Records.NewRecord <ContainerStatus >(); containerStatus.SetContainerId(attempt3.GetMasterContainer().GetId()); containerStatus.SetDiagnostics("mimic NM disk_failure"); containerStatus.SetState(ContainerState.Complete); containerStatus.SetExitStatus(ContainerExitStatus.DisksFailed); IDictionary <ApplicationId, IList <ContainerStatus> > conts = new Dictionary <ApplicationId , IList <ContainerStatus> >(); conts[app1.GetApplicationId()] = Sharpen.Collections.SingletonList(containerStatus ); nm1.NodeHeartbeat(conts, true); am3.WaitForState(RMAppAttemptState.Failed); NUnit.Framework.Assert.IsTrue(!attempt3.ShouldCountTowardsMaxAttemptRetry()); NUnit.Framework.Assert.AreEqual(ContainerExitStatus.DisksFailed, appState.GetAttempt (am3.GetApplicationAttemptId()).GetAMContainerExitStatus()); rm1.WaitForState(app1.GetApplicationId(), RMAppState.Accepted); MockAM am4 = rm1.WaitForNewAMToLaunchAndRegister(app1.GetApplicationId(), 4, nm1); RMAppAttempt attempt4 = app1.GetCurrentAppAttempt(); NUnit.Framework.Assert.IsTrue(((RMAppAttemptImpl)attempt4).MayBeLastAttempt()); // create second NM, and register to rm1 MockNM nm2 = new MockNM("127.0.0.1:2234", 8000, rm1.GetResourceTrackerService()); nm2.RegisterNode(); // nm1 heartbeats to report unhealthy // This will mimic ContainerExitStatus.ABORT nm1.NodeHeartbeat(false); am4.WaitForState(RMAppAttemptState.Failed); NUnit.Framework.Assert.IsTrue(!attempt4.ShouldCountTowardsMaxAttemptRetry()); NUnit.Framework.Assert.AreEqual(ContainerExitStatus.Aborted, appState.GetAttempt( am4.GetApplicationAttemptId()).GetAMContainerExitStatus()); // launch next AM in nm2 nm2.NodeHeartbeat(true); MockAM am5 = rm1.WaitForNewAMToLaunchAndRegister(app1.GetApplicationId(), 5, nm2); RMAppAttempt attempt5 = app1.GetCurrentAppAttempt(); NUnit.Framework.Assert.IsTrue(((RMAppAttemptImpl)attempt5).MayBeLastAttempt()); // fail the AM normally nm2.NodeHeartbeat(am5.GetApplicationAttemptId(), 1, ContainerState.Complete); am5.WaitForState(RMAppAttemptState.Failed); NUnit.Framework.Assert.IsTrue(attempt5.ShouldCountTowardsMaxAttemptRetry()); // AM should not be restarted. rm1.WaitForState(app1.GetApplicationId(), RMAppState.Failed); NUnit.Framework.Assert.AreEqual(5, app1.GetAppAttempts().Count); rm1.Stop(); }