Beispiel #1
0
        // Test regular RM restart/failover, new RM should not count
        // AM failure towards the max-retry-account and should be able to
        // re-launch the AM.
        /// <exception cref="System.Exception"/>
        public virtual void TestRMRestartOrFailoverNotCountedForAMFailures()
        {
            YarnConfiguration conf = new YarnConfiguration();

            conf.SetClass(YarnConfiguration.RmScheduler, typeof(CapacityScheduler), typeof(ResourceScheduler
                                                                                           ));
            conf.SetBoolean(YarnConfiguration.RecoveryEnabled, true);
            conf.SetBoolean(YarnConfiguration.RmWorkPreservingRecoveryEnabled, false);
            conf.Set(YarnConfiguration.RmStore, typeof(MemoryRMStateStore).FullName);
            // explicitly set max-am-retry count as 1.
            conf.SetInt(YarnConfiguration.RmAmMaxAttempts, 1);
            MemoryRMStateStore memStore = new MemoryRMStateStore();

            memStore.Init(conf);
            MockRM rm1 = new MockRM(conf, memStore);

            rm1.Start();
            MockNM nm1 = new MockNM("127.0.0.1:1234", 8000, rm1.GetResourceTrackerService());

            nm1.RegisterNode();
            RMApp app1 = rm1.SubmitApp(200);
            // AM should be restarted even though max-am-attempt is 1.
            MockAM       am1      = MockRM.LaunchAndRegisterAM(app1, rm1, nm1);
            RMAppAttempt attempt1 = app1.GetCurrentAppAttempt();

            NUnit.Framework.Assert.IsTrue(((RMAppAttemptImpl)attempt1).MayBeLastAttempt());
            // Restart rm.
            MockRM rm2 = new MockRM(conf, memStore);

            rm2.Start();
            ApplicationStateData appState = memStore.GetState().GetApplicationState()[app1.GetApplicationId
                                                                                          ()];

            // re-register the NM
            nm1.SetResourceTrackerService(rm2.GetResourceTrackerService());
            NMContainerStatus status = Org.Apache.Hadoop.Yarn.Util.Records.NewRecord <NMContainerStatus
                                                                                      >();

            status.SetContainerExitStatus(ContainerExitStatus.KilledByResourcemanager);
            status.SetContainerId(attempt1.GetMasterContainer().GetId());
            status.SetContainerState(ContainerState.Complete);
            status.SetDiagnostics(string.Empty);
            nm1.RegisterNode(Sharpen.Collections.SingletonList(status), null);
            rm2.WaitForState(attempt1.GetAppAttemptId(), RMAppAttemptState.Failed);
            NUnit.Framework.Assert.AreEqual(ContainerExitStatus.KilledByResourcemanager, appState
                                            .GetAttempt(am1.GetApplicationAttemptId()).GetAMContainerExitStatus());
            // Will automatically start a new AppAttempt in rm2
            rm2.WaitForState(app1.GetApplicationId(), RMAppState.Accepted);
            MockAM am2 = rm2.WaitForNewAMToLaunchAndRegister(app1.GetApplicationId(), 2, nm1);

            MockRM.FinishAMAndVerifyAppState(app1, rm2, nm1, am2);
            RMAppAttempt attempt3 = rm2.GetRMContext().GetRMApps()[app1.GetApplicationId()].GetCurrentAppAttempt
                                        ();

            NUnit.Framework.Assert.IsTrue(attempt3.ShouldCountTowardsMaxAttemptRetry());
            NUnit.Framework.Assert.AreEqual(ContainerExitStatus.Invalid, appState.GetAttempt(
                                                am2.GetApplicationAttemptId()).GetAMContainerExitStatus());
            rm1.Stop();
            rm2.Stop();
        }
        // Test the DT mast key in the state-store when the mast key is being rolled.
        /// <exception cref="System.Exception"/>
        public virtual void TestRMDTMasterKeyStateOnRollingMasterKey()
        {
            MemoryRMStateStore memStore = new MemoryRMStateStore();

            memStore.Init(conf);
            RMStateStore.RMState rmState = memStore.GetState();
            IDictionary <RMDelegationTokenIdentifier, long> rmDTState = rmState.GetRMDTSecretManagerState
                                                                            ().GetTokenState();
            ICollection <DelegationKey> rmDTMasterKeyState = rmState.GetRMDTSecretManagerState
                                                                 ().GetMasterKeyState();
            MockRM rm1 = new TestRMDelegationTokens.MyMockRM(this, conf, memStore);

            rm1.Start();
            // on rm start, two master keys are created.
            // One is created at RMDTSecretMgr.startThreads.updateCurrentKey();
            // the other is created on the first run of
            // tokenRemoverThread.rollMasterKey()
            RMDelegationTokenSecretManager dtSecretManager = rm1.GetRMContext().GetRMDelegationTokenSecretManager
                                                                 ();

            // assert all master keys are saved
            NUnit.Framework.Assert.AreEqual(dtSecretManager.GetAllMasterKeys(), rmDTMasterKeyState
                                            );
            ICollection <DelegationKey> expiringKeys = new HashSet <DelegationKey>();

            Sharpen.Collections.AddAll(expiringKeys, dtSecretManager.GetAllMasterKeys());
            // request to generate a RMDelegationToken
            GetDelegationTokenRequest request = Org.Mockito.Mockito.Mock <GetDelegationTokenRequest
                                                                          >();

            Org.Mockito.Mockito.When(request.GetRenewer()).ThenReturn("renewer1");
            GetDelegationTokenResponse response = rm1.GetClientRMService().GetDelegationToken
                                                      (request);

            Org.Apache.Hadoop.Yarn.Api.Records.Token delegationToken = response.GetRMDelegationToken
                                                                           ();
            Org.Apache.Hadoop.Security.Token.Token <RMDelegationTokenIdentifier> token1 = ConverterUtils
                                                                                          .ConvertFromYarn(delegationToken, (Text)null);
            RMDelegationTokenIdentifier dtId1 = token1.DecodeIdentifier();

            // For all keys that still remain in memory, we should have them stored
            // in state-store also.
            while (((TestRMDelegationTokens.TestRMDelegationTokenSecretManager)dtSecretManager
                    ).numUpdatedKeys.Get() < 3)
            {
                ((TestRMDelegationTokens.TestRMDelegationTokenSecretManager)dtSecretManager).CheckCurrentKeyInStateStore
                    (rmDTMasterKeyState);
                Sharpen.Thread.Sleep(100);
            }
            // wait for token to expire and remove from state-store
            // rollMasterKey is called every 1 second.
            int count = 0;

            while (rmDTState.Contains(dtId1) && count < 100)
            {
                Sharpen.Thread.Sleep(100);
                count++;
            }
            rm1.Stop();
        }
Beispiel #3
0
        // Test RM restarts after AM container is preempted, new RM should not count
        // AM preemption failure towards the max-retry-account and should be able to
        // re-launch the AM.
        /// <exception cref="System.Exception"/>
        public virtual void TestPreemptedAMRestartOnRMRestart()
        {
            YarnConfiguration conf = new YarnConfiguration();

            conf.SetClass(YarnConfiguration.RmScheduler, typeof(CapacityScheduler), typeof(ResourceScheduler
                                                                                           ));
            conf.SetBoolean(YarnConfiguration.RecoveryEnabled, true);
            conf.SetBoolean(YarnConfiguration.RmWorkPreservingRecoveryEnabled, false);
            conf.Set(YarnConfiguration.RmStore, typeof(MemoryRMStateStore).FullName);
            // explicitly set max-am-retry count as 1.
            conf.SetInt(YarnConfiguration.RmAmMaxAttempts, 1);
            MemoryRMStateStore memStore = new MemoryRMStateStore();

            memStore.Init(conf);
            MockRM rm1 = new MockRM(conf, memStore);

            rm1.Start();
            MockNM nm1 = new MockNM("127.0.0.1:1234", 8000, rm1.GetResourceTrackerService());

            nm1.RegisterNode();
            RMApp             app1        = rm1.SubmitApp(200);
            RMAppAttempt      attempt1    = app1.GetCurrentAppAttempt();
            MockAM            am1         = MockRM.LaunchAndRegisterAM(app1, rm1, nm1);
            CapacityScheduler scheduler   = (CapacityScheduler)rm1.GetResourceScheduler();
            ContainerId       amContainer = ContainerId.NewContainerId(am1.GetApplicationAttemptId(
                                                                           ), 1);

            // Forcibly preempt the am container;
            scheduler.KillContainer(scheduler.GetRMContainer(amContainer));
            am1.WaitForState(RMAppAttemptState.Failed);
            NUnit.Framework.Assert.IsTrue(!attempt1.ShouldCountTowardsMaxAttemptRetry());
            rm1.WaitForState(app1.GetApplicationId(), RMAppState.Accepted);
            // state store has 1 attempt stored.
            ApplicationStateData appState = memStore.GetState().GetApplicationState()[app1.GetApplicationId
                                                                                          ()];

            NUnit.Framework.Assert.AreEqual(1, appState.GetAttemptCount());
            // attempt stored has the preempted container exit status.
            NUnit.Framework.Assert.AreEqual(ContainerExitStatus.Preempted, appState.GetAttempt
                                                (am1.GetApplicationAttemptId()).GetAMContainerExitStatus());
            // Restart rm.
            MockRM rm2 = new MockRM(conf, memStore);

            nm1.SetResourceTrackerService(rm2.GetResourceTrackerService());
            nm1.RegisterNode();
            rm2.Start();
            // Restarted RM should re-launch the am.
            MockAM am2 = rm2.WaitForNewAMToLaunchAndRegister(app1.GetApplicationId(), 2, nm1);

            MockRM.FinishAMAndVerifyAppState(app1, rm2, nm1, am2);
            RMAppAttempt attempt2 = rm2.GetRMContext().GetRMApps()[app1.GetApplicationId()].GetCurrentAppAttempt
                                        ();

            NUnit.Framework.Assert.IsTrue(attempt2.ShouldCountTowardsMaxAttemptRetry());
            NUnit.Framework.Assert.AreEqual(ContainerExitStatus.Invalid, appState.GetAttempt(
                                                am2.GetApplicationAttemptId()).GetAMContainerExitStatus());
            rm1.Stop();
            rm2.Stop();
        }
        // Test all expired keys are removed from state-store.
        /// <exception cref="System.Exception"/>
        public virtual void TestRemoveExpiredMasterKeyInRMStateStore()
        {
            MemoryRMStateStore memStore = new MemoryRMStateStore();

            memStore.Init(conf);
            RMStateStore.RMState        rmState            = memStore.GetState();
            ICollection <DelegationKey> rmDTMasterKeyState = rmState.GetRMDTSecretManagerState
                                                                 ().GetMasterKeyState();
            MockRM rm1 = new TestRMDelegationTokens.MyMockRM(this, conf, memStore);

            rm1.Start();
            RMDelegationTokenSecretManager dtSecretManager = rm1.GetRMContext().GetRMDelegationTokenSecretManager
                                                                 ();

            // assert all master keys are saved
            NUnit.Framework.Assert.AreEqual(dtSecretManager.GetAllMasterKeys(), rmDTMasterKeyState
                                            );
            ICollection <DelegationKey> expiringKeys = new HashSet <DelegationKey>();

            Sharpen.Collections.AddAll(expiringKeys, dtSecretManager.GetAllMasterKeys());
            // wait for expiringKeys to expire
            while (true)
            {
                bool allExpired = true;
                foreach (DelegationKey key in expiringKeys)
                {
                    if (rmDTMasterKeyState.Contains(key))
                    {
                        allExpired = false;
                    }
                }
                if (allExpired)
                {
                    break;
                }
                Sharpen.Thread.Sleep(500);
            }
        }
Beispiel #5
0
        // AM container preempted, nm disk failure
        // should not be counted towards AM max retry count.
        /// <exception cref="System.Exception"/>
        public virtual void TestShouldNotCountFailureToMaxAttemptRetry()
        {
            YarnConfiguration conf = new YarnConfiguration();

            conf.SetClass(YarnConfiguration.RmScheduler, typeof(CapacityScheduler), typeof(ResourceScheduler
                                                                                           ));
            // explicitly set max-am-retry count as 1.
            conf.SetInt(YarnConfiguration.RmAmMaxAttempts, 1);
            conf.SetBoolean(YarnConfiguration.RecoveryEnabled, true);
            conf.Set(YarnConfiguration.RmStore, typeof(MemoryRMStateStore).FullName);
            MemoryRMStateStore memStore = new MemoryRMStateStore();

            memStore.Init(conf);
            MockRM rm1 = new MockRM(conf, memStore);

            rm1.Start();
            MockNM nm1 = new MockNM("127.0.0.1:1234", 8000, rm1.GetResourceTrackerService());

            nm1.RegisterNode();
            RMApp             app1        = rm1.SubmitApp(200);
            RMAppAttempt      attempt1    = app1.GetCurrentAppAttempt();
            MockAM            am1         = MockRM.LaunchAndRegisterAM(app1, rm1, nm1);
            CapacityScheduler scheduler   = (CapacityScheduler)rm1.GetResourceScheduler();
            ContainerId       amContainer = ContainerId.NewContainerId(am1.GetApplicationAttemptId(
                                                                           ), 1);

            // Preempt the first attempt;
            scheduler.KillContainer(scheduler.GetRMContainer(amContainer));
            am1.WaitForState(RMAppAttemptState.Failed);
            NUnit.Framework.Assert.IsTrue(!attempt1.ShouldCountTowardsMaxAttemptRetry());
            rm1.WaitForState(app1.GetApplicationId(), RMAppState.Accepted);
            ApplicationStateData appState = memStore.GetState().GetApplicationState()[app1.GetApplicationId
                                                                                          ()];
            // AM should be restarted even though max-am-attempt is 1.
            MockAM       am2      = rm1.WaitForNewAMToLaunchAndRegister(app1.GetApplicationId(), 2, nm1);
            RMAppAttempt attempt2 = app1.GetCurrentAppAttempt();

            NUnit.Framework.Assert.IsTrue(((RMAppAttemptImpl)attempt2).MayBeLastAttempt());
            // Preempt the second attempt.
            ContainerId amContainer2 = ContainerId.NewContainerId(am2.GetApplicationAttemptId
                                                                      (), 1);

            scheduler.KillContainer(scheduler.GetRMContainer(amContainer2));
            am2.WaitForState(RMAppAttemptState.Failed);
            NUnit.Framework.Assert.IsTrue(!attempt2.ShouldCountTowardsMaxAttemptRetry());
            rm1.WaitForState(app1.GetApplicationId(), RMAppState.Accepted);
            MockAM       am3      = rm1.WaitForNewAMToLaunchAndRegister(app1.GetApplicationId(), 3, nm1);
            RMAppAttempt attempt3 = app1.GetCurrentAppAttempt();

            NUnit.Framework.Assert.IsTrue(((RMAppAttemptImpl)attempt3).MayBeLastAttempt());
            // mimic NM disk_failure
            ContainerStatus containerStatus = Org.Apache.Hadoop.Yarn.Util.Records.NewRecord <ContainerStatus
                                                                                             >();

            containerStatus.SetContainerId(attempt3.GetMasterContainer().GetId());
            containerStatus.SetDiagnostics("mimic NM disk_failure");
            containerStatus.SetState(ContainerState.Complete);
            containerStatus.SetExitStatus(ContainerExitStatus.DisksFailed);
            IDictionary <ApplicationId, IList <ContainerStatus> > conts = new Dictionary <ApplicationId
                                                                                          , IList <ContainerStatus> >();

            conts[app1.GetApplicationId()] = Sharpen.Collections.SingletonList(containerStatus
                                                                               );
            nm1.NodeHeartbeat(conts, true);
            am3.WaitForState(RMAppAttemptState.Failed);
            NUnit.Framework.Assert.IsTrue(!attempt3.ShouldCountTowardsMaxAttemptRetry());
            NUnit.Framework.Assert.AreEqual(ContainerExitStatus.DisksFailed, appState.GetAttempt
                                                (am3.GetApplicationAttemptId()).GetAMContainerExitStatus());
            rm1.WaitForState(app1.GetApplicationId(), RMAppState.Accepted);
            MockAM       am4      = rm1.WaitForNewAMToLaunchAndRegister(app1.GetApplicationId(), 4, nm1);
            RMAppAttempt attempt4 = app1.GetCurrentAppAttempt();

            NUnit.Framework.Assert.IsTrue(((RMAppAttemptImpl)attempt4).MayBeLastAttempt());
            // create second NM, and register to rm1
            MockNM nm2 = new MockNM("127.0.0.1:2234", 8000, rm1.GetResourceTrackerService());

            nm2.RegisterNode();
            // nm1 heartbeats to report unhealthy
            // This will mimic ContainerExitStatus.ABORT
            nm1.NodeHeartbeat(false);
            am4.WaitForState(RMAppAttemptState.Failed);
            NUnit.Framework.Assert.IsTrue(!attempt4.ShouldCountTowardsMaxAttemptRetry());
            NUnit.Framework.Assert.AreEqual(ContainerExitStatus.Aborted, appState.GetAttempt(
                                                am4.GetApplicationAttemptId()).GetAMContainerExitStatus());
            // launch next AM in nm2
            nm2.NodeHeartbeat(true);
            MockAM       am5      = rm1.WaitForNewAMToLaunchAndRegister(app1.GetApplicationId(), 5, nm2);
            RMAppAttempt attempt5 = app1.GetCurrentAppAttempt();

            NUnit.Framework.Assert.IsTrue(((RMAppAttemptImpl)attempt5).MayBeLastAttempt());
            // fail the AM normally
            nm2.NodeHeartbeat(am5.GetApplicationAttemptId(), 1, ContainerState.Complete);
            am5.WaitForState(RMAppAttemptState.Failed);
            NUnit.Framework.Assert.IsTrue(attempt5.ShouldCountTowardsMaxAttemptRetry());
            // AM should not be restarted.
            rm1.WaitForState(app1.GetApplicationId(), RMAppState.Failed);
            NUnit.Framework.Assert.AreEqual(5, app1.GetAppAttempts().Count);
            rm1.Stop();
        }