示例#1
0
        // Test regular RM restart/failover, new RM should not count
        // AM failure towards the max-retry-account and should be able to
        // re-launch the AM.
        /// <exception cref="System.Exception"/>
        public virtual void TestRMRestartOrFailoverNotCountedForAMFailures()
        {
            YarnConfiguration conf = new YarnConfiguration();

            conf.SetClass(YarnConfiguration.RmScheduler, typeof(CapacityScheduler), typeof(ResourceScheduler
                                                                                           ));
            conf.SetBoolean(YarnConfiguration.RecoveryEnabled, true);
            conf.SetBoolean(YarnConfiguration.RmWorkPreservingRecoveryEnabled, false);
            conf.Set(YarnConfiguration.RmStore, typeof(MemoryRMStateStore).FullName);
            // explicitly set max-am-retry count as 1.
            conf.SetInt(YarnConfiguration.RmAmMaxAttempts, 1);
            MemoryRMStateStore memStore = new MemoryRMStateStore();

            memStore.Init(conf);
            MockRM rm1 = new MockRM(conf, memStore);

            rm1.Start();
            MockNM nm1 = new MockNM("127.0.0.1:1234", 8000, rm1.GetResourceTrackerService());

            nm1.RegisterNode();
            RMApp app1 = rm1.SubmitApp(200);
            // AM should be restarted even though max-am-attempt is 1.
            MockAM       am1      = MockRM.LaunchAndRegisterAM(app1, rm1, nm1);
            RMAppAttempt attempt1 = app1.GetCurrentAppAttempt();

            NUnit.Framework.Assert.IsTrue(((RMAppAttemptImpl)attempt1).MayBeLastAttempt());
            // Restart rm.
            MockRM rm2 = new MockRM(conf, memStore);

            rm2.Start();
            ApplicationStateData appState = memStore.GetState().GetApplicationState()[app1.GetApplicationId
                                                                                          ()];

            // re-register the NM
            nm1.SetResourceTrackerService(rm2.GetResourceTrackerService());
            NMContainerStatus status = Org.Apache.Hadoop.Yarn.Util.Records.NewRecord <NMContainerStatus
                                                                                      >();

            status.SetContainerExitStatus(ContainerExitStatus.KilledByResourcemanager);
            status.SetContainerId(attempt1.GetMasterContainer().GetId());
            status.SetContainerState(ContainerState.Complete);
            status.SetDiagnostics(string.Empty);
            nm1.RegisterNode(Sharpen.Collections.SingletonList(status), null);
            rm2.WaitForState(attempt1.GetAppAttemptId(), RMAppAttemptState.Failed);
            NUnit.Framework.Assert.AreEqual(ContainerExitStatus.KilledByResourcemanager, appState
                                            .GetAttempt(am1.GetApplicationAttemptId()).GetAMContainerExitStatus());
            // Will automatically start a new AppAttempt in rm2
            rm2.WaitForState(app1.GetApplicationId(), RMAppState.Accepted);
            MockAM am2 = rm2.WaitForNewAMToLaunchAndRegister(app1.GetApplicationId(), 2, nm1);

            MockRM.FinishAMAndVerifyAppState(app1, rm2, nm1, am2);
            RMAppAttempt attempt3 = rm2.GetRMContext().GetRMApps()[app1.GetApplicationId()].GetCurrentAppAttempt
                                        ();

            NUnit.Framework.Assert.IsTrue(attempt3.ShouldCountTowardsMaxAttemptRetry());
            NUnit.Framework.Assert.AreEqual(ContainerExitStatus.Invalid, appState.GetAttempt(
                                                am2.GetApplicationAttemptId()).GetAMContainerExitStatus());
            rm1.Stop();
            rm2.Stop();
        }
示例#2
0
        // Test RM restarts after AM container is preempted, new RM should not count
        // AM preemption failure towards the max-retry-account and should be able to
        // re-launch the AM.
        /// <exception cref="System.Exception"/>
        public virtual void TestPreemptedAMRestartOnRMRestart()
        {
            YarnConfiguration conf = new YarnConfiguration();

            conf.SetClass(YarnConfiguration.RmScheduler, typeof(CapacityScheduler), typeof(ResourceScheduler
                                                                                           ));
            conf.SetBoolean(YarnConfiguration.RecoveryEnabled, true);
            conf.SetBoolean(YarnConfiguration.RmWorkPreservingRecoveryEnabled, false);
            conf.Set(YarnConfiguration.RmStore, typeof(MemoryRMStateStore).FullName);
            // explicitly set max-am-retry count as 1.
            conf.SetInt(YarnConfiguration.RmAmMaxAttempts, 1);
            MemoryRMStateStore memStore = new MemoryRMStateStore();

            memStore.Init(conf);
            MockRM rm1 = new MockRM(conf, memStore);

            rm1.Start();
            MockNM nm1 = new MockNM("127.0.0.1:1234", 8000, rm1.GetResourceTrackerService());

            nm1.RegisterNode();
            RMApp             app1        = rm1.SubmitApp(200);
            RMAppAttempt      attempt1    = app1.GetCurrentAppAttempt();
            MockAM            am1         = MockRM.LaunchAndRegisterAM(app1, rm1, nm1);
            CapacityScheduler scheduler   = (CapacityScheduler)rm1.GetResourceScheduler();
            ContainerId       amContainer = ContainerId.NewContainerId(am1.GetApplicationAttemptId(
                                                                           ), 1);

            // Forcibly preempt the am container;
            scheduler.KillContainer(scheduler.GetRMContainer(amContainer));
            am1.WaitForState(RMAppAttemptState.Failed);
            NUnit.Framework.Assert.IsTrue(!attempt1.ShouldCountTowardsMaxAttemptRetry());
            rm1.WaitForState(app1.GetApplicationId(), RMAppState.Accepted);
            // state store has 1 attempt stored.
            ApplicationStateData appState = memStore.GetState().GetApplicationState()[app1.GetApplicationId
                                                                                          ()];

            NUnit.Framework.Assert.AreEqual(1, appState.GetAttemptCount());
            // attempt stored has the preempted container exit status.
            NUnit.Framework.Assert.AreEqual(ContainerExitStatus.Preempted, appState.GetAttempt
                                                (am1.GetApplicationAttemptId()).GetAMContainerExitStatus());
            // Restart rm.
            MockRM rm2 = new MockRM(conf, memStore);

            nm1.SetResourceTrackerService(rm2.GetResourceTrackerService());
            nm1.RegisterNode();
            rm2.Start();
            // Restarted RM should re-launch the am.
            MockAM am2 = rm2.WaitForNewAMToLaunchAndRegister(app1.GetApplicationId(), 2, nm1);

            MockRM.FinishAMAndVerifyAppState(app1, rm2, nm1, am2);
            RMAppAttempt attempt2 = rm2.GetRMContext().GetRMApps()[app1.GetApplicationId()].GetCurrentAppAttempt
                                        ();

            NUnit.Framework.Assert.IsTrue(attempt2.ShouldCountTowardsMaxAttemptRetry());
            NUnit.Framework.Assert.AreEqual(ContainerExitStatus.Invalid, appState.GetAttempt(
                                                am2.GetApplicationAttemptId()).GetAMContainerExitStatus());
            rm1.Stop();
            rm2.Stop();
        }
示例#3
0
        /// <exception cref="System.Exception"/>
        public virtual void TestRMAppAttemptFailuresValidityInterval()
        {
            YarnConfiguration conf = new YarnConfiguration();

            conf.SetClass(YarnConfiguration.RmScheduler, typeof(CapacityScheduler), typeof(ResourceScheduler
                                                                                           ));
            conf.SetBoolean(YarnConfiguration.RecoveryEnabled, true);
            conf.SetBoolean(YarnConfiguration.RmWorkPreservingRecoveryEnabled, false);
            conf.Set(YarnConfiguration.RmStore, typeof(MemoryRMStateStore).FullName);
            // explicitly set max-am-retry count as 2.
            conf.SetInt(YarnConfiguration.RmAmMaxAttempts, 2);
            MemoryRMStateStore memStore = new MemoryRMStateStore();

            memStore.Init(conf);
            MockRM rm1 = new MockRM(conf, memStore);

            rm1.Start();
            MockNM nm1 = new MockNM("127.0.0.1:1234", 8000, rm1.GetResourceTrackerService());

            nm1.RegisterNode();
            // set window size to a larger number : 20s
            // we will verify the app should be failed if
            // two continuous attempts failed in 20s.
            RMApp  app = rm1.SubmitApp(200, 20000);
            MockAM am  = MockRM.LaunchAM(app, rm1, nm1);

            // Fail current attempt normally
            nm1.NodeHeartbeat(am.GetApplicationAttemptId(), 1, ContainerState.Complete);
            am.WaitForState(RMAppAttemptState.Failed);
            // launch the second attempt
            rm1.WaitForState(app.GetApplicationId(), RMAppState.Accepted);
            NUnit.Framework.Assert.AreEqual(2, app.GetAppAttempts().Count);
            NUnit.Framework.Assert.IsTrue(((RMAppAttemptImpl)app.GetCurrentAppAttempt()).MayBeLastAttempt
                                              ());
            MockAM am_2 = MockRM.LaunchAndRegisterAM(app, rm1, nm1);

            am_2.WaitForState(RMAppAttemptState.Running);
            nm1.NodeHeartbeat(am_2.GetApplicationAttemptId(), 1, ContainerState.Complete);
            am_2.WaitForState(RMAppAttemptState.Failed);
            // current app should be failed.
            rm1.WaitForState(app.GetApplicationId(), RMAppState.Failed);
            ControlledClock clock = new ControlledClock(new SystemClock());
            // set window size to 6s
            RMAppImpl app1 = (RMAppImpl)rm1.SubmitApp(200, 6000);

            app1.SetSystemClock(clock);
            MockAM am1 = MockRM.LaunchAndRegisterAM(app1, rm1, nm1);

            // Fail attempt1 normally
            nm1.NodeHeartbeat(am1.GetApplicationAttemptId(), 1, ContainerState.Complete);
            am1.WaitForState(RMAppAttemptState.Failed);
            // launch the second attempt
            rm1.WaitForState(app1.GetApplicationId(), RMAppState.Accepted);
            NUnit.Framework.Assert.AreEqual(2, app1.GetAppAttempts().Count);
            RMAppAttempt attempt2 = app1.GetCurrentAppAttempt();

            NUnit.Framework.Assert.IsTrue(((RMAppAttemptImpl)attempt2).MayBeLastAttempt());
            MockAM am2 = MockRM.LaunchAndRegisterAM(app1, rm1, nm1);

            am2.WaitForState(RMAppAttemptState.Running);
            // wait for 6 seconds
            clock.SetTime(Runtime.CurrentTimeMillis() + 6 * 1000);
            // Fail attempt2 normally
            nm1.NodeHeartbeat(am2.GetApplicationAttemptId(), 1, ContainerState.Complete);
            am2.WaitForState(RMAppAttemptState.Failed);
            // can launch the third attempt successfully
            rm1.WaitForState(app1.GetApplicationId(), RMAppState.Accepted);
            NUnit.Framework.Assert.AreEqual(3, app1.GetAppAttempts().Count);
            RMAppAttempt attempt3 = app1.GetCurrentAppAttempt();

            clock.Reset();
            MockAM am3 = MockRM.LaunchAndRegisterAM(app1, rm1, nm1);

            am3.WaitForState(RMAppAttemptState.Running);
            // Restart rm.
            MockRM rm2 = new MockRM(conf, memStore);

            rm2.Start();
            // re-register the NM
            nm1.SetResourceTrackerService(rm2.GetResourceTrackerService());
            NMContainerStatus status = Org.Apache.Hadoop.Yarn.Util.Records.NewRecord <NMContainerStatus
                                                                                      >();

            status.SetContainerExitStatus(ContainerExitStatus.KilledByResourcemanager);
            status.SetContainerId(attempt3.GetMasterContainer().GetId());
            status.SetContainerState(ContainerState.Complete);
            status.SetDiagnostics(string.Empty);
            nm1.RegisterNode(Sharpen.Collections.SingletonList(status), null);
            rm2.WaitForState(attempt3.GetAppAttemptId(), RMAppAttemptState.Failed);
            rm2.WaitForState(app1.GetApplicationId(), RMAppState.Accepted);
            // Lauch Attempt 4
            MockAM am4 = rm2.WaitForNewAMToLaunchAndRegister(app1.GetApplicationId(), 4, nm1);

            // wait for 6 seconds
            clock.SetTime(Runtime.CurrentTimeMillis() + 6 * 1000);
            // Fail attempt4 normally
            nm1.NodeHeartbeat(am4.GetApplicationAttemptId(), 1, ContainerState.Complete);
            am4.WaitForState(RMAppAttemptState.Failed);
            // can launch the 5th attempt successfully
            rm2.WaitForState(app1.GetApplicationId(), RMAppState.Accepted);
            MockAM am5 = rm2.WaitForNewAMToLaunchAndRegister(app1.GetApplicationId(), 5, nm1);

            clock.Reset();
            am5.WaitForState(RMAppAttemptState.Running);
            // Fail attempt5 normally
            nm1.NodeHeartbeat(am5.GetApplicationAttemptId(), 1, ContainerState.Complete);
            am5.WaitForState(RMAppAttemptState.Failed);
            rm2.WaitForState(app1.GetApplicationId(), RMAppState.Failed);
            rm1.Stop();
            rm2.Stop();
        }
示例#4
0
        // AM container preempted, nm disk failure
        // should not be counted towards AM max retry count.
        /// <exception cref="System.Exception"/>
        public virtual void TestShouldNotCountFailureToMaxAttemptRetry()
        {
            YarnConfiguration conf = new YarnConfiguration();

            conf.SetClass(YarnConfiguration.RmScheduler, typeof(CapacityScheduler), typeof(ResourceScheduler
                                                                                           ));
            // explicitly set max-am-retry count as 1.
            conf.SetInt(YarnConfiguration.RmAmMaxAttempts, 1);
            conf.SetBoolean(YarnConfiguration.RecoveryEnabled, true);
            conf.Set(YarnConfiguration.RmStore, typeof(MemoryRMStateStore).FullName);
            MemoryRMStateStore memStore = new MemoryRMStateStore();

            memStore.Init(conf);
            MockRM rm1 = new MockRM(conf, memStore);

            rm1.Start();
            MockNM nm1 = new MockNM("127.0.0.1:1234", 8000, rm1.GetResourceTrackerService());

            nm1.RegisterNode();
            RMApp             app1        = rm1.SubmitApp(200);
            RMAppAttempt      attempt1    = app1.GetCurrentAppAttempt();
            MockAM            am1         = MockRM.LaunchAndRegisterAM(app1, rm1, nm1);
            CapacityScheduler scheduler   = (CapacityScheduler)rm1.GetResourceScheduler();
            ContainerId       amContainer = ContainerId.NewContainerId(am1.GetApplicationAttemptId(
                                                                           ), 1);

            // Preempt the first attempt;
            scheduler.KillContainer(scheduler.GetRMContainer(amContainer));
            am1.WaitForState(RMAppAttemptState.Failed);
            NUnit.Framework.Assert.IsTrue(!attempt1.ShouldCountTowardsMaxAttemptRetry());
            rm1.WaitForState(app1.GetApplicationId(), RMAppState.Accepted);
            ApplicationStateData appState = memStore.GetState().GetApplicationState()[app1.GetApplicationId
                                                                                          ()];
            // AM should be restarted even though max-am-attempt is 1.
            MockAM       am2      = rm1.WaitForNewAMToLaunchAndRegister(app1.GetApplicationId(), 2, nm1);
            RMAppAttempt attempt2 = app1.GetCurrentAppAttempt();

            NUnit.Framework.Assert.IsTrue(((RMAppAttemptImpl)attempt2).MayBeLastAttempt());
            // Preempt the second attempt.
            ContainerId amContainer2 = ContainerId.NewContainerId(am2.GetApplicationAttemptId
                                                                      (), 1);

            scheduler.KillContainer(scheduler.GetRMContainer(amContainer2));
            am2.WaitForState(RMAppAttemptState.Failed);
            NUnit.Framework.Assert.IsTrue(!attempt2.ShouldCountTowardsMaxAttemptRetry());
            rm1.WaitForState(app1.GetApplicationId(), RMAppState.Accepted);
            MockAM       am3      = rm1.WaitForNewAMToLaunchAndRegister(app1.GetApplicationId(), 3, nm1);
            RMAppAttempt attempt3 = app1.GetCurrentAppAttempt();

            NUnit.Framework.Assert.IsTrue(((RMAppAttemptImpl)attempt3).MayBeLastAttempt());
            // mimic NM disk_failure
            ContainerStatus containerStatus = Org.Apache.Hadoop.Yarn.Util.Records.NewRecord <ContainerStatus
                                                                                             >();

            containerStatus.SetContainerId(attempt3.GetMasterContainer().GetId());
            containerStatus.SetDiagnostics("mimic NM disk_failure");
            containerStatus.SetState(ContainerState.Complete);
            containerStatus.SetExitStatus(ContainerExitStatus.DisksFailed);
            IDictionary <ApplicationId, IList <ContainerStatus> > conts = new Dictionary <ApplicationId
                                                                                          , IList <ContainerStatus> >();

            conts[app1.GetApplicationId()] = Sharpen.Collections.SingletonList(containerStatus
                                                                               );
            nm1.NodeHeartbeat(conts, true);
            am3.WaitForState(RMAppAttemptState.Failed);
            NUnit.Framework.Assert.IsTrue(!attempt3.ShouldCountTowardsMaxAttemptRetry());
            NUnit.Framework.Assert.AreEqual(ContainerExitStatus.DisksFailed, appState.GetAttempt
                                                (am3.GetApplicationAttemptId()).GetAMContainerExitStatus());
            rm1.WaitForState(app1.GetApplicationId(), RMAppState.Accepted);
            MockAM       am4      = rm1.WaitForNewAMToLaunchAndRegister(app1.GetApplicationId(), 4, nm1);
            RMAppAttempt attempt4 = app1.GetCurrentAppAttempt();

            NUnit.Framework.Assert.IsTrue(((RMAppAttemptImpl)attempt4).MayBeLastAttempt());
            // create second NM, and register to rm1
            MockNM nm2 = new MockNM("127.0.0.1:2234", 8000, rm1.GetResourceTrackerService());

            nm2.RegisterNode();
            // nm1 heartbeats to report unhealthy
            // This will mimic ContainerExitStatus.ABORT
            nm1.NodeHeartbeat(false);
            am4.WaitForState(RMAppAttemptState.Failed);
            NUnit.Framework.Assert.IsTrue(!attempt4.ShouldCountTowardsMaxAttemptRetry());
            NUnit.Framework.Assert.AreEqual(ContainerExitStatus.Aborted, appState.GetAttempt(
                                                am4.GetApplicationAttemptId()).GetAMContainerExitStatus());
            // launch next AM in nm2
            nm2.NodeHeartbeat(true);
            MockAM       am5      = rm1.WaitForNewAMToLaunchAndRegister(app1.GetApplicationId(), 5, nm2);
            RMAppAttempt attempt5 = app1.GetCurrentAppAttempt();

            NUnit.Framework.Assert.IsTrue(((RMAppAttemptImpl)attempt5).MayBeLastAttempt());
            // fail the AM normally
            nm2.NodeHeartbeat(am5.GetApplicationAttemptId(), 1, ContainerState.Complete);
            am5.WaitForState(RMAppAttemptState.Failed);
            NUnit.Framework.Assert.IsTrue(attempt5.ShouldCountTowardsMaxAttemptRetry());
            // AM should not be restarted.
            rm1.WaitForState(app1.GetApplicationId(), RMAppState.Failed);
            NUnit.Framework.Assert.AreEqual(5, app1.GetAppAttempts().Count);
            rm1.Stop();
        }
示例#5
0
        /// <exception cref="System.Exception"/>
        public virtual void TestNMTokensRebindOnAMRestart()
        {
            YarnConfiguration conf = new YarnConfiguration();

            conf.SetInt(YarnConfiguration.RmAmMaxAttempts, 3);
            MockRM rm1 = new MockRM(conf);

            rm1.Start();
            RMApp app1 = rm1.SubmitApp(200, "myname", "myuser", new Dictionary <ApplicationAccessType
                                                                                , string>(), false, "default", -1, null, "MAPREDUCE", false, true);
            MockNM nm1 = new MockNM("127.0.0.1:1234", 8000, rm1.GetResourceTrackerService());

            nm1.RegisterNode();
            MockNM nm2 = new MockNM("127.1.1.1:4321", 8000, rm1.GetResourceTrackerService());

            nm2.RegisterNode();
            MockAM            am1        = MockRM.LaunchAndRegisterAM(app1, rm1, nm1);
            IList <Container> containers = new AList <Container>();
            // nmTokens keeps track of all the nmTokens issued in the allocate call.
            IList <NMToken> expectedNMTokens = new AList <NMToken>();

            // am1 allocate 2 container on nm1.
            // first container
            while (true)
            {
                AllocateResponse response = am1.Allocate("127.0.0.1", 2000, 2, new AList <ContainerId
                                                                                          >());
                nm1.NodeHeartbeat(true);
                Sharpen.Collections.AddAll(containers, response.GetAllocatedContainers());
                Sharpen.Collections.AddAll(expectedNMTokens, response.GetNMTokens());
                if (containers.Count == 2)
                {
                    break;
                }
                Sharpen.Thread.Sleep(200);
                System.Console.Out.WriteLine("Waiting for container to be allocated.");
            }
            // launch the container-2
            nm1.NodeHeartbeat(am1.GetApplicationAttemptId(), 2, ContainerState.Running);
            ContainerId containerId2 = ContainerId.NewContainerId(am1.GetApplicationAttemptId
                                                                      (), 2);

            rm1.WaitForState(nm1, containerId2, RMContainerState.Running);
            // launch the container-3
            nm1.NodeHeartbeat(am1.GetApplicationAttemptId(), 3, ContainerState.Running);
            ContainerId containerId3 = ContainerId.NewContainerId(am1.GetApplicationAttemptId
                                                                      (), 3);

            rm1.WaitForState(nm1, containerId3, RMContainerState.Running);
            // fail am1
            nm1.NodeHeartbeat(am1.GetApplicationAttemptId(), 1, ContainerState.Complete);
            am1.WaitForState(RMAppAttemptState.Failed);
            rm1.WaitForState(app1.GetApplicationId(), RMAppState.Accepted);
            // restart the am
            MockAM am2 = MockRM.LaunchAM(app1, rm1, nm1);
            RegisterApplicationMasterResponse registerResponse = am2.RegisterAppAttempt();

            rm1.WaitForState(app1.GetApplicationId(), RMAppState.Running);
            // check am2 get the nm token from am1.
            NUnit.Framework.Assert.AreEqual(expectedNMTokens, registerResponse.GetNMTokensFromPreviousAttempts
                                                ());
            // am2 allocate 1 container on nm2
            containers = new AList <Container>();
            while (true)
            {
                AllocateResponse allocateResponse = am2.Allocate("127.1.1.1", 4000, 1, new AList <
                                                                     ContainerId>());
                nm2.NodeHeartbeat(true);
                Sharpen.Collections.AddAll(containers, allocateResponse.GetAllocatedContainers());
                Sharpen.Collections.AddAll(expectedNMTokens, allocateResponse.GetNMTokens());
                if (containers.Count == 1)
                {
                    break;
                }
                Sharpen.Thread.Sleep(200);
                System.Console.Out.WriteLine("Waiting for container to be allocated.");
            }
            nm1.NodeHeartbeat(am2.GetApplicationAttemptId(), 2, ContainerState.Running);
            ContainerId am2ContainerId2 = ContainerId.NewContainerId(am2.GetApplicationAttemptId
                                                                         (), 2);

            rm1.WaitForState(nm1, am2ContainerId2, RMContainerState.Running);
            // fail am2.
            nm1.NodeHeartbeat(am2.GetApplicationAttemptId(), 1, ContainerState.Complete);
            am2.WaitForState(RMAppAttemptState.Failed);
            rm1.WaitForState(app1.GetApplicationId(), RMAppState.Accepted);
            // restart am
            MockAM am3 = MockRM.LaunchAM(app1, rm1, nm1);

            registerResponse = am3.RegisterAppAttempt();
            rm1.WaitForState(app1.GetApplicationId(), RMAppState.Running);
            // check am3 get the NM token from both am1 and am2;
            IList <NMToken> transferredTokens = registerResponse.GetNMTokensFromPreviousAttempts
                                                    ();

            NUnit.Framework.Assert.AreEqual(2, transferredTokens.Count);
            NUnit.Framework.Assert.IsTrue(transferredTokens.ContainsAll(expectedNMTokens));
            rm1.Stop();
        }
示例#6
0
        /// <exception cref="System.Exception"/>
        public virtual void TestAMRestartWithExistingContainers()
        {
            YarnConfiguration conf = new YarnConfiguration();

            conf.SetInt(YarnConfiguration.RmAmMaxAttempts, 2);
            MockRM rm1 = new MockRM(conf);

            rm1.Start();
            RMApp app1 = rm1.SubmitApp(200, "name", "user", new Dictionary <ApplicationAccessType
                                                                            , string>(), false, "default", -1, null, "MAPREDUCE", false, true);
            MockNM nm1 = new MockNM("127.0.0.1:1234", 10240, rm1.GetResourceTrackerService());

            nm1.RegisterNode();
            MockNM nm2 = new MockNM("127.0.0.1:2351", 4089, rm1.GetResourceTrackerService());

            nm2.RegisterNode();
            MockAM am1           = MockRM.LaunchAndRegisterAM(app1, rm1, nm1);
            int    NumContainers = 3;

            // allocate NUM_CONTAINERS containers
            am1.Allocate("127.0.0.1", 1024, NumContainers, new AList <ContainerId>());
            nm1.NodeHeartbeat(true);
            // wait for containers to be allocated.
            IList <Container> containers = am1.Allocate(new AList <ResourceRequest>(), new AList
                                                        <ContainerId>()).GetAllocatedContainers();

            while (containers.Count != NumContainers)
            {
                nm1.NodeHeartbeat(true);
                Sharpen.Collections.AddAll(containers, am1.Allocate(new AList <ResourceRequest>(),
                                                                    new AList <ContainerId>()).GetAllocatedContainers());
                Sharpen.Thread.Sleep(200);
            }
            // launch the 2nd container, for testing running container transferred.
            nm1.NodeHeartbeat(am1.GetApplicationAttemptId(), 2, ContainerState.Running);
            ContainerId containerId2 = ContainerId.NewContainerId(am1.GetApplicationAttemptId
                                                                      (), 2);

            rm1.WaitForState(nm1, containerId2, RMContainerState.Running);
            // launch the 3rd container, for testing container allocated by previous
            // attempt is completed by the next new attempt/
            nm1.NodeHeartbeat(am1.GetApplicationAttemptId(), 3, ContainerState.Running);
            ContainerId containerId3 = ContainerId.NewContainerId(am1.GetApplicationAttemptId
                                                                      (), 3);

            rm1.WaitForState(nm1, containerId3, RMContainerState.Running);
            // 4th container still in AQUIRED state. for testing Acquired container is
            // always killed.
            ContainerId containerId4 = ContainerId.NewContainerId(am1.GetApplicationAttemptId
                                                                      (), 4);

            rm1.WaitForState(nm1, containerId4, RMContainerState.Acquired);
            // 5th container is in Allocated state. for testing allocated container is
            // always killed.
            am1.Allocate("127.0.0.1", 1024, 1, new AList <ContainerId>());
            nm1.NodeHeartbeat(true);
            ContainerId containerId5 = ContainerId.NewContainerId(am1.GetApplicationAttemptId
                                                                      (), 5);

            rm1.WaitForContainerAllocated(nm1, containerId5);
            rm1.WaitForState(nm1, containerId5, RMContainerState.Allocated);
            // 6th container is in Reserved state.
            am1.Allocate("127.0.0.1", 6000, 1, new AList <ContainerId>());
            ContainerId containerId6 = ContainerId.NewContainerId(am1.GetApplicationAttemptId
                                                                      (), 6);

            nm1.NodeHeartbeat(true);
            SchedulerApplicationAttempt schedulerAttempt = ((AbstractYarnScheduler)rm1.GetResourceScheduler
                                                                ()).GetCurrentAttemptForContainer(containerId6);

            while (schedulerAttempt.GetReservedContainers().IsEmpty())
            {
                System.Console.Out.WriteLine("Waiting for container " + containerId6 + " to be reserved."
                                             );
                nm1.NodeHeartbeat(true);
                Sharpen.Thread.Sleep(200);
            }
            // assert containerId6 is reserved.
            NUnit.Framework.Assert.AreEqual(containerId6, schedulerAttempt.GetReservedContainers
                                                ()[0].GetContainerId());
            // fail the AM by sending CONTAINER_FINISHED event without registering.
            nm1.NodeHeartbeat(am1.GetApplicationAttemptId(), 1, ContainerState.Complete);
            am1.WaitForState(RMAppAttemptState.Failed);
            // wait for some time. previous AM's running containers should still remain
            // in scheduler even though am failed
            Sharpen.Thread.Sleep(3000);
            rm1.WaitForState(nm1, containerId2, RMContainerState.Running);
            // acquired/allocated containers are cleaned up.
            NUnit.Framework.Assert.IsNull(rm1.GetResourceScheduler().GetRMContainer(containerId4
                                                                                    ));
            NUnit.Framework.Assert.IsNull(rm1.GetResourceScheduler().GetRMContainer(containerId5
                                                                                    ));
            // wait for app to start a new attempt.
            rm1.WaitForState(app1.GetApplicationId(), RMAppState.Accepted);
            // assert this is a new AM.
            ApplicationAttemptId newAttemptId = app1.GetCurrentAppAttempt().GetAppAttemptId();

            NUnit.Framework.Assert.IsFalse(newAttemptId.Equals(am1.GetApplicationAttemptId())
                                           );
            // launch the new AM
            RMAppAttempt attempt2 = app1.GetCurrentAppAttempt();

            nm1.NodeHeartbeat(true);
            MockAM am2 = rm1.SendAMLaunched(attempt2.GetAppAttemptId());
            RegisterApplicationMasterResponse registerResponse = am2.RegisterAppAttempt();

            // Assert two containers are running: container2 and container3;
            NUnit.Framework.Assert.AreEqual(2, registerResponse.GetContainersFromPreviousAttempts
                                                ().Count);
            bool containerId2Exists = false;
            bool containerId3Exists = false;

            foreach (Container container in registerResponse.GetContainersFromPreviousAttempts
                         ())
            {
                if (container.GetId().Equals(containerId2))
                {
                    containerId2Exists = true;
                }
                if (container.GetId().Equals(containerId3))
                {
                    containerId3Exists = true;
                }
            }
            NUnit.Framework.Assert.IsTrue(containerId2Exists && containerId3Exists);
            rm1.WaitForState(app1.GetApplicationId(), RMAppState.Running);
            // complete container by sending the container complete event which has earlier
            // attempt's attemptId
            nm1.NodeHeartbeat(am1.GetApplicationAttemptId(), 3, ContainerState.Complete);
            // Even though the completed container containerId3 event was sent to the
            // earlier failed attempt, new RMAppAttempt can also capture this container
            // info.
            // completed containerId4 is also transferred to the new attempt.
            RMAppAttempt newAttempt = app1.GetRMAppAttempt(am2.GetApplicationAttemptId());

            // 4 containers finished, acquired/allocated/reserved/completed.
            WaitForContainersToFinish(4, newAttempt);
            bool container3Exists = false;
            bool container4Exists = false;
            bool container5Exists = false;
            bool container6Exists = false;

            foreach (ContainerStatus status in newAttempt.GetJustFinishedContainers())
            {
                if (status.GetContainerId().Equals(containerId3))
                {
                    // containerId3 is the container ran by previous attempt but finished by the
                    // new attempt.
                    container3Exists = true;
                }
                if (status.GetContainerId().Equals(containerId4))
                {
                    // containerId4 is the Acquired Container killed by the previous attempt,
                    // it's now inside new attempt's finished container list.
                    container4Exists = true;
                }
                if (status.GetContainerId().Equals(containerId5))
                {
                    // containerId5 is the Allocated container killed by previous failed attempt.
                    container5Exists = true;
                }
                if (status.GetContainerId().Equals(containerId6))
                {
                    // containerId6 is the reserved container killed by previous failed attempt.
                    container6Exists = true;
                }
            }
            NUnit.Framework.Assert.IsTrue(container3Exists && container4Exists && container5Exists &&
                                          container6Exists);
            // New SchedulerApplicationAttempt also has the containers info.
            rm1.WaitForState(nm1, containerId2, RMContainerState.Running);
            // record the scheduler attempt for testing.
            SchedulerApplicationAttempt schedulerNewAttempt = ((AbstractYarnScheduler)rm1.GetResourceScheduler
                                                                   ()).GetCurrentAttemptForContainer(containerId2);

            // finish this application
            MockRM.FinishAMAndVerifyAppState(app1, rm1, nm1, am2);
            // the 2nd attempt released the 1st attempt's running container, when the
            // 2nd attempt finishes.
            NUnit.Framework.Assert.IsFalse(schedulerNewAttempt.GetLiveContainers().Contains(containerId2
                                                                                            ));
            // all 4 normal containers finished.
            System.Console.Out.WriteLine("New attempt's just finished containers: " + newAttempt
                                         .GetJustFinishedContainers());
            WaitForContainersToFinish(5, newAttempt);
            rm1.Stop();
        }
示例#7
0
        /// <exception cref="System.Exception"/>
        public virtual void TestResourceRequestRestoreWhenRMContainerIsAtAllocated()
        {
            ConfigureScheduler();
            YarnConfiguration conf = GetConf();
            MockRM            rm1  = new MockRM(conf);

            try
            {
                rm1.Start();
                RMApp app1 = rm1.SubmitApp(200, "name", "user", new Dictionary <ApplicationAccessType
                                                                                , string>(), false, "default", -1, null, "Test", false, true);
                MockNM nm1 = new MockNM("127.0.0.1:1234", 10240, rm1.GetResourceTrackerService());
                nm1.RegisterNode();
                MockNM nm2 = new MockNM("127.0.0.1:2351", 10240, rm1.GetResourceTrackerService());
                nm2.RegisterNode();
                MockAM am1           = MockRM.LaunchAndRegisterAM(app1, rm1, nm1);
                int    NumContainers = 1;
                // allocate NUM_CONTAINERS containers
                am1.Allocate("127.0.0.1", 1024, NumContainers, new AList <ContainerId>());
                nm1.NodeHeartbeat(true);
                // wait for containers to be allocated.
                IList <Container> containers = am1.Allocate(new AList <ResourceRequest>(), new AList
                                                            <ContainerId>()).GetAllocatedContainers();
                while (containers.Count != NumContainers)
                {
                    nm1.NodeHeartbeat(true);
                    Sharpen.Collections.AddAll(containers, am1.Allocate(new AList <ResourceRequest>(),
                                                                        new AList <ContainerId>()).GetAllocatedContainers());
                    Sharpen.Thread.Sleep(200);
                }
                // launch the 2nd container, for testing running container transferred.
                nm1.NodeHeartbeat(am1.GetApplicationAttemptId(), 2, ContainerState.Running);
                ContainerId containerId2 = ContainerId.NewContainerId(am1.GetApplicationAttemptId
                                                                          (), 2);
                rm1.WaitForState(nm1, containerId2, RMContainerState.Running);
                // 3rd container is in Allocated state.
                am1.Allocate("127.0.0.1", 1024, NumContainers, new AList <ContainerId>());
                nm2.NodeHeartbeat(true);
                ContainerId containerId3 = ContainerId.NewContainerId(am1.GetApplicationAttemptId
                                                                          (), 3);
                rm1.WaitForContainerAllocated(nm2, containerId3);
                rm1.WaitForState(nm2, containerId3, RMContainerState.Allocated);
                // NodeManager restart
                nm2.RegisterNode();
                // NM restart kills all allocated and running containers.
                rm1.WaitForState(nm2, containerId3, RMContainerState.Killed);
                // The killed RMContainer request should be restored. In successive
                // nodeHeartBeats AM should be able to get container allocated.
                containers = am1.Allocate(new AList <ResourceRequest>(), new AList <ContainerId>())
                             .GetAllocatedContainers();
                while (containers.Count != NumContainers)
                {
                    nm2.NodeHeartbeat(true);
                    Sharpen.Collections.AddAll(containers, am1.Allocate(new AList <ResourceRequest>(),
                                                                        new AList <ContainerId>()).GetAllocatedContainers());
                    Sharpen.Thread.Sleep(200);
                }
                nm2.NodeHeartbeat(am1.GetApplicationAttemptId(), 4, ContainerState.Running);
                ContainerId containerId4 = ContainerId.NewContainerId(am1.GetApplicationAttemptId
                                                                          (), 4);
                rm1.WaitForState(nm2, containerId4, RMContainerState.Running);
            }
            finally
            {
                rm1.Stop();
            }
        }