/// <exception cref="System.Exception"/> public virtual void TestAppCleanupWhenNMReconnects() { conf.SetInt(YarnConfiguration.RmAmMaxAttempts, 1); MemoryRMStateStore memStore = new MemoryRMStateStore(); memStore.Init(conf); // start RM MockRM rm1 = new MockRM(conf, memStore); rm1.Start(); MockNM nm1 = new MockNM("127.0.0.1:1234", 15120, rm1.GetResourceTrackerService()); nm1.RegisterNode(); // create app and launch the AM RMApp app0 = rm1.SubmitApp(200); MockAM am0 = LaunchAM(app0, rm1, nm1); nm1.NodeHeartbeat(am0.GetApplicationAttemptId(), 1, ContainerState.Complete); rm1.WaitForState(app0.GetApplicationId(), RMAppState.Failed); // wait for application cleanup message received WaitForAppCleanupMessageRecved(nm1, app0.GetApplicationId()); // reconnect NM with application still active nm1.RegisterNode(Arrays.AsList(app0.GetApplicationId())); WaitForAppCleanupMessageRecved(nm1, app0.GetApplicationId()); rm1.Stop(); }
/// <exception cref="System.Exception"/> public virtual void TestAppCleanupWhenRMRestartedBeforeAppFinished() { conf.SetInt(YarnConfiguration.RmAmMaxAttempts, 1); MemoryRMStateStore memStore = new MemoryRMStateStore(); memStore.Init(conf); // start RM MockRM rm1 = new MockRM(conf, memStore); rm1.Start(); MockNM nm1 = new MockNM("127.0.0.1:1234", 1024, rm1.GetResourceTrackerService()); nm1.RegisterNode(); MockNM nm2 = new MockNM("127.0.0.1:5678", 1024, rm1.GetResourceTrackerService()); nm2.RegisterNode(); // create app and launch the AM RMApp app0 = rm1.SubmitApp(200); MockAM am0 = LaunchAM(app0, rm1, nm1); // alloc another container on nm2 AllocateResponse allocResponse = am0.Allocate(Arrays.AsList(ResourceRequest.NewInstance (Priority.NewInstance(1), "*", Resource.NewInstance(1024, 0), 1)), null); while (null == allocResponse.GetAllocatedContainers() || allocResponse.GetAllocatedContainers ().IsEmpty()) { nm2.NodeHeartbeat(true); allocResponse = am0.Allocate(null, null); Sharpen.Thread.Sleep(1000); } // start new RM MockRM rm2 = new MockRM(conf, memStore); rm2.Start(); // nm1/nm2 register to rm2, and do a heartbeat nm1.SetResourceTrackerService(rm2.GetResourceTrackerService()); nm1.RegisterNode(Arrays.AsList(NMContainerStatus.NewInstance(ContainerId.NewContainerId (am0.GetApplicationAttemptId(), 1), ContainerState.Complete, Resource.NewInstance (1024, 1), string.Empty, 0, Priority.NewInstance(0), 1234)), Arrays.AsList(app0. GetApplicationId())); nm2.SetResourceTrackerService(rm2.GetResourceTrackerService()); nm2.RegisterNode(Arrays.AsList(app0.GetApplicationId())); // assert app state has been saved. rm2.WaitForState(app0.GetApplicationId(), RMAppState.Failed); // wait for application cleanup message received on NM1 WaitForAppCleanupMessageRecved(nm1, app0.GetApplicationId()); // wait for application cleanup message received on NM2 WaitForAppCleanupMessageRecved(nm2, app0.GetApplicationId()); rm1.Stop(); rm2.Stop(); }
/// <exception cref="System.Exception"/> public virtual void TestInvalidatedAMHostPortOnAMRestart() { MockRM rm1 = new MockRM(conf); rm1.Start(); MockNM nm1 = new MockNM("127.0.0.1:1234", 15120, rm1.GetResourceTrackerService()); nm1.RegisterNode(); // a failed app RMApp app2 = rm1.SubmitApp(200); MockAM am2 = MockRM.LaunchAndRegisterAM(app2, rm1, nm1); nm1.NodeHeartbeat(am2.GetApplicationAttemptId(), 1, ContainerState.Complete); am2.WaitForState(RMAppAttemptState.Failed); rm1.WaitForState(app2.GetApplicationId(), RMAppState.Accepted); // before new attempt is launched, the app report returns the invalid AM // host and port. GetApplicationReportRequest request1 = GetApplicationReportRequest.NewInstance(app2 .GetApplicationId()); ApplicationReport report1 = rm1.GetClientRMService().GetApplicationReport(request1 ).GetApplicationReport(); NUnit.Framework.Assert.AreEqual("N/A", report1.GetHost()); NUnit.Framework.Assert.AreEqual(-1, report1.GetRpcPort()); }
/// <exception cref="System.Exception"/> public virtual void TestKillAppWhenFailoverHappensAtNewState() { // create a customized RMAppManager // During the process of Application submission, // the RMAppState will always be NEW. // The ApplicationState will not be saved in RMStateStore. StartRMsWithCustomizedRMAppManager(); MockNM nm1 = new MockNM("127.0.0.1:1234", 15120, rm1.GetResourceTrackerService()); nm1.RegisterNode(); // Submit the application RMApp app0 = rm1.SubmitApp(200, string.Empty, UserGroupInformation.GetCurrentUser ().GetShortUserName(), null, false, null, configuration.GetInt(YarnConfiguration .RmAmMaxAttempts, YarnConfiguration.DefaultRmAmMaxAttempts), null, null, false, false); // failover and kill application // When FailOver happens, the state of this application is NEW, // and ApplicationState is not saved in RMStateStore. The active RM // can not load the ApplicationState of this application. // Expected to get ApplicationNotFoundException // when receives the KillApplicationRequest try { FailOverAndKillApp(app0.GetApplicationId(), RMAppState.New); NUnit.Framework.Assert.Fail("Should get an exception here"); } catch (ApplicationNotFoundException ex) { NUnit.Framework.Assert.IsTrue(ex.Message.Contains("Trying to kill an absent application " + app0.GetApplicationId())); } }
/// <exception cref="System.Exception"/> public virtual MockNM RegisterNode(string nodeIdStr, int memory, int vCores) { MockNM nm = new MockNM(nodeIdStr, memory, vCores, GetResourceTrackerService()); nm.RegisterNode(); return(nm); }
/// <exception cref="System.Exception"/> public virtual MockNM RegisterNode(string nodeIdStr, int memory, int vCores, IList <ApplicationId> runningApplications) { MockNM nm = new MockNM(nodeIdStr, memory, vCores, GetResourceTrackerService(), YarnVersionInfo .GetVersion()); nm.RegisterNode(runningApplications); return(nm); }
// This is to test AM Host and rpc port are invalidated after the am attempt // is killed or failed, so that client doesn't get the wrong information. /// <exception cref="System.Exception"/> public virtual void TestInvalidateAMHostPortWhenAMFailedOrKilled() { conf.SetInt(YarnConfiguration.RmAmMaxAttempts, 1); MockRM rm1 = new MockRM(conf); rm1.Start(); // a succeeded app RMApp app1 = rm1.SubmitApp(200); MockNM nm1 = new MockNM("127.0.0.1:1234", 15120, rm1.GetResourceTrackerService()); nm1.RegisterNode(); MockAM am1 = MockRM.LaunchAndRegisterAM(app1, rm1, nm1); MockRM.FinishAMAndVerifyAppState(app1, rm1, nm1, am1); // a failed app RMApp app2 = rm1.SubmitApp(200); MockAM am2 = MockRM.LaunchAndRegisterAM(app2, rm1, nm1); nm1.NodeHeartbeat(am2.GetApplicationAttemptId(), 1, ContainerState.Complete); am2.WaitForState(RMAppAttemptState.Failed); rm1.WaitForState(app2.GetApplicationId(), RMAppState.Failed); // a killed app RMApp app3 = rm1.SubmitApp(200); MockAM am3 = MockRM.LaunchAndRegisterAM(app3, rm1, nm1); rm1.KillApp(app3.GetApplicationId()); rm1.WaitForState(app3.GetApplicationId(), RMAppState.Killed); rm1.WaitForState(am3.GetApplicationAttemptId(), RMAppAttemptState.Killed); GetApplicationsRequest request1 = GetApplicationsRequest.NewInstance(EnumSet.Of(YarnApplicationState .Finished, YarnApplicationState.Killed, YarnApplicationState.Failed)); GetApplicationsResponse response1 = rm1.GetClientRMService().GetApplications(request1 ); IList <ApplicationReport> appList1 = response1.GetApplicationList(); NUnit.Framework.Assert.AreEqual(3, appList1.Count); foreach (ApplicationReport report in appList1) { // killed/failed apps host and rpc port are invalidated. if (report.GetApplicationId().Equals(app2.GetApplicationId()) || report.GetApplicationId ().Equals(app3.GetApplicationId())) { NUnit.Framework.Assert.AreEqual("N/A", report.GetHost()); NUnit.Framework.Assert.AreEqual(-1, report.GetRpcPort()); } // succeeded app's host and rpc port is not invalidated if (report.GetApplicationId().Equals(app1.GetApplicationId())) { NUnit.Framework.Assert.IsFalse(report.GetHost().Equals("N/A")); NUnit.Framework.Assert.IsTrue(report.GetRpcPort() != -1); } } }
public virtual void TestSetRMIdentifierInRegistration() { Configuration conf = new Configuration(); rm = new MockRM(conf); rm.Start(); MockNM nm = new MockNM("host1:1234", 5120, rm.GetResourceTrackerService()); RegisterNodeManagerResponse response = nm.RegisterNode(); // Verify the RMIdentifier is correctly set in RegisterNodeManagerResponse NUnit.Framework.Assert.AreEqual(ResourceManager.GetClusterTimeStamp(), response.GetRMIdentifier ()); }
/// <exception cref="System.Exception"/> public virtual void TestContainerCleanupWhenRMRestartedAppNotRegistered() { conf.SetInt(YarnConfiguration.RmAmMaxAttempts, 1); MemoryRMStateStore memStore = new MemoryRMStateStore(); memStore.Init(conf); // start RM DrainDispatcher dispatcher = new DrainDispatcher(); MockRM rm1 = new _MockRM_413(dispatcher, conf, memStore); rm1.Start(); MockNM nm1 = new MockNM("127.0.0.1:1234", 15120, rm1.GetResourceTrackerService()); nm1.RegisterNode(); // create app and launch the AM RMApp app0 = rm1.SubmitApp(200); MockAM am0 = LaunchAM(app0, rm1, nm1); nm1.NodeHeartbeat(am0.GetApplicationAttemptId(), 1, ContainerState.Running); rm1.WaitForState(app0.GetApplicationId(), RMAppState.Running); // start new RM DrainDispatcher dispatcher2 = new DrainDispatcher(); MockRM rm2 = new _MockRM_432(dispatcher2, conf, memStore); rm2.Start(); // nm1 register to rm2, and do a heartbeat nm1.SetResourceTrackerService(rm2.GetResourceTrackerService()); nm1.RegisterNode(Arrays.AsList(app0.GetApplicationId())); rm2.WaitForState(app0.GetApplicationId(), RMAppState.Accepted); // Add unknown container for application unknown to scheduler NodeHeartbeatResponse response = nm1.NodeHeartbeat(am0.GetApplicationAttemptId(), 2, ContainerState.Running); WaitForContainerCleanup(dispatcher2, nm1, response); rm1.Stop(); rm2.Stop(); }
/// <exception cref="System.Exception"/> public virtual void TestUsageWithOneAttemptAndOneContainer() { MockRM rm = new MockRM(conf); rm.Start(); MockNM nm = new MockNM("127.0.0.1:1234", 15120, rm.GetResourceTrackerService()); nm.RegisterNode(); RMApp app0 = rm.SubmitApp(200); RMAppMetrics rmAppMetrics = app0.GetRMAppMetrics(); NUnit.Framework.Assert.IsTrue("Before app submittion, memory seconds should have been 0 but was " + rmAppMetrics.GetMemorySeconds(), rmAppMetrics.GetMemorySeconds() == 0); NUnit.Framework.Assert.IsTrue("Before app submission, vcore seconds should have been 0 but was " + rmAppMetrics.GetVcoreSeconds(), rmAppMetrics.GetVcoreSeconds() == 0); RMAppAttempt attempt0 = app0.GetCurrentAppAttempt(); nm.NodeHeartbeat(true); MockAM am0 = rm.SendAMLaunched(attempt0.GetAppAttemptId()); am0.RegisterAppAttempt(); RMContainer rmContainer = rm.GetResourceScheduler().GetRMContainer(attempt0.GetMasterContainer ().GetId()); // Allow metrics to accumulate. int sleepInterval = 1000; int cumulativeSleepTime = 0; while (rmAppMetrics.GetMemorySeconds() <= 0 && cumulativeSleepTime < 5000) { Sharpen.Thread.Sleep(sleepInterval); cumulativeSleepTime += sleepInterval; } rmAppMetrics = app0.GetRMAppMetrics(); NUnit.Framework.Assert.IsTrue("While app is running, memory seconds should be >0 but is " + rmAppMetrics.GetMemorySeconds(), rmAppMetrics.GetMemorySeconds() > 0); NUnit.Framework.Assert.IsTrue("While app is running, vcore seconds should be >0 but is " + rmAppMetrics.GetVcoreSeconds(), rmAppMetrics.GetVcoreSeconds() > 0); MockRM.FinishAMAndVerifyAppState(app0, rm, nm, am0); AggregateAppResourceUsage ru = CalculateContainerResourceMetrics(rmContainer); rmAppMetrics = app0.GetRMAppMetrics(); NUnit.Framework.Assert.AreEqual("Unexcpected MemorySeconds value", ru.GetMemorySeconds (), rmAppMetrics.GetMemorySeconds()); NUnit.Framework.Assert.AreEqual("Unexpected VcoreSeconds value", ru.GetVcoreSeconds (), rmAppMetrics.GetVcoreSeconds()); rm.Stop(); }
/// <summary>Validate killing an application when it is at accepted state.</summary> /// <exception cref="System.Exception">exception</exception> public virtual void TestApplicationKillAtAcceptedState() { Dispatcher dispatcher = new _AsyncDispatcher_573(); MockRM rm = new _MockRM_596(dispatcher, conf); // test metrics QueueMetrics metrics = rm.GetResourceScheduler().GetRootQueueMetrics(); int appsKilled = metrics.GetAppsKilled(); int appsSubmitted = metrics.GetAppsSubmitted(); rm.Start(); MockNM nm1 = new MockNM("127.0.0.1:1234", 15120, rm.GetResourceTrackerService()); nm1.RegisterNode(); // a failed app RMApp application = rm.SubmitApp(200); MockAM am = MockRM.LaunchAM(application, rm, nm1); am.WaitForState(RMAppAttemptState.Launched); nm1.NodeHeartbeat(am.GetApplicationAttemptId(), 1, ContainerState.Running); rm.WaitForState(application.GetApplicationId(), RMAppState.Accepted); // Now kill the application before new attempt is launched, the app report // returns the invalid AM host and port. KillApplicationRequest request = KillApplicationRequest.NewInstance(application.GetApplicationId ()); rm.GetClientRMService().ForceKillApplication(request); // Specific test for YARN-1689 follows // Now let's say a race causes AM to register now. This should not crash RM. am.RegisterAppAttempt(false); // We explicitly intercepted the kill-event to RMAppAttempt, so app should // still be in KILLING state. rm.WaitForState(application.GetApplicationId(), RMAppState.Killing); // AM should now be in running rm.WaitForState(am.GetApplicationAttemptId(), RMAppAttemptState.Running); // Simulate that appAttempt is killed. rm.GetRMContext().GetDispatcher().GetEventHandler().Handle(new RMAppEvent(application .GetApplicationId(), RMAppEventType.AttemptKilled)); rm.WaitForState(application.GetApplicationId(), RMAppState.Killed); // test metrics metrics = rm.GetResourceScheduler().GetRootQueueMetrics(); NUnit.Framework.Assert.AreEqual(appsKilled + 1, metrics.GetAppsKilled()); NUnit.Framework.Assert.AreEqual(appsSubmitted + 1, metrics.GetAppsSubmitted()); }
/// <exception cref="System.Exception"/> public virtual void TestKillAppWhenFailoverHappensAtRunningState() { StartRMs(); MockNM nm1 = new MockNM("127.0.0.1:1234", 15120, rm1.GetResourceTrackerService()); nm1.RegisterNode(); // create app and launch the AM RMApp app0 = rm1.SubmitApp(200); MockAM am0 = LaunchAM(app0, rm1, nm1); // failover and kill application // The application is at RUNNING State when failOver happens. // Since RMStateStore has already saved ApplicationState, the active RM // will load the ApplicationState. After that, the application will be at // ACCEPTED State. Because the application is not at Final State, // KillApplicationResponse.getIsKillCompleted is expected to return false. FailOverAndKillApp(app0.GetApplicationId(), am0.GetApplicationAttemptId(), RMAppState .Running, RMAppAttemptState.Running, RMAppState.Accepted); }
// Test Kill an app while the app is failing /// <exception cref="System.Exception"/> public virtual void TestKillFailingApp() { // this dispatcher ignores RMAppAttemptEventType.KILL event Dispatcher dispatcher = new _AsyncDispatcher_708(); MockRM rm1 = new _MockRM_731(dispatcher, conf); rm1.Start(); MockNM nm1 = new MockNM("127.0.0.1:1234", 8192, rm1.GetResourceTrackerService()); nm1.RegisterNode(); RMApp app1 = rm1.SubmitApp(200); MockAM am1 = MockRM.LaunchAndRegisterAM(app1, rm1, nm1); rm1.KillApp(app1.GetApplicationId()); // fail the app by sending container_finished event. nm1.NodeHeartbeat(am1.GetApplicationAttemptId(), 1, ContainerState.Complete); rm1.WaitForState(am1.GetApplicationAttemptId(), RMAppAttemptState.Failed); // app is killed, not launching a new attempt rm1.WaitForState(app1.GetApplicationId(), RMAppState.Killed); }
/// <exception cref="System.Exception"/> public virtual void TestKillAppWhenFailOverHappensDuringApplicationKill() { // create a customized ClientRMService // When receives the killApplicationRequest, simply return the response // and make sure the application will not be KILLED State StartRMsWithCustomizedClientRMService(); MockNM nm1 = new MockNM("127.0.0.1:1234", 15120, rm1.GetResourceTrackerService()); nm1.RegisterNode(); // create app and launch the AM RMApp app0 = rm1.SubmitApp(200); MockAM am0 = LaunchAM(app0, rm1, nm1); // ensure that the app is in running state NUnit.Framework.Assert.AreEqual(app0.GetState(), RMAppState.Running); // kill the app. rm1.KillApp(app0.GetApplicationId()); // failover happens before this application goes to final state. // The RMAppState that will be loaded by the active rm // should be ACCEPTED. FailOverAndKillApp(app0.GetApplicationId(), am0.GetApplicationAttemptId(), RMAppState .Running, RMAppAttemptState.Running, RMAppState.Accepted); }
/// <exception cref="System.Exception"/> public virtual void TestKillAppWhenFailoverHappensAtFinalState() { StartRMs(); MockNM nm1 = new MockNM("127.0.0.1:1234", 15120, rm1.GetResourceTrackerService()); nm1.RegisterNode(); // create app and launch the AM RMApp app0 = rm1.SubmitApp(200); MockAM am0 = LaunchAM(app0, rm1, nm1); // kill the app. rm1.KillApp(app0.GetApplicationId()); rm1.WaitForState(app0.GetApplicationId(), RMAppState.Killed); rm1.WaitForState(am0.GetApplicationAttemptId(), RMAppAttemptState.Killed); // failover and kill application // The application is at Killed State and RMStateStore has already // saved this applicationState. After failover happens, the current // active RM will load the ApplicationState whose RMAppState is killed. // Because this application is at Final State, // KillApplicationResponse.getIsKillCompleted is expected to return true. FailOverAndKillApp(app0.GetApplicationId(), am0.GetApplicationAttemptId(), RMAppState .Killed, RMAppAttemptState.Killed, RMAppState.Killed); }
// Test Kill an app while the app is finishing in the meanwhile. /// <exception cref="System.Exception"/> public virtual void TestKillFinishingApp() { // this dispatcher ignores RMAppAttemptEventType.KILL event Dispatcher dispatcher = new _AsyncDispatcher_654(); MockRM rm1 = new _MockRM_677(dispatcher, conf); rm1.Start(); MockNM nm1 = new MockNM("127.0.0.1:1234", 8192, rm1.GetResourceTrackerService()); nm1.RegisterNode(); RMApp app1 = rm1.SubmitApp(200); MockAM am1 = MockRM.LaunchAndRegisterAM(app1, rm1, nm1); rm1.KillApp(app1.GetApplicationId()); FinishApplicationMasterRequest req = FinishApplicationMasterRequest.NewInstance(FinalApplicationStatus .Succeeded, string.Empty, string.Empty); am1.UnregisterAppAttempt(req, true); rm1.WaitForState(am1.GetApplicationAttemptId(), RMAppAttemptState.Finishing); nm1.NodeHeartbeat(am1.GetApplicationAttemptId(), 1, ContainerState.Complete); rm1.WaitForState(am1.GetApplicationAttemptId(), RMAppAttemptState.Finished); rm1.WaitForState(app1.GetApplicationId(), RMAppState.Finished); }
/// <exception cref="System.Exception"/> public virtual void TestActivatingApplicationAfterAddingNM() { MockRM rm1 = new MockRM(conf); // start like normal because state is empty rm1.Start(); // app that gets launched RMApp app1 = rm1.SubmitApp(200); // app that does not get launched RMApp app2 = rm1.SubmitApp(200); // app1 and app2 should be scheduled, but because no resource is available, // they are not activated. RMAppAttempt attempt1 = app1.GetCurrentAppAttempt(); ApplicationAttemptId attemptId1 = attempt1.GetAppAttemptId(); rm1.WaitForState(attemptId1, RMAppAttemptState.Scheduled); RMAppAttempt attempt2 = app2.GetCurrentAppAttempt(); ApplicationAttemptId attemptId2 = attempt2.GetAppAttemptId(); rm1.WaitForState(attemptId2, RMAppAttemptState.Scheduled); MockNM nm1 = new MockNM("h1:1234", 15120, rm1.GetResourceTrackerService()); MockNM nm2 = new MockNM("h2:5678", 15120, rm1.GetResourceTrackerService()); nm1.RegisterNode(); nm2.RegisterNode(); //kick the scheduling nm1.NodeHeartbeat(true); // app1 should be allocated now rm1.WaitForState(attemptId1, RMAppAttemptState.Allocated); rm1.WaitForState(attemptId2, RMAppAttemptState.Scheduled); nm2.NodeHeartbeat(true); // app2 should be allocated now rm1.WaitForState(attemptId1, RMAppAttemptState.Allocated); rm1.WaitForState(attemptId2, RMAppAttemptState.Allocated); rm1.Stop(); }
/// <exception cref="System.Exception"/> private void AmRestartTests(bool keepRunningContainers) { MockRM rm = new MockRM(conf); rm.Start(); RMApp app = rm.SubmitApp(200, "name", "user", new Dictionary <ApplicationAccessType , string>(), false, "default", -1, null, "MAPREDUCE", false, keepRunningContainers ); MockNM nm = new MockNM("127.0.0.1:1234", 10240, rm.GetResourceTrackerService()); nm.RegisterNode(); MockAM am0 = MockRM.LaunchAndRegisterAM(app, rm, nm); int NumContainers = 1; // allocate NUM_CONTAINERS containers am0.Allocate("127.0.0.1", 1024, NumContainers, new AList <ContainerId>()); nm.NodeHeartbeat(true); // wait for containers to be allocated. IList <Container> containers = am0.Allocate(new AList <ResourceRequest>(), new AList <ContainerId>()).GetAllocatedContainers(); while (containers.Count != NumContainers) { nm.NodeHeartbeat(true); Sharpen.Collections.AddAll(containers, am0.Allocate(new AList <ResourceRequest>(), new AList <ContainerId>()).GetAllocatedContainers()); Sharpen.Thread.Sleep(200); } // launch the 2nd container. ContainerId containerId2 = ContainerId.NewContainerId(am0.GetApplicationAttemptId (), 2); nm.NodeHeartbeat(am0.GetApplicationAttemptId(), containerId2.GetContainerId(), ContainerState .Running); rm.WaitForState(nm, containerId2, RMContainerState.Running); // Capture the containers here so the metrics can be calculated after the // app has completed. ICollection <RMContainer> rmContainers = rm.scheduler.GetSchedulerAppInfo(am0.GetApplicationAttemptId ()).GetLiveContainers(); // fail the first app attempt by sending CONTAINER_FINISHED event without // registering. ContainerId amContainerId = app.GetCurrentAppAttempt().GetMasterContainer().GetId (); nm.NodeHeartbeat(am0.GetApplicationAttemptId(), amContainerId.GetContainerId(), ContainerState .Complete); am0.WaitForState(RMAppAttemptState.Failed); long memorySeconds = 0; long vcoreSeconds = 0; // Calculate container usage metrics for first attempt. if (keepRunningContainers) { // Only calculate the usage for the one container that has completed. foreach (RMContainer c in rmContainers) { if (c.GetContainerId().Equals(amContainerId)) { AggregateAppResourceUsage ru = CalculateContainerResourceMetrics(c); memorySeconds += ru.GetMemorySeconds(); vcoreSeconds += ru.GetVcoreSeconds(); } else { // The remaining container should be RUNNING. NUnit.Framework.Assert.IsTrue("After first attempt failed, remaining container " + "should still be running. ", c.GetContainerState().Equals(ContainerState.Running )); } } } else { // If keepRunningContainers is false, all live containers should now // be completed. Calculate the resource usage metrics for all of them. foreach (RMContainer c in rmContainers) { AggregateAppResourceUsage ru = CalculateContainerResourceMetrics(c); memorySeconds += ru.GetMemorySeconds(); vcoreSeconds += ru.GetVcoreSeconds(); } } // wait for app to start a new attempt. rm.WaitForState(app.GetApplicationId(), RMAppState.Accepted); // assert this is a new AM. RMAppAttempt attempt2 = app.GetCurrentAppAttempt(); NUnit.Framework.Assert.IsFalse(attempt2.GetAppAttemptId().Equals(am0.GetApplicationAttemptId ())); // launch the new AM nm.NodeHeartbeat(true); MockAM am1 = rm.SendAMLaunched(attempt2.GetAppAttemptId()); am1.RegisterAppAttempt(); // allocate NUM_CONTAINERS containers am1.Allocate("127.0.0.1", 1024, NumContainers, new AList <ContainerId>()); nm.NodeHeartbeat(true); // wait for containers to be allocated. containers = am1.Allocate(new AList <ResourceRequest>(), new AList <ContainerId>()) .GetAllocatedContainers(); while (containers.Count != NumContainers) { nm.NodeHeartbeat(true); Sharpen.Collections.AddAll(containers, am1.Allocate(new AList <ResourceRequest>(), new AList <ContainerId>()).GetAllocatedContainers()); Sharpen.Thread.Sleep(200); } rm.WaitForState(app.GetApplicationId(), RMAppState.Running); // Capture running containers for later use by metrics calculations. rmContainers = rm.scheduler.GetSchedulerAppInfo(attempt2.GetAppAttemptId()).GetLiveContainers (); // complete container by sending the container complete event which has // earlier attempt's attemptId amContainerId = app.GetCurrentAppAttempt().GetMasterContainer().GetId(); nm.NodeHeartbeat(am0.GetApplicationAttemptId(), amContainerId.GetContainerId(), ContainerState .Complete); MockRM.FinishAMAndVerifyAppState(app, rm, nm, am1); // Calculate container usage metrics for second attempt. foreach (RMContainer c_1 in rmContainers) { AggregateAppResourceUsage ru = CalculateContainerResourceMetrics(c_1); memorySeconds += ru.GetMemorySeconds(); vcoreSeconds += ru.GetVcoreSeconds(); } RMAppMetrics rmAppMetrics = app.GetRMAppMetrics(); NUnit.Framework.Assert.AreEqual("Unexcpected MemorySeconds value", memorySeconds, rmAppMetrics.GetMemorySeconds()); NUnit.Framework.Assert.AreEqual("Unexpected VcoreSeconds value", vcoreSeconds, rmAppMetrics .GetVcoreSeconds()); rm.Stop(); return; }
/// <exception cref="System.Exception"/> public virtual void TestUsageWithMultipleContainersAndRMRestart() { // Set max attempts to 1 so that when the first attempt fails, the app // won't try to start a new one. conf.SetInt(YarnConfiguration.RmAmMaxAttempts, 1); conf.SetBoolean(YarnConfiguration.RecoveryEnabled, true); conf.SetBoolean(YarnConfiguration.RmWorkPreservingRecoveryEnabled, false); MemoryRMStateStore memStore = new MemoryRMStateStore(); memStore.Init(conf); MockRM rm0 = new MockRM(conf, memStore); rm0.Start(); MockNM nm = new MockNM("127.0.0.1:1234", 65536, rm0.GetResourceTrackerService()); nm.RegisterNode(); RMApp app0 = rm0.SubmitApp(200); rm0.WaitForState(app0.GetApplicationId(), RMAppState.Accepted); RMAppAttempt attempt0 = app0.GetCurrentAppAttempt(); ApplicationAttemptId attemptId0 = attempt0.GetAppAttemptId(); rm0.WaitForState(attemptId0, RMAppAttemptState.Scheduled); nm.NodeHeartbeat(true); rm0.WaitForState(attemptId0, RMAppAttemptState.Allocated); MockAM am0 = rm0.SendAMLaunched(attempt0.GetAppAttemptId()); am0.RegisterAppAttempt(); int NumContainers = 2; am0.Allocate("127.0.0.1", 1000, NumContainers, new AList <ContainerId>()); nm.NodeHeartbeat(true); IList <Container> conts = am0.Allocate(new AList <ResourceRequest>(), new AList <ContainerId >()).GetAllocatedContainers(); while (conts.Count != NumContainers) { nm.NodeHeartbeat(true); Sharpen.Collections.AddAll(conts, am0.Allocate(new AList <ResourceRequest>(), new AList <ContainerId>()).GetAllocatedContainers()); Sharpen.Thread.Sleep(500); } // launch the 2nd and 3rd containers. foreach (Container c in conts) { nm.NodeHeartbeat(attempt0.GetAppAttemptId(), c.GetId().GetContainerId(), ContainerState .Running); rm0.WaitForState(nm, c.GetId(), RMContainerState.Running); } // Get the RMContainers for all of the live containers, to be used later // for metrics calculations and comparisons. ICollection <RMContainer> rmContainers = rm0.scheduler.GetSchedulerAppInfo(attempt0 .GetAppAttemptId()).GetLiveContainers(); // Allow metrics to accumulate. int sleepInterval = 1000; int cumulativeSleepTime = 0; while (app0.GetRMAppMetrics().GetMemorySeconds() <= 0 && cumulativeSleepTime < 5000 ) { Sharpen.Thread.Sleep(sleepInterval); cumulativeSleepTime += sleepInterval; } // Stop all non-AM containers foreach (Container c_1 in conts) { if (c_1.GetId().GetContainerId() == 1) { continue; } nm.NodeHeartbeat(attempt0.GetAppAttemptId(), c_1.GetId().GetContainerId(), ContainerState .Complete); rm0.WaitForState(nm, c_1.GetId(), RMContainerState.Completed); } // After all other containers have completed, manually complete the master // container in order to trigger a save to the state store of the resource // usage metrics. This will cause the attempt to fail, and, since the max // attempt retries is 1, the app will also fail. This is intentional so // that all containers will complete prior to saving. ContainerId cId = ContainerId.NewContainerId(attempt0.GetAppAttemptId(), 1); nm.NodeHeartbeat(attempt0.GetAppAttemptId(), cId.GetContainerId(), ContainerState .Complete); rm0.WaitForState(nm, cId, RMContainerState.Completed); // Check that the container metrics match those from the app usage report. long memorySeconds = 0; long vcoreSeconds = 0; foreach (RMContainer c_2 in rmContainers) { AggregateAppResourceUsage ru = CalculateContainerResourceMetrics(c_2); memorySeconds += ru.GetMemorySeconds(); vcoreSeconds += ru.GetVcoreSeconds(); } RMAppMetrics metricsBefore = app0.GetRMAppMetrics(); NUnit.Framework.Assert.AreEqual("Unexcpected MemorySeconds value", memorySeconds, metricsBefore.GetMemorySeconds()); NUnit.Framework.Assert.AreEqual("Unexpected VcoreSeconds value", vcoreSeconds, metricsBefore .GetVcoreSeconds()); // create new RM to represent RM restart. Load up the state store. MockRM rm1 = new MockRM(conf, memStore); rm1.Start(); RMApp app0After = rm1.GetRMContext().GetRMApps()[app0.GetApplicationId()]; // Compare container resource usage metrics from before and after restart. RMAppMetrics metricsAfter = app0After.GetRMAppMetrics(); NUnit.Framework.Assert.AreEqual("Vcore seconds were not the same after RM Restart" , metricsBefore.GetVcoreSeconds(), metricsAfter.GetVcoreSeconds()); NUnit.Framework.Assert.AreEqual("Memory seconds were not the same after RM Restart" , metricsBefore.GetMemorySeconds(), metricsAfter.GetMemorySeconds()); rm0.Stop(); rm0.Close(); rm1.Stop(); rm1.Close(); }
// The test verifies processing of NMContainerStatuses which are sent during // NM registration. // 1. Start the cluster-RM,NM,Submit app with 1024MB,Launch & register AM // 2. AM sends ResourceRequest for 1 container with memory 2048MB. // 3. Verify for number of container allocated by RM // 4. Verify Memory Usage by cluster, it should be 3072. AM memory + requested // memory. 1024 + 2048=3072 // 5. Re-register NM by sending completed container status // 6. Verify for Memory Used, it should be 1024 // 7. Send AM heatbeat to RM. Allocated response should contain completed // container. /// <exception cref="System.Exception"/> public virtual void TestProcessingNMContainerStatusesOnNMRestart() { conf.SetInt(YarnConfiguration.RmAmMaxAttempts, 1); MemoryRMStateStore memStore = new MemoryRMStateStore(); memStore.Init(conf); // 1. Start the cluster-RM,NM,Submit app with 1024MB,Launch & register AM MockRM rm1 = new MockRM(conf, memStore); rm1.Start(); int nmMemory = 8192; int amMemory = 1024; int containerMemory = 2048; MockNM nm1 = new MockNM("127.0.0.1:1234", nmMemory, rm1.GetResourceTrackerService ()); nm1.RegisterNode(); RMApp app0 = rm1.SubmitApp(amMemory); MockAM am0 = MockRM.LaunchAndRegisterAM(app0, rm1, nm1); // 2. AM sends ResourceRequest for 1 container with memory 2048MB. int noOfContainers = 1; IList <Container> allocateContainers = am0.AllocateAndWaitForContainers(noOfContainers , containerMemory, nm1); // 3. Verify for number of container allocated by RM NUnit.Framework.Assert.AreEqual(noOfContainers, allocateContainers.Count); Container container = allocateContainers[0]; nm1.NodeHeartbeat(am0.GetApplicationAttemptId(), 1, ContainerState.Running); nm1.NodeHeartbeat(am0.GetApplicationAttemptId(), container.GetId().GetContainerId (), ContainerState.Running); rm1.WaitForState(app0.GetApplicationId(), RMAppState.Running); // 4. Verify Memory Usage by cluster, it should be 3072. AM memory + // requested memory. 1024 + 2048=3072 ResourceScheduler rs = rm1.GetRMContext().GetScheduler(); int allocatedMB = rs.GetRootQueueMetrics().GetAllocatedMB(); NUnit.Framework.Assert.AreEqual(amMemory + containerMemory, allocatedMB); // 5. Re-register NM by sending completed container status IList <NMContainerStatus> nMContainerStatusForApp = CreateNMContainerStatusForApp( am0); nm1.RegisterNode(nMContainerStatusForApp, Arrays.AsList(app0.GetApplicationId())); WaitForClusterMemory(nm1, rs, amMemory); // 6. Verify for Memory Used, it should be 1024 NUnit.Framework.Assert.AreEqual(amMemory, rs.GetRootQueueMetrics().GetAllocatedMB ()); // 7. Send AM heatbeat to RM. Allocated response should contain completed // container AllocateRequest req = AllocateRequest.NewInstance(0, 0F, new AList <ResourceRequest >(), new AList <ContainerId>(), null); AllocateResponse allocate = am0.Allocate(req); IList <ContainerStatus> completedContainersStatuses = allocate.GetCompletedContainersStatuses (); NUnit.Framework.Assert.AreEqual(noOfContainers, completedContainersStatuses.Count ); // Application clean up should happen Cluster memory used is 0 nm1.NodeHeartbeat(am0.GetApplicationAttemptId(), 1, ContainerState.Complete); WaitForClusterMemory(nm1, rs, 0); rm1.Stop(); }
public virtual void TestReconnectNode() { DrainDispatcher dispatcher = new DrainDispatcher(); rm = new _MockRM_567(this, dispatcher); rm.Start(); MockNM nm1 = rm.RegisterNode("host1:1234", 5120); MockNM nm2 = rm.RegisterNode("host2:5678", 5120); nm1.NodeHeartbeat(true); nm2.NodeHeartbeat(false); dispatcher.Await(); CheckUnealthyNMCount(rm, nm2, true, 1); int expectedNMs = ClusterMetrics.GetMetrics().GetNumActiveNMs(); QueueMetrics metrics = rm.GetResourceScheduler().GetRootQueueMetrics(); // TODO Metrics incorrect in case of the FifoScheduler NUnit.Framework.Assert.AreEqual(5120, metrics.GetAvailableMB()); // reconnect of healthy node nm1 = rm.RegisterNode("host1:1234", 5120); NodeHeartbeatResponse response = nm1.NodeHeartbeat(true); NUnit.Framework.Assert.IsTrue(NodeAction.Normal.Equals(response.GetNodeAction())); dispatcher.Await(); NUnit.Framework.Assert.AreEqual(expectedNMs, ClusterMetrics.GetMetrics().GetNumActiveNMs ()); CheckUnealthyNMCount(rm, nm2, true, 1); // reconnect of unhealthy node nm2 = rm.RegisterNode("host2:5678", 5120); response = nm2.NodeHeartbeat(false); NUnit.Framework.Assert.IsTrue(NodeAction.Normal.Equals(response.GetNodeAction())); dispatcher.Await(); NUnit.Framework.Assert.AreEqual(expectedNMs, ClusterMetrics.GetMetrics().GetNumActiveNMs ()); CheckUnealthyNMCount(rm, nm2, true, 1); // unhealthy node changed back to healthy nm2 = rm.RegisterNode("host2:5678", 5120); dispatcher.Await(); response = nm2.NodeHeartbeat(true); response = nm2.NodeHeartbeat(true); dispatcher.Await(); NUnit.Framework.Assert.AreEqual(5120 + 5120, metrics.GetAvailableMB()); // reconnect of node with changed capability nm1 = rm.RegisterNode("host2:5678", 10240); dispatcher.Await(); response = nm1.NodeHeartbeat(true); dispatcher.Await(); NUnit.Framework.Assert.IsTrue(NodeAction.Normal.Equals(response.GetNodeAction())); NUnit.Framework.Assert.AreEqual(5120 + 10240, metrics.GetAvailableMB()); // reconnect of node with changed capability and running applications IList <ApplicationId> runningApps = new AList <ApplicationId>(); runningApps.AddItem(ApplicationId.NewInstance(1, 0)); nm1 = rm.RegisterNode("host2:5678", 15360, 2, runningApps); dispatcher.Await(); response = nm1.NodeHeartbeat(true); dispatcher.Await(); NUnit.Framework.Assert.IsTrue(NodeAction.Normal.Equals(response.GetNodeAction())); NUnit.Framework.Assert.AreEqual(5120 + 15360, metrics.GetAvailableMB()); // reconnect healthy node changing http port nm1 = new MockNM("host1:1234", 5120, rm.GetResourceTrackerService()); nm1.SetHttpPort(3); nm1.RegisterNode(); dispatcher.Await(); response = nm1.NodeHeartbeat(true); response = nm1.NodeHeartbeat(true); dispatcher.Await(); RMNode rmNode = rm.GetRMContext().GetRMNodes()[nm1.GetNodeId()]; NUnit.Framework.Assert.AreEqual(3, rmNode.GetHttpPort()); NUnit.Framework.Assert.AreEqual(5120, rmNode.GetTotalCapability().GetMemory()); NUnit.Framework.Assert.AreEqual(5120 + 15360, metrics.GetAvailableMB()); }