public virtual void TestAllcoateRequestWithIncrease() { IList <ContainerResourceIncreaseRequest> incRequests = new AList <ContainerResourceIncreaseRequest >(); for (int i = 0; i < 3; i++) { incRequests.AddItem(ContainerResourceIncreaseRequest.NewInstance(null, Resource.NewInstance (0, i))); } AllocateRequest r = AllocateRequest.NewInstance(123, 0f, null, null, null, incRequests ); // serde YarnServiceProtos.AllocateRequestProto p = ((AllocateRequestPBImpl)r).GetProto(); r = new AllocateRequestPBImpl(p); // check value NUnit.Framework.Assert.AreEqual(123, r.GetResponseId()); NUnit.Framework.Assert.AreEqual(incRequests.Count, r.GetIncreaseRequests().Count); for (int i_1 = 0; i_1 < incRequests.Count; i_1++) { NUnit.Framework.Assert.AreEqual(r.GetIncreaseRequests()[i_1].GetCapability().GetVirtualCores (), incRequests[i_1].GetCapability().GetVirtualCores()); } }
/// <exception cref="System.Exception"/> public virtual AllocateResponse Allocate(IList <ResourceRequest> resourceRequest, IList <ContainerId> releases) { AllocateRequest req = AllocateRequest.NewInstance(0, 0F, resourceRequest, releases , null); return(Allocate(req)); }
/// <exception cref="System.Exception"/> protected internal override void Heartbeat() { lock (this) { AllocateRequest allocateRequest = AllocateRequest.NewInstance(this.lastResponseID , base.GetApplicationProgress(), new AList <ResourceRequest>(), new AList <ContainerId >(), null); AllocateResponse allocateResponse = null; try { allocateResponse = scheduler.Allocate(allocateRequest); // Reset retry count if no exception occurred. retrystartTime = Runtime.CurrentTimeMillis(); } catch (ApplicationAttemptNotFoundException e) { Log.Info("Event from RM: shutting down Application Master"); // This can happen if the RM has been restarted. If it is in that state, // this application must clean itself up. eventHandler.Handle(new JobEvent(this.GetJob().GetID(), JobEventType.JobAmReboot) ); throw new YarnRuntimeException("Resource Manager doesn't recognize AttemptId: " + this.GetContext().GetApplicationID(), e); } catch (ApplicationMasterNotRegisteredException) { Log.Info("ApplicationMaster is out of sync with ResourceManager," + " hence resync and send outstanding requests." ); this.lastResponseID = 0; Register(); } catch (Exception e) { // This can happen when the connection to the RM has gone down. Keep // re-trying until the retryInterval has expired. if (Runtime.CurrentTimeMillis() - retrystartTime >= retryInterval) { Log.Error("Could not contact RM after " + retryInterval + " milliseconds."); eventHandler.Handle(new JobEvent(this.GetJob().GetID(), JobEventType.InternalError )); throw new YarnRuntimeException("Could not contact RM after " + retryInterval + " milliseconds." ); } // Throw this up to the caller, which may decide to ignore it and // continue to attempt to contact the RM. throw; } if (allocateResponse != null) { this.lastResponseID = allocateResponse.GetResponseId(); Token token = allocateResponse.GetAMRMToken(); if (token != null) { UpdateAMRMToken(token); } } } }
/// <exception cref="Org.Apache.Hadoop.Yarn.Exceptions.YarnException"/> /// <exception cref="System.IO.IOException"/> public virtual void TestAllocateOnHA() { AllocateRequest request = AllocateRequest.NewInstance(0, 50f, new AList <ResourceRequest >(), new AList <ContainerId>(), ResourceBlacklistRequest.NewInstance(new AList <string >(), new AList <string>())); AllocateResponse response = amClient.Allocate(request); NUnit.Framework.Assert.AreEqual(response, this.cluster.CreateFakeAllocateResponse ()); }
public virtual void TestAllcoateRequestWithoutIncrease() { AllocateRequest r = AllocateRequest.NewInstance(123, 0f, null, null, null, null); // serde YarnServiceProtos.AllocateRequestProto p = ((AllocateRequestPBImpl)r).GetProto(); r = new AllocateRequestPBImpl(p); // check value NUnit.Framework.Assert.AreEqual(123, r.GetResponseId()); NUnit.Framework.Assert.AreEqual(0, r.GetIncreaseRequests().Count); }
public virtual void TestValidateResourceBlacklistRequest() { TestAMAuthorization.MyContainerManager containerManager = new TestAMAuthorization.MyContainerManager (); TestAMAuthorization.MockRMWithAMS rm = new TestAMAuthorization.MockRMWithAMS(new YarnConfiguration(), containerManager); rm.Start(); MockNM nm1 = rm.RegisterNode("localhost:1234", 5120); IDictionary <ApplicationAccessType, string> acls = new Dictionary <ApplicationAccessType , string>(2); acls[ApplicationAccessType.ViewApp] = "*"; RMApp app = rm.SubmitApp(1024, "appname", "appuser", acls); nm1.NodeHeartbeat(true); RMAppAttempt attempt = app.GetCurrentAppAttempt(); ApplicationAttemptId applicationAttemptId = attempt.GetAppAttemptId(); WaitForLaunchedState(attempt); // Create a client to the RM. Configuration conf = rm.GetConfig(); YarnRPC rpc = YarnRPC.Create(conf); UserGroupInformation currentUser = UserGroupInformation.CreateRemoteUser(applicationAttemptId .ToString()); Credentials credentials = containerManager.GetContainerCredentials(); IPEndPoint rmBindAddress = rm.GetApplicationMasterService().GetBindAddress(); Org.Apache.Hadoop.Security.Token.Token <TokenIdentifier> amRMToken = TestAMAuthorization.MockRMWithAMS .SetupAndReturnAMRMToken(rmBindAddress, credentials.GetAllTokens()); currentUser.AddToken(amRMToken); ApplicationMasterProtocol client = currentUser.DoAs(new _PrivilegedAction_626(rpc , rmBindAddress, conf)); RegisterApplicationMasterRequest request = Org.Apache.Hadoop.Yarn.Util.Records.NewRecord <RegisterApplicationMasterRequest>(); client.RegisterApplicationMaster(request); ResourceBlacklistRequest blacklistRequest = ResourceBlacklistRequest.NewInstance( Sharpen.Collections.SingletonList(ResourceRequest.Any), null); AllocateRequest allocateRequest = AllocateRequest.NewInstance(0, 0.0f, null, null , blacklistRequest); bool error = false; try { client.Allocate(allocateRequest); } catch (InvalidResourceBlacklistRequestException) { error = true; } rm.Stop(); NUnit.Framework.Assert.IsTrue("Didn't not catch InvalidResourceBlacklistRequestException" , error); }
/// <exception cref="Org.Apache.Hadoop.Yarn.Exceptions.YarnException"/> /// <exception cref="System.IO.IOException"/> protected internal virtual AllocateResponse MakeRemoteRequest() { ApplyRequestLimits(); ResourceBlacklistRequest blacklistRequest = ResourceBlacklistRequest.NewInstance( new AList <string>(blacklistAdditions), new AList <string>(blacklistRemovals)); AllocateRequest allocateRequest = AllocateRequest.NewInstance(lastResponseID, base .GetApplicationProgress(), new AList <ResourceRequest>(ask), new AList <ContainerId >(release), blacklistRequest); AllocateResponse allocateResponse = scheduler.Allocate(allocateRequest); lastResponseID = allocateResponse.GetResponseId(); availableResources = allocateResponse.GetAvailableResources(); lastClusterNmCount = clusterNmCount; clusterNmCount = allocateResponse.GetNumClusterNodes(); int numCompletedContainers = allocateResponse.GetCompletedContainersStatuses().Count; if (ask.Count > 0 || release.Count > 0) { Log.Info("getResources() for " + applicationId + ":" + " ask=" + ask.Count + " release= " + release.Count + " newContainers=" + allocateResponse.GetAllocatedContainers() .Count + " finishedContainers=" + numCompletedContainers + " resourcelimit=" + availableResources + " knownNMs=" + clusterNmCount); } ask.Clear(); release.Clear(); if (numCompletedContainers > 0) { // re-send limited requests when a container completes to trigger asking // for more containers Sharpen.Collections.AddAll(requestLimitsToUpdate, requestLimits.Keys); } if (blacklistAdditions.Count > 0 || blacklistRemovals.Count > 0) { Log.Info("Update the blacklist for " + applicationId + ": blacklistAdditions=" + blacklistAdditions.Count + " blacklistRemovals=" + blacklistRemovals.Count); } blacklistAdditions.Clear(); blacklistRemovals.Clear(); return(allocateResponse); }
public virtual void TestARRMResponseId() { MockNM nm1 = rm.RegisterNode("h1:1234", 5000); RMApp app = rm.SubmitApp(2000); // Trigger the scheduling so the AM gets 'launched' nm1.NodeHeartbeat(true); RMAppAttempt attempt = app.GetCurrentAppAttempt(); MockAM am = rm.SendAMLaunched(attempt.GetAppAttemptId()); am.RegisterAppAttempt(); AllocateRequest allocateRequest = AllocateRequest.NewInstance(0, 0F, null, null, null); AllocateResponse response = Allocate(attempt.GetAppAttemptId(), allocateRequest); NUnit.Framework.Assert.AreEqual(1, response.GetResponseId()); NUnit.Framework.Assert.IsTrue(response.GetAMCommand() == null); allocateRequest = AllocateRequest.NewInstance(response.GetResponseId(), 0F, null, null, null); response = Allocate(attempt.GetAppAttemptId(), allocateRequest); NUnit.Framework.Assert.AreEqual(2, response.GetResponseId()); /* try resending */ response = Allocate(attempt.GetAppAttemptId(), allocateRequest); NUnit.Framework.Assert.AreEqual(2, response.GetResponseId()); allocateRequest = AllocateRequest.NewInstance(0, 0F, null, null, null); try { Allocate(attempt.GetAppAttemptId(), allocateRequest); NUnit.Framework.Assert.Fail(); } catch (Exception e) { NUnit.Framework.Assert.IsTrue(e.InnerException is InvalidApplicationMasterRequestException ); } }
// The test verifies processing of NMContainerStatuses which are sent during // NM registration. // 1. Start the cluster-RM,NM,Submit app with 1024MB,Launch & register AM // 2. AM sends ResourceRequest for 1 container with memory 2048MB. // 3. Verify for number of container allocated by RM // 4. Verify Memory Usage by cluster, it should be 3072. AM memory + requested // memory. 1024 + 2048=3072 // 5. Re-register NM by sending completed container status // 6. Verify for Memory Used, it should be 1024 // 7. Send AM heatbeat to RM. Allocated response should contain completed // container. /// <exception cref="System.Exception"/> public virtual void TestProcessingNMContainerStatusesOnNMRestart() { conf.SetInt(YarnConfiguration.RmAmMaxAttempts, 1); MemoryRMStateStore memStore = new MemoryRMStateStore(); memStore.Init(conf); // 1. Start the cluster-RM,NM,Submit app with 1024MB,Launch & register AM MockRM rm1 = new MockRM(conf, memStore); rm1.Start(); int nmMemory = 8192; int amMemory = 1024; int containerMemory = 2048; MockNM nm1 = new MockNM("127.0.0.1:1234", nmMemory, rm1.GetResourceTrackerService ()); nm1.RegisterNode(); RMApp app0 = rm1.SubmitApp(amMemory); MockAM am0 = MockRM.LaunchAndRegisterAM(app0, rm1, nm1); // 2. AM sends ResourceRequest for 1 container with memory 2048MB. int noOfContainers = 1; IList <Container> allocateContainers = am0.AllocateAndWaitForContainers(noOfContainers , containerMemory, nm1); // 3. Verify for number of container allocated by RM NUnit.Framework.Assert.AreEqual(noOfContainers, allocateContainers.Count); Container container = allocateContainers[0]; nm1.NodeHeartbeat(am0.GetApplicationAttemptId(), 1, ContainerState.Running); nm1.NodeHeartbeat(am0.GetApplicationAttemptId(), container.GetId().GetContainerId (), ContainerState.Running); rm1.WaitForState(app0.GetApplicationId(), RMAppState.Running); // 4. Verify Memory Usage by cluster, it should be 3072. AM memory + // requested memory. 1024 + 2048=3072 ResourceScheduler rs = rm1.GetRMContext().GetScheduler(); int allocatedMB = rs.GetRootQueueMetrics().GetAllocatedMB(); NUnit.Framework.Assert.AreEqual(amMemory + containerMemory, allocatedMB); // 5. Re-register NM by sending completed container status IList <NMContainerStatus> nMContainerStatusForApp = CreateNMContainerStatusForApp( am0); nm1.RegisterNode(nMContainerStatusForApp, Arrays.AsList(app0.GetApplicationId())); WaitForClusterMemory(nm1, rs, amMemory); // 6. Verify for Memory Used, it should be 1024 NUnit.Framework.Assert.AreEqual(amMemory, rs.GetRootQueueMetrics().GetAllocatedMB ()); // 7. Send AM heatbeat to RM. Allocated response should contain completed // container AllocateRequest req = AllocateRequest.NewInstance(0, 0F, new AList <ResourceRequest >(), new AList <ContainerId>(), null); AllocateResponse allocate = am0.Allocate(req); IList <ContainerStatus> completedContainersStatuses = allocate.GetCompletedContainersStatuses (); NUnit.Framework.Assert.AreEqual(noOfContainers, completedContainersStatuses.Count ); // Application clean up should happen Cluster memory used is 0 nm1.NodeHeartbeat(am0.GetApplicationAttemptId(), 1, ContainerState.Complete); WaitForClusterMemory(nm1, rs, 0); rm1.Stop(); }
/// <exception cref="Org.Apache.Hadoop.Yarn.Exceptions.YarnException"/> /// <exception cref="System.IO.IOException"/> public override AllocateResponse Allocate(float progressIndicator) { Preconditions.CheckArgument(progressIndicator >= 0, "Progress indicator should not be negative" ); AllocateResponse allocateResponse = null; IList <ResourceRequest> askList = null; IList <ContainerId> releaseList = null; AllocateRequest allocateRequest = null; IList <string> blacklistToAdd = new AList <string>(); IList <string> blacklistToRemove = new AList <string>(); try { lock (this) { askList = new AList <ResourceRequest>(ask.Count); foreach (ResourceRequest r in ask) { // create a copy of ResourceRequest as we might change it while the // RPC layer is using it to send info across askList.AddItem(ResourceRequest.NewInstance(r.GetPriority(), r.GetResourceName(), r.GetCapability(), r.GetNumContainers(), r.GetRelaxLocality(), r.GetNodeLabelExpression ())); } releaseList = new AList <ContainerId>(release); // optimistically clear this collection assuming no RPC failure ask.Clear(); release.Clear(); Sharpen.Collections.AddAll(blacklistToAdd, blacklistAdditions); Sharpen.Collections.AddAll(blacklistToRemove, blacklistRemovals); ResourceBlacklistRequest blacklistRequest = ResourceBlacklistRequest.NewInstance( blacklistToAdd, blacklistToRemove); allocateRequest = AllocateRequest.NewInstance(lastResponseId, progressIndicator, askList, releaseList, blacklistRequest); // clear blacklistAdditions and blacklistRemovals before // unsynchronized part blacklistAdditions.Clear(); blacklistRemovals.Clear(); } try { allocateResponse = rmClient.Allocate(allocateRequest); } catch (ApplicationMasterNotRegisteredException) { Log.Warn("ApplicationMaster is out of sync with ResourceManager," + " hence resyncing." ); lock (this) { Sharpen.Collections.AddAll(release, this.pendingRelease); Sharpen.Collections.AddAll(blacklistAdditions, this.blacklistedNodes); foreach (IDictionary <string, SortedDictionary <Resource, AMRMClientImpl.ResourceRequestInfo > > rr in remoteRequestsTable.Values) { foreach (IDictionary <Resource, AMRMClientImpl.ResourceRequestInfo> capabalities in rr.Values) { foreach (AMRMClientImpl.ResourceRequestInfo request in capabalities.Values) { AddResourceRequestToAsk(request.remoteRequest); } } } } // re register with RM RegisterApplicationMaster(); allocateResponse = Allocate(progressIndicator); return(allocateResponse); } lock (this) { // update these on successful RPC clusterNodeCount = allocateResponse.GetNumClusterNodes(); lastResponseId = allocateResponse.GetResponseId(); clusterAvailableResources = allocateResponse.GetAvailableResources(); if (!allocateResponse.GetNMTokens().IsEmpty()) { PopulateNMTokens(allocateResponse.GetNMTokens()); } if (allocateResponse.GetAMRMToken() != null) { UpdateAMRMToken(allocateResponse.GetAMRMToken()); } if (!pendingRelease.IsEmpty() && !allocateResponse.GetCompletedContainersStatuses ().IsEmpty()) { RemovePendingReleaseRequests(allocateResponse.GetCompletedContainersStatuses()); } } } finally { // TODO how to differentiate remote yarn exception vs error in rpc if (allocateResponse == null) { // we hit an exception in allocate() // preserve ask and release for next call to allocate() lock (this) { Sharpen.Collections.AddAll(release, releaseList); // requests could have been added or deleted during call to allocate // If requests were added/removed then there is nothing to do since // the ResourceRequest object in ask would have the actual new value. // If ask does not have this ResourceRequest then it was unchanged and // so we can add the value back safely. // This assumes that there will no concurrent calls to allocate() and // so we dont have to worry about ask being changed in the // synchronized block at the beginning of this method. foreach (ResourceRequest oldAsk in askList) { if (!ask.Contains(oldAsk)) { ask.AddItem(oldAsk); } } Sharpen.Collections.AddAll(blacklistAdditions, blacklistToAdd); Sharpen.Collections.AddAll(blacklistRemovals, blacklistToRemove); } } } return(allocateResponse); }
public virtual void TestAMRMUnusableNodes() { MockNM nm1 = rm.RegisterNode("127.0.0.1:1234", 10000); MockNM nm2 = rm.RegisterNode("127.0.0.2:1234", 10000); MockNM nm3 = rm.RegisterNode("127.0.0.3:1234", 10000); MockNM nm4 = rm.RegisterNode("127.0.0.4:1234", 10000); dispatcher.Await(); RMApp app1 = rm.SubmitApp(2000); // Trigger the scheduling so the AM gets 'launched' on nm1 nm1.NodeHeartbeat(true); RMAppAttempt attempt1 = app1.GetCurrentAppAttempt(); MockAM am1 = rm.SendAMLaunched(attempt1.GetAppAttemptId()); // register AM returns no unusable node am1.RegisterAppAttempt(); // allocate request returns no updated node AllocateRequest allocateRequest1 = AllocateRequest.NewInstance(0, 0F, null, null, null); AllocateResponse response1 = Allocate(attempt1.GetAppAttemptId(), allocateRequest1 ); IList <NodeReport> updatedNodes = response1.GetUpdatedNodes(); NUnit.Framework.Assert.AreEqual(0, updatedNodes.Count); SyncNodeHeartbeat(nm4, false); // allocate request returns updated node allocateRequest1 = AllocateRequest.NewInstance(response1.GetResponseId(), 0F, null , null, null); response1 = Allocate(attempt1.GetAppAttemptId(), allocateRequest1); updatedNodes = response1.GetUpdatedNodes(); NUnit.Framework.Assert.AreEqual(1, updatedNodes.Count); NodeReport nr = updatedNodes.GetEnumerator().Next(); NUnit.Framework.Assert.AreEqual(nm4.GetNodeId(), nr.GetNodeId()); NUnit.Framework.Assert.AreEqual(NodeState.Unhealthy, nr.GetNodeState()); // resending the allocate request returns the same result response1 = Allocate(attempt1.GetAppAttemptId(), allocateRequest1); updatedNodes = response1.GetUpdatedNodes(); NUnit.Framework.Assert.AreEqual(1, updatedNodes.Count); nr = updatedNodes.GetEnumerator().Next(); NUnit.Framework.Assert.AreEqual(nm4.GetNodeId(), nr.GetNodeId()); NUnit.Framework.Assert.AreEqual(NodeState.Unhealthy, nr.GetNodeState()); SyncNodeLost(nm3); // subsequent allocate request returns delta allocateRequest1 = AllocateRequest.NewInstance(response1.GetResponseId(), 0F, null , null, null); response1 = Allocate(attempt1.GetAppAttemptId(), allocateRequest1); updatedNodes = response1.GetUpdatedNodes(); NUnit.Framework.Assert.AreEqual(1, updatedNodes.Count); nr = updatedNodes.GetEnumerator().Next(); NUnit.Framework.Assert.AreEqual(nm3.GetNodeId(), nr.GetNodeId()); NUnit.Framework.Assert.AreEqual(NodeState.Lost, nr.GetNodeState()); // registering another AM gives it the complete failed list RMApp app2 = rm.SubmitApp(2000); // Trigger nm2 heartbeat so that AM gets launched on it nm2.NodeHeartbeat(true); RMAppAttempt attempt2 = app2.GetCurrentAppAttempt(); MockAM am2 = rm.SendAMLaunched(attempt2.GetAppAttemptId()); // register AM returns all unusable nodes am2.RegisterAppAttempt(); // allocate request returns no updated node AllocateRequest allocateRequest2 = AllocateRequest.NewInstance(0, 0F, null, null, null); AllocateResponse response2 = Allocate(attempt2.GetAppAttemptId(), allocateRequest2 ); updatedNodes = response2.GetUpdatedNodes(); NUnit.Framework.Assert.AreEqual(0, updatedNodes.Count); SyncNodeHeartbeat(nm4, true); // both AM's should get delta updated nodes allocateRequest1 = AllocateRequest.NewInstance(response1.GetResponseId(), 0F, null , null, null); response1 = Allocate(attempt1.GetAppAttemptId(), allocateRequest1); updatedNodes = response1.GetUpdatedNodes(); NUnit.Framework.Assert.AreEqual(1, updatedNodes.Count); nr = updatedNodes.GetEnumerator().Next(); NUnit.Framework.Assert.AreEqual(nm4.GetNodeId(), nr.GetNodeId()); NUnit.Framework.Assert.AreEqual(NodeState.Running, nr.GetNodeState()); allocateRequest2 = AllocateRequest.NewInstance(response2.GetResponseId(), 0F, null , null, null); response2 = Allocate(attempt2.GetAppAttemptId(), allocateRequest2); updatedNodes = response2.GetUpdatedNodes(); NUnit.Framework.Assert.AreEqual(1, updatedNodes.Count); nr = updatedNodes.GetEnumerator().Next(); NUnit.Framework.Assert.AreEqual(nm4.GetNodeId(), nr.GetNodeId()); NUnit.Framework.Assert.AreEqual(NodeState.Running, nr.GetNodeState()); // subsequent allocate calls should return no updated nodes allocateRequest2 = AllocateRequest.NewInstance(response2.GetResponseId(), 0F, null , null, null); response2 = Allocate(attempt2.GetAppAttemptId(), allocateRequest2); updatedNodes = response2.GetUpdatedNodes(); NUnit.Framework.Assert.AreEqual(0, updatedNodes.Count); }