public AppAttemptInfo(ResourceManager rm, RMAppAttempt attempt, string user, string schemePrefix) { this.startTime = 0; this.containerId = string.Empty; this.nodeHttpAddress = string.Empty; this.nodeId = string.Empty; this.logsLink = string.Empty; this.blacklistedNodes = string.Empty; if (attempt != null) { this.id = attempt.GetAppAttemptId().GetAttemptId(); this.startTime = attempt.GetStartTime(); Container masterContainer = attempt.GetMasterContainer(); if (masterContainer != null) { this.containerId = masterContainer.GetId().ToString(); this.nodeHttpAddress = masterContainer.GetNodeHttpAddress(); this.nodeId = masterContainer.GetNodeId().ToString(); this.logsLink = WebAppUtils.GetRunningLogURL(schemePrefix + masterContainer.GetNodeHttpAddress (), ConverterUtils.ToString(masterContainer.GetId()), user); if (rm.GetResourceScheduler() is AbstractYarnScheduler) { AbstractYarnScheduler ayScheduler = (AbstractYarnScheduler)rm.GetResourceScheduler (); SchedulerApplicationAttempt sattempt = ayScheduler.GetApplicationAttempt(attempt. GetAppAttemptId()); if (sattempt != null) { blacklistedNodes = StringUtils.Join(sattempt.GetBlacklistedNodes(), ", "); } } } } }
public override void TransferStateFromPreviousAttempt(SchedulerApplicationAttempt appAttempt) { lock (this) { base.TransferStateFromPreviousAttempt(appAttempt); this.headroomProvider = ((Org.Apache.Hadoop.Yarn.Server.Resourcemanager.Scheduler.Common.Fica.FiCaSchedulerApp )appAttempt).GetHeadroomProvider(); } }
public void RecoverContainer(Org.Apache.Hadoop.Yarn.Api.Records.Resource clusterResource , SchedulerApplicationAttempt schedulerAttempt, RMContainer rmContainer) { if (rmContainer.GetState().Equals(RMContainerState.Completed)) { return; } this._enclosing.IncreaseUsedResources(rmContainer); this._enclosing.UpdateAppHeadRoom(schedulerAttempt); this._enclosing.UpdateAvailableResourcesMetrics(); }
public static ICollection <string> GetBlacklistedNodes(ResourceManager rm, ApplicationAttemptId appid) { if (rm.GetResourceScheduler() is AbstractYarnScheduler) { AbstractYarnScheduler ayScheduler = (AbstractYarnScheduler)rm.GetResourceScheduler (); SchedulerApplicationAttempt attempt = ayScheduler.GetApplicationAttempt(appid); if (attempt != null) { return(attempt.GetBlacklistedNodes()); } } return(null); }
public override void UnreserveResource(SchedulerApplicationAttempt application) { lock (this) { // Cannot unreserve for wrong application... ApplicationAttemptId reservedApplication = GetReservedContainer().GetContainer(). GetId().GetApplicationAttemptId(); if (!reservedApplication.Equals(application.GetApplicationAttemptId())) { throw new InvalidOperationException("Trying to unreserve " + " for application " + application.GetApplicationId() + " when currently reserved " + " for application " + reservedApplication.GetApplicationId() + " on node " + this); } SetReservedContainer(null); this.reservedAppSchedulable = null; } }
public override void ReserveResource(SchedulerApplicationAttempt application, Priority priority, RMContainer container) { lock (this) { // Check if it's already reserved RMContainer reservedContainer = GetReservedContainer(); if (reservedContainer != null) { // Sanity check if (!container.GetContainer().GetNodeId().Equals(GetNodeID())) { throw new InvalidOperationException("Trying to reserve" + " container " + container + " on node " + container.GetReservedNode() + " when currently" + " reserved resource " + reservedContainer + " on node " + reservedContainer.GetReservedNode()); } // Cannot reserve more than one application attempt on a given node! // Reservation is still against attempt. if (!reservedContainer.GetContainer().GetId().GetApplicationAttemptId().Equals(container .GetContainer().GetId().GetApplicationAttemptId())) { throw new InvalidOperationException("Trying to reserve" + " container " + container + " for application " + application.GetApplicationAttemptId() + " when currently" + " reserved container " + reservedContainer + " on node " + this); } if (Log.IsDebugEnabled()) { Log.Debug("Updated reserved container " + container.GetContainer().GetId() + " on node " + this + " for application attempt " + application.GetApplicationAttemptId()); } } else { if (Log.IsDebugEnabled()) { Log.Debug("Reserved container " + container.GetContainer().GetId() + " on node " + this + " for application attempt " + application.GetApplicationAttemptId()); } } SetReservedContainer(container); } }
public override void RecoverContainer(Org.Apache.Hadoop.Yarn.Api.Records.Resource clusterResource, SchedulerApplicationAttempt attempt, RMContainer rmContainer) { if (rmContainer.GetState().Equals(RMContainerState.Completed)) { return; } // Careful! Locking order is important! lock (this) { FiCaSchedulerNode node = scheduler.GetNode(rmContainer.GetContainer().GetNodeId() ); base.AllocateResource(clusterResource, rmContainer.GetContainer().GetResource(), node.GetLabels()); } if (parent != null) { parent.RecoverContainer(clusterResource, attempt, rmContainer); } }
public override void UnreserveResource(SchedulerApplicationAttempt application) { lock (this) { // adding NP checks as this can now be called for preemption if (GetReservedContainer() != null && GetReservedContainer().GetContainer() != null && GetReservedContainer().GetContainer().GetId() != null && GetReservedContainer ().GetContainer().GetId().GetApplicationAttemptId() != null) { // Cannot unreserve for wrong application... ApplicationAttemptId reservedApplication = GetReservedContainer().GetContainer(). GetId().GetApplicationAttemptId(); if (!reservedApplication.Equals(application.GetApplicationAttemptId())) { throw new InvalidOperationException("Trying to unreserve " + " for application " + application.GetApplicationAttemptId() + " when currently reserved " + " for application " + reservedApplication.GetApplicationId() + " on node " + this); } } SetReservedContainer(null); } }
public override void ReserveResource(SchedulerApplicationAttempt application, Priority priority, RMContainer container) { lock (this) { // Check if it's already reserved RMContainer reservedContainer = GetReservedContainer(); if (reservedContainer != null) { // Sanity check if (!container.GetContainer().GetNodeId().Equals(GetNodeID())) { throw new InvalidOperationException("Trying to reserve" + " container " + container + " on node " + container.GetReservedNode() + " when currently" + " reserved resource " + reservedContainer + " on node " + reservedContainer.GetReservedNode()); } // Cannot reserve more than one application on a given node! if (!reservedContainer.GetContainer().GetId().GetApplicationAttemptId().Equals(container .GetContainer().GetId().GetApplicationAttemptId())) { throw new InvalidOperationException("Trying to reserve" + " container " + container + " for application " + application.GetApplicationId() + " when currently" + " reserved container " + reservedContainer + " on node " + this); } Log.Info("Updated reserved container " + container.GetContainer().GetId() + " on node " + this + " for application " + application); } else { Log.Info("Reserved container " + container.GetContainer().GetId() + " on node " + this + " for application " + application); } SetReservedContainer(container); this.reservedAppSchedulable = (FSAppAttempt)application; } }
/// <exception cref="System.Exception"/> public virtual void TestAMRestartWithExistingContainers() { YarnConfiguration conf = new YarnConfiguration(); conf.SetInt(YarnConfiguration.RmAmMaxAttempts, 2); MockRM rm1 = new MockRM(conf); rm1.Start(); RMApp app1 = rm1.SubmitApp(200, "name", "user", new Dictionary <ApplicationAccessType , string>(), false, "default", -1, null, "MAPREDUCE", false, true); MockNM nm1 = new MockNM("127.0.0.1:1234", 10240, rm1.GetResourceTrackerService()); nm1.RegisterNode(); MockNM nm2 = new MockNM("127.0.0.1:2351", 4089, rm1.GetResourceTrackerService()); nm2.RegisterNode(); MockAM am1 = MockRM.LaunchAndRegisterAM(app1, rm1, nm1); int NumContainers = 3; // allocate NUM_CONTAINERS containers am1.Allocate("127.0.0.1", 1024, NumContainers, new AList <ContainerId>()); nm1.NodeHeartbeat(true); // wait for containers to be allocated. IList <Container> containers = am1.Allocate(new AList <ResourceRequest>(), new AList <ContainerId>()).GetAllocatedContainers(); while (containers.Count != NumContainers) { nm1.NodeHeartbeat(true); Sharpen.Collections.AddAll(containers, am1.Allocate(new AList <ResourceRequest>(), new AList <ContainerId>()).GetAllocatedContainers()); Sharpen.Thread.Sleep(200); } // launch the 2nd container, for testing running container transferred. nm1.NodeHeartbeat(am1.GetApplicationAttemptId(), 2, ContainerState.Running); ContainerId containerId2 = ContainerId.NewContainerId(am1.GetApplicationAttemptId (), 2); rm1.WaitForState(nm1, containerId2, RMContainerState.Running); // launch the 3rd container, for testing container allocated by previous // attempt is completed by the next new attempt/ nm1.NodeHeartbeat(am1.GetApplicationAttemptId(), 3, ContainerState.Running); ContainerId containerId3 = ContainerId.NewContainerId(am1.GetApplicationAttemptId (), 3); rm1.WaitForState(nm1, containerId3, RMContainerState.Running); // 4th container still in AQUIRED state. for testing Acquired container is // always killed. ContainerId containerId4 = ContainerId.NewContainerId(am1.GetApplicationAttemptId (), 4); rm1.WaitForState(nm1, containerId4, RMContainerState.Acquired); // 5th container is in Allocated state. for testing allocated container is // always killed. am1.Allocate("127.0.0.1", 1024, 1, new AList <ContainerId>()); nm1.NodeHeartbeat(true); ContainerId containerId5 = ContainerId.NewContainerId(am1.GetApplicationAttemptId (), 5); rm1.WaitForContainerAllocated(nm1, containerId5); rm1.WaitForState(nm1, containerId5, RMContainerState.Allocated); // 6th container is in Reserved state. am1.Allocate("127.0.0.1", 6000, 1, new AList <ContainerId>()); ContainerId containerId6 = ContainerId.NewContainerId(am1.GetApplicationAttemptId (), 6); nm1.NodeHeartbeat(true); SchedulerApplicationAttempt schedulerAttempt = ((AbstractYarnScheduler)rm1.GetResourceScheduler ()).GetCurrentAttemptForContainer(containerId6); while (schedulerAttempt.GetReservedContainers().IsEmpty()) { System.Console.Out.WriteLine("Waiting for container " + containerId6 + " to be reserved." ); nm1.NodeHeartbeat(true); Sharpen.Thread.Sleep(200); } // assert containerId6 is reserved. NUnit.Framework.Assert.AreEqual(containerId6, schedulerAttempt.GetReservedContainers ()[0].GetContainerId()); // fail the AM by sending CONTAINER_FINISHED event without registering. nm1.NodeHeartbeat(am1.GetApplicationAttemptId(), 1, ContainerState.Complete); am1.WaitForState(RMAppAttemptState.Failed); // wait for some time. previous AM's running containers should still remain // in scheduler even though am failed Sharpen.Thread.Sleep(3000); rm1.WaitForState(nm1, containerId2, RMContainerState.Running); // acquired/allocated containers are cleaned up. NUnit.Framework.Assert.IsNull(rm1.GetResourceScheduler().GetRMContainer(containerId4 )); NUnit.Framework.Assert.IsNull(rm1.GetResourceScheduler().GetRMContainer(containerId5 )); // wait for app to start a new attempt. rm1.WaitForState(app1.GetApplicationId(), RMAppState.Accepted); // assert this is a new AM. ApplicationAttemptId newAttemptId = app1.GetCurrentAppAttempt().GetAppAttemptId(); NUnit.Framework.Assert.IsFalse(newAttemptId.Equals(am1.GetApplicationAttemptId()) ); // launch the new AM RMAppAttempt attempt2 = app1.GetCurrentAppAttempt(); nm1.NodeHeartbeat(true); MockAM am2 = rm1.SendAMLaunched(attempt2.GetAppAttemptId()); RegisterApplicationMasterResponse registerResponse = am2.RegisterAppAttempt(); // Assert two containers are running: container2 and container3; NUnit.Framework.Assert.AreEqual(2, registerResponse.GetContainersFromPreviousAttempts ().Count); bool containerId2Exists = false; bool containerId3Exists = false; foreach (Container container in registerResponse.GetContainersFromPreviousAttempts ()) { if (container.GetId().Equals(containerId2)) { containerId2Exists = true; } if (container.GetId().Equals(containerId3)) { containerId3Exists = true; } } NUnit.Framework.Assert.IsTrue(containerId2Exists && containerId3Exists); rm1.WaitForState(app1.GetApplicationId(), RMAppState.Running); // complete container by sending the container complete event which has earlier // attempt's attemptId nm1.NodeHeartbeat(am1.GetApplicationAttemptId(), 3, ContainerState.Complete); // Even though the completed container containerId3 event was sent to the // earlier failed attempt, new RMAppAttempt can also capture this container // info. // completed containerId4 is also transferred to the new attempt. RMAppAttempt newAttempt = app1.GetRMAppAttempt(am2.GetApplicationAttemptId()); // 4 containers finished, acquired/allocated/reserved/completed. WaitForContainersToFinish(4, newAttempt); bool container3Exists = false; bool container4Exists = false; bool container5Exists = false; bool container6Exists = false; foreach (ContainerStatus status in newAttempt.GetJustFinishedContainers()) { if (status.GetContainerId().Equals(containerId3)) { // containerId3 is the container ran by previous attempt but finished by the // new attempt. container3Exists = true; } if (status.GetContainerId().Equals(containerId4)) { // containerId4 is the Acquired Container killed by the previous attempt, // it's now inside new attempt's finished container list. container4Exists = true; } if (status.GetContainerId().Equals(containerId5)) { // containerId5 is the Allocated container killed by previous failed attempt. container5Exists = true; } if (status.GetContainerId().Equals(containerId6)) { // containerId6 is the reserved container killed by previous failed attempt. container6Exists = true; } } NUnit.Framework.Assert.IsTrue(container3Exists && container4Exists && container5Exists && container6Exists); // New SchedulerApplicationAttempt also has the containers info. rm1.WaitForState(nm1, containerId2, RMContainerState.Running); // record the scheduler attempt for testing. SchedulerApplicationAttempt schedulerNewAttempt = ((AbstractYarnScheduler)rm1.GetResourceScheduler ()).GetCurrentAttemptForContainer(containerId2); // finish this application MockRM.FinishAMAndVerifyAppState(app1, rm1, nm1, am2); // the 2nd attempt released the 1st attempt's running container, when the // 2nd attempt finishes. NUnit.Framework.Assert.IsFalse(schedulerNewAttempt.GetLiveContainers().Contains(containerId2 )); // all 4 normal containers finished. System.Console.Out.WriteLine("New attempt's just finished containers: " + newAttempt .GetJustFinishedContainers()); WaitForContainersToFinish(5, newAttempt); rm1.Stop(); }
public override void RecoverContainer(Org.Apache.Hadoop.Yarn.Api.Records.Resource clusterResource, SchedulerApplicationAttempt schedulerAttempt, RMContainer rmContainer ) { }
private void UpdateAppHeadRoom(SchedulerApplicationAttempt schedulerAttempt) { schedulerAttempt.SetHeadroom(Resources.Subtract(clusterResource, usedResource)); }
public abstract void RecoverContainer(Org.Apache.Hadoop.Yarn.Api.Records.Resource arg1, SchedulerApplicationAttempt arg2, RMContainer arg3);