/// <summary>Verify if the NodeManager could identify disk failures.</summary> /// <param name="localORLogDirs"> /// <em>true</em> represent nm-local-dirs and <em>false /// </em> means nm-log-dirs /// </param> /// <param name="expectedDirs">expected nm-local-dirs/nm-log-dirs as a string</param> /// <param name="isHealthy"><em>true</em> if the overall node should be healthy</param> private void VerifyDisksHealth(bool localORLogDirs, string expectedDirs, bool isHealthy ) { // Wait for the NodeManager to identify disk failures. WaitForDiskHealthCheck(); IList <string> list = localORLogDirs ? dirsHandler.GetLocalDirs() : dirsHandler.GetLogDirs (); string seenDirs = StringUtils.Join(",", list); Log.Info("ExpectedDirs=" + expectedDirs); Log.Info("SeenDirs=" + seenDirs); NUnit.Framework.Assert.IsTrue("NodeManager could not identify disk failure.", expectedDirs .Equals(seenDirs)); NUnit.Framework.Assert.AreEqual("Node's health in terms of disks is wrong", isHealthy , dirsHandler.AreDisksHealthy()); for (int i = 0; i < 10; i++) { IEnumerator <RMNode> iter = yarnCluster.GetResourceManager().GetRMContext().GetRMNodes ().Values.GetEnumerator(); if ((iter.Next().GetState() != NodeState.Unhealthy) == isHealthy) { break; } // wait for the node health info to go to RM try { Sharpen.Thread.Sleep(1000); } catch (Exception) { Log.Error("Interrupted while waiting for NM->RM heartbeat."); } } IEnumerator <RMNode> iter_1 = yarnCluster.GetResourceManager().GetRMContext().GetRMNodes ().Values.GetEnumerator(); NUnit.Framework.Assert.AreEqual("RM is not updated with the health status of a node" , isHealthy, iter_1.Next().GetState() != NodeState.Unhealthy); }
public virtual int Call() { // dispatcher not typed ContainerLaunchContext launchContext = container.GetLaunchContext(); IDictionary <Path, IList <string> > localResources = null; ContainerId containerID = container.GetContainerId(); string containerIdStr = ConverterUtils.ToString(containerID); IList <string> command = launchContext.GetCommands(); int ret = -1; // CONTAINER_KILLED_ON_REQUEST should not be missed if the container // is already at KILLING if (container.GetContainerState() == ContainerState.Killing) { dispatcher.GetEventHandler().Handle(new ContainerExitEvent(containerID, ContainerEventType .ContainerKilledOnRequest, Shell.Windows ? ContainerExecutor.ExitCode.ForceKilled .GetExitCode() : ContainerExecutor.ExitCode.Terminated.GetExitCode(), "Container terminated before launch." )); return(0); } try { localResources = container.GetLocalizedResources(); if (localResources == null) { throw RPCUtil.GetRemoteException("Unable to get local resources when Container " + containerID + " is at " + container.GetContainerState()); } string user = container.GetUser(); // /////////////////////////// Variable expansion // Before the container script gets written out. IList <string> newCmds = new AList <string>(command.Count); string appIdStr = app.GetAppId().ToString(); string relativeContainerLogDir = Org.Apache.Hadoop.Yarn.Server.Nodemanager.Containermanager.Launcher.ContainerLaunch .GetRelativeContainerLogDir(appIdStr, containerIdStr); Path containerLogDir = dirsHandler.GetLogPathForWrite(relativeContainerLogDir, false ); foreach (string str in command) { // TODO: Should we instead work via symlinks without this grammar? newCmds.AddItem(ExpandEnvironment(str, containerLogDir)); } launchContext.SetCommands(newCmds); IDictionary <string, string> environment = launchContext.GetEnvironment(); // Make a copy of env to iterate & do variable expansion foreach (KeyValuePair <string, string> entry in environment) { string value = entry.Value; value = ExpandEnvironment(value, containerLogDir); entry.SetValue(value); } // /////////////////////////// End of variable expansion FileContext lfs = FileContext.GetLocalFSFileContext(); Path nmPrivateContainerScriptPath = dirsHandler.GetLocalPathForWrite(GetContainerPrivateDir (appIdStr, containerIdStr) + Path.Separator + ContainerScript); Path nmPrivateTokensPath = dirsHandler.GetLocalPathForWrite(GetContainerPrivateDir (appIdStr, containerIdStr) + Path.Separator + string.Format(ContainerLocalizer.TokenFileNameFmt , containerIdStr)); Path nmPrivateClasspathJarDir = dirsHandler.GetLocalPathForWrite(GetContainerPrivateDir (appIdStr, containerIdStr)); DataOutputStream containerScriptOutStream = null; DataOutputStream tokensOutStream = null; // Select the working directory for the container Path containerWorkDir = dirsHandler.GetLocalPathForWrite(ContainerLocalizer.Usercache + Path.Separator + user + Path.Separator + ContainerLocalizer.Appcache + Path.Separator + appIdStr + Path.Separator + containerIdStr, LocalDirAllocator.SizeUnknown, false ); string pidFileSubpath = GetPidFileSubpath(appIdStr, containerIdStr); // pid file should be in nm private dir so that it is not // accessible by users pidFilePath = dirsHandler.GetLocalPathForWrite(pidFileSubpath); IList <string> localDirs = dirsHandler.GetLocalDirs(); IList <string> logDirs = dirsHandler.GetLogDirs(); IList <string> containerLogDirs = new AList <string>(); foreach (string logDir in logDirs) { containerLogDirs.AddItem(logDir + Path.Separator + relativeContainerLogDir); } if (!dirsHandler.AreDisksHealthy()) { ret = ContainerExitStatus.DisksFailed; throw new IOException("Most of the disks failed. " + dirsHandler.GetDisksHealthReport (false)); } try { // /////////// Write out the container-script in the nmPrivate space. IList <Path> appDirs = new AList <Path>(localDirs.Count); foreach (string localDir in localDirs) { Path usersdir = new Path(localDir, ContainerLocalizer.Usercache); Path userdir = new Path(usersdir, user); Path appsdir = new Path(userdir, ContainerLocalizer.Appcache); appDirs.AddItem(new Path(appsdir, appIdStr)); } containerScriptOutStream = lfs.Create(nmPrivateContainerScriptPath, EnumSet.Of(CreateFlag .Create, CreateFlag.Overwrite)); // Set the token location too. environment[ApplicationConstants.ContainerTokenFileEnvName] = new Path(containerWorkDir , FinalContainerTokensFile).ToUri().GetPath(); // Sanitize the container's environment SanitizeEnv(environment, containerWorkDir, appDirs, containerLogDirs, localResources , nmPrivateClasspathJarDir); // Write out the environment exec.WriteLaunchEnv(containerScriptOutStream, environment, localResources, launchContext .GetCommands()); // /////////// End of writing out container-script // /////////// Write out the container-tokens in the nmPrivate space. tokensOutStream = lfs.Create(nmPrivateTokensPath, EnumSet.Of(CreateFlag.Create, CreateFlag .Overwrite)); Credentials creds = container.GetCredentials(); creds.WriteTokenStorageToStream(tokensOutStream); } finally { // /////////// End of writing out container-tokens IOUtils.Cleanup(Log, containerScriptOutStream, tokensOutStream); } // LaunchContainer is a blocking call. We are here almost means the // container is launched, so send out the event. dispatcher.GetEventHandler().Handle(new ContainerEvent(containerID, ContainerEventType .ContainerLaunched)); context.GetNMStateStore().StoreContainerLaunched(containerID); // Check if the container is signalled to be killed. if (!shouldLaunchContainer.CompareAndSet(false, true)) { Log.Info("Container " + containerIdStr + " not launched as " + "cleanup already called" ); ret = ContainerExecutor.ExitCode.Terminated.GetExitCode(); } else { exec.ActivateContainer(containerID, pidFilePath); ret = exec.LaunchContainer(container, nmPrivateContainerScriptPath, nmPrivateTokensPath , user, appIdStr, containerWorkDir, localDirs, logDirs); } } catch (Exception e) { Log.Warn("Failed to launch container.", e); dispatcher.GetEventHandler().Handle(new ContainerExitEvent(containerID, ContainerEventType .ContainerExitedWithFailure, ret, e.Message)); return(ret); } finally { completed.Set(true); exec.DeactivateContainer(containerID); try { context.GetNMStateStore().StoreContainerCompleted(containerID, ret); } catch (IOException) { Log.Error("Unable to set exit code for container " + containerID); } } if (Log.IsDebugEnabled()) { Log.Debug("Container " + containerIdStr + " completed with exit code " + ret); } if (ret == ContainerExecutor.ExitCode.ForceKilled.GetExitCode() || ret == ContainerExecutor.ExitCode .Terminated.GetExitCode()) { // If the process was killed, Send container_cleanedup_after_kill and // just break out of this method. dispatcher.GetEventHandler().Handle(new ContainerExitEvent(containerID, ContainerEventType .ContainerKilledOnRequest, ret, "Container exited with a non-zero exit code " + ret)); return(ret); } if (ret != 0) { Log.Warn("Container exited with a non-zero exit code " + ret); this.dispatcher.GetEventHandler().Handle(new ContainerExitEvent(containerID, ContainerEventType .ContainerExitedWithFailure, ret, "Container exited with a non-zero exit code " + ret)); return(ret); } Log.Info("Container " + containerIdStr + " succeeded "); dispatcher.GetEventHandler().Handle(new ContainerEvent(containerID, ContainerEventType .ContainerExitedWithSuccess)); return(0); }