/// <exception cref="System.IO.IOException"/> private void TestDirsFailures(bool localORLogDirs) { string dirType = localORLogDirs ? "local" : "log"; string dirsProperty = localORLogDirs ? YarnConfiguration.NmLocalDirs : YarnConfiguration .NmLogDirs; Configuration conf = new Configuration(); // set disk health check interval to a small value (say 1 sec). conf.SetLong(YarnConfiguration.NmDiskHealthCheckIntervalMs, DiskHealthCheckInterval ); // If 2 out of the total 4 local-dirs fail OR if 2 Out of the total 4 // log-dirs fail, then the node's health status should become unhealthy. conf.SetFloat(YarnConfiguration.NmMinHealthyDisksFraction, 0.60F); if (yarnCluster != null) { yarnCluster.Stop(); FileUtil.FullyDelete(localFSDirBase); localFSDirBase.Mkdirs(); } Log.Info("Starting up YARN cluster"); yarnCluster = new MiniYARNCluster(typeof(TestDiskFailures).FullName, 1, numLocalDirs , numLogDirs); yarnCluster.Init(conf); yarnCluster.Start(); NodeManager nm = yarnCluster.GetNodeManager(0); Log.Info("Configured nm-" + dirType + "-dirs=" + nm.GetConfig().Get(dirsProperty) ); dirsHandler = nm.GetNodeHealthChecker().GetDiskHandler(); IList <string> list = localORLogDirs ? dirsHandler.GetLocalDirs() : dirsHandler.GetLogDirs (); string[] dirs = Sharpen.Collections.ToArray(list, new string[list.Count]); NUnit.Framework.Assert.AreEqual("Number of nm-" + dirType + "-dirs is wrong.", numLocalDirs , dirs.Length); string expectedDirs = StringUtils.Join(",", list); // validate the health of disks initially VerifyDisksHealth(localORLogDirs, expectedDirs, true); // Make 1 nm-local-dir fail and verify if "the nodemanager can identify // the disk failure(s) and can update the list of good nm-local-dirs. PrepareDirToFail(dirs[2]); expectedDirs = dirs[0] + "," + dirs[1] + "," + dirs[3]; VerifyDisksHealth(localORLogDirs, expectedDirs, true); // Now, make 1 more nm-local-dir/nm-log-dir fail and verify if "the // nodemanager can identify the disk failures and can update the list of // good nm-local-dirs/nm-log-dirs and can update the overall health status // of the node to unhealthy". PrepareDirToFail(dirs[0]); expectedDirs = dirs[1] + "," + dirs[3]; VerifyDisksHealth(localORLogDirs, expectedDirs, false); // Fail the remaining 2 local-dirs/log-dirs and verify if NM remains with // empty list of local-dirs/log-dirs and the overall health status is // unhealthy. PrepareDirToFail(dirs[1]); PrepareDirToFail(dirs[3]); expectedDirs = string.Empty; VerifyDisksHealth(localORLogDirs, expectedDirs, false); }
/// <summary>Verify if the NodeManager could identify disk failures.</summary> /// <param name="localORLogDirs"> /// <em>true</em> represent nm-local-dirs and <em>false /// </em> means nm-log-dirs /// </param> /// <param name="expectedDirs">expected nm-local-dirs/nm-log-dirs as a string</param> /// <param name="isHealthy"><em>true</em> if the overall node should be healthy</param> private void VerifyDisksHealth(bool localORLogDirs, string expectedDirs, bool isHealthy ) { // Wait for the NodeManager to identify disk failures. WaitForDiskHealthCheck(); IList <string> list = localORLogDirs ? dirsHandler.GetLocalDirs() : dirsHandler.GetLogDirs (); string seenDirs = StringUtils.Join(",", list); Log.Info("ExpectedDirs=" + expectedDirs); Log.Info("SeenDirs=" + seenDirs); NUnit.Framework.Assert.IsTrue("NodeManager could not identify disk failure.", expectedDirs .Equals(seenDirs)); NUnit.Framework.Assert.AreEqual("Node's health in terms of disks is wrong", isHealthy , dirsHandler.AreDisksHealthy()); for (int i = 0; i < 10; i++) { IEnumerator <RMNode> iter = yarnCluster.GetResourceManager().GetRMContext().GetRMNodes ().Values.GetEnumerator(); if ((iter.Next().GetState() != NodeState.Unhealthy) == isHealthy) { break; } // wait for the node health info to go to RM try { Sharpen.Thread.Sleep(1000); } catch (Exception) { Log.Error("Interrupted while waiting for NM->RM heartbeat."); } } IEnumerator <RMNode> iter_1 = yarnCluster.GetResourceManager().GetRMContext().GetRMNodes ().Values.GetEnumerator(); NUnit.Framework.Assert.AreEqual("RM is not updated with the health status of a node" , isHealthy, iter_1.Next().GetState() != NodeState.Unhealthy); }
public virtual void TestDirFailuresOnStartup() { Configuration conf = new YarnConfiguration(); string localDir1 = new FilePath(testDir, "localDir1").GetPath(); string localDir2 = new FilePath(testDir, "localDir2").GetPath(); string logDir1 = new FilePath(testDir, "logDir1").GetPath(); string logDir2 = new FilePath(testDir, "logDir2").GetPath(); conf.Set(YarnConfiguration.NmLocalDirs, localDir1 + "," + localDir2); conf.Set(YarnConfiguration.NmLogDirs, logDir1 + "," + logDir2); PrepareDirToFail(localDir1); PrepareDirToFail(logDir2); LocalDirsHandlerService dirSvc = new LocalDirsHandlerService(); dirSvc.Init(conf); IList <string> localDirs = dirSvc.GetLocalDirs(); NUnit.Framework.Assert.AreEqual(1, localDirs.Count); NUnit.Framework.Assert.AreEqual(new Path(localDir2).ToString(), localDirs[0]); IList <string> logDirs = dirSvc.GetLogDirs(); NUnit.Framework.Assert.AreEqual(1, logDirs.Count); NUnit.Framework.Assert.AreEqual(new Path(logDir1).ToString(), logDirs[0]); }
public virtual int Call() { // dispatcher not typed ContainerLaunchContext launchContext = container.GetLaunchContext(); IDictionary <Path, IList <string> > localResources = null; ContainerId containerID = container.GetContainerId(); string containerIdStr = ConverterUtils.ToString(containerID); IList <string> command = launchContext.GetCommands(); int ret = -1; // CONTAINER_KILLED_ON_REQUEST should not be missed if the container // is already at KILLING if (container.GetContainerState() == ContainerState.Killing) { dispatcher.GetEventHandler().Handle(new ContainerExitEvent(containerID, ContainerEventType .ContainerKilledOnRequest, Shell.Windows ? ContainerExecutor.ExitCode.ForceKilled .GetExitCode() : ContainerExecutor.ExitCode.Terminated.GetExitCode(), "Container terminated before launch." )); return(0); } try { localResources = container.GetLocalizedResources(); if (localResources == null) { throw RPCUtil.GetRemoteException("Unable to get local resources when Container " + containerID + " is at " + container.GetContainerState()); } string user = container.GetUser(); // /////////////////////////// Variable expansion // Before the container script gets written out. IList <string> newCmds = new AList <string>(command.Count); string appIdStr = app.GetAppId().ToString(); string relativeContainerLogDir = Org.Apache.Hadoop.Yarn.Server.Nodemanager.Containermanager.Launcher.ContainerLaunch .GetRelativeContainerLogDir(appIdStr, containerIdStr); Path containerLogDir = dirsHandler.GetLogPathForWrite(relativeContainerLogDir, false ); foreach (string str in command) { // TODO: Should we instead work via symlinks without this grammar? newCmds.AddItem(ExpandEnvironment(str, containerLogDir)); } launchContext.SetCommands(newCmds); IDictionary <string, string> environment = launchContext.GetEnvironment(); // Make a copy of env to iterate & do variable expansion foreach (KeyValuePair <string, string> entry in environment) { string value = entry.Value; value = ExpandEnvironment(value, containerLogDir); entry.SetValue(value); } // /////////////////////////// End of variable expansion FileContext lfs = FileContext.GetLocalFSFileContext(); Path nmPrivateContainerScriptPath = dirsHandler.GetLocalPathForWrite(GetContainerPrivateDir (appIdStr, containerIdStr) + Path.Separator + ContainerScript); Path nmPrivateTokensPath = dirsHandler.GetLocalPathForWrite(GetContainerPrivateDir (appIdStr, containerIdStr) + Path.Separator + string.Format(ContainerLocalizer.TokenFileNameFmt , containerIdStr)); Path nmPrivateClasspathJarDir = dirsHandler.GetLocalPathForWrite(GetContainerPrivateDir (appIdStr, containerIdStr)); DataOutputStream containerScriptOutStream = null; DataOutputStream tokensOutStream = null; // Select the working directory for the container Path containerWorkDir = dirsHandler.GetLocalPathForWrite(ContainerLocalizer.Usercache + Path.Separator + user + Path.Separator + ContainerLocalizer.Appcache + Path.Separator + appIdStr + Path.Separator + containerIdStr, LocalDirAllocator.SizeUnknown, false ); string pidFileSubpath = GetPidFileSubpath(appIdStr, containerIdStr); // pid file should be in nm private dir so that it is not // accessible by users pidFilePath = dirsHandler.GetLocalPathForWrite(pidFileSubpath); IList <string> localDirs = dirsHandler.GetLocalDirs(); IList <string> logDirs = dirsHandler.GetLogDirs(); IList <string> containerLogDirs = new AList <string>(); foreach (string logDir in logDirs) { containerLogDirs.AddItem(logDir + Path.Separator + relativeContainerLogDir); } if (!dirsHandler.AreDisksHealthy()) { ret = ContainerExitStatus.DisksFailed; throw new IOException("Most of the disks failed. " + dirsHandler.GetDisksHealthReport (false)); } try { // /////////// Write out the container-script in the nmPrivate space. IList <Path> appDirs = new AList <Path>(localDirs.Count); foreach (string localDir in localDirs) { Path usersdir = new Path(localDir, ContainerLocalizer.Usercache); Path userdir = new Path(usersdir, user); Path appsdir = new Path(userdir, ContainerLocalizer.Appcache); appDirs.AddItem(new Path(appsdir, appIdStr)); } containerScriptOutStream = lfs.Create(nmPrivateContainerScriptPath, EnumSet.Of(CreateFlag .Create, CreateFlag.Overwrite)); // Set the token location too. environment[ApplicationConstants.ContainerTokenFileEnvName] = new Path(containerWorkDir , FinalContainerTokensFile).ToUri().GetPath(); // Sanitize the container's environment SanitizeEnv(environment, containerWorkDir, appDirs, containerLogDirs, localResources , nmPrivateClasspathJarDir); // Write out the environment exec.WriteLaunchEnv(containerScriptOutStream, environment, localResources, launchContext .GetCommands()); // /////////// End of writing out container-script // /////////// Write out the container-tokens in the nmPrivate space. tokensOutStream = lfs.Create(nmPrivateTokensPath, EnumSet.Of(CreateFlag.Create, CreateFlag .Overwrite)); Credentials creds = container.GetCredentials(); creds.WriteTokenStorageToStream(tokensOutStream); } finally { // /////////// End of writing out container-tokens IOUtils.Cleanup(Log, containerScriptOutStream, tokensOutStream); } // LaunchContainer is a blocking call. We are here almost means the // container is launched, so send out the event. dispatcher.GetEventHandler().Handle(new ContainerEvent(containerID, ContainerEventType .ContainerLaunched)); context.GetNMStateStore().StoreContainerLaunched(containerID); // Check if the container is signalled to be killed. if (!shouldLaunchContainer.CompareAndSet(false, true)) { Log.Info("Container " + containerIdStr + " not launched as " + "cleanup already called" ); ret = ContainerExecutor.ExitCode.Terminated.GetExitCode(); } else { exec.ActivateContainer(containerID, pidFilePath); ret = exec.LaunchContainer(container, nmPrivateContainerScriptPath, nmPrivateTokensPath , user, appIdStr, containerWorkDir, localDirs, logDirs); } } catch (Exception e) { Log.Warn("Failed to launch container.", e); dispatcher.GetEventHandler().Handle(new ContainerExitEvent(containerID, ContainerEventType .ContainerExitedWithFailure, ret, e.Message)); return(ret); } finally { completed.Set(true); exec.DeactivateContainer(containerID); try { context.GetNMStateStore().StoreContainerCompleted(containerID, ret); } catch (IOException) { Log.Error("Unable to set exit code for container " + containerID); } } if (Log.IsDebugEnabled()) { Log.Debug("Container " + containerIdStr + " completed with exit code " + ret); } if (ret == ContainerExecutor.ExitCode.ForceKilled.GetExitCode() || ret == ContainerExecutor.ExitCode .Terminated.GetExitCode()) { // If the process was killed, Send container_cleanedup_after_kill and // just break out of this method. dispatcher.GetEventHandler().Handle(new ContainerExitEvent(containerID, ContainerEventType .ContainerKilledOnRequest, ret, "Container exited with a non-zero exit code " + ret)); return(ret); } if (ret != 0) { Log.Warn("Container exited with a non-zero exit code " + ret); this.dispatcher.GetEventHandler().Handle(new ContainerExitEvent(containerID, ContainerEventType .ContainerExitedWithFailure, ret, "Container exited with a non-zero exit code " + ret)); return(ret); } Log.Info("Container " + containerIdStr + " succeeded "); dispatcher.GetEventHandler().Handle(new ContainerEvent(containerID, ContainerEventType .ContainerExitedWithSuccess)); return(0); }