Пример #1
0
        /// <exception cref="System.IO.IOException"/>
        private void TestDirsFailures(bool localORLogDirs)
        {
            string dirType      = localORLogDirs ? "local" : "log";
            string dirsProperty = localORLogDirs ? YarnConfiguration.NmLocalDirs : YarnConfiguration
                                  .NmLogDirs;
            Configuration conf = new Configuration();

            // set disk health check interval to a small value (say 1 sec).
            conf.SetLong(YarnConfiguration.NmDiskHealthCheckIntervalMs, DiskHealthCheckInterval
                         );
            // If 2 out of the total 4 local-dirs fail OR if 2 Out of the total 4
            // log-dirs fail, then the node's health status should become unhealthy.
            conf.SetFloat(YarnConfiguration.NmMinHealthyDisksFraction, 0.60F);
            if (yarnCluster != null)
            {
                yarnCluster.Stop();
                FileUtil.FullyDelete(localFSDirBase);
                localFSDirBase.Mkdirs();
            }
            Log.Info("Starting up YARN cluster");
            yarnCluster = new MiniYARNCluster(typeof(TestDiskFailures).FullName, 1, numLocalDirs
                                              , numLogDirs);
            yarnCluster.Init(conf);
            yarnCluster.Start();
            NodeManager nm = yarnCluster.GetNodeManager(0);

            Log.Info("Configured nm-" + dirType + "-dirs=" + nm.GetConfig().Get(dirsProperty)
                     );
            dirsHandler = nm.GetNodeHealthChecker().GetDiskHandler();
            IList <string> list = localORLogDirs ? dirsHandler.GetLocalDirs() : dirsHandler.GetLogDirs
                                      ();

            string[] dirs = Sharpen.Collections.ToArray(list, new string[list.Count]);
            NUnit.Framework.Assert.AreEqual("Number of nm-" + dirType + "-dirs is wrong.", numLocalDirs
                                            , dirs.Length);
            string expectedDirs = StringUtils.Join(",", list);

            // validate the health of disks initially
            VerifyDisksHealth(localORLogDirs, expectedDirs, true);
            // Make 1 nm-local-dir fail and verify if "the nodemanager can identify
            // the disk failure(s) and can update the list of good nm-local-dirs.
            PrepareDirToFail(dirs[2]);
            expectedDirs = dirs[0] + "," + dirs[1] + "," + dirs[3];
            VerifyDisksHealth(localORLogDirs, expectedDirs, true);
            // Now, make 1 more nm-local-dir/nm-log-dir fail and verify if "the
            // nodemanager can identify the disk failures and can update the list of
            // good nm-local-dirs/nm-log-dirs and can update the overall health status
            // of the node to unhealthy".
            PrepareDirToFail(dirs[0]);
            expectedDirs = dirs[1] + "," + dirs[3];
            VerifyDisksHealth(localORLogDirs, expectedDirs, false);
            // Fail the remaining 2 local-dirs/log-dirs and verify if NM remains with
            // empty list of local-dirs/log-dirs and the overall health status is
            // unhealthy.
            PrepareDirToFail(dirs[1]);
            PrepareDirToFail(dirs[3]);
            expectedDirs = string.Empty;
            VerifyDisksHealth(localORLogDirs, expectedDirs, false);
        }
Пример #2
0
        /// <summary>Verify if the NodeManager could identify disk failures.</summary>
        /// <param name="localORLogDirs">
        /// <em>true</em> represent nm-local-dirs and <em>false
        /// </em> means nm-log-dirs
        /// </param>
        /// <param name="expectedDirs">expected nm-local-dirs/nm-log-dirs as a string</param>
        /// <param name="isHealthy"><em>true</em> if the overall node should be healthy</param>
        private void VerifyDisksHealth(bool localORLogDirs, string expectedDirs, bool isHealthy
                                       )
        {
            // Wait for the NodeManager to identify disk failures.
            WaitForDiskHealthCheck();
            IList <string> list = localORLogDirs ? dirsHandler.GetLocalDirs() : dirsHandler.GetLogDirs
                                      ();
            string seenDirs = StringUtils.Join(",", list);

            Log.Info("ExpectedDirs=" + expectedDirs);
            Log.Info("SeenDirs=" + seenDirs);
            NUnit.Framework.Assert.IsTrue("NodeManager could not identify disk failure.", expectedDirs
                                          .Equals(seenDirs));
            NUnit.Framework.Assert.AreEqual("Node's health in terms of disks is wrong", isHealthy
                                            , dirsHandler.AreDisksHealthy());
            for (int i = 0; i < 10; i++)
            {
                IEnumerator <RMNode> iter = yarnCluster.GetResourceManager().GetRMContext().GetRMNodes
                                                ().Values.GetEnumerator();
                if ((iter.Next().GetState() != NodeState.Unhealthy) == isHealthy)
                {
                    break;
                }
                // wait for the node health info to go to RM
                try
                {
                    Sharpen.Thread.Sleep(1000);
                }
                catch (Exception)
                {
                    Log.Error("Interrupted while waiting for NM->RM heartbeat.");
                }
            }
            IEnumerator <RMNode> iter_1 = yarnCluster.GetResourceManager().GetRMContext().GetRMNodes
                                              ().Values.GetEnumerator();

            NUnit.Framework.Assert.AreEqual("RM is not updated with the health status of a node"
                                            , isHealthy, iter_1.Next().GetState() != NodeState.Unhealthy);
        }
Пример #3
0
        public virtual void TestDirFailuresOnStartup()
        {
            Configuration conf      = new YarnConfiguration();
            string        localDir1 = new FilePath(testDir, "localDir1").GetPath();
            string        localDir2 = new FilePath(testDir, "localDir2").GetPath();
            string        logDir1   = new FilePath(testDir, "logDir1").GetPath();
            string        logDir2   = new FilePath(testDir, "logDir2").GetPath();

            conf.Set(YarnConfiguration.NmLocalDirs, localDir1 + "," + localDir2);
            conf.Set(YarnConfiguration.NmLogDirs, logDir1 + "," + logDir2);
            PrepareDirToFail(localDir1);
            PrepareDirToFail(logDir2);
            LocalDirsHandlerService dirSvc = new LocalDirsHandlerService();

            dirSvc.Init(conf);
            IList <string> localDirs = dirSvc.GetLocalDirs();

            NUnit.Framework.Assert.AreEqual(1, localDirs.Count);
            NUnit.Framework.Assert.AreEqual(new Path(localDir2).ToString(), localDirs[0]);
            IList <string> logDirs = dirSvc.GetLogDirs();

            NUnit.Framework.Assert.AreEqual(1, logDirs.Count);
            NUnit.Framework.Assert.AreEqual(new Path(logDir1).ToString(), logDirs[0]);
        }
Пример #4
0
        public virtual int Call()
        {
            // dispatcher not typed
            ContainerLaunchContext launchContext = container.GetLaunchContext();
            IDictionary <Path, IList <string> > localResources = null;
            ContainerId    containerID    = container.GetContainerId();
            string         containerIdStr = ConverterUtils.ToString(containerID);
            IList <string> command        = launchContext.GetCommands();
            int            ret            = -1;

            // CONTAINER_KILLED_ON_REQUEST should not be missed if the container
            // is already at KILLING
            if (container.GetContainerState() == ContainerState.Killing)
            {
                dispatcher.GetEventHandler().Handle(new ContainerExitEvent(containerID, ContainerEventType
                                                                           .ContainerKilledOnRequest, Shell.Windows ? ContainerExecutor.ExitCode.ForceKilled
                                                                           .GetExitCode() : ContainerExecutor.ExitCode.Terminated.GetExitCode(), "Container terminated before launch."
                                                                           ));
                return(0);
            }
            try
            {
                localResources = container.GetLocalizedResources();
                if (localResources == null)
                {
                    throw RPCUtil.GetRemoteException("Unable to get local resources when Container "
                                                     + containerID + " is at " + container.GetContainerState());
                }
                string user = container.GetUser();
                // /////////////////////////// Variable expansion
                // Before the container script gets written out.
                IList <string> newCmds  = new AList <string>(command.Count);
                string         appIdStr = app.GetAppId().ToString();
                string         relativeContainerLogDir = Org.Apache.Hadoop.Yarn.Server.Nodemanager.Containermanager.Launcher.ContainerLaunch
                                                         .GetRelativeContainerLogDir(appIdStr, containerIdStr);
                Path containerLogDir = dirsHandler.GetLogPathForWrite(relativeContainerLogDir, false
                                                                      );
                foreach (string str in command)
                {
                    // TODO: Should we instead work via symlinks without this grammar?
                    newCmds.AddItem(ExpandEnvironment(str, containerLogDir));
                }
                launchContext.SetCommands(newCmds);
                IDictionary <string, string> environment = launchContext.GetEnvironment();
                // Make a copy of env to iterate & do variable expansion
                foreach (KeyValuePair <string, string> entry in environment)
                {
                    string value = entry.Value;
                    value = ExpandEnvironment(value, containerLogDir);
                    entry.SetValue(value);
                }
                // /////////////////////////// End of variable expansion
                FileContext lfs = FileContext.GetLocalFSFileContext();
                Path        nmPrivateContainerScriptPath = dirsHandler.GetLocalPathForWrite(GetContainerPrivateDir
                                                                                                (appIdStr, containerIdStr) + Path.Separator + ContainerScript);
                Path nmPrivateTokensPath = dirsHandler.GetLocalPathForWrite(GetContainerPrivateDir
                                                                                (appIdStr, containerIdStr) + Path.Separator + string.Format(ContainerLocalizer.TokenFileNameFmt
                                                                                                                                            , containerIdStr));
                Path nmPrivateClasspathJarDir = dirsHandler.GetLocalPathForWrite(GetContainerPrivateDir
                                                                                     (appIdStr, containerIdStr));
                DataOutputStream containerScriptOutStream = null;
                DataOutputStream tokensOutStream          = null;
                // Select the working directory for the container
                Path containerWorkDir = dirsHandler.GetLocalPathForWrite(ContainerLocalizer.Usercache
                                                                         + Path.Separator + user + Path.Separator + ContainerLocalizer.Appcache + Path.Separator
                                                                         + appIdStr + Path.Separator + containerIdStr, LocalDirAllocator.SizeUnknown, false
                                                                         );
                string pidFileSubpath = GetPidFileSubpath(appIdStr, containerIdStr);
                // pid file should be in nm private dir so that it is not
                // accessible by users
                pidFilePath = dirsHandler.GetLocalPathForWrite(pidFileSubpath);
                IList <string> localDirs        = dirsHandler.GetLocalDirs();
                IList <string> logDirs          = dirsHandler.GetLogDirs();
                IList <string> containerLogDirs = new AList <string>();
                foreach (string logDir in logDirs)
                {
                    containerLogDirs.AddItem(logDir + Path.Separator + relativeContainerLogDir);
                }
                if (!dirsHandler.AreDisksHealthy())
                {
                    ret = ContainerExitStatus.DisksFailed;
                    throw new IOException("Most of the disks failed. " + dirsHandler.GetDisksHealthReport
                                              (false));
                }
                try
                {
                    // /////////// Write out the container-script in the nmPrivate space.
                    IList <Path> appDirs = new AList <Path>(localDirs.Count);
                    foreach (string localDir in localDirs)
                    {
                        Path usersdir = new Path(localDir, ContainerLocalizer.Usercache);
                        Path userdir  = new Path(usersdir, user);
                        Path appsdir  = new Path(userdir, ContainerLocalizer.Appcache);
                        appDirs.AddItem(new Path(appsdir, appIdStr));
                    }
                    containerScriptOutStream = lfs.Create(nmPrivateContainerScriptPath, EnumSet.Of(CreateFlag
                                                                                                   .Create, CreateFlag.Overwrite));
                    // Set the token location too.
                    environment[ApplicationConstants.ContainerTokenFileEnvName] = new Path(containerWorkDir
                                                                                           , FinalContainerTokensFile).ToUri().GetPath();
                    // Sanitize the container's environment
                    SanitizeEnv(environment, containerWorkDir, appDirs, containerLogDirs, localResources
                                , nmPrivateClasspathJarDir);
                    // Write out the environment
                    exec.WriteLaunchEnv(containerScriptOutStream, environment, localResources, launchContext
                                        .GetCommands());
                    // /////////// End of writing out container-script
                    // /////////// Write out the container-tokens in the nmPrivate space.
                    tokensOutStream = lfs.Create(nmPrivateTokensPath, EnumSet.Of(CreateFlag.Create, CreateFlag
                                                                                 .Overwrite));
                    Credentials creds = container.GetCredentials();
                    creds.WriteTokenStorageToStream(tokensOutStream);
                }
                finally
                {
                    // /////////// End of writing out container-tokens
                    IOUtils.Cleanup(Log, containerScriptOutStream, tokensOutStream);
                }
                // LaunchContainer is a blocking call. We are here almost means the
                // container is launched, so send out the event.
                dispatcher.GetEventHandler().Handle(new ContainerEvent(containerID, ContainerEventType
                                                                       .ContainerLaunched));
                context.GetNMStateStore().StoreContainerLaunched(containerID);
                // Check if the container is signalled to be killed.
                if (!shouldLaunchContainer.CompareAndSet(false, true))
                {
                    Log.Info("Container " + containerIdStr + " not launched as " + "cleanup already called"
                             );
                    ret = ContainerExecutor.ExitCode.Terminated.GetExitCode();
                }
                else
                {
                    exec.ActivateContainer(containerID, pidFilePath);
                    ret = exec.LaunchContainer(container, nmPrivateContainerScriptPath, nmPrivateTokensPath
                                               , user, appIdStr, containerWorkDir, localDirs, logDirs);
                }
            }
            catch (Exception e)
            {
                Log.Warn("Failed to launch container.", e);
                dispatcher.GetEventHandler().Handle(new ContainerExitEvent(containerID, ContainerEventType
                                                                           .ContainerExitedWithFailure, ret, e.Message));
                return(ret);
            }
            finally
            {
                completed.Set(true);
                exec.DeactivateContainer(containerID);
                try
                {
                    context.GetNMStateStore().StoreContainerCompleted(containerID, ret);
                }
                catch (IOException)
                {
                    Log.Error("Unable to set exit code for container " + containerID);
                }
            }
            if (Log.IsDebugEnabled())
            {
                Log.Debug("Container " + containerIdStr + " completed with exit code " + ret);
            }
            if (ret == ContainerExecutor.ExitCode.ForceKilled.GetExitCode() || ret == ContainerExecutor.ExitCode
                .Terminated.GetExitCode())
            {
                // If the process was killed, Send container_cleanedup_after_kill and
                // just break out of this method.
                dispatcher.GetEventHandler().Handle(new ContainerExitEvent(containerID, ContainerEventType
                                                                           .ContainerKilledOnRequest, ret, "Container exited with a non-zero exit code " +
                                                                           ret));
                return(ret);
            }
            if (ret != 0)
            {
                Log.Warn("Container exited with a non-zero exit code " + ret);
                this.dispatcher.GetEventHandler().Handle(new ContainerExitEvent(containerID, ContainerEventType
                                                                                .ContainerExitedWithFailure, ret, "Container exited with a non-zero exit code "
                                                                                + ret));
                return(ret);
            }
            Log.Info("Container " + containerIdStr + " succeeded ");
            dispatcher.GetEventHandler().Handle(new ContainerEvent(containerID, ContainerEventType
                                                                   .ContainerExitedWithSuccess));
            return(0);
        }