/// <summary> /// make the necessary datastructure for starting a new physical process /// </summary> /// <param name="wd">the new process' working directory</param> /// <param name="details">a descriptor indicating how to start the process</param> /// <param name="id">the process' unique identifier</param> /// <returns>the datastructure needed to start the process</returns> private ProcessStartInfo MakeStartInfo(string wd, ExeDetails details, string id) { ProcessStartInfo startInfo = new ProcessStartInfo(); startInfo.CreateNoWindow = true; startInfo.UseShellExecute = false; startInfo.WorkingDirectory = wd; if (details.stdoutFile != null) { startInfo.RedirectStandardOutput = true; } if (details.stderrFile != null) { startInfo.RedirectStandardError = true; } startInfo.Arguments = details.commandLineArguments; if (Path.IsPathRooted(details.commandLine)) { // if the executable was specified as a full path, run it from there startInfo.FileName = details.commandLine; } else { // otherwise run it in the process' working directory startInfo.FileName = Path.Combine(wd, details.commandLine); } // add all the environment variables that were specified in the config foreach (var e in details.environment) { startInfo.EnvironmentVariables.Remove(e.Key); startInfo.EnvironmentVariables.Add(e.Key, e.Value); } // add the Peloponnese-specific environment variables that will let the process contact the server // and register itself startInfo.EnvironmentVariables.Add(Constants.EnvManagerServerUri, parent.parent.ServerAddress); startInfo.EnvironmentVariables.Add(Constants.EnvJobGuid, parent.parent.JobGuid.ToString()); startInfo.EnvironmentVariables.Add(Constants.EnvProcessGroup, parent.GroupName); startInfo.EnvironmentVariables.Add(Constants.EnvProcessIdentifier, id); startInfo.EnvironmentVariables.Add(Constants.EnvProcessHostName, Environment.MachineName); // for data locality purposes, every process in the group is assumed to be running on the // same rack startInfo.EnvironmentVariables.Add(Constants.EnvProcessRackName, "localrack"); return(startInfo); }
/// <summary> /// start a new version of the physical process /// </summary> /// <param name="details">a descriptor of how to start the process</param> /// <returns>true if and only if the process was successfully started</returns> public async Task <bool> Start(ExeDetails details) { // copies of the identifier and version that can be used outside the lock string id; lock (this) { // we should only be starting one version at a time Debug.Assert(systemProcess == null); Debug.Assert(identifier == null); // increment the version before starting the process, then record the version // for use outside the lock ++version; if (parent.NumberOfVersions > 0 && version > parent.NumberOfVersions) { // hack for now to stop the service running indefinitely creating local // process directories if the config is broken and none of them will start logger.Log("Local process " + index + " failed too many times: exiting"); parent.StartShuttingDown(); return(false); } // create a new unique identifier for the process based on its index and version, // then record the identifier for use outside the lock id = String.Format("Process.{0,3:D3}.{1,3:D3}", index, version); } // let the parent know the new process' identifier before it is started. It will tell the service // which will then know to accept its registration once it starts parent.OnRegisterProcess(id); // try to actually start the process if (await StartInternal(details, id)) { return(true); } else { // this failed: let the parent know logger.Log("Start reporting process exit"); parent.OnProcessExited(index, id, 1, true); return(false); } }
/// <summary> /// read the config and initialize. Can throw exceptions which will be cleanly caught by /// the parent /// </summary> /// <param name="p">parent to use for callbacks</param> /// <param name="name">name of this group in the service</param> /// <param name="config">element describing configuration parameters</param> public void Initialize(IServiceManager p, string name, XElement config) { parent = p; logger = parent.Logger; groupName = name; shuttingDown = false; // read the target number of processes out of the config. This defaults to 1 // if not otherwise specified int numberOfProcesses = 1; var nProcAttr = config.Attribute("numberOfProcesses"); if (nProcAttr != null) { // don't worry about throwing exceptions if this is malformed numberOfProcesses = int.Parse(nProcAttr.Value); } // read the target number of restarts for each process out of the config. This defaults to 5 // if not otherwise specified numberOfVersions = 5; var nRestartAttr = config.Attribute("numberOfVersions"); if (nRestartAttr != null) { // don't worry about throwing exceptions if this is malformed numberOfVersions = int.Parse(nRestartAttr.Value); } // make a logical process object for each process we are managing processes = new LocalProcess[numberOfProcesses]; for (int i = 0; i < processes.Length; ++i) { processes[i] = new LocalProcess(this, i); } // read the descriptor that we will use to create physical processes. // don't worry about throwing exceptions if this isn't present or is // malformed var processElement = config.Descendants("Process").Single(); processDetails = new ExeDetails(); processDetails.ReadFromConfig(processElement, logger); }
/// <summary> /// do the work of copying resources to the physical process' working directory, and /// starting it /// </summary> /// <param name="details">a descriptor of how to start the process</param> /// <param name="id">unique identifier for the process</param> /// <returns>true if and only if the process was started</returns> private async Task <bool> StartInternal(ExeDetails details, string id) { // make a working directory by combining the service's working directory // with the group name and unique process identifier var groupWd = Path.Combine(Directory.GetCurrentDirectory(), parent.GroupName); string wd = Path.Combine(groupWd, id); try { // if there was already a directory of that name, try to delete it Directory.Delete(wd, true); } catch (Exception e) { if (!(e is DirectoryNotFoundException)) { // if there's a directory there that we can't delete, don't even try to // start the process because something bad is going on. logger.Log("Failed to delete existing directory " + wd + ": " + e.Message); return(false); } } try { // make the working directory for the new process Directory.CreateDirectory(wd); logger.Log("Created working directory " + wd); } catch (Exception e) { // if we can't make the working directory, don't try to start the process logger.Log("Failed to create working directory " + wd + ": " + e.Message); return(false); } logger.Log("Copying resources to " + wd); // we will copy all the resource groups in parallel; this is the list of Tasks // to wait on var waiters = new List <Task <bool> >(); foreach (var r in details.resources) { waiters.Add(r.FetchToLocalDirectoryAsync(wd)); } // the return values are an array of bools indicating for each group whether it // copied successfully or not var gotResourcesArray = await Task.WhenAll(waiters); // AND together all the return values var gotResources = gotResourcesArray.Aggregate(true, (a, b) => a && b); if (!gotResources) { // at least one resource failed to copy: we can't start the process logger.Log("Failed to copy resources to working directory " + wd); return(false); } ProcessStartInfo startInfo; try { // make the actual datastructure for starting the process startInfo = MakeStartInfo(wd, details, id); } catch (Exception e) { logger.Log("Failed to make process start info for " + id + ": " + e.ToString()); return(false); } Process newProcess; lock (this) { // make a new system process and copy it into a local variable to use outside // the lock. Once we exit the lock here, an asynchronous call to Stop() could try to kill // the process systemProcess = new Process(); systemProcess.StartInfo = startInfo; systemProcess.EnableRaisingEvents = true; systemProcess.Exited += new EventHandler(ProcessExited); identifier = id; newProcess = systemProcess; } logger.Log("Trying to start process " + parent.GroupName + ":" + id + " -- " + startInfo.FileName + " " + startInfo.Arguments); try { newProcess.Start(); logger.Log("Process " + newProcess.Id + " started for " + parent.GroupName + ":" + id); if (details.stdoutFile != null) { string stdOutDest = details.stdoutFile; if (details.redirectDirectory != null) { stdOutDest = Path.Combine(details.redirectDirectory, stdOutDest); } Task copyTask = Task.Run(() => CopyStreamWithCatch(systemProcess.StandardOutput, stdOutDest, wd)); } if (details.stderrFile != null) { string stdErrDest = details.stderrFile; if (details.redirectDirectory != null) { stdErrDest = Path.Combine(details.redirectDirectory, stdErrDest); } Task copyTask = Task.Run(() => CopyStreamWithCatch(systemProcess.StandardError, stdErrDest, wd)); } return(true); } catch (Exception e) { // if we didn't manage to start the process, get rid of the pointer to it; the parent // will call parent.OnProcessExited for us lock (this) { systemProcess = null; identifier = null; } logger.Log("Process start failed for " + parent.GroupName + ":" + id + ": " + e.ToString()); return(false); } }
/// <summary> /// read the config and initialize. Can throw exceptions which will be cleanly caught by /// the parent /// </summary> /// <param name="p">parent to use for callbacks</param> /// <param name="name">name of this group in the service</param> /// <param name="config">element describing configuration parameters</param> public void Initialize(IServiceManager p, string name, XElement config) { parent = p; logger = parent.Logger; groupName = name; // read the target number of processes out of the config. This defaults to -1 // if not otherwise specified, which means use all the machines in the cluster maxProcesses = -1; var nProcAttr = config.Attribute("maxProcesses"); if (nProcAttr != null) { // don't worry about throwing exceptions if this is malformed maxProcesses = int.Parse(nProcAttr.Value); } // read the target number of failures out of the config. These default to -1 // if not otherwise specified, which means tolerate arbitrary failures maxFailuresPerNode = -1; var nFPNAttr = config.Attribute("maxFailuresPerNode"); if (nFPNAttr != null) { // don't worry about throwing exceptions if this is malformed maxFailuresPerNode = int.Parse(nFPNAttr.Value); } maxTotalFailures = -1; var nTFAttr = config.Attribute("maxTotalFailures"); if (nTFAttr != null) { // don't worry about throwing exceptions if this is malformed maxTotalFailures = int.Parse(nTFAttr.Value); } // read the amount of memory to request per container from the config // it defaults to -1 workerMemoryInMB = -1; var workerMemAttr = config.Attribute("workerMemoryInMB"); if (workerMemAttr != null) { workerMemoryInMB = int.Parse(workerMemAttr.Value); } // read the descriptor that we will use to create physical processes. // don't worry about throwing exceptions if this isn't present or is // malformed var processElement = config.Descendants("Process").Single(); processDetails = new ExeDetails(); processDetails.ReadFromConfig(processElement, logger); foreach (var rg in processDetails.resources) { if (!(rg is HdfsResources)) { throw new ApplicationException("All YARN process resources must reside in HDFS: " + rg.ToString()); } } }