示例#1
0
        /// <summary>
        /// Submits the jobs and waits for it to complete.
        /// When it submits, it create a log entry file in the cluster working directory, named according to the run name. This file is deleted
        /// when the job finishes successfully, so long as we're still waiting for it to finish. If SubmitAndWait is called and this file already
        /// exists, then it is assumed that the job we want to submit was already submitted, so we wait for it to finish rather than submit again.
        /// </summary>
        /// <param name="clusterArgs"></param>
        /// <param name="distributableObj"></param>
        /// <param name="maxSubmitAfterTasksFail"></param>
        /// <param name="OnSubmittedCallbackOrNull"></param>
        public static void SubmitAndWait(ClusterSubmitterArgs clusterArgs, IDistributable distributableObj, int maxSubmitAfterTasksFail = 0, Action OnSubmittedCallbackOrNull = null)
        {
            using (ParallelOptionsScope.Suspend())
            {
                FileInfo logEntryFile = HpcLibSettings.GetLogEntryFile(clusterArgs);
                if (logEntryFile.Exists)
                {
                    Console.WriteLine(Resource.Job_already_exists, logEntryFile.FullName);
                    clusterArgs = HpcLibSettings.LoadLogEntryFile(logEntryFile).ClusterArgs;
                }
                else
                {
                    Submit(clusterArgs, distributableObj);
                    Console.WriteLine(Resource.Wait_Writing_log);
                    HpcLibSettings.WriteLogEntryToClusterDirectory(clusterArgs);
                }

                if (OnSubmittedCallbackOrNull != null)
                {
                    OnSubmittedCallbackOrNull();
                }

                JobState jobState = WaitForJobInternal(clusterArgs, maxSubmitAfterTasksFail);

                logEntryFile.Delete();  // job finished successfully, so we can delete this. Even if failed or canceled, we assume that we'll want to overwrite in the future.

                if (jobState != JobState.Finished)
                {
                    throw new Exception("Job " + jobState);
                }
            }
        }
示例#2
0
        /// <summary>
        /// Submits the ArgumentCollection and ClusterArgs to the cluster, telling the cluster to run whichever exe is currently running using a new set of args that divids the work up in to tasks.
        /// </summary>
        /// <param name="clusterArgs">cluster args</param>
        /// <param name="applicationArgs">application args</param>
        public static void Submit(ClusterSubmitterArgs clusterArgs, ArgumentCollection applicationArgs)
        {
            CommandArguments cmd = applicationArgs as CommandArguments;

            Helper.CheckCondition <ArgumentException>(cmd != null, "Can only provide command arguments to the cluster submitter");
            Submit(clusterArgs, new DistributableWrapper(cmd));
        }
示例#3
0
        /// <summary>
        /// Gets the log entry file
        /// </summary>
        /// <param name="clusterArgs">Cluster submitter arguments</param>
        /// <returns>log file info</returns>
        public static FileInfo GetLogEntryFile(ClusterSubmitterArgs clusterArgs)
        {
            string directory = clusterArgs.ExternalRemoteDirectoryName;
            string filename  = LogEntryFileName(clusterArgs.Name);

            return(new FileInfo(Path.Combine(directory, filename)));
        }
示例#4
0
        /// <summary>
        /// Writes log entries into cluster dir
        /// </summary>
        /// <param name="clusterArgs">Cluster submitter arguments</param>
        public static void WriteLogEntryToClusterDirectory(ClusterSubmitterArgs clusterArgs)
        {
            FileInfo logEntryFile = GetLogEntryFile(clusterArgs);
            LogEntry logEntry     = new LogEntry(clusterArgs);

            System.Xml.Serialization.XmlSerializer serializer = new System.Xml.Serialization.XmlSerializer(typeof(LogEntry));
            using (var writer = logEntryFile.CreateText())
            {
                serializer.Serialize(writer, logEntry);
            }
        }
示例#5
0
 /// <summary>
 /// Constructor with cluster args
 /// </summary>
 /// <param name="clusterArgs">Cluster submitter args</param>
 public LogEntry(ClusterSubmitterArgs clusterArgs)
 {
     Date     = DateTime.Now;
     LocalDir = Environment.CurrentDirectory;
     if (clusterArgs.RelativeDir)
     {
         clusterArgs.Dir         = clusterArgs.ExternalRemoteDirectoryName;
         clusterArgs.RelativeDir = false;
     }
     ClusterArgs = clusterArgs;
 }
示例#6
0
 private static void CopyExes(ClusterSubmitterArgs clusterArgs)
 {
     if (clusterArgs.ExeRelativeDirectoryName == null)
     {
         clusterArgs.ExeRelativeDirectoryName = HpcLib.CopyExesToCluster(clusterArgs.ExternalRemoteDirectoryName, clusterArgs.ExeName);
     }
     else
     {
         Console.WriteLine(Resource.Using_exe + clusterArgs.ExeRelativeDirectoryName);
         string absoluteExeDir = clusterArgs.ExternalRemoteDirectoryName + "\\" + clusterArgs.ExeRelativeDirectoryName;
         Helper.CheckCondition(Directory.Exists(absoluteExeDir), "Directory {0} does not exist!", absoluteExeDir);
     }
 }
示例#7
0
        /// <summary>
        /// Waits for the job specified by clusterArgs to finish. If the state is Canceled or Failed,
        /// will throw an exception. Attempts to connect to the cluster if not already connected.
        /// </summary>
        /// <param name="clusterArgs">cluster args</param>
        /// <param name="maxNumTimesToResubmitFailedTasks">max number of times to resubmit</param>
        public static void WaitForJob(ClusterSubmitterArgs clusterArgs, int maxNumTimesToResubmitFailedTasks = 0)
        {
            JobState jobState = WaitForJobInternal(clusterArgs, maxNumTimesToResubmitFailedTasks);

            if (jobState == Microsoft.Hpc.Scheduler.Properties.JobState.Canceled)
            {
                throw new Exception("Job canceled.");
            }
            else if (jobState == Microsoft.Hpc.Scheduler.Properties.JobState.Failed)
            {
                throw new Exception("Job failed.");
            }
        }
示例#8
0
        /// <summary>
        /// Calls the corresponding Submit function, but waits for the cluster to Finish, Fail, or be Canceled. If the final state is
        /// Finished, returns silently. Otherwise, it throws and Exception. For a description of the other parameters, see Submit().
        /// *** NOTE: ONLY WORKS WITH V2 CLUSTERS. ****
        /// </summary>
        public static ClusterSubmitterArgs SubmitAndWait(ArgumentCollection argumentCollection, int maxSubmitAfterTasksFail = 0)
        {
            if (argumentCollection.PeekOptional <string>("cluster", "help").Equals("help", StringComparison.CurrentCultureIgnoreCase))
            {
                Console.WriteLine("");
                Console.WriteLine(ArgumentCollection.CreateHelpMessage(typeof(ClusterSubmitterArgs), includeDateStamp: false));
                return(null);
            }

            ClusterSubmitterArgs clusterArgs = new ClusterSubmitterArgs(argumentCollection);

            SubmitAndWait(clusterArgs, argumentCollection, maxSubmitAfterTasksFail);
            return(clusterArgs);
        }
示例#9
0
        private static void SubmitInternal(ClusterSubmitterArgs clusterArgs, IDistributable distributableObj)
        {
            lock (_submitterLockObj)  // for now, just let one thread submit at a time.
            {
                if (string.IsNullOrEmpty(clusterArgs.Name))
                {
                    clusterArgs.Name = distributableObj.JobName;
                }

                CopyExes(clusterArgs);

                clusterArgs.StdErrDirName = CreateUniqueDirectory(clusterArgs.ExternalRemoteDirectoryName, "Stderr", distributableObj.JobName);
                clusterArgs.StdOutDirName = CreateUniqueDirectory(clusterArgs.ExternalRemoteDirectoryName, "Stdout", distributableObj.JobName);

                if (clusterArgs.CopyInputFiles != null)
                {
                    if (!(distributableObj is DistributableWrapper))
                    {
                        clusterArgs.CopyInputFiles.AddRange(ArgumentCollection.EnumerateValuesOfTypeFromParsable <InputFile>(distributableObj).Select(file => file.ToString()));
                    }

                    if (clusterArgs.CopyInputFiles.Count > 0)
                    {
                        CopyInputFiles(clusterArgs.CopyInputFiles, clusterArgs.ExternalRemoteDirectoryName);
                    }
                }

                using (ParallelOptionsScope.Suspend())
                {
                    switch (clusterArgs.Version)
                    {
                    case 3:
                        SubmitViaAPI3(clusterArgs, distributableObj);
                        break;

                    default:
                        throw new NotSupportedException(string.Format("Cluster version {0} is not supported.", clusterArgs.Version));
                    }
                }
                Console.WriteLine(Resource.Processed_job, clusterArgs.Cluster, clusterArgs.ExternalRemoteDirectoryName);


                Console.WriteLine(Resource.Writing_log_file);
                HpcLibSettings.TryWriteToLog(clusterArgs);

                Console.WriteLine(Resource.Done);
            }
            return;
        }
示例#10
0
 /// <summary>
 /// Submits the ArgumentCollection to the cluster, telling the cluster to run whichever exe is currently running using a new set of args that divids the work up in to tasks.
 /// </summary>
 /// <param name="clusterArgs">cluster args</param>
 /// <param name="distributableObj">distributable objects</param>
 public static void Submit(ClusterSubmitterArgs clusterArgs, IDistributable distributableObj)
 {
     for (int numTries = 0; numTries < clusterArgs.MaxSubmitTries; numTries++)
     {
         try
         {
             SubmitInternal(clusterArgs, distributableObj);
             return;
         }
         catch (Exception exception)
         {
             Console.WriteLine(Resource.Error_Submitting + clusterArgs.Cluster + ": " + exception.Message);
             Console.WriteLine(string.Format(CultureInfo.CurrentCulture, "numTry={0} of {1}", numTries, clusterArgs.MaxSubmitTries));
             Console.WriteLine(exception.StackTrace);
             Console.WriteLine(Resource.User_CluserHelp);
             Thread.Sleep(new TimeSpan(0, 10, 0));
         }
     }
     throw new Exception("max number of cluster submitter tries (" + clusterArgs.MaxSubmitTries + ") exceeded");
 }
示例#11
0
        private static ISchedulerTask AddCleanupTaskToJob(ClusterSubmitterArgs clusterArgs, IScheduler scheduler, ISchedulerJob job, IDistributable distributableJob)
        {
            ISchedulerCollection taskList        = job.GetTaskList(scheduler.CreateFilterCollection(), scheduler.CreateSortCollection(), true);
            IStringCollection    dependencyTasks = scheduler.CreateStringCollection();

            if (!clusterArgs.OnlyDoCleanup)
            {
                dependencyTasks.Add(((ISchedulerTask)taskList[0]).Name);
            }
            ISchedulerTask cleanupTask = CreateCleanupTask(job, clusterArgs.ExternalRemoteDirectoryName, clusterArgs.StdErrRelativeDirName, clusterArgs.StdOutRelativeDirName, "cleanup", isFinalCleanup: true);

            Locally local = new Locally()
            {
                Cleanup         = true,
                TaskCount       = clusterArgs.TaskCount,
                Tasks           = new RangeCollection(),
                ParallelOptions = new ParallelOptions()
                {
                    MaxDegreeOfParallelism = 1
                }
            };

            DistributeApp.Distribute distributeExe = new DistributeApp.Distribute()
            {
                Distributor   = local,
                Distributable = distributableJob
            };

            string exeName = distributableJob is DistributableWrapper ? clusterArgs.ExeName : distributeExe.GetType().Assembly.GetName().Name;

            string taskCommandLine = string.Format("{0}\\{1} {2}", clusterArgs.ExeRelativeDirectoryName, exeName, CreateTaskString(distributeExe, clusterArgs.MinimalCommandLine));

            cleanupTask.CommandLine = taskCommandLine;

            if (!clusterArgs.OnlyDoCleanup)
            {
                cleanupTask.DependsOn = dependencyTasks;
            }
            job.AddTask(cleanupTask);
            return(cleanupTask);
        }
示例#12
0
        /// <summary>
        /// Tries to write to the log file. If no log file is defined, returns false.
        /// </summary>
        /// <param name="clusterArgs">Cluser submitter arugemtns</param>
        /// <returns>true if write sucessed</returns>
        public static bool TryWriteToLog(ClusterSubmitterArgs clusterArgs)
        {
            if (!string.IsNullOrEmpty(LogFile))
            {
                if (!Directory.Exists(Path.GetDirectoryName(LogFile)))
                {
                    Console.WriteLine(Resource.Log_file_directory, Path.GetDirectoryName(LogFile));
                    return(false);
                }
                FileStream filestream = null;
                try
                {
                    bool exists = File.Exists(LogFile);
                    if (FileUtils.TryToOpenFile(LogFile, new TimeSpan(0, 3, 0), FileMode.OpenOrCreate, FileAccess.ReadWrite, FileShare.None, out filestream))
                    {
                        List <LogEntry> logEntries    = exists && filestream.Length > 0 ? LogEntry.LoadEntries(new StreamReader(filestream)) : new List <LogEntry>();
                        LogEntry        newSubmission = new LogEntry(clusterArgs);
                        logEntries.Add(newSubmission);
                        filestream.Position = 0;
                        LogEntry.SaveEntries(logEntries, new StreamWriter(filestream));
                        filestream.Dispose();

                        return(true);
                    }
                }
                catch (System.Xml.XmlException exception)
                {
                    Console.WriteLine(exception);
                    return(false);
                }
                finally
                {
                    if (filestream != null)
                    {
                        filestream.Dispose();
                    }
                }
            }
            return(false);
        }
示例#13
0
        private static JobState WaitForJobInternal(ClusterSubmitterArgs clusterArgs, int maxNumTimesToResubmitFailedTasks)
        {
            JobListener jobListener;

            JobListener.TryConnect(clusterArgs.Cluster, clusterArgs.JobID, clusterArgs.Username, out jobListener).Enforce("Could not connect to scheduler {0} or find jobID {1} for user {2}.",
                                                                                                                          clusterArgs.Cluster, clusterArgs.JobID, clusterArgs.Username);

            ManualResetEvent mre = new ManualResetEvent(false);

            // setup a notification for when the job is done.
            jobListener.OnJobStateChanged += (o, e) =>
            {
                if (jobListener.JobIsDone)
                {
                    mre.Set();
                }
            };

            // setup the notification to requeue failed tasks.
            if (maxNumTimesToResubmitFailedTasks > 0)
            {
                jobListener.OnTaskStateChanged += (o, e) =>
                {
                    if (jobListener.JobCounters.FailedTaskCount > 0 && maxNumTimesToResubmitFailedTasks-- > 0)
                    {
                        Console.WriteLine(Resource.Tasks_failed);
                        RequeueFailedAndCanceledTasks(jobListener.Scheduler, jobListener.Job);
                    }
                };
            }

            if (!jobListener.JobIsDone)
            {
                mre.WaitOne();
            }

            return(jobListener.JobState);
        }
示例#14
0
        private static IStringCollection GetNodesToUse(ClusterSubmitterArgs clusterArgs, IScheduler scheduler, ISchedulerJob job)
        {
            job.AutoCalculateMax = false;
            job.AutoCalculateMin = false;
            var availableNodes           = scheduler.GetNodeList(null, null);
            IStringCollection nodesToUse = scheduler.CreateStringCollection();
            List <string>     nodesFound = new List <string>();

            foreach (var node in availableNodes)
            {
                string nodeName = ((Microsoft.Hpc.Scheduler.SchedulerNode)node).Name;
                if (!clusterArgs.NodeExclusionList.Contains(nodeName))
                {
                    nodesToUse.Add(nodeName);
                }
                else
                {
                    nodesFound.Add(nodeName);
                }
            }
            Helper.CheckCondition(nodesFound.Count != clusterArgs.NodeExclusionList.Count, "not all nodes in exclusion list found: check for typo " + clusterArgs.NodeExclusionList);

            return(nodesToUse);
        }
示例#15
0
        private static ISchedulerTask CreateTask(int?taskNumber, ClusterSubmitterArgs clusterArgs, ISchedulerJob job, IDistributable distributableObj, IStringCollection nodesToUse)
        {
            Locally local = new Locally()
            {
                Cleanup   = false,
                TaskCount = clusterArgs.TaskCount,
                Tasks     = taskNumber.HasValue ? new RangeCollection(taskNumber.Value) : null,
            };

            ISchedulerTask task = job.CreateTask();

            if (nodesToUse != null)
            {
                task.RequiredNodes = nodesToUse;
            }
            if (clusterArgs.NumCoresPerTask != null)
            {
                task.MinimumNumberOfCores = clusterArgs.NumCoresPerTask.Value;
                task.MaximumNumberOfCores = clusterArgs.NumCoresPerTask.Value;
                task.MaximumNumberOfNodes = 1;
                local.ParallelOptions.MaxDegreeOfParallelism = clusterArgs.NumCoresPerTask.Value;
            }
            if (!clusterArgs.IsExclusive)
            {
                local.ParallelOptions = new ParallelOptions()
                {
                    MaxDegreeOfParallelism = 1
                };
            }

            task.WorkDirectory = clusterArgs.ExternalRemoteDirectoryName;

            DistributeApp.Distribute distributeExe = new DistributeApp.Distribute()
            {
                Distributable = distributableObj,
                Distributor   = local
            };

            string taskArgString = CreateTaskString(distributeExe, clusterArgs.MinimalCommandLine);
            string exeName       = distributeExe.Distributable is DistributableWrapper ? clusterArgs.ExeName : distributeExe.GetType().Assembly.GetName().Name;

            string taskCommandLine = null;

            if (clusterArgs.UseMPI)
            {
                taskCommandLine = string.Format("mpiexec -n {0} {1}\\{2} {3}", clusterArgs.NumCoresPerTask, clusterArgs.ExeRelativeDirectoryName, exeName, taskArgString);
            }
            else
            {
                taskCommandLine = string.Format("{0}\\{1} {2}", clusterArgs.ExeRelativeDirectoryName, exeName, taskArgString);
            }
            task.CommandLine = taskCommandLine;

            string taskNumberAsString = taskNumber.HasValue ? taskNumber.Value.ToString() : "*";

            task.Name = Helper.CreateDelimitedString(" ", distributableObj.JobName, taskNumberAsString);
            Console.WriteLine(Resource.StdOutRelativeDirName + clusterArgs.StdOutRelativeDirName);
            task.StdErrFilePath = string.Format(@"{0}\{1}.txt", clusterArgs.StdErrRelativeDirName, taskNumberAsString);
            task.StdOutFilePath = string.Format(@"{0}\{1}.txt", clusterArgs.StdOutRelativeDirName, taskNumberAsString);

            Console.WriteLine(Resource.CreateTask, task.CommandLine.Length, task.CommandLine);
            if (task.StdErrFilePath.Length >= 160)
            {
                Console.WriteLine(Resource.Caution, task.StdErrFilePath.Length);
            }

            return(task);
        }
示例#16
0
        private static void SubmitViaAPI3(ClusterSubmitterArgs clusterArgs, IDistributable distributableObj)
        {
            Console.WriteLine(string.Format("Connecting to cluster {0} using API version 3 .", clusterArgs.Cluster));

            using (IScheduler scheduler = new Scheduler())
            {
                scheduler.Connect(clusterArgs.Cluster);
                ISchedulerJob job = scheduler.CreateJob();

                job.Name     = distributableObj.JobName;
                job.Priority = clusterArgs.Priority;

                if (clusterArgs.JobTemplate != null)
                {
                    Microsoft.Hpc.Scheduler.IStringCollection jobTemplates = scheduler.GetJobTemplateList();
                    string decodedJobTemplate = System.Web.HttpUtility.UrlDecode(clusterArgs.JobTemplate);
                    if (jobTemplates.Contains(decodedJobTemplate))
                    {
                        job.SetJobTemplate(decodedJobTemplate);
                    }
                    else
                    {
                        Console.WriteLine(string.Format(Resource.Job_template, decodedJobTemplate));
                        foreach (var template in jobTemplates)
                        {
                            Console.Write("'" + template + "' ");
                        }
                        Console.WriteLine(Resource.SubmitViaAPI3);
                    }
                }


                if (clusterArgs.NumCoresPerTask != null)
                {
                    clusterArgs.IsExclusive = false;
                }

                IStringCollection nodesToUse = null;

                if (clusterArgs.NodeExclusionList != null && clusterArgs.NodeExclusionList.Count > 0)
                {
                    nodesToUse = GetNodesToUse(clusterArgs, scheduler, job);
                }
                else if (clusterArgs.NodesToUseList != null && clusterArgs.NodesToUseList.Count > 0)
                {
                    nodesToUse = scheduler.CreateStringCollection();
                    foreach (string nodeName in clusterArgs.NodesToUseList)
                    {
                        nodesToUse.Add(nodeName);
                    }
                }
                else if (clusterArgs.NumCoresPerTask != null)
                {
                    job.AutoCalculateMax = true;
                    job.AutoCalculateMin = true;
                }
                else if (clusterArgs.IsExclusive)
                {
                    job.UnitType = Microsoft.Hpc.Scheduler.Properties.JobUnitType.Node;
                    if (clusterArgs.MinimumNumberOfNodes != null)
                    {
                        job.MaximumNumberOfNodes = clusterArgs.MaximumNumberOfNodes.Value;
                        job.MinimumNumberOfNodes = clusterArgs.MinimumNumberOfNodes.Value;
                    }
                }
                else if (clusterArgs.MinimumNumberOfCores != null)
                {
                    if (clusterArgs.MaximumNumberOfCores == null)
                    {
                        job.AutoCalculateMax = true;
                    }
                    else
                    {
                        job.AutoCalculateMax     = false;
                        job.MaximumNumberOfCores = clusterArgs.MaximumNumberOfCores.Value;
                    }
                    job.MaximumNumberOfCores = clusterArgs.MaximumNumberOfCores ?? Math.Max(clusterArgs.TaskCount, scheduler.GetCounters().TotalCores);
                    job.MinimumNumberOfCores = clusterArgs.MinimumNumberOfCores.Value;
                    job.AutoCalculateMin     = false;
                }
                else
                {
                    job.AutoCalculateMax = true;
                    job.AutoCalculateMin = true;
                }

                if (!clusterArgs.OnlyDoCleanup)
                {
                    if (clusterArgs.TaskRange.IsContiguous())
                    {
                        if (clusterArgs.TaskRange.LastElement > clusterArgs.TaskCount - 1)
                        {
                            clusterArgs.TaskRange = new RangeCollection(clusterArgs.TaskRange.FirstElement, clusterArgs.TaskCount - 1);
                        }
                        ISchedulerTask task = CreateTask(null, clusterArgs, job, distributableObj, nodesToUse);

                        task.Type = TaskType.ParametricSweep;

                        task.StartValue = 0;
                        task.EndValue   = clusterArgs.TaskCount - 1;

                        job.AddTask(task);
                    }
                    else
                    {
                        job.AddTasks(clusterArgs.TaskRange.Select(taskNum => CreateTask((int)taskNum, clusterArgs, job, distributableObj, nodesToUse)).ToArray());
                    }
                }
                else
                {
                    clusterArgs.Cleanup = true;
                }

                ISchedulerTask cleanupTask = null;
                if (clusterArgs.Cleanup)
                {
                    cleanupTask = AddCleanupTaskToJob(clusterArgs, scheduler, job, distributableObj);
                }

                Console.WriteLine(Resource.Submitting_job);
                scheduler.SubmitJob(job, null, null);
                clusterArgs.JobID = job.Id;
                Console.WriteLine(job.Name + Resource.submitted);
            }
        }