public static void Submit(ClusterSubmitterArgs clusterArgs, ArgumentCollection applicationArgs) { CommandArguments cmd = applicationArgs as CommandArguments; Helper.CheckCondition <ArgumentException>(cmd != null, "Can only provide command arguments to the cluster submitter"); Submit(clusterArgs, new DistributableWrapper(cmd)); }
private static v2008R2.IStringCollection GetNodesToUse(ClusterSubmitterArgs clusterArgs, v2008R2.IScheduler scheduler, v2008R2.ISchedulerJob job) { job.AutoCalculateMax = false; job.AutoCalculateMin = false; var availableNodes = scheduler.GetNodeList(null, null); v2008R2.IStringCollection nodesToUse = scheduler.CreateStringCollection(); List <string> nodesFound = new List <string>(); foreach (var node in availableNodes) { string nodeName = ((Microsoft.Hpc.Scheduler.SchedulerNode)node).Name; if (!clusterArgs.NodeExclusionList.Contains(nodeName)) { nodesToUse.Add(nodeName); } else { nodesFound.Add(nodeName); } } Helper.CheckCondition(nodesFound.Count != clusterArgs.NodeExclusionList.Count, "not all nodes in exclusion list found: check for typo " + clusterArgs.NodeExclusionList); return(nodesToUse); }
private static JobWaitingParams WaitForJobInternal(ClusterSubmitterArgs clusterArgs) { v2008R2.ISchedulerJob job = clusterArgs.GetV2Job(); var jobState = job.State; //clusterArgs.JobV2.Refresh(); //clusterArgs.JobState = clusterArgs.JobV2.State; JobWaitingParams jobWaitingParams = new JobWaitingParams { Job = job, JobState = jobState, ManualResetEvent = new ManualResetEvent(false) }; SetupJobEventHandler(jobWaitingParams); int heartBeatPeriod = 60 * 1000; // beat once a minute // put in a using statement to guarantee dispose will be called and the timer will be shutdown. using (Timer timer = HeartbeatTimer(clusterArgs.JobID, clusterArgs.Cluster, jobWaitingParams, heartBeatPeriod)) { //wait jobWaitingParams.Job.Refresh(); if (!JobIsFinished(jobWaitingParams.Job.State)) { jobWaitingParams.ManualResetEvent.WaitOne(); } timer.Change(Timeout.Infinite, Timeout.Infinite); // shutdown the timer } return(jobWaitingParams); }
public static void SubmitAndWait(ClusterSubmitterArgs clusterArgs, IDistributable distributableObj, int maxSubmitAfterTasksFail = 0) { using (ParallelOptionsScope.Suspend()) { int numberOfTries = 0; retry: Submit(clusterArgs, distributableObj); JobWaitingParams jobWaitingParams = WaitForJobInternal(clusterArgs); if (jobWaitingParams.JobState == v2008R2.Properties.JobState.Canceled) { throw new Exception("Job canceled."); } else if (jobWaitingParams.JobState == v2008R2.Properties.JobState.Failed) { if (numberOfTries < maxSubmitAfterTasksFail) { ++numberOfTries; Console.WriteLine("Job failed, trying again..."); goto retry; } throw new Exception("Job failed."); } //HpcLib.HpcLib.CopyFiles(new List<String> { "" }, _remoteTaskOutDir, TASK_OUT_DIR); } }
private static void SubmitInternal(ClusterSubmitterArgs clusterArgs, IDistributable distributableObj) { lock (_submitterLockObj) // for now, just let one thread submit at a time. { if (clusterArgs.Archive != null) { MBT.Escience.FileUtils.ArchiveExes(clusterArgs.Archive); } //ArgumentCollection argsToUse = (ArgumentCollection)applicationArgs.Clone(); CopyExes(clusterArgs); clusterArgs.StdErrDirName = CreateUniqueDirectory(clusterArgs.ExternalRemoteDirectoryName, "Stderr", clusterArgs.Name); clusterArgs.StdOutDirName = CreateUniqueDirectory(clusterArgs.ExternalRemoteDirectoryName, "Stdout", clusterArgs.Name); if (clusterArgs.CopyInputFiles.Count > 0) { CopyInputFiles(clusterArgs.CopyInputFiles, clusterArgs.ExternalRemoteDirectoryName); } using (ParallelOptionsScope.Suspend()) { switch (clusterArgs.Version) { case 1: SubmitViaAPI1(clusterArgs, distributableObj); break; case 2: Console.Error.WriteLine("Api2 and 3 are the same. Submitting via Api3."); SubmitViaAPI3(clusterArgs, distributableObj); break; case 3: SubmitViaAPI3(clusterArgs, distributableObj); break; default: throw new NotSupportedException(string.Format("Cluster version {0} is not supported.", clusterArgs.Version)); } } Console.WriteLine("Processed job to cluster {0} with path {1}", clusterArgs.Cluster, clusterArgs.ExternalRemoteDirectoryName); Console.WriteLine("Writing log file"); HpcLibSettings.TryWriteToLog(clusterArgs); Console.WriteLine("Writing log entry to cluster directory"); HpcLibSettings.WriteLogEntryToClusterDirectory(clusterArgs); Console.WriteLine("Done"); } return; }
public LogEntry(ClusterSubmitterArgs clusterArgs) { Date = DateTime.Now; LocalDir = Environment.CurrentDirectory; if (clusterArgs.RelativeDir) { clusterArgs.Dir = clusterArgs.ExternalRemoteDirectoryName; clusterArgs.RelativeDir = false; } ClusterArgs = clusterArgs; }
public static void WriteLogEntryToClusterDirectory(ClusterSubmitterArgs clusterArgs) { string directory = clusterArgs.ExternalRemoteDirectoryName; string filename = LogEntryFileName(clusterArgs.Name); LogEntry logEntry = new LogEntry(clusterArgs); System.Xml.Serialization.XmlSerializer serializer = new System.Xml.Serialization.XmlSerializer(typeof(LogEntry)); using (var writer = File.Create(directory + "\\" + filename)) { serializer.Serialize(writer, logEntry); } }
/// <summary> /// Waits for the job specified by clusterArgs to finish. If the state is Canceled or Failed, /// will throw an exception. Attempts to connect to the cluster if not already connected. /// </summary> /// <param name="clusterArgs"></param> public static void WaitForJob(ClusterSubmitterArgs clusterArgs) { JobWaitingParams jobWaitingParams = WaitForJobInternal(clusterArgs); if (jobWaitingParams.JobState == v2008R2.Properties.JobState.Canceled) { throw new Exception("Job canceled."); } else if (jobWaitingParams.JobState == v2008R2.Properties.JobState.Failed) { throw new Exception("Job failed."); } }
/// <summary> /// Calls the corresponding Submit function, but waits for the cluster to Finish, Fail, or be Canceled. If the final state is /// Finished, returns silently. Otherwise, it throws and Exception. For a description of the other parameters, see Submit(). /// *** NOTE: ONLY WORKS WITH V2 CLUSTERS. **** /// </summary> public static ClusterSubmitterArgs SubmitAndWait(ArgumentCollection argumentCollection, int maxSubmitAfterTasksFail = 0) { if (argumentCollection.PeekOptional <string>("cluster", "help").Equals("help", StringComparison.CurrentCultureIgnoreCase)) { Console.WriteLine(""); Console.WriteLine(HelpMessage); return(null); } ClusterSubmitterArgs clusterArgs = new ClusterSubmitterArgs(argumentCollection); SubmitAndWait(clusterArgs, argumentCollection, maxSubmitAfterTasksFail); return(clusterArgs); }
private static void SubmitViaAPI1(ClusterSubmitterArgs clusterArgs, IDistributable distributableObj) { Console.WriteLine("Submitting using API version 1"); v1.ICluster cluster = new v1.Cluster(); cluster.Connect(clusterArgs.Cluster); foreach (v1.ITask task in EnumerateTasks(clusterArgs, distributableObj)) { v1.IJob job = CreateJobApi1(cluster, (v1.JobPriority)clusterArgs.ApiPriority, task, task.Name); cluster.QueueJob(job, clusterArgs.Username, clusterArgs.Password, true, 0); } Console.WriteLine(); }
private static IEnumerable <v1.ITask> EnumerateTasks(ClusterSubmitterArgs clusterArgs, IDistributable distributableObj) { //bool checkIfValid = ValidateParamsOrNull != null; for (int pieceIndex = 0; pieceIndex < clusterArgs.TaskCount; ++pieceIndex) { if (clusterArgs.TaskRange.Contains(pieceIndex)) { ArgumentCollection thisTasksArgs; //if (TryCreateTaskArgsAndValidate(args, pieceIndex.ToString(), out thisTasksArgs)) { v1.ITask task = CreateTask(clusterArgs, pieceIndex, distributableObj); yield return(task); } } } }
private static v2008R2.ISchedulerTask AddCleanupTaskToJob(ClusterSubmitterArgs clusterArgs, v2008R2.IScheduler scheduler, v2008R2.ISchedulerJob job, IDistributable distributableJob) { v2008R2.ISchedulerCollection taskList = job.GetTaskList(scheduler.CreateFilterCollection(), scheduler.CreateSortCollection(), true); v2008R2.IStringCollection dependencyTasks = scheduler.CreateStringCollection(); if (!clusterArgs.OnlyDoCleanup) { dependencyTasks.Add(((v2008R2.ISchedulerTask)taskList[0]).Name); } v2008R2.ISchedulerTask cleanupTask = CreateCleanupTask(job, clusterArgs.ExternalRemoteDirectoryName, clusterArgs.StdErrDirName, clusterArgs.StdOutDirName, "cleanup", true); Distribute.Locally local = new Distribute.Locally() { Cleanup = true, TaskCount = clusterArgs.TaskCount, Tasks = new RangeCollection(), ParallelOptions = new ParallelOptions() { MaxDegreeOfParallelism = 1 } }; Distribute.Distribute distributeExe = new Distribute.Distribute() { Distributor = local, Distributable = distributableJob }; string exeName = distributableJob is DistributableWrapper ? clusterArgs.ExeName : distributeExe.GetType().Assembly.GetName().Name; //args.AddOptionalFlag("cleanup"); //args.AddOptional("tasks", "empty"); string taskCommandLine = string.Format("{0}\\{1} {2}", clusterArgs.ExeRelativeDirectoryName, exeName, CreateTaskString(distributeExe, clusterArgs.MinimalCommandLine)); cleanupTask.CommandLine = taskCommandLine; if (!clusterArgs.OnlyDoCleanup) { cleanupTask.DependsOn = dependencyTasks; } job.AddTask(cleanupTask); return(cleanupTask); }
private static void CopyExes(ClusterSubmitterArgs clusterArgs) { //!! why was this removed?? //var lockObj = CCSLibSettings.KnownClusters[clusterArgs.Cluster]; // use an object that is specific to this cluster. //lock (lockObj) // we should only have one thread trying to copy to the cluster at a time. Esp since copying is smart and will reuse identical exes if they're already there. { if (true || clusterArgs.ExeRelativeDirectoryName == null) { clusterArgs.ExeRelativeDirectoryName = HpcLib.CopyExesToCluster(clusterArgs.ExternalRemoteDirectoryName, clusterArgs.Name); } else { Console.WriteLine("Using exe directory specified by user: "******"\\" + clusterArgs.ExeRelativeDirectoryName; Helper.CheckCondition(Directory.Exists(absoluteExeDir), "Directory {0} does not exist!", absoluteExeDir); //string exeRelativeName = "exes\\" + clusterArgs.ExeRelativeDirectoryName; //clusterArgs.ExeRelativeDirectoryName = "\"" + exeRelativeName + "\""; } } }
public static void Submit(ClusterSubmitterArgs clusterArgs, IDistributable distributableObj) { for (int numTries = 0; numTries < clusterArgs.MaxSubmitTries; numTries++) { try { SubmitInternal(clusterArgs, distributableObj); return; } catch (Exception exception) { Console.WriteLine("\n\nError submitting to cluster " + clusterArgs.Cluster + ": " + exception.Message); Console.WriteLine("numTry=" + numTries + " of " + clusterArgs.MaxSubmitTries); Console.WriteLine(exception.StackTrace); Console.WriteLine("\n\nUse -cluster help to see usage."); Thread.Sleep(new TimeSpan(0, 10, 0)); } } throw new Exception("max number of cluster submitter tries (" + clusterArgs.MaxSubmitTries + ") exceeded"); }
private static v1.ITask CreateTask(ClusterSubmitterArgs clusterArgs, int taskNum, IDistributable distributableObj) { v1.ITask task = new v1.Task(); task.WorkDirectory = clusterArgs.InternalRemoteDirectory; Distribute.Locally local = new Distribute.Locally() { Cleanup = false, TaskCount = clusterArgs.TaskCount, ParallelOptions = new ParallelOptions() { MaxDegreeOfParallelism = 1 }, Tasks = new RangeCollection(taskNum) }; Distribute.Distribute distributeExe = new Distribute.Distribute() { Distributable = distributableObj, Distributor = local }; string taskString = CreateTaskString(distributeExe, clusterArgs.MinimalCommandLine); string exeName = distributableObj is DistributableWrapper ? clusterArgs.ExeName : distributeExe.GetType().Assembly.GetName().Name; string taskCommandLine = string.Format("{0}\\{1} {2}", clusterArgs.ExeRelativeDirectoryName, exeName, taskString); task.CommandLine = taskCommandLine; task.Name = Helper.CreateDelimitedString(" ", clusterArgs.Name, taskNum); task.IsExclusive = false; task.MinimumNumberOfProcessors = 1; task.MaximumNumberOfProcessors = 1; task.Stderr = string.Format(@"{0}\{1}.txt", clusterArgs.StdErrDirName, taskNum); task.Stdout = string.Format(@"{0}\{1}.txt", clusterArgs.StdOutDirName, taskNum); task.Runtime = "Infinite"; return(task); }
private static void SubmitViaAPI3(ClusterSubmitterArgs clusterArgs, IDistributable distributableObj) { Console.WriteLine(string.Format("Connecting to cluster {0} using API version 3 .", clusterArgs.Cluster)); using (v2008R2.IScheduler scheduler = new v2008R2.Scheduler()) { scheduler.Connect(clusterArgs.Cluster); v2008R2.ISchedulerJob job = scheduler.CreateJob(); job.Name = clusterArgs.Name; job.Priority = (v2008R2.Properties.JobPriority)clusterArgs.ApiPriority; if (clusterArgs.JobTemplate != null) { Microsoft.Hpc.Scheduler.IStringCollection jobTemplates = scheduler.GetJobTemplateList(); string decodedJobTemplate = HttpUtility.UrlDecode(clusterArgs.JobTemplate); if (jobTemplates.Contains(decodedJobTemplate)) { job.SetJobTemplate(decodedJobTemplate); } else { Console.WriteLine("Job template '" + decodedJobTemplate + "' does not exist at specified cluster. Existing templates are:"); foreach (var template in jobTemplates) { Console.Write("'" + template + "' "); } Console.WriteLine("\nUsing Default job template..."); } } if (clusterArgs.NumCoresPerTask != null) { clusterArgs.IsExclusive = false; } v2008R2.IStringCollection nodesToUse = null; if (clusterArgs.NodeExclusionList != null && clusterArgs.NodeExclusionList.Count > 0) { nodesToUse = GetNodesToUse(clusterArgs, scheduler, job); } else if (clusterArgs.NodesToUseList != null && clusterArgs.NodesToUseList.Count > 0) { nodesToUse = scheduler.CreateStringCollection(); foreach (string nodeName in clusterArgs.NodesToUseList) { nodesToUse.Add(nodeName); } } else if (clusterArgs.NumCoresPerTask != null) { job.AutoCalculateMax = true; job.AutoCalculateMin = true; } else if (clusterArgs.IsExclusive) { job.UnitType = Microsoft.Hpc.Scheduler.Properties.JobUnitType.Node; if (clusterArgs.MinimumNumberOfNodes != null) { job.MaximumNumberOfNodes = clusterArgs.MaximumNumberOfNodes.Value; job.MinimumNumberOfNodes = clusterArgs.MinimumNumberOfNodes.Value; } } else if (clusterArgs.MinimumNumberOfCores != null) { job.MaximumNumberOfCores = clusterArgs.MaximumNumberOfCores.Value; Helper.CheckCondition(clusterArgs.MinimumNumberOfCores != null, "must provide both MinCores and MaxCores, not just one"); job.MinimumNumberOfCores = clusterArgs.MinimumNumberOfCores.Value; job.AutoCalculateMax = false; job.AutoCalculateMin = false; } else { job.AutoCalculateMax = true; job.AutoCalculateMin = true; } //bool checkIfValid = ValidateParamsOrNull != null; if (!clusterArgs.OnlyDoCleanup) { if (clusterArgs.TaskRange.IsContiguous()) { if (clusterArgs.TaskRange.LastElement > clusterArgs.TaskCount - 1) { clusterArgs.TaskRange = new RangeCollection(clusterArgs.TaskRange.FirstElement, clusterArgs.TaskCount - 1); } v2008R2.ISchedulerTask task = CreateTask(null, clusterArgs, job, distributableObj, nodesToUse); task.IsParametric = true; // IsParametric is marked as obsolete. But is it necessary to submit to a v2 cluster?? //task.Type = TaskType.ParametricSweep; task.StartValue = 0; task.EndValue = clusterArgs.TaskCount - 1; job.AddTask(task); } else { job.AddTasks(clusterArgs.TaskRange.Select(taskNum => CreateTask((int)taskNum, clusterArgs, job, distributableObj, nodesToUse)).ToArray()); } } else { clusterArgs.Cleanup = true; } v2008R2.ISchedulerTask cleanupTask = null; if (clusterArgs.Cleanup) { cleanupTask = AddCleanupTaskToJob(clusterArgs, scheduler, job, distributableObj); } Console.WriteLine("Submitting job."); scheduler.SubmitJob(job, null, null); clusterArgs.JobID = job.Id; Console.WriteLine(job.Name + " submitted."); } }
/// <summary> /// Tries to write to the log file. If no log file is defined, returns false. /// </summary> public static bool TryWriteToLog(ClusterSubmitterArgs clusterArgs) { if (!string.IsNullOrEmpty(LogFile)) { FileStream filestream = null; try { bool exists = File.Exists(LogFile); if (MBT.Escience.FileUtils.TryToOpenFile(LogFile, new TimeSpan(0, 3, 0), FileMode.OpenOrCreate, FileAccess.ReadWrite, FileShare.None, out filestream)) { List <LogEntry> logEntries = exists && filestream.Length > 0 ? LogEntry.LoadEntries(new StreamReader(filestream)) : new List <LogEntry>(); LogEntry newSubmission = new LogEntry(clusterArgs); logEntries.Add(newSubmission); filestream.Position = 0; LogEntry.SaveEntries(logEntries, new StreamWriter(filestream)); filestream.Dispose(); #region first try //XmlDocument doc = new XmlDocument(); //XmlElement root; //if (File.Exists(LogFile)) //{ // XmlTextReader reader = new XmlTextReader(LogFile); // doc.Load(reader); // reader.Close(); // root = (XmlElement)doc.GetElementsByTagName("SubmissionLog")[0]; //} //else //{ // XmlDeclaration dec = doc.CreateXmlDeclaration("1.0", null, null); // doc.AppendChild(dec); // root = doc.CreateElement("SubmissionLog"); // doc.AppendChild(root); //} //XmlElement newEntry = new LogEntry(clusterArgs).GetXmlElement(doc); //root.AppendChild(newEntry); //XmlWriterSettings settings = new XmlWriterSettings(); //settings.Indent = true; //XmlWriter writer = XmlWriter.Create(LogFile, settings); //doc.WriteContentTo(writer); //writer.Flush(); #endregion return(true); } } catch (System.Xml.XmlException exception) { Console.WriteLine(exception); return(false); } finally { if (filestream != null) { filestream.Dispose(); } } } return(false); }
private static v2008R2.ISchedulerTask CreateTask(int?taskNumber, ClusterSubmitterArgs clusterArgs, v2008R2.ISchedulerJob job, IDistributable distributableObj, v2008R2.IStringCollection nodesToUse) { Distribute.Locally local = new Distribute.Locally() { Cleanup = false, TaskCount = clusterArgs.TaskCount, Tasks = taskNumber.HasValue ? new RangeCollection(taskNumber.Value) : null, ParallelOptions = new ParallelOptions() { MaxDegreeOfParallelism = 1 } }; v2008R2.ISchedulerTask task = job.CreateTask(); if (nodesToUse != null) { task.RequiredNodes = nodesToUse; } if (clusterArgs.NumCoresPerTask != null) { task.MinimumNumberOfCores = clusterArgs.NumCoresPerTask.Value; task.MaximumNumberOfCores = clusterArgs.NumCoresPerTask.Value; task.MaximumNumberOfNodes = 1; local.ParallelOptions.MaxDegreeOfParallelism = clusterArgs.NumCoresPerTask.Value; } else if (clusterArgs.IsExclusive) { //task.MinimumNumberOfCores = 1; //task.MaximumNumberOfCores = 8; //task.MaximumNumberOfNodes = 1; } task.WorkDirectory = clusterArgs.ExternalRemoteDirectoryName; Distribute.Distribute distributeExe = new Distribute.Distribute() { Distributable = distributableObj, Distributor = local }; string taskArgString = CreateTaskString(distributeExe, clusterArgs.MinimalCommandLine); string exeName = distributeExe.Distributable is DistributableWrapper ? clusterArgs.ExeName : distributeExe.GetType().Assembly.GetName().Name; string taskCommandLine = null; if (clusterArgs.UseMPI) { taskCommandLine = string.Format("mpiexec -n {0} {1}\\{2} {3}", clusterArgs.NumCoresPerTask, clusterArgs.ExeRelativeDirectoryName, exeName, taskArgString); } else { taskCommandLine = string.Format("{0}\\{1} {2}", clusterArgs.ExeRelativeDirectoryName, exeName, taskArgString); } task.CommandLine = taskCommandLine; string taskNumberAsString = taskNumber.HasValue ? taskNumber.Value.ToString() : "*"; task.Name = Helper.CreateDelimitedString(" ", clusterArgs.Name, taskNumberAsString); task.StdErrFilePath = string.Format(@"{0}\{1}.txt", clusterArgs.StdErrDirName, taskNumberAsString); task.StdOutFilePath = string.Format(@"{0}\{1}.txt", clusterArgs.StdOutDirName, taskNumberAsString); if (task.StdErrFilePath.Length >= 160) { Console.WriteLine("Caution, std error file path is {0} characters, which will probably cause HPC to crash.", task.StdErrFilePath.Length); } return(task); }