/// <summary> /// Submits the jobs and waits for it to complete. /// When it submits, it create a log entry file in the cluster working directory, named according to the run name. This file is deleted /// when the job finishes successfully, so long as we're still waiting for it to finish. If SubmitAndWait is called and this file already /// exists, then it is assumed that the job we want to submit was already submitted, so we wait for it to finish rather than submit again. /// </summary> /// <param name="clusterArgs"></param> /// <param name="distributableObj"></param> /// <param name="maxSubmitAfterTasksFail"></param> /// <param name="OnSubmittedCallbackOrNull"></param> public static void SubmitAndWait(ClusterSubmitterArgs clusterArgs, IDistributable distributableObj, int maxSubmitAfterTasksFail = 0, Action OnSubmittedCallbackOrNull = null) { using (ParallelOptionsScope.Suspend()) { FileInfo logEntryFile = HpcLibSettings.GetLogEntryFile(clusterArgs); if (logEntryFile.Exists) { Console.WriteLine(Resource.Job_already_exists, logEntryFile.FullName); clusterArgs = HpcLibSettings.LoadLogEntryFile(logEntryFile).ClusterArgs; } else { Submit(clusterArgs, distributableObj); Console.WriteLine(Resource.Wait_Writing_log); HpcLibSettings.WriteLogEntryToClusterDirectory(clusterArgs); } if (OnSubmittedCallbackOrNull != null) { OnSubmittedCallbackOrNull(); } JobState jobState = WaitForJobInternal(clusterArgs, maxSubmitAfterTasksFail); logEntryFile.Delete(); // job finished successfully, so we can delete this. Even if failed or canceled, we assume that we'll want to overwrite in the future. if (jobState != JobState.Finished) { throw new Exception("Job " + jobState); } } }
private static void SubmitInternal(ClusterSubmitterArgs clusterArgs, IDistributable distributableObj) { lock (_submitterLockObj) // for now, just let one thread submit at a time. { if (string.IsNullOrEmpty(clusterArgs.Name)) { clusterArgs.Name = distributableObj.JobName; } CopyExes(clusterArgs); clusterArgs.StdErrDirName = CreateUniqueDirectory(clusterArgs.ExternalRemoteDirectoryName, "Stderr", distributableObj.JobName); clusterArgs.StdOutDirName = CreateUniqueDirectory(clusterArgs.ExternalRemoteDirectoryName, "Stdout", distributableObj.JobName); if (clusterArgs.CopyInputFiles != null) { if (!(distributableObj is DistributableWrapper)) { clusterArgs.CopyInputFiles.AddRange(ArgumentCollection.EnumerateValuesOfTypeFromParsable <InputFile>(distributableObj).Select(file => file.ToString())); } if (clusterArgs.CopyInputFiles.Count > 0) { CopyInputFiles(clusterArgs.CopyInputFiles, clusterArgs.ExternalRemoteDirectoryName); } } using (ParallelOptionsScope.Suspend()) { switch (clusterArgs.Version) { case 3: SubmitViaAPI3(clusterArgs, distributableObj); break; default: throw new NotSupportedException(string.Format("Cluster version {0} is not supported.", clusterArgs.Version)); } } Console.WriteLine(Resource.Processed_job, clusterArgs.Cluster, clusterArgs.ExternalRemoteDirectoryName); Console.WriteLine(Resource.Writing_log_file); HpcLibSettings.TryWriteToLog(clusterArgs); Console.WriteLine(Resource.Done); } return; }