/// <summary> /// Submits the jobs and waits for it to complete. /// When it submits, it create a log entry file in the cluster working directory, named according to the run name. This file is deleted /// when the job finishes successfully, so long as we're still waiting for it to finish. If SubmitAndWait is called and this file already /// exists, then it is assumed that the job we want to submit was already submitted, so we wait for it to finish rather than submit again. /// </summary> /// <param name="clusterArgs"></param> /// <param name="distributableObj"></param> /// <param name="maxSubmitAfterTasksFail"></param> /// <param name="OnSubmittedCallbackOrNull"></param> public static void SubmitAndWait(ClusterSubmitterArgs clusterArgs, IDistributable distributableObj, int maxSubmitAfterTasksFail = 0, Action OnSubmittedCallbackOrNull = null) { using (ParallelOptionsScope.Suspend()) { FileInfo logEntryFile = HpcLibSettings.GetLogEntryFile(clusterArgs); if (logEntryFile.Exists) { Console.WriteLine(Resource.Job_already_exists, logEntryFile.FullName); clusterArgs = HpcLibSettings.LoadLogEntryFile(logEntryFile).ClusterArgs; } else { Submit(clusterArgs, distributableObj); Console.WriteLine(Resource.Wait_Writing_log); HpcLibSettings.WriteLogEntryToClusterDirectory(clusterArgs); } if (OnSubmittedCallbackOrNull != null) { OnSubmittedCallbackOrNull(); } JobState jobState = WaitForJobInternal(clusterArgs, maxSubmitAfterTasksFail); logEntryFile.Delete(); // job finished successfully, so we can delete this. Even if failed or canceled, we assume that we'll want to overwrite in the future. if (jobState != JobState.Finished) { throw new Exception("Job " + jobState); } } }
/// <summary> /// Refreshes the cluster or core /// </summary> /// <returns>refreshed or not</returns> public bool Refresh() { bool changed = false; ISchedulerCounters counters; using (ParallelOptionsScope.Suspend()) { if (Connect() && null != (counters = GetCounters())) { changed = BusyCores != counters.BusyCores || IdleCores != counters.IdleCores || QueuedTasks != counters.QueuedTasks; BusyCores = counters.BusyCores; IdleCores = counters.IdleCores; QueuedTasks = counters.QueuedTasks; } else { changed = BusyCores != -1; BusyCores = -1; IdleCores = -1; QueuedTasks = -1; } } return(changed); }
private static void SubmitInternal(ClusterSubmitterArgs clusterArgs, IDistributable distributableObj) { lock (_submitterLockObj) // for now, just let one thread submit at a time. { if (string.IsNullOrEmpty(clusterArgs.Name)) { clusterArgs.Name = distributableObj.JobName; } CopyExes(clusterArgs); clusterArgs.StdErrDirName = CreateUniqueDirectory(clusterArgs.ExternalRemoteDirectoryName, "Stderr", distributableObj.JobName); clusterArgs.StdOutDirName = CreateUniqueDirectory(clusterArgs.ExternalRemoteDirectoryName, "Stdout", distributableObj.JobName); if (clusterArgs.CopyInputFiles != null) { if (!(distributableObj is DistributableWrapper)) { clusterArgs.CopyInputFiles.AddRange(ArgumentCollection.EnumerateValuesOfTypeFromParsable <InputFile>(distributableObj).Select(file => file.ToString())); } if (clusterArgs.CopyInputFiles.Count > 0) { CopyInputFiles(clusterArgs.CopyInputFiles, clusterArgs.ExternalRemoteDirectoryName); } } using (ParallelOptionsScope.Suspend()) { switch (clusterArgs.Version) { case 3: SubmitViaAPI3(clusterArgs, distributableObj); break; default: throw new NotSupportedException(string.Format("Cluster version {0} is not supported.", clusterArgs.Version)); } } Console.WriteLine(Resource.Processed_job, clusterArgs.Cluster, clusterArgs.ExternalRemoteDirectoryName); Console.WriteLine(Resource.Writing_log_file); HpcLibSettings.TryWriteToLog(clusterArgs); Console.WriteLine(Resource.Done); } return; }
private bool Connect() { if (_scheduler != null) { return(true); } bool connected = false; using (ParallelOptionsScope.Suspend()) { connected = HpcLib.TryConnect(Cluster, out _scheduler); } return(connected); }