public static void SubmitAndWait(ClusterSubmitterArgs clusterArgs, IDistributable distributableObj, int maxSubmitAfterTasksFail = 0) { using (ParallelOptionsScope.Suspend()) { int numberOfTries = 0; retry: Submit(clusterArgs, distributableObj); JobWaitingParams jobWaitingParams = WaitForJobInternal(clusterArgs); if (jobWaitingParams.JobState == v2008R2.Properties.JobState.Canceled) { throw new Exception("Job canceled."); } else if (jobWaitingParams.JobState == v2008R2.Properties.JobState.Failed) { if (numberOfTries < maxSubmitAfterTasksFail) { ++numberOfTries; Console.WriteLine("Job failed, trying again..."); goto retry; } throw new Exception("Job failed."); } //HpcLib.HpcLib.CopyFiles(new List<String> { "" }, _remoteTaskOutDir, TASK_OUT_DIR); } }
private static JobWaitingParams WaitForJobInternal(ClusterSubmitterArgs clusterArgs) { v2008R2.ISchedulerJob job = clusterArgs.GetV2Job(); var jobState = job.State; //clusterArgs.JobV2.Refresh(); //clusterArgs.JobState = clusterArgs.JobV2.State; JobWaitingParams jobWaitingParams = new JobWaitingParams { Job = job, JobState = jobState, ManualResetEvent = new ManualResetEvent(false) }; SetupJobEventHandler(jobWaitingParams); int heartBeatPeriod = 60 * 1000; // beat once a minute // put in a using statement to guarantee dispose will be called and the timer will be shutdown. using (Timer timer = HeartbeatTimer(clusterArgs.JobID, clusterArgs.Cluster, jobWaitingParams, heartBeatPeriod)) { //wait jobWaitingParams.Job.Refresh(); if (!JobIsFinished(jobWaitingParams.Job.State)) { jobWaitingParams.ManualResetEvent.WaitOne(); } timer.Change(Timeout.Infinite, Timeout.Infinite); // shutdown the timer } return(jobWaitingParams); }
private static void SetupJobEventHandler(JobWaitingParams jobWaitingParams) { jobWaitingParams.Job.OnJobState += (sender, args) => { if (JobIsFinished(args.NewState)) { jobWaitingParams.JobState = args.NewState; jobWaitingParams.ManualResetEvent.Set(); } }; }
/// <summary> /// Waits for the job specified by clusterArgs to finish. If the state is Canceled or Failed, /// will throw an exception. Attempts to connect to the cluster if not already connected. /// </summary> /// <param name="clusterArgs"></param> public static void WaitForJob(ClusterSubmitterArgs clusterArgs) { JobWaitingParams jobWaitingParams = WaitForJobInternal(clusterArgs); if (jobWaitingParams.JobState == v2008R2.Properties.JobState.Canceled) { throw new Exception("Job canceled."); } else if (jobWaitingParams.JobState == v2008R2.Properties.JobState.Failed) { throw new Exception("Job failed."); } }
private static Timer HeartbeatTimer(int jobID, string clusterName, JobWaitingParams jobWaitingParams, int heartBeatPeriod) { Timer timer = new Timer(state => { try { jobWaitingParams.Job.Refresh(); //Console.WriteLine("Job is still connected. Status is " + jobWaitingParams.JobState); } catch { Console.WriteLine("Lost connection to job. Attempting reconnect."); v2008R2.IScheduler scheduler = null; for (int iTry = 0; iTry < 10 && scheduler == null; iTry++) { HpcLib.TryConnect(clusterName, out scheduler); } if (scheduler == null) { Console.WriteLine("Unable to reconnect to cluster. Going back to sleep."); } else { jobWaitingParams.Job = scheduler.OpenJob(jobID); jobWaitingParams.Job.Refresh(); SetupJobEventHandler(jobWaitingParams); Console.WriteLine("Reconnect succeeded."); } } jobWaitingParams.JobState = jobWaitingParams.Job.State; if (JobIsFinished(jobWaitingParams.Job.State)) { jobWaitingParams.ManualResetEvent.Set(); } }, null, heartBeatPeriod, heartBeatPeriod); return(timer); }