Exemplo n.º 1
0
        public static void SubmitAndWait(ClusterSubmitterArgs clusterArgs, IDistributable distributableObj, int maxSubmitAfterTasksFail = 0)
        {
            using (ParallelOptionsScope.Suspend())
            {
                int numberOfTries = 0;

retry:

                Submit(clusterArgs, distributableObj);

                JobWaitingParams jobWaitingParams = WaitForJobInternal(clusterArgs);

                if (jobWaitingParams.JobState == v2008R2.Properties.JobState.Canceled)
                {
                    throw new Exception("Job canceled.");
                }
                else if (jobWaitingParams.JobState == v2008R2.Properties.JobState.Failed)
                {
                    if (numberOfTries < maxSubmitAfterTasksFail)
                    {
                        ++numberOfTries;
                        Console.WriteLine("Job failed, trying again...");
                        goto retry;
                    }
                    throw new Exception("Job failed.");
                }
                //HpcLib.HpcLib.CopyFiles(new List<String> { "" }, _remoteTaskOutDir, TASK_OUT_DIR);
            }
        }
Exemplo n.º 2
0
        private static JobWaitingParams WaitForJobInternal(ClusterSubmitterArgs clusterArgs)
        {
            v2008R2.ISchedulerJob job = clusterArgs.GetV2Job();
            var jobState = job.State;
            //clusterArgs.JobV2.Refresh();
            //clusterArgs.JobState = clusterArgs.JobV2.State;

            JobWaitingParams jobWaitingParams = new JobWaitingParams
            {
                Job              = job,
                JobState         = jobState,
                ManualResetEvent = new ManualResetEvent(false)
            };


            SetupJobEventHandler(jobWaitingParams);

            int heartBeatPeriod = 60 * 1000; // beat once a minute

            // put in a using statement to guarantee dispose will be called and the timer will be shutdown.
            using (Timer timer = HeartbeatTimer(clusterArgs.JobID, clusterArgs.Cluster, jobWaitingParams, heartBeatPeriod))
            {
                //wait
                jobWaitingParams.Job.Refresh();
                if (!JobIsFinished(jobWaitingParams.Job.State))
                {
                    jobWaitingParams.ManualResetEvent.WaitOne();
                }
                timer.Change(Timeout.Infinite, Timeout.Infinite);   // shutdown the timer
            }
            return(jobWaitingParams);
        }
Exemplo n.º 3
0
 private static void SetupJobEventHandler(JobWaitingParams jobWaitingParams)
 {
     jobWaitingParams.Job.OnJobState += (sender, args) =>
     {
         if (JobIsFinished(args.NewState))
         {
             jobWaitingParams.JobState = args.NewState;
             jobWaitingParams.ManualResetEvent.Set();
         }
     };
 }
Exemplo n.º 4
0
        /// <summary>
        /// Waits for the job specified by clusterArgs to finish. If the state is Canceled or Failed,
        /// will throw an exception. Attempts to connect to the cluster if not already connected.
        /// </summary>
        /// <param name="clusterArgs"></param>
        public static void WaitForJob(ClusterSubmitterArgs clusterArgs)
        {
            JobWaitingParams jobWaitingParams = WaitForJobInternal(clusterArgs);

            if (jobWaitingParams.JobState == v2008R2.Properties.JobState.Canceled)
            {
                throw new Exception("Job canceled.");
            }
            else if (jobWaitingParams.JobState == v2008R2.Properties.JobState.Failed)
            {
                throw new Exception("Job failed.");
            }
        }
Exemplo n.º 5
0
        private static Timer HeartbeatTimer(int jobID, string clusterName, JobWaitingParams jobWaitingParams, int heartBeatPeriod)
        {
            Timer timer = new Timer(state =>
            {
                try
                {
                    jobWaitingParams.Job.Refresh();
                    //Console.WriteLine("Job is still connected. Status is " + jobWaitingParams.JobState);
                }
                catch
                {
                    Console.WriteLine("Lost connection to job. Attempting reconnect.");
                    v2008R2.IScheduler scheduler = null;
                    for (int iTry = 0; iTry < 10 && scheduler == null; iTry++)
                    {
                        HpcLib.TryConnect(clusterName, out scheduler);
                    }
                    if (scheduler == null)
                    {
                        Console.WriteLine("Unable to reconnect to cluster. Going back to sleep.");
                    }
                    else
                    {
                        jobWaitingParams.Job = scheduler.OpenJob(jobID);
                        jobWaitingParams.Job.Refresh();
                        SetupJobEventHandler(jobWaitingParams);
                        Console.WriteLine("Reconnect succeeded.");
                    }
                }

                jobWaitingParams.JobState = jobWaitingParams.Job.State;

                if (JobIsFinished(jobWaitingParams.Job.State))
                {
                    jobWaitingParams.ManualResetEvent.Set();
                }
            }, null, heartBeatPeriod, heartBeatPeriod);

            return(timer);
        }