Ejemplo n.º 1
0
        /// <summary>
        /// Requeues failed and canceled tasks for the given job. First checks that there are any, otherwise won't do anything.
        /// This is thread safe, locking on the job.
        /// </summary>
        /// <param name="scheduler"></param>
        /// <param name="job"></param>
        public static void RequeueFailedAndCanceledTasks(IScheduler scheduler, ISchedulerJob job)
        {
            lock (job)
            {
                job.Refresh();
                var counters = job.GetCounters();
                if (counters.FailedTaskCount > 0 || counters.CanceledTaskCount > 0)
                {
                    var failedTasks = GetFailedAndCanceledTasks(scheduler, job);
                    foreach (ISchedulerTask task in failedTasks)
                    {
                        job.RequeueTask(task.TaskId);
                    }

                    if (job.State != Microsoft.Hpc.Scheduler.Properties.JobState.Running)
                    {
                        scheduler.ConfigureJob(job.Id);
                        scheduler.SubmitJob(job, null, null);
                    }
                }
            }
        }
Ejemplo n.º 2
0
        public static string getLogLine(ISchedulerJob job)
        {
/* #date	time	JobId   Queed   Running	Finished Failed Total	Progress	User
 * Tue Jan 24 15:13:05 GMT 2012	7016    1	12	30	40	Running	AJG\gomoz
 *
 * Tue Jan 24 15:13:05 GMT 2012	7017    1	8	23	34.7826086956522	Running	AJG\gomoz
 *
 * Tue Jan 24 15:13:05 GMT 2012	7018    11	8	28	28.5714285714286	Running	AJG\gomoz
 *
 * Tue Jan 24 15:13:05 GMT 2012	7019    111	4	15	26.6666666666667	Running	AJG\gomoz
 *
 * Tue Jan 24 15:13:29 GMT 2012	7016    12  14	30	46.6666666666667	Running	AJG\gomoz */
//         System.Console.WriteLine("{0}\t{1}\t{2}\t{3}\T{4}", dateStamp, job.Id, job.);

            DateTime dateStamp = DateTime.Now;

            if (job.State == JobState.Finished)
            {
                dateStamp = job.EndTime;
            }

            if (job.State == JobState.Canceled)
            {
                dateStamp = job.EndTime;
            }


            ISchedulerJobCounters counters = job.GetCounters();
            List <string>         groups   = new List <string>(job.NodeGroups);
            float  progress   = 100 * ((float)counters.FinishedTaskCount / (float)counters.TaskCount);
            string nodeGroups = "[" + string.Join(",", groups.ToArray()) + "]";

            string line = string.Empty;

            if (job.State == JobState.Running)
            {
                if (job.EndTime < new DateTime(1970, 1, 1))
                {
                    line = string.Format("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}\t{8}\t{9}\t{10}\t{11}\t{12}\t{13}\t{14}", dateStamp.ToString(hpcDateTimeFormat), job.Id, counters.QueuedTaskCount, counters.RunningTaskCount, counters.FinishedTaskCount, counters.FailedTaskCount, counters.TaskCount, progress, job.State, job.Owner, nodeGroups, job.StartTime, "NULL", (DateTime.Now - job.StartTime).Minutes, job.Name);
                }
                else
                {
                    line = string.Format("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}\t{8}\t{9}\t{10}\t{11}\t{12}\t{13}\t{14}", dateStamp.ToString(hpcDateTimeFormat), job.Id, counters.QueuedTaskCount, counters.RunningTaskCount, counters.FinishedTaskCount, counters.FailedTaskCount, counters.TaskCount, progress, job.State, job.Owner, nodeGroups, job.StartTime, job.EndTime, (DateTime.Now - job.StartTime).Minutes, job.Name);
                }
            }
            else if (job.State == JobState.Finished)
            {
                line = string.Format("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}\t{8}\t{9}\t{10}\t{11}\t{12}\t{13}\t{14}", dateStamp.ToString(hpcDateTimeFormat), job.Id, counters.QueuedTaskCount, counters.RunningTaskCount, counters.FinishedTaskCount, counters.FailedTaskCount, counters.TaskCount, progress, job.State, job.Owner, nodeGroups, job.StartTime, job.EndTime, (job.EndTime - job.StartTime).Minutes, job.Name);
            }
            else if (job.State == JobState.Failed)
            {
                line = string.Format("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}\t{8}\t{9}\t{10}\t{11}\t{12}\t{13}\t{14}", dateStamp.ToString(hpcDateTimeFormat), job.Id, counters.QueuedTaskCount, counters.RunningTaskCount, counters.FinishedTaskCount, counters.FailedTaskCount, counters.TaskCount, progress, job.State, job.Owner, nodeGroups, job.StartTime, job.EndTime, (job.EndTime - job.StartTime).Minutes, job.Name);
            }
            else
            {
                if (job.EndTime < new DateTime(1970, 1, 1))
                {
                    line = string.Format("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}\t{8}\t{9}\t{10}\t{11}\t{12}\t{13}\t{14}", dateStamp.ToString(hpcDateTimeFormat), job.Id, counters.QueuedTaskCount, counters.RunningTaskCount, counters.FinishedTaskCount, counters.FailedTaskCount, counters.TaskCount, progress, job.State, job.Owner, nodeGroups, job.StartTime, "NULL", 0, job.Name);
                }
                else
                {
                    line = string.Format("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}\t{8}\t{9}\t{10}\t{11}\t{12}\t{13}\t{14}", dateStamp.ToString(hpcDateTimeFormat), job.Id, counters.QueuedTaskCount, counters.RunningTaskCount, counters.FinishedTaskCount, counters.FailedTaskCount, counters.TaskCount, progress, job.State, job.Owner, nodeGroups, job.StartTime, job.EndTime, 0, job.Name);
                }
            }



            //System.Console.WriteLine( line.Trim() );

            return(line.Trim());
        }
        private void update()
        {
retry:
            try
            {
                Mouse.OverrideCursor = System.Windows.Input.Cursors.Wait;

                lblID.Content = this.Title = "Experiment #" + id.ToString();

                SqlCommand cmd = new SqlCommand("SELECT SubmissionTime,Category," +
                                                "(SELECT COUNT(1) FROM JobQueue WHERE ExperimentID=" + id.ToString() + ") as Queued," +
                                                "(SELECT COUNT(1) FROM Data WHERE ExperimentID=" + id.ToString() + ") as Finished," +
                                                "(SELECT SUM(SAT) FROM Data WHERE ExperimentID=" + id.ToString() + " AND ResultCode=0) as SAT," +
                                                "(SELECT SUM(UNSAT) FROM Data WHERE ExperimentID=" + id.ToString() + " AND ResultCode=0) as UNSAT," +
                                                "(SELECT SUM(UNKNOWN) FROM Data WHERE ExperimentID=" + id.ToString() + " AND ResultCode=0) as UNKNOWN," +
                                                "(SELECT COUNT(1) FROM Data WHERE ExperimentID=" + id.ToString() + " AND ResultCode=0 AND SAT+UNSAT > TargetSAT+TargetUNSAT AND UNKNOWN < TargetUNKNOWN) as OVR," +
                                                "(SELECT COUNT(1) FROM Data WHERE ExperimentID=" + id.ToString() + " AND (SAT+UNSAT < TargetSAT+TargetUNSAT OR UNKNOWN > TargetUNKNOWN)) as UDR," +
                                                "(SELECT COUNT(1) FROM Data WHERE ExperimentID=" + id.ToString() + " AND ResultCode=3) as BUG," +
                                                "(SELECT COUNT(1) FROM Data WHERE ExperimentID=" + id.ToString() + " AND ResultCode=4) as ERROR," +
                                                "(SELECT COUNT(1) FROM Data WHERE ExperimentID=" + id.ToString() + " AND ResultCode=5) as TIMEOUT," +
                                                "(SELECT COUNT(1) FROM Data WHERE ExperimentID=" + id.ToString() + " AND ResultCode=6) as MEMORYOUT," +
                                                "Memout as MaxMem,Timeout as MaxTime,Parameters,Cluster,ClusterJobID,Nodegroup,Locality,Creator,Note,Longparams " +
                                                "FROM Experiments WHERE ID=" + id.ToString(), sql);
                cmd.CommandTimeout = 0;
                SqlDataReader r = cmd.ExecuteReader();
                if (!r.Read())
                {
                    throw new Exception("Error reading from SQL connection");
                }

                txtSubmissionTime.Text = ((DateTime)r["SubmissionTime"]).ToString();
                txtCategory.Text       = (string)r["Category"];

                int f = (int)r["Finished"];
                int q = (int)r["Queued"];
                lblTotal.Content    = f + q;
                lblFinished.Content = f;
                lblRunning.Content  = q;

                lblRunning.Foreground = (q == 0) ? System.Windows.Media.Brushes.Green :
                                        System.Windows.Media.Brushes.Red;

                lblSAT.Content     = (DBNull.Value.Equals(r["SAT"])) ? 0 : (int)r["SAT"];
                lblUNSAT.Content   = (DBNull.Value.Equals(r["UNSAT"])) ? 0 : (int)r["UNSAT"];
                lblUnknown.Content = (DBNull.Value.Equals(r["UNKNOWN"])) ? 0 : (int)r["UNKNOWN"];
                lblOver.Content    = (DBNull.Value.Equals(r["OVR"])) ? 0 : (int)r["OVR"];
                lblUnder.Content   = (DBNull.Value.Equals(r["UDR"])) ? 0 : (int)r["UDR"];

                int bugs   = (int)r["BUG"];;
                int prob   = (int)r["ERROR"];
                int toed   = (int)r["TIMEOUT"];
                int memoed = (int)r["MEMORYOUT"];

                lblBug.Content    = bugs;
                lblBug.Foreground = (bugs == 0) ? System.Windows.Media.Brushes.Black :
                                    System.Windows.Media.Brushes.Red;
                lblNonzero.Content    = prob;
                lblNonzero.Foreground = (prob == 0) ? System.Windows.Media.Brushes.Black :
                                        System.Windows.Media.Brushes.Red;

                lblMemdout.Content    = memoed;
                lblMemdout.Foreground = (memoed == 0) ? System.Windows.Media.Brushes.Black :
                                        System.Windows.Media.Brushes.Red;

                lblTimedout.Content    = toed;
                lblTimedout.Foreground = (toed == 0) ? System.Windows.Media.Brushes.Black :
                                         System.Windows.Media.Brushes.Red;


                txtTimeout.Text = (string)r["MaxTime"];
                txtMemout.Text  = (string)r["MaxMem"];
                if (r["Parameters"].Equals(DBNull.Value))
                {
                    txtParameters.Text = (string)r["Longparams"];
                }
                else
                {
                    txtParameters.Text = (string)r["Parameters"];
                }
                string cluster = (string)r["Cluster"];
                txtCluster.Text = cluster;
                int clusterJobID = (DBNull.Value.Equals(r["ClusterJobID"])) ? 0 : (int)r["ClusterJobID"];
                txtNodeGroup.Text = (string)r["Nodegroup"];
                txtLocality.Text  = (string)r["Locality"];
                txtCreator.Text   = (string)r["Creator"];
                txtNote.Text      = (DBNull.Value.Equals(r["Note"])) ? "" : (string)r["Note"];

                r.Close();

                if (cluster != "" && clusterJobID != 0)
                {
                    try
                    {
                        Scheduler scheduler = new Scheduler();
                        scheduler.Connect(cluster);
                        ISchedulerJob job   = scheduler.OpenJob(Convert.ToInt32(clusterJobID));
                        JobState      state = job.State;
                        lblClusterStatus.Content = state.ToString();
                        if (state == JobState.Running)
                        {
                            lblClusterStatus.Content += " (" + job.GetCounters().RunningTaskCount + " wrkrs)";
                        }
                        if (state == JobState.Running || state == JobState.Queued ||
                            state == JobState.Validating || state == JobState.Finished ||
                            state == JobState.Finishing || state == JobState.Submitted ||
                            state == JobState.ExternalValidation)
                        {
                            lblClusterStatus.Foreground = System.Windows.Media.Brushes.Green;
                        }
                        else
                        {
                            lblClusterStatus.Foreground = System.Windows.Media.Brushes.Red;
                        }
                    }
                    catch (SchedulerException) {
                        lblClusterStatus.Content    = "Job ID unknown to the cluster. Assume finished.";
                        lblClusterStatus.Foreground = System.Windows.Media.Brushes.Orange;
                    }
                    catch
                    {
                        lblClusterStatus.Content    = "Unable to retrieve status.";
                        lblClusterStatus.Foreground = System.Windows.Media.Brushes.Orange;
                    }
                }
                else
                {
                    lblClusterStatus.Content    = "Unable to retrieve status.";
                    lblClusterStatus.Foreground = System.Windows.Media.Brushes.Black;
                }

                cmd = new SqlCommand("SELECT COUNT(*) FROM Data WHERE ExperimentID=" + id.ToString() + " AND ResultCode=4 AND (stderr LIKE '%INFRASTRUCTURE ERROR%' OR ReturnValue=-1073741515);", sql);
                r   = cmd.ExecuteReader();
                if (r.Read())
                {
                    int ierrs = (int)r[0];
                    if (ierrs == 0)
                    {
                        lblInfrastructureErrors.Content = "";
                    }
                    else
                    {
                        lblInfrastructureErrors.Content = ierrs.ToString() + " infrastructure errors!";
                    }
                }
                r.Close();
            }
            catch (SqlException ex)
            {
                if (ex.Number == -2) /* timeout */ goto {
                    retry;
                }
Ejemplo n.º 4
0
        public void RefreshJobStatus()
        {
            if (_job != null)
            {
                LogEntry oldVersion = (LogEntry)this.MemberwiseClone();

                _job.Refresh();
                JobState = _job.State;
                ISchedulerJobCounters counters = _job.GetCounters();
                string stateStr = string.Format("{0}/{1}/{2}/{3}", counters.QueuedTaskCount, counters.RunningTaskCount, counters.FailedTaskCount, counters.FinishedTaskCount);
                FailedTaskCount = counters.FailedTaskCount;
                TaskStatus      = stateStr;
                if (FailedTaskCount > 0)
                {
                    IEnumerable <ISchedulerTask> tasklist = GetFailedTasks(_job);
                    string failedTaskRangeAsString        = tasklist.Select(task => task.TaskId.JobTaskId).StringJoin(",");
                    if ("" != failedTaskRangeAsString)
                    {
                        this.FailedTasks = RangeCollection.Parse(failedTaskRangeAsString).ToString();
                    }
                    else
                    {
                        FailedTasks = "";
                    }
                }
                else
                {
                    FailedTasks = "";
                }

                if (JobState == JobState.Finished)
                {
                    if (WallTime.Ticks == 0)
                    {
                        DateTime startTime = _job.SubmitTime;
                        DateTime endTime   = _job.EndTime;
                        WallTime = endTime - startTime;
                    }
                    if (CpuTime.Ticks == 0)
                    {
                        var tasklist   = _job.GetTaskList(null, null, true).Cast <ISchedulerTask>();
                        var totalTicks = tasklist.Select(task => (task.EndTime - task.StartTime).Ticks).Sum();
                        CpuTime = new TimeSpan(totalTicks);
                    }
                }

                bool taskStateChanged = FailedTasks != oldVersion.FailedTasks || TaskStatus != oldVersion.TaskStatus;
                bool jobStateChanged  = JobState != oldVersion.JobState ||
                                        (FailedTaskCount == 0) != (oldVersion.FailedTaskCount == 0) ||
                                        string.IsNullOrEmpty(FailedTasks) != string.IsNullOrEmpty(oldVersion.FailedTasks) ||
                                        CpuTime != oldVersion.CpuTime || WallTime != oldVersion.WallTime;
                //if (_taskStateChangedSinceLastEvent != taskStateChanged || _jobStateChangedSinceLastEvent != jobStateChanged)
                //    Console.WriteLine("bad");

                if (taskStateChanged)
                {
                    RaiseTaskStateChangedEvent();
                }
                if (jobStateChanged)
                {
                    RaiseJobStateChangedEvent();
                }
            }
        }