/// <summary> /// Requeues failed and canceled tasks for the given job. First checks that there are any, otherwise won't do anything. /// This is thread safe, locking on the job. /// </summary> /// <param name="scheduler"></param> /// <param name="job"></param> public static void RequeueFailedAndCanceledTasks(IScheduler scheduler, ISchedulerJob job) { lock (job) { job.Refresh(); var counters = job.GetCounters(); if (counters.FailedTaskCount > 0 || counters.CanceledTaskCount > 0) { var failedTasks = GetFailedAndCanceledTasks(scheduler, job); foreach (ISchedulerTask task in failedTasks) { job.RequeueTask(task.TaskId); } if (job.State != Microsoft.Hpc.Scheduler.Properties.JobState.Running) { scheduler.ConfigureJob(job.Id); scheduler.SubmitJob(job, null, null); } } } }
public static string getLogLine(ISchedulerJob job) { /* #date time JobId Queed Running Finished Failed Total Progress User * Tue Jan 24 15:13:05 GMT 2012 7016 1 12 30 40 Running AJG\gomoz * * Tue Jan 24 15:13:05 GMT 2012 7017 1 8 23 34.7826086956522 Running AJG\gomoz * * Tue Jan 24 15:13:05 GMT 2012 7018 11 8 28 28.5714285714286 Running AJG\gomoz * * Tue Jan 24 15:13:05 GMT 2012 7019 111 4 15 26.6666666666667 Running AJG\gomoz * * Tue Jan 24 15:13:29 GMT 2012 7016 12 14 30 46.6666666666667 Running AJG\gomoz */ // System.Console.WriteLine("{0}\t{1}\t{2}\t{3}\T{4}", dateStamp, job.Id, job.); DateTime dateStamp = DateTime.Now; if (job.State == JobState.Finished) { dateStamp = job.EndTime; } if (job.State == JobState.Canceled) { dateStamp = job.EndTime; } ISchedulerJobCounters counters = job.GetCounters(); List <string> groups = new List <string>(job.NodeGroups); float progress = 100 * ((float)counters.FinishedTaskCount / (float)counters.TaskCount); string nodeGroups = "[" + string.Join(",", groups.ToArray()) + "]"; string line = string.Empty; if (job.State == JobState.Running) { if (job.EndTime < new DateTime(1970, 1, 1)) { line = string.Format("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}\t{8}\t{9}\t{10}\t{11}\t{12}\t{13}\t{14}", dateStamp.ToString(hpcDateTimeFormat), job.Id, counters.QueuedTaskCount, counters.RunningTaskCount, counters.FinishedTaskCount, counters.FailedTaskCount, counters.TaskCount, progress, job.State, job.Owner, nodeGroups, job.StartTime, "NULL", (DateTime.Now - job.StartTime).Minutes, job.Name); } else { line = string.Format("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}\t{8}\t{9}\t{10}\t{11}\t{12}\t{13}\t{14}", dateStamp.ToString(hpcDateTimeFormat), job.Id, counters.QueuedTaskCount, counters.RunningTaskCount, counters.FinishedTaskCount, counters.FailedTaskCount, counters.TaskCount, progress, job.State, job.Owner, nodeGroups, job.StartTime, job.EndTime, (DateTime.Now - job.StartTime).Minutes, job.Name); } } else if (job.State == JobState.Finished) { line = string.Format("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}\t{8}\t{9}\t{10}\t{11}\t{12}\t{13}\t{14}", dateStamp.ToString(hpcDateTimeFormat), job.Id, counters.QueuedTaskCount, counters.RunningTaskCount, counters.FinishedTaskCount, counters.FailedTaskCount, counters.TaskCount, progress, job.State, job.Owner, nodeGroups, job.StartTime, job.EndTime, (job.EndTime - job.StartTime).Minutes, job.Name); } else if (job.State == JobState.Failed) { line = string.Format("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}\t{8}\t{9}\t{10}\t{11}\t{12}\t{13}\t{14}", dateStamp.ToString(hpcDateTimeFormat), job.Id, counters.QueuedTaskCount, counters.RunningTaskCount, counters.FinishedTaskCount, counters.FailedTaskCount, counters.TaskCount, progress, job.State, job.Owner, nodeGroups, job.StartTime, job.EndTime, (job.EndTime - job.StartTime).Minutes, job.Name); } else { if (job.EndTime < new DateTime(1970, 1, 1)) { line = string.Format("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}\t{8}\t{9}\t{10}\t{11}\t{12}\t{13}\t{14}", dateStamp.ToString(hpcDateTimeFormat), job.Id, counters.QueuedTaskCount, counters.RunningTaskCount, counters.FinishedTaskCount, counters.FailedTaskCount, counters.TaskCount, progress, job.State, job.Owner, nodeGroups, job.StartTime, "NULL", 0, job.Name); } else { line = string.Format("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}\t{8}\t{9}\t{10}\t{11}\t{12}\t{13}\t{14}", dateStamp.ToString(hpcDateTimeFormat), job.Id, counters.QueuedTaskCount, counters.RunningTaskCount, counters.FinishedTaskCount, counters.FailedTaskCount, counters.TaskCount, progress, job.State, job.Owner, nodeGroups, job.StartTime, job.EndTime, 0, job.Name); } } //System.Console.WriteLine( line.Trim() ); return(line.Trim()); }
private void update() { retry: try { Mouse.OverrideCursor = System.Windows.Input.Cursors.Wait; lblID.Content = this.Title = "Experiment #" + id.ToString(); SqlCommand cmd = new SqlCommand("SELECT SubmissionTime,Category," + "(SELECT COUNT(1) FROM JobQueue WHERE ExperimentID=" + id.ToString() + ") as Queued," + "(SELECT COUNT(1) FROM Data WHERE ExperimentID=" + id.ToString() + ") as Finished," + "(SELECT SUM(SAT) FROM Data WHERE ExperimentID=" + id.ToString() + " AND ResultCode=0) as SAT," + "(SELECT SUM(UNSAT) FROM Data WHERE ExperimentID=" + id.ToString() + " AND ResultCode=0) as UNSAT," + "(SELECT SUM(UNKNOWN) FROM Data WHERE ExperimentID=" + id.ToString() + " AND ResultCode=0) as UNKNOWN," + "(SELECT COUNT(1) FROM Data WHERE ExperimentID=" + id.ToString() + " AND ResultCode=0 AND SAT+UNSAT > TargetSAT+TargetUNSAT AND UNKNOWN < TargetUNKNOWN) as OVR," + "(SELECT COUNT(1) FROM Data WHERE ExperimentID=" + id.ToString() + " AND (SAT+UNSAT < TargetSAT+TargetUNSAT OR UNKNOWN > TargetUNKNOWN)) as UDR," + "(SELECT COUNT(1) FROM Data WHERE ExperimentID=" + id.ToString() + " AND ResultCode=3) as BUG," + "(SELECT COUNT(1) FROM Data WHERE ExperimentID=" + id.ToString() + " AND ResultCode=4) as ERROR," + "(SELECT COUNT(1) FROM Data WHERE ExperimentID=" + id.ToString() + " AND ResultCode=5) as TIMEOUT," + "(SELECT COUNT(1) FROM Data WHERE ExperimentID=" + id.ToString() + " AND ResultCode=6) as MEMORYOUT," + "Memout as MaxMem,Timeout as MaxTime,Parameters,Cluster,ClusterJobID,Nodegroup,Locality,Creator,Note,Longparams " + "FROM Experiments WHERE ID=" + id.ToString(), sql); cmd.CommandTimeout = 0; SqlDataReader r = cmd.ExecuteReader(); if (!r.Read()) { throw new Exception("Error reading from SQL connection"); } txtSubmissionTime.Text = ((DateTime)r["SubmissionTime"]).ToString(); txtCategory.Text = (string)r["Category"]; int f = (int)r["Finished"]; int q = (int)r["Queued"]; lblTotal.Content = f + q; lblFinished.Content = f; lblRunning.Content = q; lblRunning.Foreground = (q == 0) ? System.Windows.Media.Brushes.Green : System.Windows.Media.Brushes.Red; lblSAT.Content = (DBNull.Value.Equals(r["SAT"])) ? 0 : (int)r["SAT"]; lblUNSAT.Content = (DBNull.Value.Equals(r["UNSAT"])) ? 0 : (int)r["UNSAT"]; lblUnknown.Content = (DBNull.Value.Equals(r["UNKNOWN"])) ? 0 : (int)r["UNKNOWN"]; lblOver.Content = (DBNull.Value.Equals(r["OVR"])) ? 0 : (int)r["OVR"]; lblUnder.Content = (DBNull.Value.Equals(r["UDR"])) ? 0 : (int)r["UDR"]; int bugs = (int)r["BUG"];; int prob = (int)r["ERROR"]; int toed = (int)r["TIMEOUT"]; int memoed = (int)r["MEMORYOUT"]; lblBug.Content = bugs; lblBug.Foreground = (bugs == 0) ? System.Windows.Media.Brushes.Black : System.Windows.Media.Brushes.Red; lblNonzero.Content = prob; lblNonzero.Foreground = (prob == 0) ? System.Windows.Media.Brushes.Black : System.Windows.Media.Brushes.Red; lblMemdout.Content = memoed; lblMemdout.Foreground = (memoed == 0) ? System.Windows.Media.Brushes.Black : System.Windows.Media.Brushes.Red; lblTimedout.Content = toed; lblTimedout.Foreground = (toed == 0) ? System.Windows.Media.Brushes.Black : System.Windows.Media.Brushes.Red; txtTimeout.Text = (string)r["MaxTime"]; txtMemout.Text = (string)r["MaxMem"]; if (r["Parameters"].Equals(DBNull.Value)) { txtParameters.Text = (string)r["Longparams"]; } else { txtParameters.Text = (string)r["Parameters"]; } string cluster = (string)r["Cluster"]; txtCluster.Text = cluster; int clusterJobID = (DBNull.Value.Equals(r["ClusterJobID"])) ? 0 : (int)r["ClusterJobID"]; txtNodeGroup.Text = (string)r["Nodegroup"]; txtLocality.Text = (string)r["Locality"]; txtCreator.Text = (string)r["Creator"]; txtNote.Text = (DBNull.Value.Equals(r["Note"])) ? "" : (string)r["Note"]; r.Close(); if (cluster != "" && clusterJobID != 0) { try { Scheduler scheduler = new Scheduler(); scheduler.Connect(cluster); ISchedulerJob job = scheduler.OpenJob(Convert.ToInt32(clusterJobID)); JobState state = job.State; lblClusterStatus.Content = state.ToString(); if (state == JobState.Running) { lblClusterStatus.Content += " (" + job.GetCounters().RunningTaskCount + " wrkrs)"; } if (state == JobState.Running || state == JobState.Queued || state == JobState.Validating || state == JobState.Finished || state == JobState.Finishing || state == JobState.Submitted || state == JobState.ExternalValidation) { lblClusterStatus.Foreground = System.Windows.Media.Brushes.Green; } else { lblClusterStatus.Foreground = System.Windows.Media.Brushes.Red; } } catch (SchedulerException) { lblClusterStatus.Content = "Job ID unknown to the cluster. Assume finished."; lblClusterStatus.Foreground = System.Windows.Media.Brushes.Orange; } catch { lblClusterStatus.Content = "Unable to retrieve status."; lblClusterStatus.Foreground = System.Windows.Media.Brushes.Orange; } } else { lblClusterStatus.Content = "Unable to retrieve status."; lblClusterStatus.Foreground = System.Windows.Media.Brushes.Black; } cmd = new SqlCommand("SELECT COUNT(*) FROM Data WHERE ExperimentID=" + id.ToString() + " AND ResultCode=4 AND (stderr LIKE '%INFRASTRUCTURE ERROR%' OR ReturnValue=-1073741515);", sql); r = cmd.ExecuteReader(); if (r.Read()) { int ierrs = (int)r[0]; if (ierrs == 0) { lblInfrastructureErrors.Content = ""; } else { lblInfrastructureErrors.Content = ierrs.ToString() + " infrastructure errors!"; } } r.Close(); } catch (SqlException ex) { if (ex.Number == -2) /* timeout */ goto { retry; }
public void RefreshJobStatus() { if (_job != null) { LogEntry oldVersion = (LogEntry)this.MemberwiseClone(); _job.Refresh(); JobState = _job.State; ISchedulerJobCounters counters = _job.GetCounters(); string stateStr = string.Format("{0}/{1}/{2}/{3}", counters.QueuedTaskCount, counters.RunningTaskCount, counters.FailedTaskCount, counters.FinishedTaskCount); FailedTaskCount = counters.FailedTaskCount; TaskStatus = stateStr; if (FailedTaskCount > 0) { IEnumerable <ISchedulerTask> tasklist = GetFailedTasks(_job); string failedTaskRangeAsString = tasklist.Select(task => task.TaskId.JobTaskId).StringJoin(","); if ("" != failedTaskRangeAsString) { this.FailedTasks = RangeCollection.Parse(failedTaskRangeAsString).ToString(); } else { FailedTasks = ""; } } else { FailedTasks = ""; } if (JobState == JobState.Finished) { if (WallTime.Ticks == 0) { DateTime startTime = _job.SubmitTime; DateTime endTime = _job.EndTime; WallTime = endTime - startTime; } if (CpuTime.Ticks == 0) { var tasklist = _job.GetTaskList(null, null, true).Cast <ISchedulerTask>(); var totalTicks = tasklist.Select(task => (task.EndTime - task.StartTime).Ticks).Sum(); CpuTime = new TimeSpan(totalTicks); } } bool taskStateChanged = FailedTasks != oldVersion.FailedTasks || TaskStatus != oldVersion.TaskStatus; bool jobStateChanged = JobState != oldVersion.JobState || (FailedTaskCount == 0) != (oldVersion.FailedTaskCount == 0) || string.IsNullOrEmpty(FailedTasks) != string.IsNullOrEmpty(oldVersion.FailedTasks) || CpuTime != oldVersion.CpuTime || WallTime != oldVersion.WallTime; //if (_taskStateChangedSinceLastEvent != taskStateChanged || _jobStateChangedSinceLastEvent != jobStateChanged) // Console.WriteLine("bad"); if (taskStateChanged) { RaiseTaskStateChangedEvent(); } if (jobStateChanged) { RaiseJobStateChangedEvent(); } } }