/// <summary> /// Diagnose a list of jobs. /// </summary> /// <param name="jobs">Jobs to diagnose.</param> /// <param name="config">Cluster configuration.</param> /// <param name="manager">Communicatino manager.</param> public static List <DiagnosisLog> DiagnoseJobs(IEnumerable <DryadLinqJobSummary> jobs, ClusterConfiguration config, CommManager manager) { var dryadLinqJobSummaries = jobs as DryadLinqJobSummary[] ?? jobs.ToArray(); int jobCount = dryadLinqJobSummaries.Count(); List <DiagnosisLog> result = new List <DiagnosisLog>(); int done = 0; foreach (DryadLinqJobSummary summary in dryadLinqJobSummaries) { if (summary == null) { continue; } manager.Token.ThrowIfCancellationRequested(); JobFailureDiagnosis diagnosis = JobFailureDiagnosis.CreateJobFailureDiagnosis(config, summary, manager); manager.Status("Diagnosing " + summary.ShortName(), StatusKind.LongOp); DiagnosisLog log = diagnosis.Diagnose(); result.Add(log); done++; manager.Progress(done * 100 / jobCount); } manager.Status("Diagnosis complete", StatusKind.OK); return(result); }
/// <summary> /// Recompute the list of jobs on the cluster and add them to the clusterJobs field. /// </summary> /// <param name="virtualCluster">Unused.</param> /// <param name="manager">Communication manager.</param> protected override void RecomputeClusterJobList(string virtualCluster, CommManager manager) { this.clusterJobs = new Dictionary <string, ClusterJobInformation>(); if (string.IsNullOrEmpty(CachedClusterResidentObject.CacheDirectory)) { return; } string joblist = Path.Combine(CachedClusterResidentObject.CacheDirectory, "jobs"); if (!Directory.Exists(joblist)) { Directory.CreateDirectory(joblist); } string[] files = Directory.GetFiles(joblist, "*.xml"); foreach (var file in files) { manager.Token.ThrowIfCancellationRequested(); DryadLinqJobSummary job = Utilities.LoadXml <DryadLinqJobSummary>(file); string cjid = job.Cluster + "-" + job.ClusterJobId; // there may be two jobs with same id from different clusters ClusterJobInformation ci = new ClusterJobInformation(this.Config.Name, job.Cluster, cjid, job.Name, job.User, job.Date, job.EndTime - job.Date, job.Status); ci.SetAssociatedSummary(job); if (this.clusterJobs.ContainsKey(cjid)) { manager.Status("Duplicate job id, cannot insert in cache " + job.AsIdentifyingString(), StatusKind.Error); continue; } this.clusterJobs.Add(cjid, ci); } manager.Progress(100); }
/// <summary> /// Try to find the job information from cluster and summary. /// </summary> /// <param name="manager">Communication manager.</param> protected void FindJobInfo(CommManager manager) { DryadLinqJobInfo jobinfo = DryadLinqJobInfo.CreateDryadLinqJobInfo(this.cluster, this.Summary, true, manager); if (jobinfo == null) { manager.Status("Cannot collect information for " + Summary.ShortName() + " to diagnose", StatusKind.Error); return; } this.Job = jobinfo; this.StaticPlan = JobObjectModel.DryadJobStaticPlan.CreatePlan(jobinfo, manager); }
/// <summary> /// Cancel a job. /// </summary> /// <param name="jobs">Jobs to cancel.</param> /// <param name="cluster">Cluster where the jobs are running.</param> /// <returns>True if all cancellations succeed.</returns> /// <param name="manager">Communicatoni manager.</param> // ReSharper disable once UnusedParameter.Global public static bool CancelJobs(IEnumerable <DryadLinqJobSummary> jobs, ClusterStatus cluster, CommManager manager) { bool done = true; foreach (DryadLinqJobSummary job in jobs) { manager.Token.ThrowIfCancellationRequested(); if (job.Status != ClusterJobInformation.ClusterJobStatus.Running) { manager.Status("Job " + job.Name + " does not appear to be running; will still try to cancel", StatusKind.Error); } bool success; string reason = ""; try { success = cluster.CancelJob(job); } catch (Exception ex) { success = false; reason = ex.Message; Trace.TraceInformation(ex.ToString()); } if (success) { manager.Status("Job " + job.Name + " cancelled", StatusKind.OK); } else { manager.Status("Cancellation of " + job.Name + " failed " + reason, StatusKind.Error); } done &= success; } return(done); }
/// <summary> /// Diagnose a list of jobs. /// </summary> /// <param name="jobs">Jobs to diagnose.</param> /// <param name="config">Cluster configuration.</param> /// <param name="manager">Communicatino manager.</param> public static List<DiagnosisLog> DiagnoseJobs(IEnumerable<DryadLinqJobSummary> jobs, ClusterConfiguration config, CommManager manager) { var dryadLinqJobSummaries = jobs as DryadLinqJobSummary[] ?? jobs.ToArray(); int jobCount = dryadLinqJobSummaries.Count(); List<DiagnosisLog> result = new List<DiagnosisLog>(); int done = 0; foreach (DryadLinqJobSummary summary in dryadLinqJobSummaries) { if (summary == null) continue; manager.Token.ThrowIfCancellationRequested(); JobFailureDiagnosis diagnosis = JobFailureDiagnosis.CreateJobFailureDiagnosis(config, summary, manager); manager.Status("Diagnosing " + summary.ShortName(), StatusKind.LongOp); DiagnosisLog log = diagnosis.Diagnose(); result.Add(log); done++; manager.Progress(done * 100 / jobCount); } manager.Status("Diagnosis complete", StatusKind.OK); return result; }
/// <summary> /// Cancel a job. /// </summary> /// <param name="jobs">Jobs to cancel.</param> /// <param name="cluster">Cluster where the jobs are running.</param> /// <returns>True if all cancellations succeed.</returns> /// <param name="manager">Communicatoni manager.</param> // ReSharper disable once UnusedParameter.Global public static bool CancelJobs(IEnumerable<DryadLinqJobSummary> jobs, ClusterStatus cluster, CommManager manager) { bool done = true; foreach (DryadLinqJobSummary job in jobs) { manager.Token.ThrowIfCancellationRequested(); if (job.Status != ClusterJobInformation.ClusterJobStatus.Running) { manager.Status("Job " + job.Name + " does not appear to be running; will still try to cancel", StatusKind.Error); } bool success; string reason = ""; try { success = cluster.CancelJob(job); } catch (Exception ex) { success = false; reason = ex.Message; Trace.TraceInformation(ex.ToString()); } if (success) manager.Status("Job " + job.Name + " cancelled", StatusKind.OK); else manager.Status("Cancellation of " + job.Name + " failed " + reason, StatusKind.Error); done &= success; } return done; }
/// <summary> /// Recompute the list of jobs on the cluster and add them to the clusterJobs field. /// </summary> /// <param name="virtualCluster">Unused.</param> /// <param name="manager">Communication manager.</param> protected override void RecomputeClusterJobList(string virtualCluster, CommManager manager) { this.clusterJobs = new Dictionary<string, ClusterJobInformation>(); if (string.IsNullOrEmpty(CachedClusterResidentObject.CacheDirectory)) return; string joblist = Path.Combine(CachedClusterResidentObject.CacheDirectory, "jobs"); if (!Directory.Exists(joblist)) Directory.CreateDirectory(joblist); string[] files = Directory.GetFiles(joblist, "*.xml"); foreach (var file in files) { manager.Token.ThrowIfCancellationRequested(); DryadLinqJobSummary job = Utilities.LoadXml<DryadLinqJobSummary>(file); string cjid = job.Cluster + "-" + job.ClusterJobId; // there may be two jobs with same id from different clusters ClusterJobInformation ci = new ClusterJobInformation(this.Config.Name, job.Cluster, cjid, job.Name, job.User, job.Date, job.EndTime - job.Date, job.Status); ci.SetAssociatedSummary(job); if (this.clusterJobs.ContainsKey(cjid)) { manager.Status("Duplicate job id, cannot insert in cache " + job.AsIdentifyingString(), StatusKind.Error); continue; } this.clusterJobs.Add(cjid, ci); } manager.Progress(100); }
/// <summary> /// Cache the vertices in the list; executed on the background thread. /// </summary> /// <returns>True: success.</returns> /// <param name="manager">Communication manager.</param> /// <param name="config">Cluster configuration.</param> /// <param name="summary">Job to cache.</param> /// <param name="vertices">Vertices to cache.</param> private static bool CacheAllVertices( ClusterConfiguration config, DryadLinqJobSummary summary, List<ExecutedVertexInstance> vertices, CommManager manager) { int done = 0; int todo = vertices.Count; int files = 0; manager.Status("Caching data for " + todo + " vertices", StatusKind.LongOp); foreach (ExecutedVertexInstance v in vertices) { files += CacheVertexInfo(config, summary, v); done++; manager.Progress(done / todo); } manager.Progress(100); manager.Status("Cached " + files + " files", StatusKind.OK); return true; }
/// <summary> /// Get the contents of a specified cluster resident object. /// </summary> /// <param name="path">Cluster object whose contents is read.</param> /// <param name="pattern">Pattern to filter contents, for folders.</param> /// <returns>The file contents.</returns> /// <param name="manager">Communication manager.</param> private static FileContents GetContents(CommManager manager, IClusterResidentObject path, string pattern) { if (path == null) { return new FileContents("Null path"); } StringBuilder output = new StringBuilder(); Dictionary<string, IClusterResidentObject> linkCache = new Dictionary<string, IClusterResidentObject>(); linkCache.Add(path.ToString(), path); string error = (path.RepresentsAFolder ? "Folder " : "") + path; if (path.Exception != null) { error += " [Error accessing: " + path.Exception.Message + "]"; return new FileContents(error); } if (path.RepresentsAFolder) { IEnumerable<IClusterResidentObject> dirs = path.GetFilesAndFolders(pattern); int displayed = 0; foreach (IClusterResidentObject d in dirs) { manager.Token.ThrowIfCancellationRequested(); if (d.Exception != null) { error += " [Error " + d.Exception.Message + "]"; return new FileContents(error); } if (d.RepresentsAFolder) { string dirdisplay = string.Format("{0:u} {1,16} file://{2}", d.CreationTime, "d", d.Name); output.AppendLine(dirdisplay); } else { string filedisplay = string.Format("{0:u} {1,16:N0} file://{2}", d.CreationTime, d.Size, d.Name); output.AppendLine(filedisplay); } linkCache.Add("file://" + d.Name, d); displayed++; } if (displayed == 0) error += "[empty]"; return new FileContents(output.ToString(), error, linkCache); } else { manager.Status("Extracting contents of " + path, StatusKind.LongOp); ISharedStreamReader sr = path.GetStream(); if (sr.Exception != null) { error += " [Error " + sr.Exception.Message + "]"; return new FileContents(error); } else { if (path.Size == 0) error += "[empty]"; var contents = sr.ReadToEnd(manager.Token); return new FileContents(contents, error, linkCache); } } }
/// <summary> /// Fill the job info by parsing the stdout.txt. /// <returns>The updated job.</returns> /// <param name="manager">Communication manager.</param> /// </summary> public bool CollectEssentialInformation(CommManager manager) { this.RefreshJobStatus(manager); if (this.ManagerVertex == null) { this.ManagerVertex = new ExecutedVertexInstance(this, -1, 0, "JobManager", "", this.Summary.Date); this.ManagerVertex.IsManager = true; this.ManagerVertex.SetStartInformation(this, this.Summary.Machine, this.Summary.Date, this.Summary.ManagerProcessGuid, ""); this.ManagerVertex.StartCommandTime = this.ManagerVertex.CreationTime = this.ManagerVertex.VertexScheduleTime = this.Summary.Date; ExecutedVertexInstance.VertexState jmstate = ExecutedVertexInstance.VertexState.Started; switch (this.Summary.Status) { case ClusterJobInformation.ClusterJobStatus.Failed: jmstate = ExecutedVertexInstance.VertexState.Failed; break; /* case ClusterJobInformation.ClusterJobStatus.Succeeded: jmstate = ExecutedVertexInstance.VertexState.Successful; break; */ } this.ManagerVertex.SetState(jmstate); this.jobVertices.Add(this.ManagerVertex); } if (this.stdoutpath == null) return false; bool success = this.ParseStdout(this.stdoutpath, manager); manager.Progress(100); if (!success) return false; this.JobInfoCannotBeCollected = false; manager.Status("Stdout parsed", StatusKind.OK); this.LastUpdatetime = DateTime.Now; if (this.Summary.Status == ClusterJobInformation.ClusterJobStatus.Running) { foreach (var vertex in this.Vertices.Where(v => v.State == ExecutedVertexInstance.VertexState.Started)) vertex.MarkVertexWasRunning(this.LastUpdatetime); this.ManagerVertex.MarkVertexWasRunning(this.LastUpdatetime); } else if (this.jobSummary.Status == ClusterJobInformation.ClusterJobStatus.Failed) { if (this.ManagerVertex.State == ExecutedVertexInstance.VertexState.Started) this.ManagerVertex.SetState(ExecutedVertexInstance.VertexState.Failed); foreach (var vertex in this.Vertices.Where(v => v.State == ExecutedVertexInstance.VertexState.Started)) vertex.MarkVertexWasRunning(this.ManagerVertex.End); } return true; }
/// <summary> /// Create information about a job run on the cluster. /// </summary> /// <param name="cf">Cluster configuration.</param> /// <param name="summary">Summary description of the job.</param> /// <returns>The Dryad job description, or null.</returns> /// <param name="fill">If true, fill all the information, otherwise the user will have to call FillInformation on the result later.</param> /// <param name="manager">Communication manager.</param> public static DryadLinqJobInfo CreateDryadLinqJobInfo(ClusterConfiguration cf, DryadLinqJobSummary summary, bool fill, CommManager manager) { try { DryadLinqJobInfo job = new DryadLinqJobInfo(cf, summary); if (fill) job.CollectEssentialInformation(manager); return job; } catch (Exception e) { Trace.TraceInformation(e.ToString()); manager.Status("Could not collect job information for " + summary.Name + ": " + e.Message, StatusKind.Error); return null; } }
/// <summary> /// Discover the vertex channels in a Scope-generated vcmdStart*xml file. /// </summary> /// <returns>True if the discovery was successful.</returns> /// <param name="inputs">If true discover the inputs.</param> /// <param name="outputs">If true discover the outputs.</param> /// <param name="fast">If true do not discover the channel sizes (much faster).</param> /// <param name="manager">Communication manager.</param> // ReSharper disable UnusedParameter.Global public bool DiscoverScopeChannels(bool inputs, bool outputs, bool fast, CommManager manager) // ReSharper restore UnusedParameter.Global { // find the xml file var files = this.WorkDirectory.GetFilesAndFolders("vcmdStart*.xml").ToList(); if (files.Count != 1) { manager.Status("Cannot locate vcmdStart*.xml file", StatusKind.Error); return false; } ISharedStreamReader sr = files.First().GetStream(); if (sr.Exception != null) { manager.Status("Error reading vcmdStart*.xml file" + sr.Exception.Message, StatusKind.Error); return false; } // ReSharper disable PossibleNullReferenceException XDocument plan = XDocument.Parse(sr.ReadToEnd(manager.Token)); if (inputs && this.InputChannels == null) { var channels = new Dictionary<int, ChannelEndpointDescription>(); IEnumerable<XElement> inputsData = plan.Root.Element("inputs").Elements(); int chno = 0; foreach (var e in inputsData) { string chpath = e.Attribute("path").Value; long size = long.Parse(e.Attribute("length").Value); ChannelEndpointDescription desc = new ChannelEndpointDescription(true, chno, chpath, size); channels.Add(chno, desc); chno++; } this.InputChannels = channels; } if (outputs && this.OutputChannels == null) { var channels = new Dictionary<int, ChannelEndpointDescription>(); IEnumerable<XElement> inputsData = plan.Root.Element("outputs").Elements(); int chno = 0; foreach (var e in inputsData) { string chpath = e.Attribute("path").Value; ChannelEndpointDescription desc = new ChannelEndpointDescription(true, chno, chpath, -1); channels.Add(chno, desc); chno ++; } this.OutputChannels = channels; } // ReSharper restore PossibleNullReferenceException sr.Close(); return true; }
/// <summary> /// Factory: create the plan for a given job. /// </summary> /// <param name="dryadLinqJobInfo">Job to create plan for.</param> /// <returns>The plan or null.</returns> /// <param name="manager">Communication manager.</param> public static DryadJobStaticPlan CreatePlan(DryadLinqJobInfo dryadLinqJobInfo, CommManager manager) { manager.Status("Trying to build static plan", StatusKind.LongOp); ClusterConfiguration config = dryadLinqJobInfo.ClusterConfiguration; IClusterResidentObject file = config.JobQueryPlan(dryadLinqJobInfo.Summary); if (config is CacheClusterConfiguration) config = (config as CacheClusterConfiguration).ActualConfig(dryadLinqJobInfo.Summary); if (file.Exception == null) { DryadJobStaticPlan retval; { retval = new DryadLinqJobStaticPlan(config, file.GetStream()); } retval.ParseQueryPlan(manager); return retval; } else { manager.Status("Exception while looking for plan " + file.Exception.Message, StatusKind.Error); return null; } }
/// <summary> /// Parse the stdout.txt file from the job manager. /// </summary> /// <param name="file">File to parse.</param> /// <param name="manager">Communication manager.</param> /// <returns>True if the parsing succeeds.</returns> private bool ParseStdout(IClusterResidentObject file, CommManager manager) { int currentLine = 0; if (this.stdoutLinesParsed == 0) // don't lose it if we are only parsing the tail. this.lastTimestampSeen = this.Summary.Date; // start from the job submission timestamp // we are reusing the stream this.stdoutLinesParsed = 0; try { long filesize = file.Size; long readbytes = 0; string message = "Scanning JM stdout " + file; if (filesize >= 0) message += string.Format("({0:N0} bytes)", filesize); manager.Status(message, StatusKind.LongOp); if (this.cachedStdoutReader == null) this.cachedStdoutReader = file.GetStream(); if (this.cachedStdoutReader.Exception != null) { manager.Status("Exception while opening stdout " + this.cachedStdoutReader.Exception.Message, StatusKind.Error); return false; } while (!this.cachedStdoutReader.EndOfStream) { string line = this.cachedStdoutReader.ReadLine(); readbytes += line.Length; if (currentLine >= this.stdoutLinesParsed) { while (true) { manager.Token.ThrowIfCancellationRequested(); int startLine = currentLine; bool completeLine = true; try { completeLine = this.ParseStdoutLineNew(line); } catch (Exception ex) { manager.Status(string.Format("Line {0}: Exception {1}", currentLine, ex.Message), StatusKind.Error); Console.WriteLine("Line {0}: Exception {1}", currentLine, ex); } if (!completeLine) { if (this.cachedStdoutReader.EndOfStream) { throw new Exception("File ended while scanning for closing quote started on line " + startLine); } string extraline = this.cachedStdoutReader.ReadLine(); line += "\n" + extraline; currentLine++; } else break; } } currentLine++; if (currentLine % 100 == 0 && filesize > 0) { manager.Progress(Math.Min(100, (int)(100 * readbytes / filesize))); } } this.stdoutLinesParsed = currentLine; if (this.ManagerVertex != null) { if (this.ManagerVertex.End == DateTime.MinValue) // approximation this.ManagerVertex.End = this.lastTimestampSeen; // we are done with this stream if (this.ManagerVertex.State == ExecutedVertexInstance.VertexState.Failed || this.ManagerVertex.State == ExecutedVertexInstance.VertexState.Successful) { this.cachedStdoutReader.Close(); this.cachedStdoutReader = null; // will force reopening if refreshed } } return true; } catch (Exception e) { manager.Status("Exception while reading stdout " + e.Message, StatusKind.Error); Trace.TraceInformation(e.ToString()); return false; } }