/// <summary> /// Force the recomputation of the cluster job list. /// </summary> /// <param name="virtualCluster">Virtual cluster to use (defined only for some cluster types).</param> /// <param name="manager">Communication manager.</param> protected override void RecomputeClusterJobList(string virtualCluster, CommManager manager) { this.clusterJobs = new Dictionary <string, ClusterJobInformation>(); if (!Directory.Exists(this.config.JobsFolder)) { return; } string[] subfolders = Directory.GetDirectories(this.config.JobsFolder); int done = 0; foreach (var job in subfolders) { manager.Token.ThrowIfCancellationRequested(); string jobId = Path.GetFileName(job); ClusterJobInformation info = this.GetJobInfo(job, jobId); if (info != null) { // ReSharper disable once AssignNullToNotNullAttribute this.clusterJobs.Add(jobId, info); } manager.Progress(done++ *100 / subfolders.Length); } manager.Progress(100); }
/// <summary> /// Diagnose a list of jobs. /// </summary> /// <param name="jobs">Jobs to diagnose.</param> /// <param name="config">Cluster configuration.</param> /// <param name="manager">Communicatino manager.</param> public static List <DiagnosisLog> DiagnoseJobs(IEnumerable <DryadLinqJobSummary> jobs, ClusterConfiguration config, CommManager manager) { var dryadLinqJobSummaries = jobs as DryadLinqJobSummary[] ?? jobs.ToArray(); int jobCount = dryadLinqJobSummaries.Count(); List <DiagnosisLog> result = new List <DiagnosisLog>(); int done = 0; foreach (DryadLinqJobSummary summary in dryadLinqJobSummaries) { if (summary == null) { continue; } manager.Token.ThrowIfCancellationRequested(); JobFailureDiagnosis diagnosis = JobFailureDiagnosis.CreateJobFailureDiagnosis(config, summary, manager); manager.Status("Diagnosing " + summary.ShortName(), StatusKind.LongOp); DiagnosisLog log = diagnosis.Diagnose(); result.Add(log); done++; manager.Progress(done * 100 / jobCount); } manager.Status("Diagnosis complete", StatusKind.OK); return(result); }
/// <summary> /// Recompute the list of jobs on the cluster and add them to the clusterJobs field. /// </summary> /// <param name="virtualCluster">Unused.</param> /// <param name="manager">Communication manager.</param> protected override void RecomputeClusterJobList(string virtualCluster, CommManager manager) { this.clusterJobs = new Dictionary <string, ClusterJobInformation>(); if (string.IsNullOrEmpty(CachedClusterResidentObject.CacheDirectory)) { return; } string joblist = Path.Combine(CachedClusterResidentObject.CacheDirectory, "jobs"); if (!Directory.Exists(joblist)) { Directory.CreateDirectory(joblist); } string[] files = Directory.GetFiles(joblist, "*.xml"); foreach (var file in files) { manager.Token.ThrowIfCancellationRequested(); DryadLinqJobSummary job = Utilities.LoadXml <DryadLinqJobSummary>(file); string cjid = job.Cluster + "-" + job.ClusterJobId; // there may be two jobs with same id from different clusters ClusterJobInformation ci = new ClusterJobInformation(this.Config.Name, job.Cluster, cjid, job.Name, job.User, job.Date, job.EndTime - job.Date, job.Status); ci.SetAssociatedSummary(job); if (this.clusterJobs.ContainsKey(cjid)) { manager.Status("Duplicate job id, cannot insert in cache " + job.AsIdentifyingString(), StatusKind.Error); continue; } this.clusterJobs.Add(cjid, ci); } manager.Progress(100); }
/// <summary> /// Look to see whether the vertices failed reading from some common set of machines. /// This is incomplete: e.g., it does not work for tidyfs streams. /// </summary> /// <returns>Yes if there were correlated failures.</returns> /// <param name="manager">Communication manager.</param> protected Decision LookForCorrelatedReadFailures(CommManager manager) { // if we have more than this many failures we start to worry const int maxFailures = 5; IEnumerable <ExecutedVertexInstance> failures = this.Job.Vertices.Where(v => v.State == ExecutedVertexInstance.VertexState.Failed). Where(v => !v.IsManager). ToList(); int totalFailures = failures.Count(); if (totalFailures < maxFailures) { return(Decision.No); } List <ChannelEndpointDescription> channelsFailed = new List <ChannelEndpointDescription>(); int verticesDone = 0; foreach (ExecutedVertexInstance v in failures) { var crf = VertexFailureDiagnosis.CreateVertexFailureDiagnosis(this.Job, this.StaticPlan, v, manager).ChannelReadFailure(manager); if (crf != null) { channelsFailed.Add(crf); } verticesDone++; manager.Progress(verticesDone * 100 / totalFailures); } if (channelsFailed.Count() < maxFailures) { return(Decision.No); } this.Log(DiagnosisMessage.Importance.Final, "There are " + channelsFailed.Count() + " read failures in the job", ""); var files = channelsFailed.Where(ced => ced.UriType == "file").ToList(); if (files.Count() == 0) { this.Log(DiagnosisMessage.Importance.Final, "All channels with failures are distributed files", "No further information is available"); return(Decision.Dontknow); } Decision result = Decision.Dontknow; var machines = files.Select(f => new UNCPathname(f.LocalPath).Machine).GroupBy(w => w).ToList(); foreach (var m in machines) { int failuresOnM = m.Count(); if (failuresOnM > 3) { this.Log(DiagnosisMessage.Importance.Final, "There are " + failuresOnM + " read failures reading from machine", m.Key); result = Decision.Yes; } } return(result); }
/// <summary> /// Force the recomputation of the cluster job list. /// </summary> /// <param name="virtualCluster">Virtual cluster to use (defined only for some cluster types).</param> /// <param name="manager">Communication manager.</param> protected override void RecomputeClusterJobList(string virtualCluster, CommManager manager) { this.clusterJobs = new Dictionary <string, ClusterJobInformation>(); var jobs = this.config.AzureClient.EnumerateDirectory("").ToList(); int done = 0; foreach (var job in jobs) { manager.Token.ThrowIfCancellationRequested(); ClusterJobInformation info = this.GetJobInfo(job); if (info != null) { // ReSharper disable once AssignNullToNotNullAttribute this.clusterJobs.Add(job, info); } manager.Progress(100 * done++ / jobs.Count); } manager.Progress(100); }
/// <summary> /// Force the recomputation of the cluster job list. /// </summary> /// <param name="virtualCluster">Virtual cluster to use (defined only for some cluster types).</param> /// <param name="manager">Communication manager.</param> // ReSharper disable once UnusedParameter.Global protected override void RecomputeClusterJobList(string virtualCluster, CommManager manager) { this.clusterJobs = new Dictionary <string, ClusterJobInformation>(); var uri = DfsFile.UriFromPath(this.config.JobsFolderUri, ""); var jobs = this.config.DfsClient.EnumerateSubdirectories(uri).ToList(); int done = 0; foreach (var job in jobs) { manager.Token.ThrowIfCancellationRequested(); ClusterJobInformation info = this.GetJobInfo(DfsFile.PathFromUri(this.config.JobsFolderUri, job)); if (info != null) { // ReSharper disable once AssignNullToNotNullAttribute this.clusterJobs.Add(info.ClusterJobID, info); } manager.Progress(100 * done++ / jobs.Count); } manager.Progress(100); }
/// <summary> /// Force the recomputation of the cluster job list. /// </summary> /// <param name="virtualCluster">Virtual cluster to use (defined only for some cluster types).</param> /// <param name="manager">Communication manager.</param> protected override void RecomputeClusterJobList(string virtualCluster, CommManager manager) { this.clusterJobs = new Dictionary <string, ClusterJobInformation>(); var jobs = this.config.AzureClient.ExpandFileOrDirectory(AzureDfsFile.UriFromPath(this.config, "")).ToList(); int done = 0; foreach (var job in jobs) { manager.Token.ThrowIfCancellationRequested(); string jobRootFolder = AzureDfsFile.PathFromUri(this.config, job); ClusterJobInformation info = this.GetJobInfo(jobRootFolder); if (info != null) { // ReSharper disable once AssignNullToNotNullAttribute this.clusterJobs.Add(job.AbsolutePath, info); } manager.Progress(100 * done++ / jobs.Count); } manager.Progress(100); }
/// <summary> /// Force the recomputation of the cluster job list. /// </summary> /// <param name="virtualCluster">Virtual cluster to use (defined only for some cluster types).</param> /// <param name="manager">Communication manager.</param> protected override void RecomputeClusterJobList(string virtualCluster, CommManager manager) { this.clusterJobs = new Dictionary<string, ClusterJobInformation>(); var jobs = this.config.AzureClient.ExpandFileOrDirectory(AzureDfsFile.UriFromPath(this.config, "")).ToList(); int done = 0; foreach (var job in jobs) { manager.Token.ThrowIfCancellationRequested(); string jobRootFolder = AzureDfsFile.PathFromUri(this.config, job); ClusterJobInformation info = this.GetJobInfo(jobRootFolder); if (info != null) { // ReSharper disable once AssignNullToNotNullAttribute this.clusterJobs.Add(job.AbsolutePath, info); } manager.Progress(100*done++/jobs.Count); } manager.Progress(100); }
/// <summary> /// Parse the stdout.txt file from the job manager. /// </summary> /// <param name="file">File to parse.</param> /// <param name="manager">Communication manager.</param> /// <returns>True if the parsing succeeds.</returns> private bool ParseStdout(IClusterResidentObject file, CommManager manager) { int currentLine = 0; if (this.stdoutLinesParsed == 0) // don't lose it if we are only parsing the tail. this.lastTimestampSeen = this.Summary.Date; // start from the job submission timestamp // we are reusing the stream this.stdoutLinesParsed = 0; try { long filesize = file.Size; long readbytes = 0; string message = "Scanning JM stdout " + file; if (filesize >= 0) message += string.Format("({0:N0} bytes)", filesize); manager.Status(message, StatusKind.LongOp); if (this.cachedStdoutReader == null) this.cachedStdoutReader = file.GetStream(); if (this.cachedStdoutReader.Exception != null) { manager.Status("Exception while opening stdout " + this.cachedStdoutReader.Exception.Message, StatusKind.Error); return false; } while (!this.cachedStdoutReader.EndOfStream) { string line = this.cachedStdoutReader.ReadLine(); readbytes += line.Length; if (currentLine >= this.stdoutLinesParsed) { while (true) { manager.Token.ThrowIfCancellationRequested(); int startLine = currentLine; bool completeLine = true; try { completeLine = this.ParseStdoutLineNew(line); } catch (Exception ex) { manager.Status(string.Format("Line {0}: Exception {1}", currentLine, ex.Message), StatusKind.Error); Console.WriteLine("Line {0}: Exception {1}", currentLine, ex); } if (!completeLine) { if (this.cachedStdoutReader.EndOfStream) { throw new Exception("File ended while scanning for closing quote started on line " + startLine); } string extraline = this.cachedStdoutReader.ReadLine(); line += "\n" + extraline; currentLine++; } else break; } } currentLine++; if (currentLine % 100 == 0 && filesize > 0) { manager.Progress(Math.Min(100, (int)(100 * readbytes / filesize))); } } this.stdoutLinesParsed = currentLine; if (this.ManagerVertex != null) { if (this.ManagerVertex.End == DateTime.MinValue) // approximation this.ManagerVertex.End = this.lastTimestampSeen; // we are done with this stream if (this.ManagerVertex.State == ExecutedVertexInstance.VertexState.Failed || this.ManagerVertex.State == ExecutedVertexInstance.VertexState.Successful) { this.cachedStdoutReader.Close(); this.cachedStdoutReader = null; // will force reopening if refreshed } } return true; } catch (Exception e) { manager.Status("Exception while reading stdout " + e.Message, StatusKind.Error); Trace.TraceInformation(e.ToString()); return false; } }
/// <summary> /// Diagnose a list of jobs. /// </summary> /// <param name="jobs">Jobs to diagnose.</param> /// <param name="config">Cluster configuration.</param> /// <param name="manager">Communicatino manager.</param> public static List<DiagnosisLog> DiagnoseJobs(IEnumerable<DryadLinqJobSummary> jobs, ClusterConfiguration config, CommManager manager) { var dryadLinqJobSummaries = jobs as DryadLinqJobSummary[] ?? jobs.ToArray(); int jobCount = dryadLinqJobSummaries.Count(); List<DiagnosisLog> result = new List<DiagnosisLog>(); int done = 0; foreach (DryadLinqJobSummary summary in dryadLinqJobSummaries) { if (summary == null) continue; manager.Token.ThrowIfCancellationRequested(); JobFailureDiagnosis diagnosis = JobFailureDiagnosis.CreateJobFailureDiagnosis(config, summary, manager); manager.Status("Diagnosing " + summary.ShortName(), StatusKind.LongOp); DiagnosisLog log = diagnosis.Diagnose(); result.Add(log); done++; manager.Progress(done * 100 / jobCount); } manager.Status("Diagnosis complete", StatusKind.OK); return result; }
/// <summary> /// Parse a part of the 'originalInfo.txt' file to discover a set of channel endpoints. /// </summary> /// <param name="sr">Stream reader which contains the channel information.</param> /// <returns>The list of channels, or null on failure.</returns> /// <param name="uriprefix">If the channel is an output, prefix the path with this; this is null for inputs.</param> /// <param name="skip">If true, do not return anything (still useful to advance the stream reader).</param> /// <param name="fast">If true the channel sizes are not discovered; this is much faster, since no remote machines are queried for files.</param> /// <param name="manager">Communication manager.</param> private Dictionary<int, ChannelEndpointDescription> DiscoverOriginalInfoChannels(ISharedStreamReader sr, string uriprefix, bool skip, bool fast, CommManager manager) { bool isInput = uriprefix == null; string countline = sr.ReadLine(); if (countline == null) return null; int channelCount; int spaceIndex = countline.IndexOf(' '); if (spaceIndex > 0) countline = countline.Substring(0, spaceIndex); bool success = int.TryParse(countline, out channelCount); if (!success) return null; var channels = new Dictionary<int, ChannelEndpointDescription>(channelCount); for (int i = 0; i < channelCount; i++) { string channel = sr.ReadLine(); if (channel == null) { manager.Progress(100); return null; } if (!skip) { ChannelEndpointDescription desc = new ChannelEndpointDescription(isInput, i, channel, uriprefix, fast, manager.Status); channels.Add(i, desc); manager.Progress(i * 100 / channelCount); } } manager.Progress(100); if (skip) return null; return channels; }
/// <summary> /// Look to see whether the vertices failed reading from some common set of machines. /// This is incomplete: e.g., it does not work for tidyfs streams. /// </summary> /// <returns>Yes if there were correlated failures.</returns> /// <param name="manager">Communication manager.</param> protected Decision LookForCorrelatedReadFailures(CommManager manager) { // if we have more than this many failures we start to worry const int maxFailures = 5; IEnumerable<ExecutedVertexInstance> failures = this.Job.Vertices.Where(v => v.State == ExecutedVertexInstance.VertexState.Failed). Where(v => !v.IsManager). ToList(); int totalFailures = failures.Count(); if (totalFailures < maxFailures) return Decision.No; List<ChannelEndpointDescription> channelsFailed = new List<ChannelEndpointDescription>(); int verticesDone = 0; foreach (ExecutedVertexInstance v in failures) { var crf = VertexFailureDiagnosis.CreateVertexFailureDiagnosis(this.Job, this.StaticPlan, v, manager).ChannelReadFailure(manager); if (crf != null) { channelsFailed.Add(crf); } verticesDone++; manager.Progress(verticesDone * 100 / totalFailures); } if (channelsFailed.Count() < maxFailures) return Decision.No; this.Log(DiagnosisMessage.Importance.Final, "There are " + channelsFailed.Count() + " read failures in the job", ""); var files = channelsFailed.Where(ced => ced.UriType == "file").ToList(); if (files.Count() == 0) { this.Log(DiagnosisMessage.Importance.Final, "All channels with failures are distributed files", "No further information is available"); return Decision.Dontknow; } Decision result = Decision.Dontknow; var machines = files.Select(f => new UNCPathname(f.LocalPath).Machine).GroupBy(w => w).ToList(); foreach (var m in machines) { int failuresOnM = m.Count(); if (failuresOnM > 3) { this.Log(DiagnosisMessage.Importance.Final, "There are " + failuresOnM + " read failures reading from machine", m.Key); result = Decision.Yes; } } return result; }
/// <summary> /// Fill the job info by parsing the stdout.txt. /// <returns>The updated job.</returns> /// <param name="manager">Communication manager.</param> /// </summary> public bool CollectEssentialInformation(CommManager manager) { this.RefreshJobStatus(manager); if (this.ManagerVertex == null) { this.ManagerVertex = new ExecutedVertexInstance(this, -1, 0, "JobManager", "", this.Summary.Date); this.ManagerVertex.IsManager = true; this.ManagerVertex.SetStartInformation(this, this.Summary.Machine, this.Summary.Date, this.Summary.ManagerProcessGuid, ""); this.ManagerVertex.StartCommandTime = this.ManagerVertex.CreationTime = this.ManagerVertex.VertexScheduleTime = this.Summary.Date; ExecutedVertexInstance.VertexState jmstate = ExecutedVertexInstance.VertexState.Started; switch (this.Summary.Status) { case ClusterJobInformation.ClusterJobStatus.Failed: jmstate = ExecutedVertexInstance.VertexState.Failed; break; /* case ClusterJobInformation.ClusterJobStatus.Succeeded: jmstate = ExecutedVertexInstance.VertexState.Successful; break; */ } this.ManagerVertex.SetState(jmstate); this.jobVertices.Add(this.ManagerVertex); } if (this.stdoutpath == null) return false; bool success = this.ParseStdout(this.stdoutpath, manager); manager.Progress(100); if (!success) return false; this.JobInfoCannotBeCollected = false; manager.Status("Stdout parsed", StatusKind.OK); this.LastUpdatetime = DateTime.Now; if (this.Summary.Status == ClusterJobInformation.ClusterJobStatus.Running) { foreach (var vertex in this.Vertices.Where(v => v.State == ExecutedVertexInstance.VertexState.Started)) vertex.MarkVertexWasRunning(this.LastUpdatetime); this.ManagerVertex.MarkVertexWasRunning(this.LastUpdatetime); } else if (this.jobSummary.Status == ClusterJobInformation.ClusterJobStatus.Failed) { if (this.ManagerVertex.State == ExecutedVertexInstance.VertexState.Started) this.ManagerVertex.SetState(ExecutedVertexInstance.VertexState.Failed); foreach (var vertex in this.Vertices.Where(v => v.State == ExecutedVertexInstance.VertexState.Started)) vertex.MarkVertexWasRunning(this.ManagerVertex.End); } return true; }
/// <summary> /// Force the recomputation of the cluster job list. /// </summary> /// <param name="virtualCluster">Virtual cluster to use (defined only for some cluster types).</param> /// <param name="manager">Communication manager.</param> // ReSharper disable once UnusedParameter.Global protected override void RecomputeClusterJobList(string virtualCluster, CommManager manager) { this.clusterJobs = new Dictionary<string, ClusterJobInformation>(); var uri = DfsFile.UriFromPath(this.config.JobsFolderUri, ""); var jobs = this.config.DfsClient.EnumerateSubdirectories(uri).ToList(); int done = 0; foreach (var job in jobs) { manager.Token.ThrowIfCancellationRequested(); ClusterJobInformation info = this.GetJobInfo(DfsFile.PathFromUri(this.config.JobsFolderUri, job)); if (info != null) { // ReSharper disable once AssignNullToNotNullAttribute this.clusterJobs.Add(info.ClusterJobID, info); } manager.Progress(100 * done++ / jobs.Count); } manager.Progress(100); }
/// <summary> /// Force the recomputation of the cluster job list. /// </summary> /// <param name="virtualCluster">Virtual cluster to use (defined only for some cluster types).</param> /// <param name="manager">Communication manager.</param> protected override void RecomputeClusterJobList(string virtualCluster, CommManager manager) { this.clusterJobs = new Dictionary<string, ClusterJobInformation>(); var jobs = this.config.AzureClient.EnumerateDirectory("").ToList(); int done = 0; foreach (var job in jobs) { manager.Token.ThrowIfCancellationRequested(); ClusterJobInformation info = this.GetJobInfo(job); if (info != null) { // ReSharper disable once AssignNullToNotNullAttribute this.clusterJobs.Add(job, info); } manager.Progress(100*done++/jobs.Count); } manager.Progress(100); }
/// <summary> /// Force the recomputation of the cluster job list. /// </summary> /// <param name="virtualCluster">Virtual cluster to use (defined only for some cluster types).</param> /// <param name="manager">Communication manager.</param> protected override void RecomputeClusterJobList(string virtualCluster, CommManager manager) { this.clusterJobs = new Dictionary<string, ClusterJobInformation>(); if (!Directory.Exists(this.config.JobsFolder)) return; string[] subfolders = Directory.GetDirectories(this.config.JobsFolder); int done = 0; foreach (var job in subfolders) { manager.Token.ThrowIfCancellationRequested(); string jobId = Path.GetFileName(job); ClusterJobInformation info = this.GetJobInfo(job, jobId); if (info != null) { // ReSharper disable once AssignNullToNotNullAttribute this.clusterJobs.Add(jobId, info); } manager.Progress(done++ *100/subfolders.Length); } manager.Progress(100); }
/// <summary> /// Recompute the list of jobs on the cluster and add them to the clusterJobs field. /// </summary> /// <param name="virtualCluster">Unused.</param> /// <param name="manager">Communication manager.</param> protected override void RecomputeClusterJobList(string virtualCluster, CommManager manager) { this.clusterJobs = new Dictionary<string, ClusterJobInformation>(); if (string.IsNullOrEmpty(CachedClusterResidentObject.CacheDirectory)) return; string joblist = Path.Combine(CachedClusterResidentObject.CacheDirectory, "jobs"); if (!Directory.Exists(joblist)) Directory.CreateDirectory(joblist); string[] files = Directory.GetFiles(joblist, "*.xml"); foreach (var file in files) { manager.Token.ThrowIfCancellationRequested(); DryadLinqJobSummary job = Utilities.LoadXml<DryadLinqJobSummary>(file); string cjid = job.Cluster + "-" + job.ClusterJobId; // there may be two jobs with same id from different clusters ClusterJobInformation ci = new ClusterJobInformation(this.Config.Name, job.Cluster, cjid, job.Name, job.User, job.Date, job.EndTime - job.Date, job.Status); ci.SetAssociatedSummary(job); if (this.clusterJobs.ContainsKey(cjid)) { manager.Status("Duplicate job id, cannot insert in cache " + job.AsIdentifyingString(), StatusKind.Error); continue; } this.clusterJobs.Add(cjid, ci); } manager.Progress(100); }
/// <summary> /// Cache the vertices in the list; executed on the background thread. /// </summary> /// <returns>True: success.</returns> /// <param name="manager">Communication manager.</param> /// <param name="config">Cluster configuration.</param> /// <param name="summary">Job to cache.</param> /// <param name="vertices">Vertices to cache.</param> private static bool CacheAllVertices( ClusterConfiguration config, DryadLinqJobSummary summary, List<ExecutedVertexInstance> vertices, CommManager manager) { int done = 0; int todo = vertices.Count; int files = 0; manager.Status("Caching data for " + todo + " vertices", StatusKind.LongOp); foreach (ExecutedVertexInstance v in vertices) { files += CacheVertexInfo(config, summary, v); done++; manager.Progress(done / todo); } manager.Progress(100); manager.Status("Cached " + files + " files", StatusKind.OK); return true; }