/// <summary> /// Create a failure diagnosis when the job info is not yet known. /// </summary> /// <param name="config">Cluster where job resides.</param> /// <param name="summary">Job summary.</param> /// <param name="manager">Communication manager.</param> protected FailureDiagnosis(ClusterConfiguration config, DryadLinqJobSummary summary, CommManager manager) { this.cluster = config; this.Summary = summary; this.Manager = manager; this.FindJobInfo(manager); }
/// <summary> /// Create a class to diagnose the problems of a job. /// </summary> /// <param name="config">Cluster where job resides.</param> /// <param name="manager">Communication manager.</param> /// <param name="summary">Job summary.</param> protected JobFailureDiagnosis(ClusterConfiguration config, DryadLinqJobSummary summary, CommManager manager) : base(config, summary, manager) { this.diagnosisLog = new DiagnosisLog(this.Job, summary); if (this.Job != null) { this.jobManager = this.Job.ManagerVertex; } }
/// <summary> /// Create a form to show the diagnosis result. /// </summary> /// <param name="job">Job diagnosed; may be null.</param> /// <param name="log">Diagnosis log.</param> /// <param name="summary">Job summary.</param> public DiagnosisResult(DryadLinqJobInfo job, DryadLinqJobSummary summary, DiagnosisLog log) { this.InitializeComponent(); this.job = job; if (this.job == null) this.button_job.Enabled = false; // ReSharper disable once DoNotCallOverridableMethodsInConstructor this.Text = "Diagnosis results for " + summary.Name + " " + summary.Date; this.textBox_job.Text = "Job being diangosed: " + summary.AsIdentifyingString(); foreach (string s in log.Message()) { this.textBox_message.AppendText(s); this.textBox_message.AppendText(Environment.NewLine); } }
/// <summary> /// Create a form to show the diagnosis result. /// </summary> /// <param name="job">Job diagnosed; may be null.</param> /// <param name="log">Diagnosis log.</param> /// <param name="summary">Job summary.</param> public DiagnosisResult(DryadLinqJobInfo job, DryadLinqJobSummary summary, DiagnosisLog log) { this.InitializeComponent(); this.job = job; if (this.job == null) { this.button_job.Enabled = false; } // ReSharper disable once DoNotCallOverridableMethodsInConstructor this.Text = "Diagnosis results for " + summary.Name + " " + summary.Date; this.textBox_job.Text = "Job being diangosed: " + summary.AsIdentifyingString(); foreach (string s in log.Message()) { this.textBox_message.AppendText(s); this.textBox_message.AppendText(Environment.NewLine); } }
/// <summary> /// The user has double-clicked a row in the clusterJobTable. Browse the job. /// </summary> /// <param name="sender">Unused.</param> /// <param name="e">Event describing position.</param> private void filteredDataGridView_CellMouseDoubleClick(object sender, DataGridViewCellMouseEventArgs e) { if (e.RowIndex < 0) { return; } ClusterJobInformation task = (ClusterJobInformation)this.filteredDataGridView.DataGridView.Rows[e.RowIndex].DataBoundItem; DryadLinqJobSummary js = task.DiscoverDryadLinqJob(this.clusterStatus, this.Status); if (js == null) { this.Status("Error discovering job information on cluster.", StatusKind.Error); } else { this.Status("Starting job browser...", StatusKind.LongOp); this.browseFromJobSummary(js); } }
/// <summary> /// Start the job browser from a job summary. /// </summary> /// <param name="js">Job summary to browse.</param> private void browseFromJobSummary(DryadLinqJobSummary js) { if (js == null) { return; } // TODO: this should run in the background CommManager manager = new CommManager(this.Status, delegate { }, new System.Threading.CancellationTokenSource().Token); DryadLinqJobInfo job = DryadLinqJobInfo.CreateDryadLinqJobInfo(this.clusterStatus.Config, js, false, manager); if (job != null) { JobBrowser browser = new JobBrowser(job); browser.Show(); this.Status("OK", StatusKind.OK); } else { this.Status("Could not find information about job", StatusKind.Error); } }
/// <summary> /// User opens a job by typing a job url. /// </summary> /// <param name="sender">Unused.</param> /// <param name="e">Unused.</param> private void openFromURLToolStripMenuItem_Click(object sender, EventArgs e) { var dialog = new CustomDialog("Job URL:"); DialogResult result = dialog.ShowDialog(); switch (result) { case DialogResult.Cancel: default: return; case DialogResult.OK: { string url = dialog.UserInput; if (url == "") { return; } DryadLinqJobSummary summary = null; try { summary = this.clusterStatus.DiscoverDryadLinqJobFromURL(url, this.Status); } catch (Exception ex) { this.Status("Could not find job associated to url " + url + ": exception " + ex.Message, StatusKind.Error); } if (summary == null) { this.Status("Could not locate job associated to url " + url, StatusKind.Error); } this.browseFromJobSummary(summary); break; } } }
private DfsFile(ClusterConfiguration config, DryadLinqJobSummary job, Exception ex) : base(config, job) { this.Exception = ex; }
/// <summary> /// Get all the files cached associated with a given job. /// </summary> /// <param name="job">Job with cached files.</param> /// <returns>An iterator over all files cached belonging to this job.</returns> public static IEnumerable<string> CachedJobFiles(DryadLinqJobSummary job) { if (perJobFiles.ContainsKey(job)) return perJobFiles[job]; return new List<string>(); }
/// <summary> /// Create a suitable Job Failure diagnosis object for the job being analyzed. /// </summary> /// <param name="summary">Job to diagnose.</param> /// <param name="config">Cluster where job resides.</param> /// <param name="manager">Communication manager.</param> /// <returns>A subclass of JobFailureDiagnosis with the type appropriate for the job.</returns> public static JobFailureDiagnosis CreateJobFailureDiagnosis(ClusterConfiguration config, DryadLinqJobSummary summary, CommManager manager) { if (config is CacheClusterConfiguration) config = (config as CacheClusterConfiguration).ActualConfig(summary); throw new InvalidOperationException("Configuration of type " + config.TypeOfCluster + " not supported for diagnosis"); }
/// <summary> /// Cancel the specified job. /// </summary> /// <param name="job">Job whose execution is cancelled.</param> /// <returns>True if the cancellation succeeded.</returns> public override bool CancelJob(DryadLinqJobSummary job) { return false; }
/// <summary> /// Cancel the specified job. /// </summary> /// <param name="job">Job whose execution is cancelled.</param> /// <returns>True if the cancellation succeeded.</returns> public override bool CancelJob(DryadLinqJobSummary job) { Microsoft.Research.Peloponnese.Azure.Utils.KillJob(this.config.AccountName, this.config.AccountKey, this.config.Container, job.ClusterJobId); return false; }
/// <summary> /// Create a suitable Job Failure diagnosis object for the job being analyzed. /// </summary> /// <param name="summary">Job to diagnose.</param> /// <param name="config">Cluster where job resides.</param> /// <param name="manager">Communication manager.</param> /// <returns>A subclass of JobFailureDiagnosis with the type appropriate for the job.</returns> public static JobFailureDiagnosis CreateJobFailureDiagnosis(ClusterConfiguration config, DryadLinqJobSummary summary, CommManager manager) { if (config is CacheClusterConfiguration) { config = (config as CacheClusterConfiguration).ActualConfig(summary); } throw new InvalidOperationException("Configuration of type " + config.TypeOfCluster + " not supported for diagnosis"); }
/// <summary> /// Cancel the specified job. /// </summary> /// <param name="job">Job whose execution is cancelled.</param> /// <returns>True if the cancellation succeeded.</returns> public override bool CancelJob(DryadLinqJobSummary job) { throw new InvalidOperationException(); }
/// <summary> /// Initialize an empty cached cluster resident object. /// </summary> /// <param name="config">Cluster where the file resides.</param> /// <param name="job">Job who owns these files.</param> protected CachedClusterResidentObject(ClusterConfiguration config, DryadLinqJobSummary job) { this.cacheWriter = null; this.tempFileName = null; this.Job = job; this.Config = config; }
/// <summary> /// Initialize a job info. /// </summary> /// <param name="summary">Job to summarize.</param> private void Initialize(DryadLinqJobSummary summary) { this.UsefulCPUTime = TimeSpan.Zero; this.WastedCPUTime = TimeSpan.Zero; this.LastUpdatetime = DateTime.Now; this.ManagerStdoutIncomplete = true; // until we've seen the end this.ManagerVertex = null; this.jobSummary = summary; this.ErrorCode = ""; this.AbortingMsg = ""; this.cachedStages = new Dictionary<string, DryadLinqJobStage>(); this.jobVertices = new JobVertices(); bool terminated = ClusterJobInformation.JobIsFinished(summary.Status); IClusterResidentObject managerstdoutfile = this.ClusterConfiguration.ProcessStdoutFile(summary.ManagerProcessGuid, terminated, summary.Machine, summary); if (this.ClusterConfiguration is CacheClusterConfiguration) this.stdoutpath = managerstdoutfile; else { IClusterResidentObject jmdir = this.ClusterConfiguration.ProcessDirectory(summary.ManagerProcessGuid, terminated, summary.Machine, summary); if (this.stdoutpath == null) { string filename = managerstdoutfile.Name; //this.stdoutpath = jmdir.GetFile("stdout.txt"); // do this by scanning the folder; this can give additional information about the file size on some platforms IEnumerable<IClusterResidentObject> files = jmdir.GetFilesAndFolders(filename); foreach (var f in files) { if (f.Exception != null) { throw f.Exception; } if (f.RepresentsAFolder) continue; // there should be exactly one match this.stdoutpath = f; break; } if (this.stdoutpath == null) { throw new ClusterException("Could not locate JM standard output file in folder " + jmdir); } } } }
/// <summary> /// Create a new diagnostic log. /// </summary> public DiagnosisLog(DryadLinqJobInfo job, DryadLinqJobSummary summary) { this.messages = new List <DiagnosisMessage>(); this.Summary = summary; this.Job = job; }
/// <summary> /// A file with the specified path. /// </summary> /// <param name="path">Path to the file.</param> /// <param name="client">Azure client.</param> /// <param name="config">Cluster configuration.</param> /// <param name="job">Job accessing this file.</param> /// <param name="isFolder">If true this must be a folder.</param> /// <param name="canCache">True if the file can be cached (it is immutable for sure).</param> public AzureDfsFile(ClusterConfiguration config, DryadLinqJobSummary job, AzureDfsClient client, string path, bool canCache, bool isFolder) : base(config, job) { this.client = client; this.path = path; this.ShouldCacheLocally = canCache; this.RepresentsAFolder = isFolder; this.size = -1; if (!string.IsNullOrEmpty(CachedClusterResidentObject.CacheDirectory)) this.LocalCachePath = Path.Combine(CachedClusterResidentObject.CacheDirectory, this.path); }
/// <summary> /// Create a cluster resident object corresponding to a given pathname. /// </summary> /// <param name="path">Path to the cluster-resident object.</param> /// <param name="config">Cluster where the file resides.</param> /// <param name="shouldCache">If true the file should be cached.</param> /// <param name="job">Job who owns this file.</param> public UNCFile(ClusterConfiguration config, DryadLinqJobSummary job, UNCPathname path, bool shouldCache) : base(config, job) { this.Pathname = path; this.Exception = null; this.ShouldCacheLocally = shouldCache; //if (! this.RepresentsAFolder) this.LocalCachePath = this.CachePath(this.Pathname); }
/// <summary> /// Refresh the job summary status. /// </summary> /// <param name="job">Summary to refresh.</param> /// <param name="manager">Communication manager.</param> public override void RefreshStatus(DryadLinqJobSummary job, CommManager manager) { ClusterConfiguration actual = (this.Config as CacheClusterConfiguration).ActualConfig(job); ClusterStatus actualStatus = actual.CreateClusterStatus(); actualStatus.RefreshStatus(job, manager); ClusterJobInformation info = actualStatus.DiscoverClusterJob(job, manager); if (info == null) { job.Status = ClusterJobInformation.ClusterJobStatus.Unknown; return; } job.Status = info.Status; }
/// <summary> /// Cache the vertices in the list; executed on the background thread. /// </summary> /// <returns>True: success.</returns> /// <param name="manager">Communication manager.</param> /// <param name="config">Cluster configuration.</param> /// <param name="summary">Job to cache.</param> /// <param name="vertices">Vertices to cache.</param> private static bool CacheAllVertices( ClusterConfiguration config, DryadLinqJobSummary summary, List<ExecutedVertexInstance> vertices, CommManager manager) { int done = 0; int todo = vertices.Count; int files = 0; manager.Status("Caching data for " + todo + " vertices", StatusKind.LongOp); foreach (ExecutedVertexInstance v in vertices) { files += CacheVertexInfo(config, summary, v); done++; manager.Progress(done / todo); } manager.Progress(100); manager.Status("Cached " + files + " files", StatusKind.OK); return true; }
/// <summary> /// Not needed, all summaries are already known. /// </summary> /// <param name="job">Cluster job.</param> /// <returns>Throws an exception.</returns> /// <param name="manager">Communication manager.</param> public override ClusterJobInformation DiscoverClusterJob(DryadLinqJobSummary job, CommManager manager) { ClusterConfiguration actual = (this.Config as CacheClusterConfiguration).ActualConfig(job); ClusterStatus actualStatus = actual.CreateClusterStatus(); return actualStatus.DiscoverClusterJob(job, manager); }
/// <summary> /// Cache the interesting files of this vertex. /// </summary> /// <param name="v">Vertex whose files should be cached.</param> /// <returns>Number of files cached.</returns> /// <param name="config">Cluster configuration.</param> /// <param name="summary">Job summary.</param> private static int CacheVertexInfo(ClusterConfiguration config, DryadLinqJobSummary summary, ExecutedVertexInstance v) { int cached = 0; IClusterResidentObject folder = config.ProcessWorkDirectory(v.ProcessIdentifier, v.VertexIsCompleted, v.Machine, summary); if (folder == null || folder.Exception != null) return 0; foreach (IClusterResidentObject file in folder.GetFilesAndFolders("*")) { if (file.RepresentsAFolder) continue; if (!Utilities.FileNameIndicatesTextFile(file.Name)) { continue; } if (!file.ShouldCacheLocally) { continue; } ISharedStreamReader reader = file.GetStream(); // ReSharper disable once UnusedVariable foreach (string line in reader.ReadAllLines()) { // discard; causes caching } cached++; } return cached; }
/// <summary> /// Discover the (unique) dryadlinq job corresponding to a cluster job. /// </summary> /// <param name="clusterJob">Cluster Job.</param> /// <returns>The job description.</returns> /// <param name="reporter">Delegate used to report errors.</param> public override DryadLinqJobSummary DiscoverDryadLinqJobFromClusterJob(ClusterJobInformation clusterJob, StatusReporter reporter) { DryadLinqJobSummary result = new DryadLinqJobSummary( clusterJob.Cluster, this.Config.TypeOfCluster, "", // virtual cluster "", // machine clusterJob.ClusterJobID, // jobId clusterJob.ClusterJobID, // clusterJobId new DryadProcessIdentifier("jm"), // jmProcessGuid clusterJob.Name, clusterJob.User, clusterJob.Date, clusterJob.Date + clusterJob.EstimatedRunningTime, clusterJob.Status); return result; }
/// <summary> /// If caching is enabled save information about the current job. /// </summary> /// <param name="job">Job that is being diagnosed.</param> void LogJobInCache(DryadLinqJobSummary job) { if (string.IsNullOrEmpty(CachedClusterResidentObject.CacheDirectory)) return; // generate a unique file name string summarystring = job.AsIdentifyingString(); string md5 = Utilities.MD5(summarystring); string path = Utilities.PathCombine(CachedClusterResidentObject.CacheDirectory, "jobs", md5 + ".xml"); Utilities.EnsureDirectoryExistsForFile(path); Utilities.SaveAsXml(path, job); CachedClusterResidentObject.RecordCachedFile(job, path); }
/// <summary> /// Refresh the job summary status. /// </summary> /// <param name="summary">Summary to refresh.</param> /// <param name="manager">Communication manager.</param> public override void RefreshStatus(DryadLinqJobSummary summary, CommManager manager) { ClusterJobInformation info = this.GetJobInfo(summary.JobID); if (info == null) { summary.Status = ClusterJobInformation.ClusterJobStatus.Unknown; return; } summary.Status = info.Status; }
/// <summary> /// Read the information about a job which ran the JM on the cluster /// </summary> /// <param name="cf">Configuration of the cluster.</param> /// <param name="summary">Summary of the job.</param> protected DryadLinqJobInfo(ClusterConfiguration cf, DryadLinqJobSummary summary) { this.JobInfoCannotBeCollected = true; this.ClusterConfiguration = cf; if (cf is CacheClusterConfiguration) this.OriginalClusterConfiguration = (cf as CacheClusterConfiguration).ActualConfig(summary); else this.OriginalClusterConfiguration = cf; this.Initialize(summary); }
/// <summary> /// Create a new diagnostic log. /// </summary> public DiagnosisLog(DryadLinqJobInfo job, DryadLinqJobSummary summary) { this.messages = new List<DiagnosisMessage>(); this.Summary = summary; this.Job = job; }
/// <summary> /// A file with the specified path. /// </summary> /// <param name="path">Path to the file.</param> /// <param name="client">Azure client.</param> /// <param name="config">Cluster configuration.</param> /// <param name="job">Job accessing this file.</param> /// <param name="jobFolderUri">Uri to base folder.</param> /// <param name="isFolder">If true this must be a folder.</param> /// <param name="canCache">True if the file can be cached (it is immutable for sure).</param> public DfsFile(ClusterConfiguration config, Uri jobFolderUri, DryadLinqJobSummary job, HdfsClientBase client, string path, bool canCache, bool isFolder) : base(config, job) { this.client = client; this.Exception = null; this.baseUri = jobFolderUri; this.uri = UriFromPath(jobFolderUri, path); this.ShouldCacheLocally = canCache; this.RepresentsAFolder = isFolder; this.size = -1; Console.WriteLine("DfsFile Uri={0}", this.uri); if (!string.IsNullOrEmpty(CachedClusterResidentObject.CacheDirectory)) this.LocalCachePath = Path.Combine(CachedClusterResidentObject.CacheDirectory, PathFromUri(this.baseUri, this.uri)); }
/// <summary> /// Create a class to diagnose the problems of a job. /// </summary> /// <param name="config">Cluster where job resides.</param> /// <param name="manager">Communication manager.</param> /// <param name="summary">Job summary.</param> protected JobFailureDiagnosis(ClusterConfiguration config, DryadLinqJobSummary summary, CommManager manager) : base(config, summary, manager) { this.diagnosisLog = new DiagnosisLog(this.Job, summary); if (this.Job != null) this.jobManager = this.Job.ManagerVertex; }
/// <summary> /// Refresh the job summary status. /// </summary> /// <param name="summary">Summary to refresh.</param> /// <param name="manager">Communication manager.</param> public virtual void RefreshStatus(DryadLinqJobSummary summary, CommManager manager) { // refresh the whole list: too expensive // this.RecomputeClusterJobList(summary.VirtualCluster, manager); ClusterJobInformation info = this.DiscoverClusterJob(summary, manager); if (info == null) { summary.Status = ClusterJobInformation.ClusterJobStatus.Unknown; return; } summary.Status = info.Status; }
/// <summary> /// Create information about a job run on the cluster. /// </summary> /// <param name="cf">Cluster configuration.</param> /// <param name="summary">Summary description of the job.</param> /// <returns>The Dryad job description, or null.</returns> /// <param name="fill">If true, fill all the information, otherwise the user will have to call FillInformation on the result later.</param> /// <param name="manager">Communication manager.</param> public static DryadLinqJobInfo CreateDryadLinqJobInfo(ClusterConfiguration cf, DryadLinqJobSummary summary, bool fill, CommManager manager) { try { DryadLinqJobInfo job = new DryadLinqJobInfo(cf, summary); if (fill) job.CollectEssentialInformation(manager); return job; } catch (Exception e) { Trace.TraceInformation(e.ToString()); manager.Status("Could not collect job information for " + summary.Name + ": " + e.Message, StatusKind.Error); return null; } }
/// <summary> /// Cancel the specified job. /// </summary> /// <param name="job">Job whose execution is cancelled.</param> /// <returns>True if the cancellation succeeded.</returns> public abstract bool CancelJob(DryadLinqJobSummary job);
// ReSharper restore UnusedParameter.Global /// <summary> /// Discover a cluster job given its id. /// </summary> /// <param name="job">Job to discover.</param> /// <returns>The cluster job, or null if not found.</returns> /// <param name="manager">Communication manager.</param> public virtual ClusterJobInformation DiscoverClusterJob(DryadLinqJobSummary job, CommManager manager) { if (this.clusterJobs == null) this.RecomputeClusterJobList(job.VirtualCluster, manager); return this.clusterJobs[job.ClusterJobId]; }
/// <summary> /// Record that the job owns this cached file. /// </summary> /// <param name="job">Job.</param> /// <param name="path">Cached file belonging to this job.</param> public static void RecordCachedFile(DryadLinqJobSummary job, string path) { HashSet<string> list; if (!perJobFiles.ContainsKey(job)) { list = new HashSet<string>(); perJobFiles.Add(job, list); } else { list = perJobFiles[job]; } list.Add(path); }
/// <summary> /// Start the job browser from a job summary. /// </summary> /// <param name="js">Job summary to browse.</param> private void browseFromJobSummary(DryadLinqJobSummary js) { if (js == null) return; // TODO: this should run in the background CommManager manager = new CommManager(this.Status, delegate { }, new System.Threading.CancellationTokenSource().Token); DryadLinqJobInfo job = DryadLinqJobInfo.CreateDryadLinqJobInfo(this.clusterStatus.Config, js, false, manager); if (job != null) { JobBrowser browser = new JobBrowser(job); browser.Show(); this.Status("OK", StatusKind.OK); } else { this.Status("Could not find information about job", StatusKind.Error); } }
/// <summary> /// Cancel the specified job. /// </summary> /// <param name="job">Job whose execution is cancelled.</param> /// <returns>True if the cancellation succeeded.</returns> public override bool CancelJob(DryadLinqJobSummary job) { AzureUtils.KillJob(this.config.AccountName, this.config.AccountKey, this.config.Container, job.ClusterJobId); return true; }