internal void Add(string machine, ExecutedVertexInstance vertex) { int index; if (!this.machines.ContainsKey(machine)) { index = this.MachineCount; this.machines.Add(machine, index); this.machineInfo.Add(new MachineInformation(machine)); } else { index = this.machines[machine]; } this.machineInfo[index].AddVertex(vertex); }
/// <summary> /// Create a class to diagnose the problems of a job. /// </summary> /// <param name="config">Cluster where job resides.</param> /// <param name="manager">Communication manager.</param> /// <param name="summary">Job summary.</param> protected JobFailureDiagnosis(ClusterConfiguration config, DryadLinqJobSummary summary, CommManager manager) : base(config, summary, manager) { this.diagnosisLog = new DiagnosisLog(this.Job, summary); if (this.Job != null) this.jobManager = this.Job.ManagerVertex; }
/// <summary> /// Create a class to diagnose the problems of a job. /// </summary> /// <param name="job">Job to diagnose.</param> /// <param name="plan">Plan of the diagnosed job.</param> /// <param name="manager">Communication manager.</param> protected JobFailureDiagnosis(DryadLinqJobInfo job, DryadJobStaticPlan plan, CommManager manager) : base(job, plan, manager) { this.diagnosisLog = new DiagnosisLog(job, job.Summary); this.jobManager = this.Job.ManagerVertex; }
/// <summary> /// Create a VertexFailureDiagnosis of the appropriate type. /// </summary> /// <param name="vertex">Vertex to diagnose.</param> /// <param name="job">Job containing the vertex.</param> /// <param name="manager">Communication manager.</param> /// <returns>A subclass of VertexFailureDiagnosis.</returns> /// <param name="plan">Plan of the executed job.</param> public static VertexFailureDiagnosis CreateVertexFailureDiagnosis(DryadLinqJobInfo job, DryadJobStaticPlan plan, ExecutedVertexInstance vertex, CommManager manager) { ClusterConfiguration config = job.ClusterConfiguration; if (config is CacheClusterConfiguration) config = (config as CacheClusterConfiguration).ActualConfig(job.Summary); throw new InvalidOperationException("Config of type " + config.TypeOfCluster + " not handled"); }
/// <summary> /// Create a class to diagnose the problems of a vertex. /// </summary> /// <param name="vertex">Vertex to diagnose.</param> /// <param name="job">Job containing the vertex.</param> /// <param name="plan">Plan of the executed job.</param> /// <param name="manager">Communication manager.</param> protected VertexFailureDiagnosis(DryadLinqJobInfo job, DryadJobStaticPlan plan, ExecutedVertexInstance vertex, CommManager manager) : base(job, plan, manager) { this.Job = job; this.Vertex = vertex; // ReSharper disable once DoNotCallOverridableMethodsInConstructor this.stackTraceFile = "dryadLinqStackTrace.txt"; }
/// <summary> /// Color representing the vertex state. /// </summary> /// <returns>A string naming a color.</returns> private static Color VertexStateColor(ExecutedVertexInstance.VertexState state) { switch (state) { case ExecutedVertexInstance.VertexState.Cancelled: return Color.Yellow; case ExecutedVertexInstance.VertexState.Unknown: case ExecutedVertexInstance.VertexState.Abandoned: case ExecutedVertexInstance.VertexState.Created: return Color.White; case ExecutedVertexInstance.VertexState.Started: return Color.Cyan; case ExecutedVertexInstance.VertexState.Invalidated: return Color.YellowGreen; case ExecutedVertexInstance.VertexState.Revoked: return Color.Brown; case ExecutedVertexInstance.VertexState.Successful: return Color.LightGreen; case ExecutedVertexInstance.VertexState.Failed: return Color.Tomato; default: throw new DryadException("Unexpected vertex state " + state); } }
/// <summary> /// Cache the interesting files of this vertex. /// </summary> /// <param name="v">Vertex whose files should be cached.</param> /// <returns>Number of files cached.</returns> /// <param name="config">Cluster configuration.</param> /// <param name="summary">Job summary.</param> private static int CacheVertexInfo(ClusterConfiguration config, DryadLinqJobSummary summary, ExecutedVertexInstance v) { int cached = 0; IClusterResidentObject folder = config.ProcessWorkDirectory(v.ProcessIdentifier, v.VertexIsCompleted, v.Machine, summary); if (folder == null || folder.Exception != null) return 0; foreach (IClusterResidentObject file in folder.GetFilesAndFolders("*")) { if (file.RepresentsAFolder) continue; if (!Utilities.FileNameIndicatesTextFile(file.Name)) { continue; } if (!file.ShouldCacheLocally) { continue; } ISharedStreamReader reader = file.GetStream(); // ReSharper disable once UnusedVariable foreach (string line in reader.ReadAllLines()) { // discard; causes caching } cached++; } return cached; }
/// <summary> /// Scan the JM stdout looking for the specified vertex; display the lines in the file view. /// Run in the background. /// </summary> /// <param name="vertex">Vertex to look for.</param> /// <returns>true if the information was found.</returns> /// <param name="logViewer">Viewer to use to display the logs.</param> /// <param name="stdout">Job standard output stream.</param> private static bool ScanJMStdout(ExecutedVertexInstance vertex, IClusterResidentObject stdout, LogViewer logViewer) { if (vertex == null || vertex.IsManager) return false; string vertexId = vertex.UniqueID; string name = string.Format(@"\s{0}.{1}\s", vertex.Number, vertex.Version); // the dot could match a space too. string regexstring = string.Format(@"vertex\s{0}\s(.*)\sv.{1}\s|", vertex.Number, vertex.Version); if (vertexId != "") regexstring += vertexId + "|"; regexstring += name + "|" + vertex.UniqueID; Regex regex = new Regex(regexstring, RegexOptions.Compiled); Trace.TraceInformation(regex.ToString()); long length = stdout.Size; logViewer.Status("Looking for " + vertex.Name, StatusKind.LongOp); if (length == 0) { logViewer.Status("JM stdout is empty.", StatusKind.Error); logViewer.Done(); return false; } ISharedStreamReader sr = stdout.GetStream(); if (sr.Exception != null) { logViewer.Status("Error opening JM stdout: " + sr.Exception.Message, StatusKind.Error); logViewer.Done(); return false; } try { long read = 0; long lines = 0; while (!sr.EndOfStream) { string line = sr.ReadLine(); read += line.Length; if (regex.IsMatch(line)) logViewer.AddLine(stdout.ToString(), lines, line); lines++; if (lines % 100 == 0 && length > 0) { if (logViewer.Cancelled) break; logViewer.UpdateProgress(Math.Min((int)(read * 100 / length), 100)); // the length can grow while the file is being read } } sr.Close(); } finally { logViewer.Done(); } return true; }
/// <summary> /// Scan the JM logs looking for the specified vertex; display the lines in the file view. /// Run in the background. /// </summary> /// <param name="vertex">Vertex to look for.</param> /// <returns>true if the information was found.</returns> /// <param name="logViewer">Viewer used to display the logs.</param> private bool ScanJMLogs(ExecutedVertexInstance vertex, LogViewer logViewer) { if (vertex == null || this.Job.ManagerVertex == null) return false; if (vertex == this.Job.ManagerVertex) return false; string vertexId = vertex.UniqueID; Regex regex = new Regex(vertexId, RegexOptions.Compiled); Trace.TraceInformation(regex.ToString()); IClusterResidentObject logdir = this.Job.ManagerVertex.LogDirectory; if (logdir.Exception != null) { this.Status(logdir.ToString(), StatusKind.Error); return false; } List<IClusterResidentObject> files = logdir.GetFilesAndFolders(this.Job.ManagerVertex.LogFilesPattern).ToList(); if (files.Count == 0) { this.Status("No log files found", StatusKind.Error); return false; } try { long totalWork = 0; foreach (var file in files) { if (totalWork >= 0 && file.Size >= 0) totalWork += file.Size; } long done = 0; foreach (var file in files) { ISharedStreamReader sr = file.GetStream(); if (sr.Exception != null) { logViewer.Status("Error opening file: " + sr.Exception.Message, StatusKind.Error); continue; } logViewer.Status("Scanning " + file, StatusKind.LongOp); long lineno = 0; while (!sr.EndOfStream) { if (logViewer.Cancelled) break; string line = sr.ReadLine(); done += line.Length; if (regex.IsMatch(line)) { logViewer.AddLine(file.Name, lineno, line); } lineno++; logViewer.UpdateProgress((int)(100 * done / totalWork)); } sr.Close(); if (logViewer.Cancelled) break; } } finally { logViewer.Done(); } return true; }
/// <summary> /// Display information about a selected vertex in the vertex view panes. /// </summary> /// <param name="executedVertexInstance">Vertex to display.</param> private void DisplayVertex(ExecutedVertexInstance executedVertexInstance) { this.currentVertex = executedVertexInstance; this.vertexHeaderData.Clear(); this.label_Vertex.BackColor = this.defaultBackColor; if (executedVertexInstance != null) { if (this.currentVertex.IsManager) { if (!this.comboBox_vertexInformation.Items.Contains("XML Plan")) this.comboBox_vertexInformation.Items.Add("XML Plan"); if (!this.comboBox_vertexInformation.Items.Contains("Job log")) this.comboBox_vertexInformation.Items.Add("Job log"); if (this.comboBox_vertexInformation.Items.Contains("stdout")) this.comboBox_vertexInformation.Items.Remove("stdout"); } else { if (this.comboBox_vertexInformation.Items.Contains("XML Plan")) this.comboBox_vertexInformation.Items.Remove("XML Plan"); if (this.comboBox_vertexInformation.Items.Contains("Job log")) this.comboBox_vertexInformation.Items.Remove("Job log"); if (!this.comboBox_vertexInformation.Items.Contains("stdout")) this.comboBox_vertexInformation.Items.Add("stdout"); } this.Status("Loading vertex data...", StatusKind.LongOp); this.textBox_find.Enabled = true; this.vertexPropertyEnumerator.Data = executedVertexInstance; this.vertexPropertyEnumerator.PopulateWithProperties(this.vertexHeaderData); this.label_Vertex.Text = "Vertex: " + executedVertexInstance.Name; this.comboBox_vertexInformation.Enabled = true; this.label_Vertex.BackColor = VertexStateColor(executedVertexInstance.State); this.vertexToolStripMenuItem.Enabled = false; //true; } else { this.vertexToolStripMenuItem.Enabled = false; this.textBox_find.Enabled = false; this.label_Vertex.Text = "Vertex"; this.comboBox_vertexInformation.Enabled = false; } this.vertexHeaderData.ResetBindings(); this.ChooseVertexInformation(); this.dataGridView_vertexHeader.ClearSelection(); this.Status("OK", StatusKind.OK); }
/// <summary> /// Move the selection to the specified vertex. /// </summary> /// <param name="vertex">Vertex to select.</param> private void SelectVertex(ExecutedVertexInstance vertex) { if (vertex != null) { string stageName = vertex.StageName; DryadLinqJobStage executedstage = this.Job.GetStage(stageName); if (executedstage != null && this.currentStage != executedstage && this.currentStage.Name != "All vertices") this.SetStage(executedstage); // let us move selection to this vertex for (int i = 0; i < this.dataGridView_stageContents.Rows.Count; i++) { DataGridViewRow row = this.dataGridView_stageContents.Rows[i]; if (row.DataBoundItem == vertex) { row.Selected = true; this.dataGridView_stageContents.FirstDisplayedScrollingRowIndex = i; break; } } } else { this.SetNoStageOrTable("", false); } }
/// <summary> /// Parse one line from the JM standard output. /// </summary> /// <param name="line">The line to parse.</param> private void ParseStdoutLine(string line) { DateTime lineTimeStamp = DateTime.MinValue; if (line.Contains("Created process execution record")) { Match m = vertexCreatedRegex.Match(line); if (m.Success) { lineTimeStamp = ParseLineTimestamp(line); // Created process execution record for vertex (\d+) \((.*)\) v.(\d+) GUID \{?([-A-F0-9]+)\}? int number = Int32.Parse(m.Groups[1].Value); string name = m.Groups[2].Value; int version = Int32.Parse(m.Groups[3].Value); string guid = m.Groups[4].Value; // on some platforms, e.g. HPC, this identifier is not yet assigned properly // the vertex may be already there, sometimes numbers are reused... ExecutedVertexInstance vi = this.jobVertices.FindVertex(number, version); if (vi == null) { vi = new ExecutedVertexInstance(this, number, version, name, guid, lineTimeStamp); this.jobVertices.Add(vi); } } else { m = verticesCreatedRegex.Match(line); if (m.Success) { lineTimeStamp = ParseLineTimestamp(line); // Created process execution record for vertices (.*) v.(\d+) GUID \{?([-A-F0-9]+)\}? // Created process execution record for vertices 192 (Merge__41[0]) 223 (Union__45[0]) v.0 GUID {0297A91C-FFEA-42EA-94AF-CD0163A04D45} int version = Int32.Parse(m.Groups[2].Value); string vertices = m.Groups[1].Value; string guid = m.Groups[3].Value; // on some platforms, e.g. HPC, this identifier is not yet assigned properly IEnumerable<Tuple<string, int>> vertexList = DryadLinqJobInfo.ParseVertices(vertices); foreach (var p in vertexList) { int number = p.Item2; ExecutedVertexInstance vi = this.jobVertices.FindVertex(number, version); if (vi == null) { vi = new ExecutedVertexInstance(this, number, version, p.Item1, guid, lineTimeStamp); this.jobVertices.Add(vi); } } } } } else if (line.StartsWith("Creating process")) { Match m = processCreatingRegex.Match(line); if (m.Success) { lineTimeStamp = ParseLineTimestamp(line); // Creating process for vertex (\d+) \((.*)\\) v.(\d+) GUID \{?([-A-F0-9]+)\}? machine (\w+) int number = Int32.Parse(m.Groups[1].Value); //string name = m.Groups[2].Value; int version = Int32.Parse(m.Groups[3].Value); string guid = m.Groups[4].Value; ExecutedVertexInstance vi = this.jobVertices.FindVertex(number, version); if (vi != null) { this.jobVertices.Remap(vi, guid); } } } else if (line.StartsWith("Process was revoked")) { Match m = revokedRegex.Match(line); if (m.Success) { string oldGuid = m.Groups[1].Value; ExecutedVertexInstance vi = this.jobVertices.FindVertexByGuid(oldGuid); if (vi != null) { vi.SetState(ExecutedVertexInstance.VertexState.Revoked); string newGuid = m.Groups[2].Value; this.jobVertices.Remap(vi, newGuid); } else { Trace.TraceInformation("Could not find revoked vertex with guid " + oldGuid); } } } else if (line.StartsWith("---HiPriTime")) { // Scope-specific line which we use to get the i/o information // ---HiPriTime D7D51A1F-6693-4378-95FD-FC778A67C632,F52CA694-0202-411E-85E9-0C883E770A0E,SV4_Extract_Split[0],Completed,ch1sch010331112,2011-05-03 15:26:01.681 PDT,2011-05-03 15:26:01.696 PDT,2011-05-03 15:26:02.118 PDT,2011-05-03 15:26:04.286 PDT,2011-05-03 15:26:07.656 PDT,2011-05-03 15:26:01.696 PDT,97390825,1498630 string info = line.Substring(13); string[] parts = info.Split(','); if (parts.Length >= 13) { long read = long.Parse(parts[11]); long written = long.Parse(parts[12]); string guid = parts[1]; ExecutedVertexInstance vi = this.jobVertices.FindVertexByGuid(guid); if (vi != null) { vi.DataRead = read; vi.DataWritten = written; this.TotalDataRead += read; } } } else if (line.Contains("Io information")) { // HPC-specific line Match m = ioRegex.Match(line); if (m.Success) { int number = Int32.Parse(m.Groups[1].Value); int version = Int32.Parse(m.Groups[2].Value); ExecutedVertexInstance vi = this.jobVertices.FindVertex(number, version); if (vi != null) { vi.DataRead = long.Parse(m.Groups[4].Value); vi.DataWritten = long.Parse(m.Groups[5].Value); this.TotalDataRead += vi.DataRead; } } } else if (line.Contains("Process started")) { //those vertices which are being canceled may not be here Match m = vertexStartRegex.Match(line); if (m.Success) { lineTimeStamp = ParseLineTimestamp(line); string version = m.Groups[3].Value; string guid = m.Groups[4].Value; string pid = this.ClusterConfiguration.ExtractPidFromGuid(guid, this.Summary); DryadProcessIdentifier identifier = new DryadProcessIdentifier(pid); string machine = m.Groups[5].Value; // Process started for vertex 4 (Super__0[0]) v.0 GUID {9DDD0B00-C93F-46D2-9073-1CFD27829300} machine sherwood-255 // Process started for vertices 23 (Merge__29) 24 (Apply__33) v.0 GUID {E945DC5D-9AF6-4732-8770-2A6BF7FA3041} machine sherwood-237 string vertices = m.Groups[2].Value; // This is a list of (number \(name\))* pairs // we will assume that the parantheses are matched, or we can't do much bool onevertex; if (m.Groups[1].Value == "ex") // one vertEX onevertex = true; else if (m.Groups[1].Value == "ices") onevertex = false; else throw new DryadException("Can't figure out if one or many vertices"); IEnumerable<Tuple<string, int>> vertexList = DryadLinqJobInfo.ParseVertices(vertices); int vertexcount = 0; int iversion = int.Parse(version); if (lineTimeStamp > this.lastTimestampSeen) this.lastTimestampSeen = lineTimeStamp; foreach (var p in vertexList) { int number = p.Item2; ExecutedVertexInstance vi = this.jobVertices.FindVertex(number, iversion); //new ExecutedVertexInstance(this, number, version, name, identifier, machine, this.lastTimestampSeen); if (vi == null) Trace.TraceInformation("Could not find information for vertex {0}.{1}", number, version); else vi.SetStartInformation(this, machine, this.lastTimestampSeen, identifier, guid); vertexcount++; } if (vertexcount > 1 && onevertex) throw new DryadException("Expected one vertex, found " + vertexcount); } else { Trace.TraceInformation("Unexpected parsing error on line {0}", line); } } else if (line.Contains("Abandoning")) { Match m = vertexAbandonedRegex.Match(line); if (m.Success) { int number = Int32.Parse(m.Groups[1].Value); int version = Int32.Parse(m.Groups[2].Value); ExecutedVertexInstance vi = this.jobVertices.FindVertex(number, version); if (vi != null) vi.SetState(ExecutedVertexInstance.VertexState.Abandoned); } } else if (line.Contains("Setting")) { Match m = setToFailedlRegex.Match(line); if (m.Success) { // Setting vertex 1461.0 (Merge__13[258]) to failed // Setting vertex (\d+)\.(\d+) \((.+)\) to failed(.*) int number = Int32.Parse(m.Groups[1].Value); int version = Int32.Parse(m.Groups[2].Value); ExecutedVertexInstance vi = this.jobVertices.FindVertex(number, version); if (vi != null) { vi.SetState(ExecutedVertexInstance.VertexState.Failed); //vi.ErrorString = m.Groups[4].Value; } } } else if (line.Contains("Process was terminated")) { // terminatedRegex = new Regex(@"Process was terminated Vertex (\d+)\.(\d+) \((.+)\) GUID \{?([-A-F0-9]+)\}? machine (\S+) status (.*)", // Process was terminated Vertex 11.0 (Select__6[1]) GUID {C1E35A88-F5AD-4A26-BE5F-46B6D515623F} machine sherwood-118 status The operation succeeded Match m = terminatedRegex.Match(line); if (m.Success) { lineTimeStamp = ParseLineTimestamp(line); int number = Int32.Parse(m.Groups[1].Value); int version = Int32.Parse(m.Groups[2].Value); ExecutedVertexInstance vi = this.jobVertices.FindVertex(number, version); if (vi != null) { // sometimes successful processes are terminated, because they don't report quickly enough being done if (vi.State != ExecutedVertexInstance.VertexState.Successful) { vi.SetState(ExecutedVertexInstance.VertexState.Cancelled); } vi.ErrorString = m.Groups[6].Value; if (lineTimeStamp != DateTime.MinValue) vi.End = lineTimeStamp; } } } else if (line.Contains("Timing Information Graph Start Time")) { // Cosmos-specific line // Timing Information Graph Start Time 128654556581866096 Match m = Regex.Match(line, @"Timing Information Graph Start Time (\d+)"); DateTime createTime = Utilities.Convert64time(ClusterConfiguration.GetClusterTimeZone(this.Summary), m.Groups[1].Value); this.ManagerVertex.SetStartInformation(this, this.Summary.Machine, createTime, this.Summary.ManagerProcessGuid, ""); this.ManagerVertex.StartCommandTime = this.ManagerVertex.CreationTime = this.ManagerVertex.VertexScheduleTime = createTime; this.lastTimestampSeen = createTime; } else if (line.StartsWith("Start time: ")) { // HPC L2H specific line // Start time: 04/05/2011 17:25:42.223 DateTime createTime; bool parse = DateTime.TryParse(line.Substring("Start time: ".Length), out createTime); if (parse) { this.ManagerVertex.SetStartInformation(this, this.Summary.Machine, createTime, this.Summary.ManagerProcessGuid, ""); this.ManagerVertex.StartCommandTime = this.ManagerVertex.CreationTime = this.ManagerVertex.VertexScheduleTime = createTime; this.lastTimestampSeen = createTime; } } else if (line.Contains("JM Finish time:")) { // Cosmos-specific line // JM Finish time: 129140295499437263 2010-03-25T22:25:49.943726Z Match m = Regex.Match(line, @"JM Finish time: (\d+)"); DateTime time = Utilities.Convert64time(ClusterConfiguration.GetClusterTimeZone(this.Summary), m.Groups[1].Value); this.lastTimestampSeen = time; this.ManagerVertex.End = time; } else if (line.StartsWith("Stop time ")) { // HPC L2H specific line // Stop time (Exit code = 2148734208): 04/05/2011 17:25:46.614 Regex regex = new Regex(@"Stop time \(Exit code = (.*)\): (.*)"); Match m = regex.Match(line); if (m.Success) { this.ManagerStdoutIncomplete = false; DateTime time; bool parse = DateTime.TryParse(m.Groups[2].Value, out time); if (parse) { this.lastTimestampSeen = time; this.ManagerVertex.End = time; } this.ErrorCode = m.Groups[1].Value; if (this.ErrorCode == "0") { this.ManagerVertex.SetState(ExecutedVertexInstance.VertexState.Successful); } else { this.ManagerVertex.SetState(ExecutedVertexInstance.VertexState.Failed); } } } else if (line.Contains("Timing Information")) { // Timing Information 4 1 Super__0[0] 128654556603428182 0.0000 0.0000 0.0000 0.0000 0.2500 Match m = timingInfoRegex.Match(line); if (m.Success) { int vertex = Int32.Parse(m.Groups[1].Value); int version = Int32.Parse(m.Groups[2].Value); DateTime createtime = Utilities.Convert64time(ClusterConfiguration.GetClusterTimeZone(this.Summary), m.Groups[4].Value); ExecutedVertexInstance vi = jobVertices.FindVertex(vertex, version); if (vi == null) return; // we do not keep track of vertices with duplicate scheduling, so these won't show up here if (vi.State == ExecutedVertexInstance.VertexState.Started) { Console.WriteLine("Timing information while vertex is still running " + vi); //throw new ClusterException("Timing information for vertex still running: " + vi); } DateTime last = vi.SetTiming(createtime, m.Groups[5].Value, m.Groups[6].Value, m.Groups[7].Value, m.Groups[8].Value, m.Groups[9].Value); if (last > this.lastTimestampSeen) this.lastTimestampSeen = last; this.ManagerVertex.MarkVertexWasRunning(last); try { if (vi.State == ExecutedVertexInstance.VertexState.Successful) this.UsefulCPUTime += vi.RunningTime; else if (vi.RunningTime > TimeSpan.Zero) this.WastedCPUTime += vi.RunningTime; } catch (Exception ex) { Console.WriteLine("Time value exception: " + ex.Message); } } else throw new DryadException("Unmatched timing information line " + line); } else if (line.Contains("Process has failed")) { // Process has failed Vertex 11.0 (Select__6[1]) GUID {C1E35A88-F5AD-4A26-BE5F-46B6D515623F} machine sherwood-118 Exitcode 0 status The operation succeeded // failedRegex = new Regex(@"Process has failed Vertex (\d+)\.(\d+) \((.+)\) GUID \{?([-A-F0-9]+)\}? machine (\S+) Exitcode (.*)", Match m = failedRegex.Match(line); if (m.Success) { lineTimeStamp = ParseLineTimestamp(line); int vertex = Int32.Parse(m.Groups[1].Value); int version = Int32.Parse(m.Groups[2].Value); string exitcode = m.Groups[6].Value; //string status = m.Groups[7].Value; ExecutedVertexInstance vi = jobVertices.FindVertex(vertex, version); if (vi != null) { vi.SetState(ExecutedVertexInstance.VertexState.Failed); vi.ExitCode = exitcode; if (lineTimeStamp != DateTime.MinValue) vi.End = lineTimeStamp; //vi.ErrorString = status; } } } else if (line.Contains("ABORTING:")) { this.AbortingMsg = line.Substring(10); this.ManagerVertex.SetState(ExecutedVertexInstance.VertexState.Failed); } else if (line.Contains("Accurate read data")) { Match m = datareadRegex.Match(line); if (m.Success) { this.TotalDataRead = long.Parse(m.Groups[1].Value); this.LocalReadData = long.Parse(m.Groups[2].Value); this.IntraPodDataRead = long.Parse(m.Groups[3].Value); this.CrossPodDataRead = long.Parse(m.Groups[4].Value); } } else if (line.Contains("<ErrorString>")) { //some errors contains "Error returned from managed runtime invocation" //which shows the error is from application code Match m = Regex.Match(line, @"\<ErrorString\>(.*)\</ErrorString\>"); if (m.Success && lastFailedVertex != null) { lastFailedVertex.AddErrorString(System.Web.HttpUtility.HtmlDecode(m.Groups[1].Value)); } } else if (line.Contains("Canceling")) { // Canceling vertex 1461.0 (Merge__13[258]) due to dependent failure Match m = cancelRegex.Match(line); if (m.Success) { lineTimeStamp = ParseLineTimestamp(line); int vertex = Int32.Parse(m.Groups[1].Value); int version = Int32.Parse(m.Groups[2].Value); string name = m.Groups[3].Value; ExecutedVertexInstance vi = jobVertices.FindVertex(vertex, version); if (vi != null) { if (vi.State == ExecutedVertexInstance.VertexState.Successful) vi.SetState(ExecutedVertexInstance.VertexState.Invalidated); else vi.SetState(ExecutedVertexInstance.VertexState.Cancelled); if (lineTimeStamp != DateTime.MinValue) vi.End = lineTimeStamp; } else { // TODO: this should not be needed, but this is a workaround for a bug in the HPC L2H software vi = new ExecutedVertexInstance(this, vertex, version, name, "", lineTimeStamp); vi.SetState(ExecutedVertexInstance.VertexState.Cancelled); this.jobVertices.Add(vi); } // Process wasn't even started, so there is nothing to cancel } } else if (line.Contains("Application")) { //the job ends successfully Regex endSuccessRegex = new Regex(@"Application completed successfully."); //the job failed Regex endFailRegex = new Regex(@"Application failed with error code (.*)"); Match m1 = endFailRegex.Match(line); if (m1.Success) { this.ErrorCode = m1.Groups[1].Value; this.ManagerStdoutIncomplete = false; this.ManagerVertex.SetState(ExecutedVertexInstance.VertexState.Failed); } else { Match m2 = endSuccessRegex.Match(line); if (m2.Success) { this.ManagerVertex.SetState(ExecutedVertexInstance.VertexState.Successful); this.ManagerStdoutIncomplete = false; } } } else if (line.StartsWith("Input")) { // Input vertex %u (%s) had %u read failure%s\n Match m = inputFailureRegex.Match(line); if (m.Success) { this.AbortingMsg = line; } } else if (line.Contains("Vertex")) { // terminationRegex = new Regex(@"Vertex (\d+)\.(\d+) \((.+)\) machine (\S+) guid \{?([-0-9A-F]+)\}? status (.*)" Match m = terminationRegex.Match(line); if (m.Success) { lineTimeStamp = ParseLineTimestamp(line); int vertex = Int32.Parse(m.Groups[1].Value); int version = Int32.Parse(m.Groups[2].Value); ExecutedVertexInstance vi = this.jobVertices.FindVertex(vertex, version); if (vi == null) { Trace.TraceInformation("Could not find vertex {0}.{1} line {2}", vertex, version, line); } else { bool failed = vi.SetTermination(m.Groups[6].Value, lineTimeStamp); if (failed) this.lastFailedVertex = vi; } } } if (lineTimeStamp != DateTime.MinValue) this.lastTimestampSeen = lineTimeStamp; }
/// <summary> /// New JM stdout parsing code, for YARN-based DryadLINQ. /// </summary> /// <param name="line">Line to parse.</param> /// <returns>False if the line terminated in a quoted string and has to be combined with the next line.</returns> private bool ParseStdoutLineNew(string line) { if (string.IsNullOrWhiteSpace(line)) return true; Dictionary<string, string> kvp = Utilities.ParseCSVKVP(line); if (kvp == null) return false; var strTs = kvp["logtimelocal"]; int cutOff = strTs.IndexOf("UTC"); if (cutOff >= 0) { strTs = strTs.Substring(0, cutOff); } DateTime timeStamp = DateTime.Parse(strTs, CultureInfo.InvariantCulture); timeStamp = timeStamp.ToLocalTime(); this.lastTimestampSeen = timeStamp; if (kvp.ContainsKey("job")) { string operation = kvp["job"]; switch (operation) { case "start": this.ManagerVertex.SetStartInformation(this, this.Summary.Machine, timeStamp, this.Summary.ManagerProcessGuid, ""); this.ManagerVertex.StartCommandTime = this.ManagerVertex.CreationTime = this.ManagerVertex.VertexScheduleTime = timeStamp; break; case "stop": this.ManagerVertex.End = timeStamp; string exitcode; if (kvp.TryGetValue("exitcode", out exitcode)) { this.ErrorCode = exitcode; int numCode = Convert.ToInt32(exitcode, 16); if (numCode == 0) { this.ManagerVertex.SetState(ExecutedVertexInstance.VertexState.Successful); } else { this.ManagerVertex.SetState(ExecutedVertexInstance.VertexState.Failed); } } string errorstring; if (kvp.TryGetValue("errorstring", out errorstring)) { this.ManagerVertex.AddErrorString(errorstring); this.AbortingMsg = errorstring; } break; } } else if (kvp.ContainsKey("vertex")) { string vertex = kvp["vertex"]; int number; int version; int dot = vertex.IndexOf('.'); if (dot < 0) { number = int.Parse(vertex); version = int.Parse(kvp["version"]); } else { number = int.Parse(vertex.Substring(0, dot)); version = int.Parse(vertex.Substring(dot + 1)); } if (kvp.ContainsKey("transition")) { string transition = kvp["transition"]; switch (transition) { case "created": { string name = kvp["name"]; ExecutedVertexInstance vi = new ExecutedVertexInstance(this, number, version, name, "", timeStamp); this.jobVertices.Add(vi); } break; case "starting": { // not doing anything break; } case "running": { string process; kvp.TryGetValue("id", out process); if (process == null) kvp.TryGetValue("process", out process); string machine = kvp["computer"]; ExecutedVertexInstance vi = this.jobVertices.FindVertex(number, version); this.jobVertices.Remap(vi, process); string pid = this.ClusterConfiguration.ExtractPidFromGuid(process, this.Summary); DryadProcessIdentifier identifier = new DryadProcessIdentifier(pid); vi.SetStartInformation(this, machine, timeStamp, identifier, process); } break; case "completed": { ExecutedVertexInstance vi = this.jobVertices.FindVertex(number, version); vi.SetState(ExecutedVertexInstance.VertexState.Successful); vi.End = timeStamp; vi.ExitCode = ""; this.UsefulCPUTime += vi.RunningTime; break; } case "failed": { ExecutedVertexInstance vi = this.jobVertices.FindVertex(number, version); if (vi.State != ExecutedVertexInstance.VertexState.Started) vi.SetState(ExecutedVertexInstance.VertexState.Cancelled); else { vi.SetState(ExecutedVertexInstance.VertexState.Failed); if (vi.RunningTime > TimeSpan.Zero) this.WastedCPUTime += vi.RunningTime; } if (kvp.ContainsKey("errorstring")) vi.AddErrorString(kvp["errorstring"]); string exitcode; if (kvp.TryGetValue("errorcode", out exitcode)) vi.ExitCode = exitcode; vi.End = timeStamp; break; } } } else if (kvp.ContainsKey("outputChannel")) { ExecutedVertexInstance vi = this.jobVertices.FindVertex(number, version); if (kvp.ContainsKey("errorstring")) vi.AddErrorString(kvp["errorstring"]); } else if (kvp.ContainsKey("inputChannel")) { ExecutedVertexInstance vi = this.jobVertices.FindVertex(number, version); if (kvp.ContainsKey("errorstring")) vi.AddErrorString(kvp["errorstring"]); } else if (kvp.ContainsKey("io")) { if (kvp["io"] == "starting") { ExecutedVertexInstance vi = this.jobVertices.FindVertex(number, version); int numberOfInputs = (int) TryGetNumeric(kvp, "numberOfInputs"); int numberOfOutputs = (int)TryGetNumeric(kvp, "numberOfOutputs"); if (vi.InputChannels == null) vi.InputChannels = new Dictionary<int, ChannelEndpointDescription>(); for (int i = 0; i < numberOfInputs; i++) { string uri; if (kvp.TryGetValue("uriIn." + i, out uri)) { var ched = new ChannelEndpointDescription(false, i, uri, 0); vi.InputChannels[i] = ched; } } if (vi.OutputChannels == null) vi.OutputChannels = new Dictionary<int, ChannelEndpointDescription>(); for (int i = 0; i < numberOfOutputs; i++) { string uri; if (kvp.TryGetValue("uriOut." + i, out uri)) { var ched = new ChannelEndpointDescription(false, i, uri, 0); vi.OutputChannels[i] = ched; } } } else if (kvp["io"] == "total") { ExecutedVertexInstance vi = this.jobVertices.FindVertex(number, version); long totalRead = TryGetNumeric(kvp, "totalRead"); long tempRead = TryGetNumeric(kvp, "tempRead"); long tempReadInRack = TryGetNumeric(kvp, "tempReadInRack"); long tempReadCrossRack = TryGetNumeric(kvp, "tempReadCrossRack"); long localRead = TryGetNumeric(kvp, "localRead"); long totalWritten = TryGetNumeric(kvp, "totalWritten"); vi.DataRead = totalRead; vi.DataWritten = totalWritten; if (vi.InputChannels != null) { foreach (int ch in vi.InputChannels.Keys) { long bytes = TryGetNumeric(kvp, "rb." + ch); vi.InputChannels[ch].Size = bytes; } } if (vi.OutputChannels != null) { foreach (int ch in vi.OutputChannels.Keys) { long bytes = TryGetNumeric(kvp, "wb." + ch); vi.OutputChannels[ch].Size = bytes; } } this.TotalDataRead += totalRead; this.LocalReadData += localRead; this.CrossPodDataRead += tempReadCrossRack; this.IntraPodDataRead += tempReadInRack; } else if (kvp["io"] == "running") { ExecutedVertexInstance vi = this.jobVertices.FindVertex(number, version); if (vi.InputChannels != null) { foreach (int ch in vi.InputChannels.Keys) { long bytes = TryGetNumeric(kvp, "rb." + ch); vi.InputChannels[ch].Size = bytes; bytes = TryGetNumeric(kvp, "tb." + ch); vi.InputChannels[ch].TotalSize = bytes; } } if (vi.InputChannels != null) { foreach (int ch in vi.OutputChannels.Keys) { long bytes = TryGetNumeric(kvp, "wb." + ch); vi.OutputChannels[ch].Size = bytes; } } long totalRead = TryGetNumeric(kvp, "totalRead"); long totalWritten = TryGetNumeric(kvp, "totalWritten"); vi.DataRead = totalRead; vi.DataWritten = totalWritten; } } } return true; }
/// <summary> /// A vertex has received a new guid. /// </summary> /// <param name="vi">Executed vertex instance.</param> /// <param name="newGuid">New guid.</param> internal void Remap(ExecutedVertexInstance vi, string newGuid) { if (!this.vertexByGuid.ContainsKey(newGuid)) this.vertexByGuid.Add(newGuid, vi); }
/// <summary> /// Add a new vertex to this job. /// </summary> /// <param name="vi">Vertex description to add.</param> /// <returns>Stage name that the vertex belongs to.</returns> public void Add(ExecutedVertexInstance vi) { int id = vi.Number; List<ExecutedVertexInstance> l; if (vertices.ContainsKey(id)) l = vertices[id]; else { l = new List<ExecutedVertexInstance>(); vertices.Add(id, l); } l.Add(vi); this.count++; string stage = vi.StageName; List<ExecutedVertexInstance> members; if (this.jobStages.ContainsKey(stage)) members = this.jobStages[stage]; else { members = new List<ExecutedVertexInstance>(); this.jobStages.Add(stage, members); } members.Add(vi); if (!this.vertexByGuid.ContainsKey(vi.UniqueID)) this.vertexByGuid.Add(vi.UniqueID, vi); this.jobStages["All vertices"].Add(vi); }
public void AddVertex(ExecutedVertexInstance e) { this.vertices.Add(e); }