Example #1
0
        /// <summary>
        /// Diagnose a list of jobs.
        /// </summary>
        /// <param name="jobs">Jobs to diagnose.</param>
        /// <param name="config">Cluster configuration.</param>
        /// <param name="manager">Communicatino manager.</param>
        public static List <DiagnosisLog> DiagnoseJobs(IEnumerable <DryadLinqJobSummary> jobs, ClusterConfiguration config, CommManager manager)
        {
            var dryadLinqJobSummaries = jobs as DryadLinqJobSummary[] ?? jobs.ToArray();
            int jobCount = dryadLinqJobSummaries.Count();

            List <DiagnosisLog> result = new List <DiagnosisLog>();
            int done = 0;

            foreach (DryadLinqJobSummary summary in dryadLinqJobSummaries)
            {
                if (summary == null)
                {
                    continue;
                }

                manager.Token.ThrowIfCancellationRequested();
                JobFailureDiagnosis diagnosis = JobFailureDiagnosis.CreateJobFailureDiagnosis(config, summary, manager);
                manager.Status("Diagnosing " + summary.ShortName(), StatusKind.LongOp);
                DiagnosisLog log = diagnosis.Diagnose();
                result.Add(log);

                done++;
                manager.Progress(done * 100 / jobCount);
            }
            manager.Status("Diagnosis complete", StatusKind.OK);
            return(result);
        }
Example #2
0
        /// <summary>
        /// Recompute the list of jobs on the cluster and add them to the clusterJobs field.
        /// </summary>
        /// <param name="virtualCluster">Unused.</param>
        /// <param name="manager">Communication manager.</param>
        protected override void RecomputeClusterJobList(string virtualCluster, CommManager manager)
        {
            this.clusterJobs = new Dictionary <string, ClusterJobInformation>();
            if (string.IsNullOrEmpty(CachedClusterResidentObject.CacheDirectory))
            {
                return;
            }

            string joblist = Path.Combine(CachedClusterResidentObject.CacheDirectory, "jobs");

            if (!Directory.Exists(joblist))
            {
                Directory.CreateDirectory(joblist);
            }

            string[] files = Directory.GetFiles(joblist, "*.xml");
            foreach (var file in files)
            {
                manager.Token.ThrowIfCancellationRequested();
                DryadLinqJobSummary job  = Utilities.LoadXml <DryadLinqJobSummary>(file);
                string cjid              = job.Cluster + "-" + job.ClusterJobId; // there may be two jobs with same id from different clusters
                ClusterJobInformation ci = new ClusterJobInformation(this.Config.Name, job.Cluster, cjid, job.Name, job.User, job.Date, job.EndTime - job.Date, job.Status);
                ci.SetAssociatedSummary(job);
                if (this.clusterJobs.ContainsKey(cjid))
                {
                    manager.Status("Duplicate job id, cannot insert in cache " + job.AsIdentifyingString(), StatusKind.Error);
                    continue;
                }
                this.clusterJobs.Add(cjid, ci);
            }
            manager.Progress(100);
        }
Example #3
0
        /// <summary>
        /// Try to find the job information from cluster and summary.
        /// </summary>
        /// <param name="manager">Communication manager.</param>
        protected void FindJobInfo(CommManager manager)
        {
            DryadLinqJobInfo jobinfo = DryadLinqJobInfo.CreateDryadLinqJobInfo(this.cluster, this.Summary, true, manager);

            if (jobinfo == null)
            {
                manager.Status("Cannot collect information for " + Summary.ShortName() + " to diagnose", StatusKind.Error);
                return;
            }

            this.Job        = jobinfo;
            this.StaticPlan = JobObjectModel.DryadJobStaticPlan.CreatePlan(jobinfo, manager);
        }
Example #4
0
        /// <summary>
        /// Cancel a job.
        /// </summary>
        /// <param name="jobs">Jobs to cancel.</param>
        /// <param name="cluster">Cluster where the jobs are running.</param>
        /// <returns>True if all cancellations succeed.</returns>
        /// <param name="manager">Communicatoni manager.</param>
        // ReSharper disable once UnusedParameter.Global
        public static bool CancelJobs(IEnumerable <DryadLinqJobSummary> jobs, ClusterStatus cluster, CommManager manager)
        {
            bool done = true;

            foreach (DryadLinqJobSummary job in jobs)
            {
                manager.Token.ThrowIfCancellationRequested();
                if (job.Status != ClusterJobInformation.ClusterJobStatus.Running)
                {
                    manager.Status("Job " + job.Name + " does not appear to be running; will still try to cancel", StatusKind.Error);
                }

                bool   success;
                string reason = "";
                try
                {
                    success = cluster.CancelJob(job);
                }
                catch (Exception ex)
                {
                    success = false;
                    reason  = ex.Message;
                    Trace.TraceInformation(ex.ToString());
                }

                if (success)
                {
                    manager.Status("Job " + job.Name + " cancelled", StatusKind.OK);
                }
                else
                {
                    manager.Status("Cancellation of " + job.Name + " failed " + reason, StatusKind.Error);
                }
                done &= success;
            }
            return(done);
        }
Example #5
0
        /// <summary>
        /// Diagnose a list of jobs.
        /// </summary>
        /// <param name="jobs">Jobs to diagnose.</param>
        /// <param name="config">Cluster configuration.</param>
        /// <param name="manager">Communicatino manager.</param>
        public static List<DiagnosisLog> DiagnoseJobs(IEnumerable<DryadLinqJobSummary> jobs, ClusterConfiguration config, CommManager manager)
        {
            var dryadLinqJobSummaries = jobs as DryadLinqJobSummary[] ?? jobs.ToArray();
            int jobCount = dryadLinqJobSummaries.Count();

            List<DiagnosisLog> result = new List<DiagnosisLog>();
            int done = 0;
            foreach (DryadLinqJobSummary summary in dryadLinqJobSummaries)
            {
                if (summary == null) continue;

                manager.Token.ThrowIfCancellationRequested(); 
                JobFailureDiagnosis diagnosis = JobFailureDiagnosis.CreateJobFailureDiagnosis(config, summary, manager);
                manager.Status("Diagnosing " + summary.ShortName(), StatusKind.LongOp);
                DiagnosisLog log = diagnosis.Diagnose();
                result.Add(log);

                done++;
                manager.Progress(done * 100 / jobCount);
            }
            manager.Status("Diagnosis complete", StatusKind.OK);
            return result;
        }
Example #6
0
        /// <summary>
        /// Cancel a job.
        /// </summary>
        /// <param name="jobs">Jobs to cancel.</param>
        /// <param name="cluster">Cluster where the jobs are running.</param>
        /// <returns>True if all cancellations succeed.</returns>
        /// <param name="manager">Communicatoni manager.</param>
        // ReSharper disable once UnusedParameter.Global
        public static bool CancelJobs(IEnumerable<DryadLinqJobSummary> jobs, ClusterStatus cluster, CommManager manager)
        {
            bool done = true;
            foreach (DryadLinqJobSummary job in jobs)
            {
                manager.Token.ThrowIfCancellationRequested();
                if (job.Status != ClusterJobInformation.ClusterJobStatus.Running)
                {
                    manager.Status("Job " + job.Name + " does not appear to be running; will still try to cancel", StatusKind.Error);
                }

                bool success;
                string reason = "";
                try
                {
                    success = cluster.CancelJob(job);
                }
                catch (Exception ex)
                {
                    success = false;
                    reason = ex.Message;
                    Trace.TraceInformation(ex.ToString());
                }

                if (success)
                    manager.Status("Job " + job.Name + " cancelled", StatusKind.OK);
                else
                    manager.Status("Cancellation of " + job.Name + " failed " + reason, StatusKind.Error);
                done &= success;
            }
            return done;
        }
Example #7
0
        /// <summary>
        /// Try to find the job information from cluster and summary.
        /// </summary>
        /// <param name="manager">Communication manager.</param>
        protected void FindJobInfo(CommManager manager)
        {
            DryadLinqJobInfo jobinfo = DryadLinqJobInfo.CreateDryadLinqJobInfo(this.cluster, this.Summary, true, manager);
            if (jobinfo == null)
            {
                manager.Status("Cannot collect information for " + Summary.ShortName() + " to diagnose", StatusKind.Error);
                return;
            }

            this.Job = jobinfo;
            this.StaticPlan = JobObjectModel.DryadJobStaticPlan.CreatePlan(jobinfo, manager);
        }
Example #8
0
        /// <summary>
        /// Recompute the list of jobs on the cluster and add them to the clusterJobs field.
        /// </summary>
        /// <param name="virtualCluster">Unused.</param>
        /// <param name="manager">Communication manager.</param>
        protected override void RecomputeClusterJobList(string virtualCluster, CommManager manager)
        {
            this.clusterJobs = new Dictionary<string, ClusterJobInformation>();
            if (string.IsNullOrEmpty(CachedClusterResidentObject.CacheDirectory))
                return;

            string joblist = Path.Combine(CachedClusterResidentObject.CacheDirectory, "jobs");
            if (!Directory.Exists(joblist))
                Directory.CreateDirectory(joblist);

            string[] files = Directory.GetFiles(joblist, "*.xml");
            foreach (var file in files)
            {
                manager.Token.ThrowIfCancellationRequested();
                DryadLinqJobSummary job = Utilities.LoadXml<DryadLinqJobSummary>(file);
                string cjid = job.Cluster + "-" + job.ClusterJobId; // there may be two jobs with same id from different clusters
                ClusterJobInformation ci = new ClusterJobInformation(this.Config.Name, job.Cluster, cjid, job.Name, job.User, job.Date, job.EndTime - job.Date, job.Status);
                ci.SetAssociatedSummary(job);
                if (this.clusterJobs.ContainsKey(cjid))
                {
                    manager.Status("Duplicate job id, cannot insert in cache " + job.AsIdentifyingString(), StatusKind.Error);
                    continue;
                }
                this.clusterJobs.Add(cjid, ci);
            }
            manager.Progress(100);
        }
Example #9
0
        /// <summary>
        /// Cache the vertices in the list; executed on the background thread.
        /// </summary>
        /// <returns>True: success.</returns>
        /// <param name="manager">Communication manager.</param>
        /// <param name="config">Cluster configuration.</param>
        /// <param name="summary">Job to cache.</param>
        /// <param name="vertices">Vertices to cache.</param>
        private static bool CacheAllVertices(
            ClusterConfiguration config, DryadLinqJobSummary summary, List<ExecutedVertexInstance> vertices,
            CommManager manager)
        {
            int done = 0;
            int todo = vertices.Count;
            int files = 0;
            manager.Status("Caching data for " + todo + " vertices", StatusKind.LongOp);
            foreach (ExecutedVertexInstance v in vertices)
            {
                files += CacheVertexInfo(config, summary, v);
                done++;
                manager.Progress(done / todo);
            }

            manager.Progress(100);
            manager.Status("Cached " + files + " files", StatusKind.OK);
            return true;
        }
Example #10
0
        /// <summary>
        /// Get the contents of a specified cluster resident object.
        /// </summary>
        /// <param name="path">Cluster object whose contents is read.</param>
        /// <param name="pattern">Pattern to filter contents, for folders.</param>
        /// <returns>The file contents.</returns>
        /// <param name="manager">Communication manager.</param>
        private static FileContents GetContents(CommManager manager, IClusterResidentObject path, string pattern)
        {
            if (path == null)
            {
                return new FileContents("Null path");
            }

            StringBuilder output = new StringBuilder();
            
            Dictionary<string, IClusterResidentObject> linkCache = new Dictionary<string, IClusterResidentObject>();
            linkCache.Add(path.ToString(), path); 

            string error = (path.RepresentsAFolder ? "Folder " : "") + path;
            if (path.Exception != null)
            {
                error += " [Error accessing: " + path.Exception.Message + "]";
                return new FileContents(error);
            }

            if (path.RepresentsAFolder)
            {
                IEnumerable<IClusterResidentObject> dirs = path.GetFilesAndFolders(pattern);
                int displayed = 0;
                foreach (IClusterResidentObject d in dirs)
                {
                    manager.Token.ThrowIfCancellationRequested();
                    if (d.Exception != null)
                    {
                        error += " [Error " + d.Exception.Message + "]";
                        return new FileContents(error);
                    }
                    if (d.RepresentsAFolder)
                    {
                        string dirdisplay = string.Format("{0:u} {1,16} file://{2}", d.CreationTime, "d", d.Name);
                        output.AppendLine(dirdisplay);
                    }
                    else
                    {
                        string filedisplay = string.Format("{0:u} {1,16:N0} file://{2}", d.CreationTime, d.Size, d.Name);
                        output.AppendLine(filedisplay);
                    }
                    linkCache.Add("file://" + d.Name, d);
                    displayed++;
                }

                if (displayed == 0)
                    error += "[empty]";
                return new FileContents(output.ToString(), error, linkCache);
            }
            else
            {
                manager.Status("Extracting contents of " + path, StatusKind.LongOp);
                ISharedStreamReader sr = path.GetStream();
                if (sr.Exception != null)
                {
                    error += " [Error " + sr.Exception.Message + "]";
                    return new FileContents(error);
                }
                else
                {
                    if (path.Size == 0)
                        error += "[empty]";
                    var contents = sr.ReadToEnd(manager.Token);
                    return new FileContents(contents, error, linkCache);
                }
            }
        }
Example #11
0
        /// <summary>
        /// Fill the job info by parsing the stdout.txt.
        /// <returns>The updated job.</returns>
        /// <param name="manager">Communication manager.</param>
        /// </summary>
        public bool CollectEssentialInformation(CommManager manager)
        {
            this.RefreshJobStatus(manager);
            if (this.ManagerVertex == null)
            {
                this.ManagerVertex = new ExecutedVertexInstance(this, -1, 0, "JobManager", "", this.Summary.Date);
                this.ManagerVertex.IsManager = true;
                this.ManagerVertex.SetStartInformation(this, this.Summary.Machine, this.Summary.Date, this.Summary.ManagerProcessGuid, "");
                this.ManagerVertex.StartCommandTime = this.ManagerVertex.CreationTime = this.ManagerVertex.VertexScheduleTime = this.Summary.Date;
                ExecutedVertexInstance.VertexState jmstate = ExecutedVertexInstance.VertexState.Started;
                switch (this.Summary.Status)
                {
                    case ClusterJobInformation.ClusterJobStatus.Failed:
                        jmstate = ExecutedVertexInstance.VertexState.Failed;
                        break;
                    /*
                    case ClusterJobInformation.ClusterJobStatus.Succeeded:
                        jmstate = ExecutedVertexInstance.VertexState.Successful;
                        break;
                    */
                }
                this.ManagerVertex.SetState(jmstate);
                this.jobVertices.Add(this.ManagerVertex);
            }

            if (this.stdoutpath == null)
                return false;
            bool success = this.ParseStdout(this.stdoutpath, manager);
            manager.Progress(100);
            if (!success)
                return false;

            this.JobInfoCannotBeCollected = false;
            manager.Status("Stdout parsed", StatusKind.OK);

            this.LastUpdatetime = DateTime.Now;
            if (this.Summary.Status == ClusterJobInformation.ClusterJobStatus.Running)
            {
                foreach (var vertex in this.Vertices.Where(v => v.State == ExecutedVertexInstance.VertexState.Started))
                    vertex.MarkVertexWasRunning(this.LastUpdatetime);
                this.ManagerVertex.MarkVertexWasRunning(this.LastUpdatetime);
            }
            else if (this.jobSummary.Status == ClusterJobInformation.ClusterJobStatus.Failed)
            {
                if (this.ManagerVertex.State == ExecutedVertexInstance.VertexState.Started)
                    this.ManagerVertex.SetState(ExecutedVertexInstance.VertexState.Failed);
                foreach (var vertex in this.Vertices.Where(v => v.State == ExecutedVertexInstance.VertexState.Started))
                    vertex.MarkVertexWasRunning(this.ManagerVertex.End);
            }
            
            return true;
        }
Example #12
0
 /// <summary>
 /// Create information about a job run on the cluster.
 /// </summary>
 /// <param name="cf">Cluster configuration.</param>
 /// <param name="summary">Summary description of the job.</param>
 /// <returns>The Dryad job description, or null.</returns>
 /// <param name="fill">If true, fill all the information, otherwise the user will have to call FillInformation on the result later.</param>
 /// <param name="manager">Communication manager.</param>        
 public static DryadLinqJobInfo CreateDryadLinqJobInfo(ClusterConfiguration cf, DryadLinqJobSummary summary, bool fill, CommManager manager)
 {
     try
     {
         DryadLinqJobInfo job = new DryadLinqJobInfo(cf, summary);
         if (fill)
             job.CollectEssentialInformation(manager);
         return job;
     }
     catch (Exception e)
     {
         Trace.TraceInformation(e.ToString());
         manager.Status("Could not collect job information for " + summary.Name + ": " + e.Message, StatusKind.Error);
         return null;
     }
 }
Example #13
0
        /// <summary>
        /// Discover the vertex channels in a Scope-generated vcmdStart*xml file.
        /// </summary>
        /// <returns>True if the discovery was successful.</returns>
        /// <param name="inputs">If true discover the inputs.</param>
        /// <param name="outputs">If true discover the outputs.</param>
        /// <param name="fast">If true do not discover the channel sizes (much faster).</param>
        /// <param name="manager">Communication manager.</param>
        // ReSharper disable UnusedParameter.Global
        public bool DiscoverScopeChannels(bool inputs, bool outputs, bool fast, CommManager manager)
            // ReSharper restore UnusedParameter.Global
        {
            // find the xml file
            var files = this.WorkDirectory.GetFilesAndFolders("vcmdStart*.xml").ToList();
            if (files.Count != 1)
            {
                manager.Status("Cannot locate vcmdStart*.xml file", StatusKind.Error);
                return false;
            }
            ISharedStreamReader sr = files.First().GetStream();
            if (sr.Exception != null)
            {
                manager.Status("Error reading vcmdStart*.xml file" + sr.Exception.Message, StatusKind.Error);
                return false;
            }
            
            // ReSharper disable PossibleNullReferenceException
            XDocument plan = XDocument.Parse(sr.ReadToEnd(manager.Token));
            if (inputs && this.InputChannels == null)
            {
                var channels = new Dictionary<int, ChannelEndpointDescription>();
                IEnumerable<XElement> inputsData = plan.Root.Element("inputs").Elements();
                int chno = 0;
                foreach (var e in inputsData)
                {
                    string chpath = e.Attribute("path").Value;
                    long size = long.Parse(e.Attribute("length").Value);
                    ChannelEndpointDescription desc = new ChannelEndpointDescription(true, chno, chpath, size);
                    channels.Add(chno, desc);
                    chno++;
                }
                this.InputChannels = channels;
            }

            if (outputs && this.OutputChannels == null)
            {
                var channels = new Dictionary<int, ChannelEndpointDescription>();
                IEnumerable<XElement> inputsData = plan.Root.Element("outputs").Elements();
                int chno = 0;
                foreach (var e in inputsData)
                {
                    string chpath = e.Attribute("path").Value;
                    ChannelEndpointDescription desc = new ChannelEndpointDescription(true, chno, chpath, -1);
                    channels.Add(chno, desc);
                    chno ++;
                }
                this.OutputChannels = channels;
            }
            // ReSharper restore PossibleNullReferenceException
            
            sr.Close();
            return true;
        }
Example #14
0
        /// <summary>
        /// Factory: create the plan for a given job.
        /// </summary>
        /// <param name="dryadLinqJobInfo">Job to create plan for.</param>
        /// <returns>The plan or null.</returns>
        /// <param name="manager">Communication manager.</param>
        public static DryadJobStaticPlan CreatePlan(DryadLinqJobInfo dryadLinqJobInfo, CommManager manager)
        {
            manager.Status("Trying to build static plan", StatusKind.LongOp);
            ClusterConfiguration config = dryadLinqJobInfo.ClusterConfiguration;
            IClusterResidentObject file = config.JobQueryPlan(dryadLinqJobInfo.Summary);
            if (config is CacheClusterConfiguration)
                config = (config as CacheClusterConfiguration).ActualConfig(dryadLinqJobInfo.Summary);

            if (file.Exception == null)
            {
                DryadJobStaticPlan retval;
                {
                    retval = new DryadLinqJobStaticPlan(config, file.GetStream());
                }
                retval.ParseQueryPlan(manager);
                return retval;
            }
            else
            {
                manager.Status("Exception while looking for plan " + file.Exception.Message, StatusKind.Error);
                return null;
            }
        }
Example #15
0
        /// <summary>
        /// Parse the stdout.txt file from the job manager.
        /// </summary>
        /// <param name="file">File to parse.</param>
        /// <param name="manager">Communication manager.</param>
        /// <returns>True if the parsing succeeds.</returns>
        private bool ParseStdout(IClusterResidentObject file, CommManager manager)
        {
            int currentLine = 0;
            if (this.stdoutLinesParsed == 0)
                // don't lose it if we are only parsing the tail.
                this.lastTimestampSeen = this.Summary.Date; // start from the job submission timestamp

            // we are reusing the stream
            this.stdoutLinesParsed = 0;

            try
            {
                long filesize = file.Size;
                long readbytes = 0;
                string message = "Scanning JM stdout " + file;
                if (filesize >= 0)
                    message += string.Format("({0:N0} bytes)", filesize);
                manager.Status(message, StatusKind.LongOp);

                if (this.cachedStdoutReader == null)
                    this.cachedStdoutReader = file.GetStream();
                if (this.cachedStdoutReader.Exception != null)
                {
                    manager.Status("Exception while opening stdout " + this.cachedStdoutReader.Exception.Message, StatusKind.Error);
                    return false;
                }

                while (!this.cachedStdoutReader.EndOfStream)
                {
                    string line = this.cachedStdoutReader.ReadLine();
                    readbytes += line.Length;
                    if (currentLine >= this.stdoutLinesParsed)
                    {
                        while (true)
                        {
                            manager.Token.ThrowIfCancellationRequested();
                            int startLine = currentLine;
                            bool completeLine = true;
                            try
                            {
                                completeLine = this.ParseStdoutLineNew(line);
                            }
                            catch (Exception ex)
                            {
                                manager.Status(string.Format("Line {0}: Exception {1}", currentLine, ex.Message), StatusKind.Error);
                                Console.WriteLine("Line {0}: Exception {1}", currentLine, ex);
                            }
                            if (!completeLine)
                            {
                                if (this.cachedStdoutReader.EndOfStream)
                                {
                                    throw new Exception("File ended while scanning for closing quote started on line " + startLine);
                                }

                                string extraline = this.cachedStdoutReader.ReadLine();
                                line += "\n" + extraline;
                                currentLine++;
                            }
                            else break;
                        }
                    }
                    currentLine++;
                    if (currentLine % 100 == 0 && filesize > 0)
                    {
                        manager.Progress(Math.Min(100, (int)(100 * readbytes / filesize)));
                    }
                }

                this.stdoutLinesParsed = currentLine;

                if (this.ManagerVertex != null)
                {
                    if (this.ManagerVertex.End == DateTime.MinValue)
                        // approximation
                        this.ManagerVertex.End = this.lastTimestampSeen;

                    // we are done with this stream
                    if (this.ManagerVertex.State == ExecutedVertexInstance.VertexState.Failed ||
                        this.ManagerVertex.State == ExecutedVertexInstance.VertexState.Successful)
                    {
                        this.cachedStdoutReader.Close();
                        this.cachedStdoutReader = null; // will force reopening if refreshed
                    }
                }
                return true;
            }
            catch (Exception e)
            {
                manager.Status("Exception while reading stdout " + e.Message, StatusKind.Error);
                Trace.TraceInformation(e.ToString());
                return false;
            }
        }