// Calls the Batch service to get job metrics. This is done in two parts: // // 1. List all jobs in the account. // 2. For each job, collect metrics for that job (see CollectTaskMetricsAsync). // // For simplicity, job metrics (step 2) are collected serially. You could reduce latency // by performing the CollectTaskMetricsAsync calls in parallel, but would need to // take care to synchronize access to the MetricsBuilder that accumulates the results. private async Task <MetricEvent> CollectMetricsAsync() { MetricEvent.Builder metricsBuilder = new MetricEvent.Builder { CollectionStarted = DateTime.UtcNow }; try { var totalLatencyStopWatch = Stopwatch.StartNew(); var listJobsTimer = Stopwatch.StartNew(); var jobs = await this.batchClient.JobOperations.ListJobs(DetailLevels.IdAndState.AllEntities).ToListAsync(this.runCancel.Token); listJobsTimer.Stop(); metricsBuilder.ListJobsLatency = listJobsTimer.Elapsed; foreach (var job in jobs) { await CollectTaskMetricsAsync(metricsBuilder, job); } totalLatencyStopWatch.Stop(); metricsBuilder.TotalLatency = totalLatencyStopWatch.Elapsed; metricsBuilder.CollectionCompleted = DateTime.UtcNow; return(metricsBuilder.Build()); } catch (Exception ex) { return(new MetricEvent(metricsBuilder.CollectionStarted, DateTime.UtcNow, ex)); } }
// Calls the Batch service to get metrics for a single job. The first time the // MetricMonitor sees a job, it creates a TaskStateCache to hold task state information, // and queries the states of *all* tasks in the job. Subsequent times, it queries // only for tasks whose states have changed since the previous query -- this significant // reduces download volumes for large jobs. In either case, it then updates the // cached task states and aggregates them into a TaskStateCounts object. private async Task CollectTaskMetricsAsync(MetricEvent.Builder metricsBuilder, CloudJob job) { TaskStateCache taskStateCache; bool firstTime = !this.jobStateCache.ContainsKey(job.Id); if (firstTime) { taskStateCache = new TaskStateCache(); this.jobStateCache.Add(job.Id, taskStateCache); } else { taskStateCache = this.jobStateCache[job.Id]; } // If the monitor API is called for the first time, it has to issue a query to enumerate all the tasks once to get its state. // This is a relatively slow query. // Subsequent calls to the monitor API will only look for changes to the task state since the last time the query was issued and // a clock skew (which is within 30 seconds approximately for Azure). Thus if the monitoring API periodicity is 1 minute, then the query // should look for changes in the last minute and 30 seconds. // TODO: it would be better to record the time at which the last query was issued and use that, // rather than subtracting the monitor interval from the current time DateTime since = DateTime.UtcNow - (this.monitorInterval + MaximumClockSkew); var tasksToList = firstTime ? DetailLevels.IdAndState.AllEntities : DetailLevels.IdAndState.OnlyChangedAfter(since); var listTasksTimer = Stopwatch.StartNew(); var tasks = await job.ListTasks(tasksToList).ToListAsync(this.runCancel.Token); listTasksTimer.Stop(); var listTasksLatency = listTasksTimer.Elapsed; foreach (var task in tasks) { taskStateCache.UpdateTaskState(task.Id, task.State.Value); } var taskStateCounts = taskStateCache.GetTaskStateCounts(); metricsBuilder.JobStats.Add(job.Id, new JobMetrics(listTasksLatency, taskStateCounts)); }
// Calls the Batch service to get job metrics. This is done in two parts: // // 1. List all jobs in the account. // 2. For each job, collect metrics for that job (see CollectTaskMetricsAsync). // // For simplicity, job metrics (step 2) are collected serially. You could reduce latency // by performing the CollectTaskMetricsAsync calls in parallel, but would need to // take care to synchronize access to the MetricsBuilder that accumulates the results. private async Task<MetricEvent> CollectMetricsAsync() { MetricEvent.Builder metricsBuilder = new MetricEvent.Builder { CollectionStarted = DateTime.UtcNow }; try { var totalLatencyStopWatch = Stopwatch.StartNew(); var listJobsTimer = Stopwatch.StartNew(); var jobs = await this.batchClient.JobOperations.ListJobs(DetailLevels.IdAndState.AllEntities).ToListAsync(this.runCancel.Token); listJobsTimer.Stop(); metricsBuilder.ListJobsLatency = listJobsTimer.Elapsed; foreach (var job in jobs) { await CollectTaskMetricsAsync(metricsBuilder, job); } totalLatencyStopWatch.Stop(); metricsBuilder.TotalLatency = totalLatencyStopWatch.Elapsed; metricsBuilder.CollectionCompleted = DateTime.UtcNow; return metricsBuilder.Build(); } catch (Exception ex) { return new MetricEvent(metricsBuilder.CollectionStarted, DateTime.UtcNow, ex); } }