// Calls the Batch service to get metrics for a single job. The first time the // MetricMonitor sees a job, it creates a TaskStateCache to hold task state information, // and queries the states of *all* tasks in the job. Subsequent times, it queries // only for tasks whose states have changed since the previous query -- this significant // reduces download volumes for large jobs. In either case, it then updates the // cached task states and aggregates them into a TaskStateCounts object. private async Task CollectTaskMetricsAsync(MetricEvent.Builder metricsBuilder, CloudJob job) { TaskStateCache taskStateCache; bool firstTime = !this.jobStateCache.ContainsKey(job.Id); if (firstTime) { taskStateCache = new TaskStateCache(); this.jobStateCache.Add(job.Id, taskStateCache); } else { taskStateCache = this.jobStateCache[job.Id]; } // If the monitor API is called for the first time, it has to issue a query to enumerate all the tasks once to get its state. // This is a relatively slow query. // Subsequent calls to the monitor API will only look for changes to the task state since the last time the query was issued and // a clock skew (which is within 30 seconds approximately for Azure). Thus if the monitoring API periodicity is 1 minute, then the query // should look for changes in the last minute and 30 seconds. // TODO: it would be better to record the time at which the last query was issued and use that, // rather than subtracting the monitor interval from the current time DateTime since = DateTime.UtcNow - (this.monitorInterval + MaximumClockSkew); var tasksToList = firstTime ? DetailLevels.IdAndState.AllEntities : DetailLevels.IdAndState.OnlyChangedAfter(since); var listTasksTimer = Stopwatch.StartNew(); var tasks = await job.ListTasks(tasksToList).ToListAsync(this.runCancel.Token); listTasksTimer.Stop(); var listTasksLatency = listTasksTimer.Elapsed; foreach (var task in tasks) { taskStateCache.UpdateTaskState(task.Id, task.State.Value); } var taskStateCounts = taskStateCache.GetTaskStateCounts(); metricsBuilder.JobStats.Add(job.Id, new JobMetrics(listTasksLatency, taskStateCounts)); }