/// <summary> /// Start to subscribe the job and task event /// </summary> /// <param name="jobid">indicating the job id</param> /// <param name="autoMax">indicating the auto max property of the job</param> /// <param name="autoMin">indicating the auto min property of the job</param> public async Task <(JobState jobState, int autoMax, int autoMin)> RegisterJobAsync(string jobid) { Trace.TraceInformation($"[AzureBatchSchedulerDelegation] Begin: RegisterJob, job id is {jobid}..."); //CheckBrokerAccess(jobid); int autoMax = 0, autoMin = 0; CloudJob batchJob; try { AzureBatchJobMonitorEntry jobMonitorEntry; lock (this.JobMonitors) { if (!this.JobMonitors.TryGetValue(jobid, out jobMonitorEntry)) { jobMonitorEntry = new AzureBatchJobMonitorEntry(jobid); jobMonitorEntry.Exit += new EventHandler(this.JobMonitorEntry_Exit); } } batchJob = await jobMonitorEntry.StartAsync(System.ServiceModel.OperationContext.Current); // Bug 18050: Only add/update the instance if it succeeded to // open the job. lock (this.JobMonitors) { this.JobMonitors[jobid] = jobMonitorEntry; } autoMin = jobMonitorEntry.MinUnits; autoMax = jobMonitorEntry.MaxUnits; } catch (Exception e) { Trace.TraceError($"[AzureBatchSchedulerDelegation] Exception thrown while registering job: {jobid}", e); throw; } Trace.TraceInformation($"[AzureBatchSchedulerDelegation] End: RegisterJob. Current job state = {batchJob.State}."); return(await AzureBatchJobStateConverter.FromAzureBatchJobAsync(batchJob), autoMax, autoMin); }
/// <summary> /// Query job info /// </summary> private async Task QueryJobChangeAsync() { TraceHelper.TraceEvent(this.sessionid, TraceEventType.Verbose, "[AzureBatchJobMonitorEntry] Enters QueryTaskInfo method."); bool shouldExit = false; this.pullJobGap = PullJobMinGap; JobState state = JobState.Active; Session.Data.JobState currentJobState = Session.Data.JobState.Configuring; var pool = this.batchClient.PoolOperations.GetPool(AzureBatchConfiguration.BatchPoolName); string skuName = pool.VirtualMachineSize; TraceHelper.TraceEvent(this.sessionid, TraceEventType.Information, "[AzureBatchJobMonitor] VMSize in pool is {0}", skuName); SKU targetSku = Array.Find(this.skus, sku => sku.Name.Equals(skuName, StringComparison.OrdinalIgnoreCase)); this.nodeCapacity = targetSku.VCPUs; TraceHelper.TraceEvent(this.sessionid, TraceEventType.Information, "[AzureBatchJobMonitor] Node capacity in pool is {0}", nodeCapacity); ODATADetailLevel detailLevel = new ODATADetailLevel(); detailLevel.SelectClause = "affinityId, ipAddress"; var nodes = await pool.ListComputeNodes(detailLevel).ToListAsync(); while (true) { if (shouldExit) { break; } List <TaskInfo> stateChangedTaskList = new List <TaskInfo>(); try { TraceHelper.TraceEvent(this.sessionid, TraceEventType.Verbose, "[AzureBatchJobMonitor] Starting get job state."); ODATADetailLevel detail = new ODATADetailLevel(selectClause: "state"); this.cloudJob = await this.batchClient.JobOperations.GetJobAsync(this.cloudJob.Id); state = this.cloudJob.State.HasValue ? this.cloudJob.State.Value : state; currentJobState = await AzureBatchJobStateConverter.FromAzureBatchJobAsync(this.cloudJob); TraceHelper.TraceEvent(this.sessionid, TraceEventType.Information, "[AzureBatchJobMonitor] Current job state in AzureBatch: JobState = {0}\n", state); TraceHelper.TraceEvent(this.sessionid, TraceEventType.Information, "[AzureBatchJobMonitor] Current job state in Telepathy: JobState = {0}\n", currentJobState); stateChangedTaskList = await this.GetTaskStateChangeAsync(nodes); TraceHelper.TraceEvent(this.sessionid, TraceEventType.Information, "[AzureBatchJobMonitor] Previous job state report to AzureBatchJobMonitorEntry: JobState = {0}\n", previousJobState); if (state == JobState.Completed || state == JobState.Disabled) { if (this.previousJobState == Session.Data.JobState.Canceling) { currentJobState = Session.Data.JobState.Canceled; } shouldExit = true; } else if (this.previousJobState == Session.Data.JobState.Canceling && !shouldExit) { //Override current job state as Canceling, because when all tasks turn to be completed, the job state converter will make job state finishing. //If one job is cancelling in previous state and now is not in one terminated state, keep to reporting cancelling state to job monitor entry. currentJobState = Session.Data.JobState.Canceling; TraceHelper.TraceEvent(this.sessionid, TraceEventType.Information, "[AzureBatchJobMonitor] Overwrite current job state as {0} in Telepathy according to previous job state {1}\n", currentJobState, previousJobState); } } catch (BatchException e) { TraceHelper.TraceEvent(this.sessionid, TraceEventType.Warning, "[AzureBatchJobMonitor] BatchException thrown when querying job info: {0}", e); //If the previous job state is canceling and current job is not found, then the job is deleted. if (e.RequestInformation != null & e.RequestInformation.HttpStatusCode != null) { if (e.RequestInformation.HttpStatusCode == System.Net.HttpStatusCode.NotFound) { if (previousJobState == Session.Data.JobState.Canceling) { TraceHelper.TraceEvent(this.sessionid, TraceEventType.Warning, "[AzureBatchJobMonitor] The queried job has been deleted."); } else { TraceHelper.TraceEvent(this.sessionid, TraceEventType.Warning, "[AzureBatchJobMonitor] The queried job previous state is {0}, we make its state as canceled because it's no longer exist.", previousJobState); } shouldExit = true; currentJobState = Session.Data.JobState.Canceled; } } } catch (Exception e) { TraceHelper.TraceEvent(this.sessionid, TraceEventType.Warning, "[AzureBatchJobMonitor] Exception thrown when querying job info: {0}", e); } try { if (this.ReportJobStateAction != null) { TraceHelper.TraceEvent(this.sessionid, TraceEventType.Information, "[AzureBatchJobMonitor] Current job state report to AzureBatchJobMonitorEntry: JobState = {0}\n", currentJobState); this.ReportJobStateAction(currentJobState, stateChangedTaskList, shouldExit); } } catch (Exception e) { TraceHelper.TraceEvent(this.sessionid, TraceEventType.Warning, "[AzureBatchJobMonitor] Exception thrown when report job info: {0}", e); } this.previousJobState = currentJobState; if (!shouldExit) { TraceHelper.TraceEvent(this.sessionid, TraceEventType.Information, "[AzureBatchJobMonitor] Waiting {0} milliseconds and start another round of getting job state info.", this.pullJobGap); // Sleep and pull job again, clear the register pull job flag await Task.Delay(this.pullJobGap); if (this.pullJobGap < PullJobMaxGap) { this.pullJobGap *= 2; if (this.pullJobGap > PullJobMaxGap) { this.pullJobGap = PullJobMaxGap; } } } } }