/// <summary> /// Gets the task info /// </summary> /// <returns>returns the task info as a dictionary, keyed by task id</returns> /// <remarks> /// This method returns a list of task info which ChangeTime property is in this rank: [this.lastChangeTime, DateTime.Now]. /// </remarks> private async Task <List <TaskInfo> > GetTaskStateChangeAsync(List <ComputeNode> nodes) { try { TraceHelper.TraceEvent(this.sessionid, TraceEventType.Verbose, "[AzureBatchJobMonitor] Query task info..."); ODATADetailLevel detail = new ODATADetailLevel(filterClause: $"(stateTransitionTime ge datetime'{this.lastChangeTime:O}')", selectClause: "id,nodeInfo,state,stateTransitionTime"); TraceHelper.TraceEvent(this.sessionid, TraceEventType.Information, "[AzureBatchJobMonitor] Query task info filter clause = {0}\n", detail.FilterClause); List <CloudTask> stateChangedTasks = await this.batchClient.JobOperations.ListTasks(this.cloudJob.Id, detail).ToListAsync(); if (stateChangedTasks.Count == 0) { // no service task dispathed yet. TraceHelper.TraceEvent(this.sessionid, TraceEventType.Warning, "[AzureBatchJobMonitorEntry] Failed to get tasks or no task state change."); return(null); } List <TaskInfo> results = new List <TaskInfo>(stateChangedTasks.Count); TraceHelper.TraceEvent(this.sessionid, TraceEventType.Information, "[AzureBatchJobMonitor] The number of changed state tasks is {0}", stateChangedTasks.Count); DateTime lastStateTransitionTime = new DateTime(); foreach (CloudTask task in stateChangedTasks) { TraceHelper.TraceEvent(this.sessionid, TraceEventType.Information, "[AzureBatchJobMonitor] task {0} state changed to {1}, at date time = {2}\n", task.Id, task.State, task.StateTransitionTime); TaskState state = task.State.Value; DateTime stateTransitionTime = task.StateTransitionTime.Value; if (state == TaskState.Running) { TaskInfo info = new TaskInfo(); info.Id = task.Id; info.State = TaskStateConverter.FromAzureBatchTaskState(task.State.Value); info.MachineName = nodes.First(n => n.AffinityId == task.ComputeNodeInformation.AffinityId) .IPAddress; info.Capacity = this.nodeCapacity; info.FirstCoreIndex = Int32.Parse(TelepathyConstants.FirstCoreIndex); TraceHelper.TraceEvent(this.sessionid, TraceEventType.Information, "[AzureBatchJobMonitor] Node capacity in pool is\n", nodeCapacity); results.Add(info); } else if (state == TaskState.Completed) { TaskInfo info = new TaskInfo { Id = task.Id, State = TaskStateConverter.FromAzureBatchTaskState(task.State.Value) }; results.Add(info); } if (DateTime.Compare(lastStateTransitionTime, stateTransitionTime) < 1) { lastStateTransitionTime = stateTransitionTime; } } this.cloudJob.Refresh(); this.lastChangeTime = lastStateTransitionTime; return(results); } catch (Exception ex) { TraceHelper.TraceEvent(this.sessionid, TraceEventType.Warning, "[AzureBatchJobMonitor] Fail when get task info: {0}", ex); return(null); } finally { TraceHelper.TraceEvent(this.sessionid, TraceEventType.Verbose, "[AzureBatchJobMonitor] Query task info finished."); } }
/// <summary> /// Gets the task info /// </summary> /// <returns>returns the task info as a dictionary, keyed by task id</returns> /// <remarks> /// This method returns a list of task info which ChangeTime property is in this rank: [this.lastChangeTime, DateTime.Now]. /// This method does not change this.lastChangeTime to DateTime.Now after getting tasks because it may fail when sending those information to broker /// So changeTime is outputed and this.lastChangeTime should be modified to this time after suceeded sending back task info /// </remarks> private List <TaskInfo> GetTaskInfo() { DateTime changeTime = DateTime.UtcNow; try { TraceHelper.TraceEvent(this.sessionid, TraceEventType.Verbose, "[JobMonitorEntry] Query task info..."); // Step 1: Query task allocation history to fetch node id and core id for tasks Dictionary <int, TaskAllocationHistoryItem> taskInfoDic = new Dictionary <int, TaskAllocationHistoryItem>(); PropertyIdCollection allocationPropertyCollection = new PropertyIdCollection(); allocationPropertyCollection.AddPropertyId(AllocationProperties.TaskId); allocationPropertyCollection.AddPropertyId(AllocationProperties.CoreId); allocationPropertyCollection.AddPropertyId(AllocationProperties.NodeName); using (ISchedulerRowEnumerator rows = this.schedulerJob.OpenTaskAllocationHistoryEnumerator(allocationPropertyCollection)) { foreach (PropertyRow row in rows) { // Note: Finished/Failed/Canceled task will also be enumerated here // We are going to add them into the dic regaredless of the state // because only running tasks will be queried in the following logic. int objectId = (int)row[AllocationProperties.TaskId].Value; TaskAllocationHistoryItem taskInfo; if (taskInfoDic.TryGetValue(objectId, out taskInfo)) { // For each task instance cache the assigned resource with the lowest coreId. This is needed when node or socket allocation is used // in order to generate the correct port to connect to the service host int coreId = (int)row[AllocationProperties.CoreId].Value; if (taskInfo.FirstCoreId > coreId) { taskInfo.FirstCoreId = coreId; } taskInfo.Capacity++; } else { taskInfo = new TaskAllocationHistoryItem(); taskInfo.Capacity = 1; taskInfo.FirstCoreId = (int)row[AllocationProperties.CoreId].Value; taskInfo.NodeName = (string)row[AllocationProperties.NodeName].Value; taskInfoDic.Add(objectId, taskInfo); } } } TraceHelper.TraceEvent(this.sessionid, TraceEventType.Verbose, "[JobMonitorEntry] Query task info (got task allocation history)."); // Step 2: Get task states from scheduler // Only task whose change time is between lastChangeTime and Now will be queried // Only task id and state are required, will get node name according to node id from allocation history got from step 1. IPropertyIdCollection collection = new PropertyIdCollection(); collection.AddPropertyId(TaskPropertyIds.Id); collection.AddPropertyId(TaskPropertyIds.State); FilterCollection fc = new FilterCollection(); fc.Add(FilterOperator.GreaterThan, TaskPropertyIds.ChangeTime, this.lastChangeTime); fc.Add(FilterOperator.LessThanOrEqual, TaskPropertyIds.ChangeTime, changeTime); // FIXME: There's performance impact on this query because we look for TaskPropertyIds.Type // which is requires a table join. Need to have a better way to do so. fc.Add(FilterOperator.Equal, TaskPropertyIds.Type, TaskType.Service); List <PropertyRow> taskRows = new List <PropertyRow>(); // The ISchedulerRowSet object is a snapshot and is always a new object, so no lock is needed foreach (var taskRow in this.schedulerJob.OpenTaskEnumerator(collection, fc, null, true)) { taskRows.Add(taskRow); } TraceHelper.TraceEvent(this.sessionid, TraceEventType.Verbose, "[JobMonitorEntry] GetTaskInfo, got {0} rows.", taskRows.Count); if (taskRows.Count == 0) { // no service task dispathed yet. TraceHelper.TraceEvent(this.sessionid, TraceEventType.Warning, "[JobMonitorEntry] Failed to get task property rows."); return(null); } TraceHelper.TraceEvent(this.sessionid, TraceEventType.Verbose, "[JobMonitorEntry] Query task info (got task info rows from scheduler)."); this.schedulerJob.Refresh(); int jobRequeueCount = this.schedulerJob.RequeueCount; TraceHelper.TraceEvent(this.sessionid, TraceEventType.Verbose, "[JobMonitorEntry] Job requeue count is {0}", jobRequeueCount); List <TaskInfo> results = new List <TaskInfo>(taskRows.Count); foreach (PropertyRow row in taskRows) { int objectId = (int)row[TaskPropertyIds.Id].Value; TaskAllocationHistoryItem taskInfo; if (!taskInfoDic.TryGetValue(objectId, out taskInfo)) { continue; } TaskState state = (TaskState)row[TaskPropertyIds.State].Value; if (state == TaskState.Running || state == TaskState.Dispatching) { TraceHelper.TraceEvent(this.sessionid, TraceEventType.Information, "[JobMonitorEntry] Task {0} changed into Running", objectId); string machineName = taskInfo.NodeName; NodeLocation location = NodeLocation.OnPremise; string azureServiceName = null; string azureLoadBalancerAddress = null; try { this.GetNodeInfo(machineName, out location, out azureServiceName, out azureLoadBalancerAddress); } catch (Exception e) { // if exception happens when querying node info, just skip this node temporarily. TraceHelper.TraceEvent(this.sessionid, TraceEventType.Warning, "[JobMonitorEntry] -> Get node info for task {0} throws exception. Exception: {1}", objectId, e); continue; } TraceHelper.TraceEvent(this.sessionid, TraceEventType.Verbose, "[JobMonitorEntry] ->Get machine name for task {0}: {1}", objectId, machineName); int capacity = taskInfo.Capacity; int coreindex = taskInfo.FirstCoreId; TraceHelper.TraceEvent(this.sessionid, TraceEventType.Verbose, "[JobMonitorEntry] ->Get coreid for task {0}: {1}", objectId, coreindex); TraceHelper.TraceEvent(this.sessionid, TraceEventType.Verbose, "[JobMonitorEntry] ->Get AzureLoadBalancerAddress for task {0}: {1}", objectId, azureLoadBalancerAddress); TaskInfo info = new TaskInfo(); info.Id = objectId; info.Capacity = capacity; if (SoaHelper.IsOnAzure()) { info.MachineVirtualName = machineName; } else { info.MachineName = machineName; } info.Location = NodeLocationConverter.FromHpcNodeLocation(location); info.ProxyServiceName = azureServiceName; info.AzureLoadBalancerAddress = azureLoadBalancerAddress; info.State = TaskStateConverter.FromHpcTaskState(state); info.FirstCoreIndex = coreindex; info.JobRequeueCount = jobRequeueCount; results.Add(info); } else if (state == TaskState.Failed || state == TaskState.Canceled || state == TaskState.Canceling || state == TaskState.Finished || state == TaskState.Finishing) { TaskInfo info = new TaskInfo(); info.Id = objectId; info.State = TaskStateConverter.FromHpcTaskState(state); info.JobRequeueCount = jobRequeueCount; results.Add(info); } } this.lastChangeTime = changeTime; return(results); } catch (Exception ex) { TraceHelper.TraceEvent(this.sessionid, TraceEventType.Warning, "[JobMonitorEntry] Fail when get task info: {0}", ex); return(null); } finally { TraceHelper.TraceEvent(this.sessionid, TraceEventType.Verbose, "[JobMonitorEntry] Query task info finished."); } }