Пример #1
0
        /// <summary>
        /// Gets the task info
        /// </summary>
        /// <returns>returns the task info as a dictionary, keyed by task id</returns>
        /// <remarks>
        /// This method returns a list of task info which ChangeTime property is in this rank: [this.lastChangeTime, DateTime.Now].
        /// </remarks>
        private async Task <List <TaskInfo> > GetTaskStateChangeAsync(List <ComputeNode> nodes)
        {
            try
            {
                TraceHelper.TraceEvent(this.sessionid, TraceEventType.Verbose, "[AzureBatchJobMonitor] Query task info...");
                ODATADetailLevel detail = new ODATADetailLevel(filterClause: $"(stateTransitionTime ge datetime'{this.lastChangeTime:O}')", selectClause: "id,nodeInfo,state,stateTransitionTime");
                TraceHelper.TraceEvent(this.sessionid, TraceEventType.Information, "[AzureBatchJobMonitor] Query task info filter clause = {0}\n", detail.FilterClause);
                List <CloudTask> stateChangedTasks = await this.batchClient.JobOperations.ListTasks(this.cloudJob.Id, detail).ToListAsync();

                if (stateChangedTasks.Count == 0)
                {
                    // no service task dispathed yet.
                    TraceHelper.TraceEvent(this.sessionid, TraceEventType.Warning,
                                           "[AzureBatchJobMonitorEntry] Failed to get tasks or no task state change.");
                    return(null);
                }

                List <TaskInfo> results = new List <TaskInfo>(stateChangedTasks.Count);
                TraceHelper.TraceEvent(this.sessionid, TraceEventType.Information, "[AzureBatchJobMonitor] The number of changed state tasks is {0}", stateChangedTasks.Count);
                DateTime lastStateTransitionTime = new DateTime();
                foreach (CloudTask task in stateChangedTasks)
                {
                    TraceHelper.TraceEvent(this.sessionid, TraceEventType.Information, "[AzureBatchJobMonitor] task {0} state changed to {1}, at date time = {2}\n", task.Id, task.State, task.StateTransitionTime);
                    TaskState state = task.State.Value;
                    DateTime  stateTransitionTime = task.StateTransitionTime.Value;
                    if (state == TaskState.Running)
                    {
                        TaskInfo info = new TaskInfo();
                        info.Id          = task.Id;
                        info.State       = TaskStateConverter.FromAzureBatchTaskState(task.State.Value);
                        info.MachineName = nodes.First(n => n.AffinityId == task.ComputeNodeInformation.AffinityId)
                                           .IPAddress;
                        info.Capacity       = this.nodeCapacity;
                        info.FirstCoreIndex = Int32.Parse(TelepathyConstants.FirstCoreIndex);
                        TraceHelper.TraceEvent(this.sessionid, TraceEventType.Information, "[AzureBatchJobMonitor] Node capacity in pool is\n", nodeCapacity);
                        results.Add(info);
                    }
                    else if (state == TaskState.Completed)
                    {
                        TaskInfo info = new TaskInfo
                        {
                            Id    = task.Id,
                            State = TaskStateConverter.FromAzureBatchTaskState(task.State.Value)
                        };
                        results.Add(info);
                    }

                    if (DateTime.Compare(lastStateTransitionTime, stateTransitionTime) < 1)
                    {
                        lastStateTransitionTime = stateTransitionTime;
                    }
                }
                this.cloudJob.Refresh();
                this.lastChangeTime = lastStateTransitionTime;
                return(results);
            }
            catch (Exception ex)
            {
                TraceHelper.TraceEvent(this.sessionid, TraceEventType.Warning, "[AzureBatchJobMonitor] Fail when get task info: {0}", ex);
                return(null);
            }
            finally
            {
                TraceHelper.TraceEvent(this.sessionid, TraceEventType.Verbose, "[AzureBatchJobMonitor] Query task info finished.");
            }
        }
Пример #2
0
        /// <summary>
        /// Gets the task info
        /// </summary>
        /// <returns>returns the task info as a dictionary, keyed by task id</returns>
        /// <remarks>
        /// This method returns a list of task info which ChangeTime property is in this rank: [this.lastChangeTime, DateTime.Now].
        /// This method does not change this.lastChangeTime to DateTime.Now after getting tasks because it may fail when sending those information to broker
        /// So changeTime is outputed and this.lastChangeTime should be modified to this time after suceeded sending back task info
        /// </remarks>
        private List <TaskInfo> GetTaskInfo()
        {
            DateTime changeTime = DateTime.UtcNow;

            try
            {
                TraceHelper.TraceEvent(this.sessionid, TraceEventType.Verbose, "[JobMonitorEntry] Query task info...");

                // Step 1: Query task allocation history to fetch node id and core id for tasks
                Dictionary <int, TaskAllocationHistoryItem> taskInfoDic = new Dictionary <int, TaskAllocationHistoryItem>();
                PropertyIdCollection allocationPropertyCollection       = new PropertyIdCollection();
                allocationPropertyCollection.AddPropertyId(AllocationProperties.TaskId);
                allocationPropertyCollection.AddPropertyId(AllocationProperties.CoreId);
                allocationPropertyCollection.AddPropertyId(AllocationProperties.NodeName);
                using (ISchedulerRowEnumerator rows = this.schedulerJob.OpenTaskAllocationHistoryEnumerator(allocationPropertyCollection))
                {
                    foreach (PropertyRow row in rows)
                    {
                        // Note: Finished/Failed/Canceled task will also be enumerated here
                        // We are going to add them into the dic regaredless of the state
                        // because only running tasks will be queried in the following logic.
                        int objectId = (int)row[AllocationProperties.TaskId].Value;
                        TaskAllocationHistoryItem taskInfo;
                        if (taskInfoDic.TryGetValue(objectId, out taskInfo))
                        {
                            // For each task instance cache the assigned resource with the lowest coreId. This is needed when node or socket allocation is used
                            //   in order to generate the correct port to connect to the service host
                            int coreId = (int)row[AllocationProperties.CoreId].Value;
                            if (taskInfo.FirstCoreId > coreId)
                            {
                                taskInfo.FirstCoreId = coreId;
                            }

                            taskInfo.Capacity++;
                        }
                        else
                        {
                            taskInfo             = new TaskAllocationHistoryItem();
                            taskInfo.Capacity    = 1;
                            taskInfo.FirstCoreId = (int)row[AllocationProperties.CoreId].Value;
                            taskInfo.NodeName    = (string)row[AllocationProperties.NodeName].Value;
                            taskInfoDic.Add(objectId, taskInfo);
                        }
                    }
                }

                TraceHelper.TraceEvent(this.sessionid, TraceEventType.Verbose, "[JobMonitorEntry] Query task info (got task allocation history).");

                // Step 2: Get task states from scheduler
                // Only task whose change time is between lastChangeTime and Now will be queried
                // Only task id and state are required, will get node name according to node id from allocation history got from step 1.
                IPropertyIdCollection collection = new PropertyIdCollection();
                collection.AddPropertyId(TaskPropertyIds.Id);
                collection.AddPropertyId(TaskPropertyIds.State);
                FilterCollection fc = new FilterCollection();
                fc.Add(FilterOperator.GreaterThan, TaskPropertyIds.ChangeTime, this.lastChangeTime);
                fc.Add(FilterOperator.LessThanOrEqual, TaskPropertyIds.ChangeTime, changeTime);

                // FIXME: There's performance impact on this query because we look for TaskPropertyIds.Type
                // which is requires a table join. Need to have a better way to do so.
                fc.Add(FilterOperator.Equal, TaskPropertyIds.Type, TaskType.Service);

                List <PropertyRow> taskRows = new List <PropertyRow>();

                // The ISchedulerRowSet object is a snapshot and is always a new object, so no lock is needed
                foreach (var taskRow in this.schedulerJob.OpenTaskEnumerator(collection, fc, null, true))
                {
                    taskRows.Add(taskRow);
                }

                TraceHelper.TraceEvent(this.sessionid, TraceEventType.Verbose,
                                       "[JobMonitorEntry] GetTaskInfo, got {0} rows.", taskRows.Count);

                if (taskRows.Count == 0)
                {
                    // no service task dispathed yet.
                    TraceHelper.TraceEvent(this.sessionid, TraceEventType.Warning,
                                           "[JobMonitorEntry] Failed to get task property rows.");

                    return(null);
                }

                TraceHelper.TraceEvent(this.sessionid, TraceEventType.Verbose, "[JobMonitorEntry] Query task info (got task info rows from scheduler).");

                this.schedulerJob.Refresh();
                int jobRequeueCount = this.schedulerJob.RequeueCount;
                TraceHelper.TraceEvent(this.sessionid, TraceEventType.Verbose, "[JobMonitorEntry] Job requeue count is {0}", jobRequeueCount);

                List <TaskInfo> results = new List <TaskInfo>(taskRows.Count);
                foreach (PropertyRow row in taskRows)
                {
                    int objectId = (int)row[TaskPropertyIds.Id].Value;

                    TaskAllocationHistoryItem taskInfo;
                    if (!taskInfoDic.TryGetValue(objectId, out taskInfo))
                    {
                        continue;
                    }

                    TaskState state = (TaskState)row[TaskPropertyIds.State].Value;

                    if (state == TaskState.Running || state == TaskState.Dispatching)
                    {
                        TraceHelper.TraceEvent(this.sessionid, TraceEventType.Information, "[JobMonitorEntry] Task {0} changed into Running", objectId);

                        string       machineName              = taskInfo.NodeName;
                        NodeLocation location                 = NodeLocation.OnPremise;
                        string       azureServiceName         = null;
                        string       azureLoadBalancerAddress = null;

                        try
                        {
                            this.GetNodeInfo(machineName, out location, out azureServiceName, out azureLoadBalancerAddress);
                        }
                        catch (Exception e)
                        {
                            // if exception happens when querying node info, just skip this node temporarily.
                            TraceHelper.TraceEvent(this.sessionid, TraceEventType.Warning, "[JobMonitorEntry] -> Get node info for task {0} throws exception. Exception: {1}", objectId, e);
                            continue;
                        }

                        TraceHelper.TraceEvent(this.sessionid, TraceEventType.Verbose, "[JobMonitorEntry] ->Get machine name for task {0}: {1}", objectId, machineName);

                        int capacity  = taskInfo.Capacity;
                        int coreindex = taskInfo.FirstCoreId;

                        TraceHelper.TraceEvent(this.sessionid, TraceEventType.Verbose, "[JobMonitorEntry] ->Get coreid for task {0}: {1}", objectId, coreindex);
                        TraceHelper.TraceEvent(this.sessionid, TraceEventType.Verbose, "[JobMonitorEntry] ->Get AzureLoadBalancerAddress for task {0}: {1}", objectId, azureLoadBalancerAddress);

                        TaskInfo info = new TaskInfo();
                        info.Id       = objectId;
                        info.Capacity = capacity;

                        if (SoaHelper.IsOnAzure())
                        {
                            info.MachineVirtualName = machineName;
                        }
                        else
                        {
                            info.MachineName = machineName;
                        }

                        info.Location                 = NodeLocationConverter.FromHpcNodeLocation(location);
                        info.ProxyServiceName         = azureServiceName;
                        info.AzureLoadBalancerAddress = azureLoadBalancerAddress;
                        info.State           = TaskStateConverter.FromHpcTaskState(state);
                        info.FirstCoreIndex  = coreindex;
                        info.JobRequeueCount = jobRequeueCount;
                        results.Add(info);
                    }
                    else if (state == TaskState.Failed || state == TaskState.Canceled || state == TaskState.Canceling || state == TaskState.Finished || state == TaskState.Finishing)
                    {
                        TaskInfo info = new TaskInfo();
                        info.Id              = objectId;
                        info.State           = TaskStateConverter.FromHpcTaskState(state);
                        info.JobRequeueCount = jobRequeueCount;
                        results.Add(info);
                    }
                }

                this.lastChangeTime = changeTime;
                return(results);
            }
            catch (Exception ex)
            {
                TraceHelper.TraceEvent(this.sessionid, TraceEventType.Warning, "[JobMonitorEntry] Fail when get task info: {0}", ex);
                return(null);
            }
            finally
            {
                TraceHelper.TraceEvent(this.sessionid, TraceEventType.Verbose, "[JobMonitorEntry] Query task info finished.");
            }
        }