예제 #1
0
        /// <summary>
        /// Returns the number of tasks that failed for reasons other than preemption
        /// </summary>
        /// <param name="schedulerJob"></param>
        /// <returns></returns>
        private int GetNonPreemptedFailedTaskCount(IScheduler scheduler, ISchedulerJob schedulerJob)
        {
            int ret = 0;

            try
            {
                // Filter by failed tasks that failed due to preemption
                IFilterCollection fc = scheduler.CreateFilterCollection();
                fc.Add(FilterOperator.NotEqual, TaskPropertyIds.FailureReason, FailureReason.Preempted);
                fc.Add(FilterOperator.Equal, TaskPropertyIds.State, TaskState.Failed);

                // Only return the task Ids
                IPropertyIdCollection propIds = new PropertyIdCollection();
                propIds.AddPropertyId(TaskPropertyIds.TaskId);

                using (ISchedulerRowSet failedTasks = schedulerJob.OpenTaskRowSet(propIds, fc, null, true))
                {
                    ret = failedTasks.GetCount();
                }
            }

            catch (Exception ex)
            {
                TraceHelper.TraceEvent(this.sessionid, TraceEventType.Warning, "[JobMonitorEntry] Failed to get non-preempted failed task count : {0}", ex);
            }

            return(ret);
        }
예제 #2
0
        /// <summary>
        /// Gets the task info
        /// </summary>
        /// <returns>returns the task info as a dictionary, keyed by task id</returns>
        /// <remarks>
        /// This method returns a list of task info which ChangeTime property is in this rank: [this.lastChangeTime, DateTime.Now].
        /// This method does not change this.lastChangeTime to DateTime.Now after getting tasks because it may fail when sending those information to broker
        /// So changeTime is outputed and this.lastChangeTime should be modified to this time after suceeded sending back task info
        /// </remarks>
        private List <TaskInfo> GetTaskInfo()
        {
            DateTime changeTime = DateTime.UtcNow;

            try
            {
                TraceHelper.TraceEvent(this.sessionid, TraceEventType.Verbose, "[JobMonitorEntry] Query task info...");

                // Step 1: Query task allocation history to fetch node id and core id for tasks
                Dictionary <int, TaskAllocationHistoryItem> taskInfoDic = new Dictionary <int, TaskAllocationHistoryItem>();
                PropertyIdCollection allocationPropertyCollection       = new PropertyIdCollection();
                allocationPropertyCollection.AddPropertyId(AllocationProperties.TaskId);
                allocationPropertyCollection.AddPropertyId(AllocationProperties.CoreId);
                allocationPropertyCollection.AddPropertyId(AllocationProperties.NodeName);
                using (ISchedulerRowEnumerator rows = this.schedulerJob.OpenTaskAllocationHistoryEnumerator(allocationPropertyCollection))
                {
                    foreach (PropertyRow row in rows)
                    {
                        // Note: Finished/Failed/Canceled task will also be enumerated here
                        // We are going to add them into the dic regaredless of the state
                        // because only running tasks will be queried in the following logic.
                        int objectId = (int)row[AllocationProperties.TaskId].Value;
                        TaskAllocationHistoryItem taskInfo;
                        if (taskInfoDic.TryGetValue(objectId, out taskInfo))
                        {
                            // For each task instance cache the assigned resource with the lowest coreId. This is needed when node or socket allocation is used
                            //   in order to generate the correct port to connect to the service host
                            int coreId = (int)row[AllocationProperties.CoreId].Value;
                            if (taskInfo.FirstCoreId > coreId)
                            {
                                taskInfo.FirstCoreId = coreId;
                            }

                            taskInfo.Capacity++;
                        }
                        else
                        {
                            taskInfo             = new TaskAllocationHistoryItem();
                            taskInfo.Capacity    = 1;
                            taskInfo.FirstCoreId = (int)row[AllocationProperties.CoreId].Value;
                            taskInfo.NodeName    = (string)row[AllocationProperties.NodeName].Value;
                            taskInfoDic.Add(objectId, taskInfo);
                        }
                    }
                }

                TraceHelper.TraceEvent(this.sessionid, TraceEventType.Verbose, "[JobMonitorEntry] Query task info (got task allocation history).");

                // Step 2: Get task states from scheduler
                // Only task whose change time is between lastChangeTime and Now will be queried
                // Only task id and state are required, will get node name according to node id from allocation history got from step 1.
                IPropertyIdCollection collection = new PropertyIdCollection();
                collection.AddPropertyId(TaskPropertyIds.Id);
                collection.AddPropertyId(TaskPropertyIds.State);
                FilterCollection fc = new FilterCollection();
                fc.Add(FilterOperator.GreaterThan, TaskPropertyIds.ChangeTime, this.lastChangeTime);
                fc.Add(FilterOperator.LessThanOrEqual, TaskPropertyIds.ChangeTime, changeTime);

                // FIXME: There's performance impact on this query because we look for TaskPropertyIds.Type
                // which is requires a table join. Need to have a better way to do so.
                fc.Add(FilterOperator.Equal, TaskPropertyIds.Type, TaskType.Service);

                List <PropertyRow> taskRows = new List <PropertyRow>();

                // The ISchedulerRowSet object is a snapshot and is always a new object, so no lock is needed
                foreach (var taskRow in this.schedulerJob.OpenTaskEnumerator(collection, fc, null, true))
                {
                    taskRows.Add(taskRow);
                }

                TraceHelper.TraceEvent(this.sessionid, TraceEventType.Verbose,
                                       "[JobMonitorEntry] GetTaskInfo, got {0} rows.", taskRows.Count);

                if (taskRows.Count == 0)
                {
                    // no service task dispathed yet.
                    TraceHelper.TraceEvent(this.sessionid, TraceEventType.Warning,
                                           "[JobMonitorEntry] Failed to get task property rows.");

                    return(null);
                }

                TraceHelper.TraceEvent(this.sessionid, TraceEventType.Verbose, "[JobMonitorEntry] Query task info (got task info rows from scheduler).");

                this.schedulerJob.Refresh();
                int jobRequeueCount = this.schedulerJob.RequeueCount;
                TraceHelper.TraceEvent(this.sessionid, TraceEventType.Verbose, "[JobMonitorEntry] Job requeue count is {0}", jobRequeueCount);

                List <TaskInfo> results = new List <TaskInfo>(taskRows.Count);
                foreach (PropertyRow row in taskRows)
                {
                    int objectId = (int)row[TaskPropertyIds.Id].Value;

                    TaskAllocationHistoryItem taskInfo;
                    if (!taskInfoDic.TryGetValue(objectId, out taskInfo))
                    {
                        continue;
                    }

                    TaskState state = (TaskState)row[TaskPropertyIds.State].Value;

                    if (state == TaskState.Running || state == TaskState.Dispatching)
                    {
                        TraceHelper.TraceEvent(this.sessionid, TraceEventType.Information, "[JobMonitorEntry] Task {0} changed into Running", objectId);

                        string       machineName              = taskInfo.NodeName;
                        NodeLocation location                 = NodeLocation.OnPremise;
                        string       azureServiceName         = null;
                        string       azureLoadBalancerAddress = null;

                        try
                        {
                            this.GetNodeInfo(machineName, out location, out azureServiceName, out azureLoadBalancerAddress);
                        }
                        catch (Exception e)
                        {
                            // if exception happens when querying node info, just skip this node temporarily.
                            TraceHelper.TraceEvent(this.sessionid, TraceEventType.Warning, "[JobMonitorEntry] -> Get node info for task {0} throws exception. Exception: {1}", objectId, e);
                            continue;
                        }

                        TraceHelper.TraceEvent(this.sessionid, TraceEventType.Verbose, "[JobMonitorEntry] ->Get machine name for task {0}: {1}", objectId, machineName);

                        int capacity  = taskInfo.Capacity;
                        int coreindex = taskInfo.FirstCoreId;

                        TraceHelper.TraceEvent(this.sessionid, TraceEventType.Verbose, "[JobMonitorEntry] ->Get coreid for task {0}: {1}", objectId, coreindex);
                        TraceHelper.TraceEvent(this.sessionid, TraceEventType.Verbose, "[JobMonitorEntry] ->Get AzureLoadBalancerAddress for task {0}: {1}", objectId, azureLoadBalancerAddress);

                        TaskInfo info = new TaskInfo();
                        info.Id       = objectId;
                        info.Capacity = capacity;

                        if (SoaHelper.IsOnAzure())
                        {
                            info.MachineVirtualName = machineName;
                        }
                        else
                        {
                            info.MachineName = machineName;
                        }

                        info.Location                 = NodeLocationConverter.FromHpcNodeLocation(location);
                        info.ProxyServiceName         = azureServiceName;
                        info.AzureLoadBalancerAddress = azureLoadBalancerAddress;
                        info.State           = TaskStateConverter.FromHpcTaskState(state);
                        info.FirstCoreIndex  = coreindex;
                        info.JobRequeueCount = jobRequeueCount;
                        results.Add(info);
                    }
                    else if (state == TaskState.Failed || state == TaskState.Canceled || state == TaskState.Canceling || state == TaskState.Finished || state == TaskState.Finishing)
                    {
                        TaskInfo info = new TaskInfo();
                        info.Id              = objectId;
                        info.State           = TaskStateConverter.FromHpcTaskState(state);
                        info.JobRequeueCount = jobRequeueCount;
                        results.Add(info);
                    }
                }

                this.lastChangeTime = changeTime;
                return(results);
            }
            catch (Exception ex)
            {
                TraceHelper.TraceEvent(this.sessionid, TraceEventType.Warning, "[JobMonitorEntry] Fail when get task info: {0}", ex);
                return(null);
            }
            finally
            {
                TraceHelper.TraceEvent(this.sessionid, TraceEventType.Verbose, "[JobMonitorEntry] Query task info finished.");
            }
        }
예제 #3
0
        /// <summary>
        /// Calculate the min and max value for the service job
        /// </summary>
        private void CalculateMinAndMax()
        {
            PropertyId[] propIds;
            PropertyRow  row;
            int          userMax, userMin;

            switch (this.schedulerJob.UnitType)
            {
            case JobUnitType.Node:
                userMax = this.schedulerJob.MaximumNumberOfNodes;
                userMin = this.schedulerJob.MinimumNumberOfNodes;
                propIds = new PropertyId[] { JobPropertyIds.ComputedMaxNodes, JobPropertyIds.ComputedMinNodes };
                break;

            case JobUnitType.Socket:
                userMax = this.schedulerJob.MaximumNumberOfSockets;
                userMin = this.schedulerJob.MinimumNumberOfSockets;
                propIds = new PropertyId[] { JobPropertyIds.ComputedMaxSockets, JobPropertyIds.ComputedMinSockets };
                break;

            default:
                userMax = this.schedulerJob.MaximumNumberOfCores;
                userMin = this.schedulerJob.MinimumNumberOfCores;
                propIds = new PropertyId[] { JobPropertyIds.ComputedMaxCores, JobPropertyIds.ComputedMinCores };
                break;
            }

            IFilterCollection filter = new FilterCollection();

            filter.Add(FilterOperator.Equal, JobPropertyIds.Id, this.schedulerJob.Id);

            IPropertyIdCollection property = new PropertyIdCollection();

            foreach (PropertyId pid in propIds)
            {
                property.AddPropertyId(pid);
            }

            using (ISchedulerRowSet set = this.scheduler.OpenJobRowSet(property, filter, null))
            {
                PropertyRow[] rows = set.GetRows(0, set.GetCount() - 1).Rows;
                Debug.Assert(rows.Length > 0);
                row = rows[0];
            }

            string callerName  = "[JobMonitorEntry.GetMinAndMax]";
            int    computedMax = (int)JobHelper.GetStorePropertyValue(row.Props[0], propIds[0], 0, callerName);
            int    computedMin = (int)JobHelper.GetStorePropertyValue(row.Props[1], propIds[1], 0, callerName);

            if (this.schedulerJob.CanShrink)
            {
                this.minUnits = this.schedulerJob.AutoCalculateMin ? computedMin : userMin;
            }
            else
            {
                this.minUnits = userMin;
            }

            if (this.schedulerJob.CanGrow)
            {
                this.maxUnits = this.schedulerJob.AutoCalculateMax ? computedMax : userMax;
            }
            else
            {
                this.maxUnits = userMax;
            }

            if (this.maxUnits < this.minUnits)
            {
                this.maxUnits = this.minUnits;
            }

            TraceHelper.TraceEvent(this.sessionid, TraceEventType.Information, "[JobMonitorEntry] MaxUnits = {0}, MinUnits = {1}", this.maxUnits, this.minUnits);
        }