/// <summary> /// Returns the number of tasks that failed for reasons other than preemption /// </summary> /// <param name="schedulerJob"></param> /// <returns></returns> private int GetNonPreemptedFailedTaskCount(IScheduler scheduler, ISchedulerJob schedulerJob) { int ret = 0; try { // Filter by failed tasks that failed due to preemption IFilterCollection fc = scheduler.CreateFilterCollection(); fc.Add(FilterOperator.NotEqual, TaskPropertyIds.FailureReason, FailureReason.Preempted); fc.Add(FilterOperator.Equal, TaskPropertyIds.State, TaskState.Failed); // Only return the task Ids IPropertyIdCollection propIds = new PropertyIdCollection(); propIds.AddPropertyId(TaskPropertyIds.TaskId); using (ISchedulerRowSet failedTasks = schedulerJob.OpenTaskRowSet(propIds, fc, null, true)) { ret = failedTasks.GetCount(); } } catch (Exception ex) { TraceHelper.TraceEvent(this.sessionid, TraceEventType.Warning, "[JobMonitorEntry] Failed to get non-preempted failed task count : {0}", ex); } return(ret); }
/// <summary> /// Gets the task info /// </summary> /// <returns>returns the task info as a dictionary, keyed by task id</returns> /// <remarks> /// This method returns a list of task info which ChangeTime property is in this rank: [this.lastChangeTime, DateTime.Now]. /// This method does not change this.lastChangeTime to DateTime.Now after getting tasks because it may fail when sending those information to broker /// So changeTime is outputed and this.lastChangeTime should be modified to this time after suceeded sending back task info /// </remarks> private List <TaskInfo> GetTaskInfo() { DateTime changeTime = DateTime.UtcNow; try { TraceHelper.TraceEvent(this.sessionid, TraceEventType.Verbose, "[JobMonitorEntry] Query task info..."); // Step 1: Query task allocation history to fetch node id and core id for tasks Dictionary <int, TaskAllocationHistoryItem> taskInfoDic = new Dictionary <int, TaskAllocationHistoryItem>(); PropertyIdCollection allocationPropertyCollection = new PropertyIdCollection(); allocationPropertyCollection.AddPropertyId(AllocationProperties.TaskId); allocationPropertyCollection.AddPropertyId(AllocationProperties.CoreId); allocationPropertyCollection.AddPropertyId(AllocationProperties.NodeName); using (ISchedulerRowEnumerator rows = this.schedulerJob.OpenTaskAllocationHistoryEnumerator(allocationPropertyCollection)) { foreach (PropertyRow row in rows) { // Note: Finished/Failed/Canceled task will also be enumerated here // We are going to add them into the dic regaredless of the state // because only running tasks will be queried in the following logic. int objectId = (int)row[AllocationProperties.TaskId].Value; TaskAllocationHistoryItem taskInfo; if (taskInfoDic.TryGetValue(objectId, out taskInfo)) { // For each task instance cache the assigned resource with the lowest coreId. This is needed when node or socket allocation is used // in order to generate the correct port to connect to the service host int coreId = (int)row[AllocationProperties.CoreId].Value; if (taskInfo.FirstCoreId > coreId) { taskInfo.FirstCoreId = coreId; } taskInfo.Capacity++; } else { taskInfo = new TaskAllocationHistoryItem(); taskInfo.Capacity = 1; taskInfo.FirstCoreId = (int)row[AllocationProperties.CoreId].Value; taskInfo.NodeName = (string)row[AllocationProperties.NodeName].Value; taskInfoDic.Add(objectId, taskInfo); } } } TraceHelper.TraceEvent(this.sessionid, TraceEventType.Verbose, "[JobMonitorEntry] Query task info (got task allocation history)."); // Step 2: Get task states from scheduler // Only task whose change time is between lastChangeTime and Now will be queried // Only task id and state are required, will get node name according to node id from allocation history got from step 1. IPropertyIdCollection collection = new PropertyIdCollection(); collection.AddPropertyId(TaskPropertyIds.Id); collection.AddPropertyId(TaskPropertyIds.State); FilterCollection fc = new FilterCollection(); fc.Add(FilterOperator.GreaterThan, TaskPropertyIds.ChangeTime, this.lastChangeTime); fc.Add(FilterOperator.LessThanOrEqual, TaskPropertyIds.ChangeTime, changeTime); // FIXME: There's performance impact on this query because we look for TaskPropertyIds.Type // which is requires a table join. Need to have a better way to do so. fc.Add(FilterOperator.Equal, TaskPropertyIds.Type, TaskType.Service); List <PropertyRow> taskRows = new List <PropertyRow>(); // The ISchedulerRowSet object is a snapshot and is always a new object, so no lock is needed foreach (var taskRow in this.schedulerJob.OpenTaskEnumerator(collection, fc, null, true)) { taskRows.Add(taskRow); } TraceHelper.TraceEvent(this.sessionid, TraceEventType.Verbose, "[JobMonitorEntry] GetTaskInfo, got {0} rows.", taskRows.Count); if (taskRows.Count == 0) { // no service task dispathed yet. TraceHelper.TraceEvent(this.sessionid, TraceEventType.Warning, "[JobMonitorEntry] Failed to get task property rows."); return(null); } TraceHelper.TraceEvent(this.sessionid, TraceEventType.Verbose, "[JobMonitorEntry] Query task info (got task info rows from scheduler)."); this.schedulerJob.Refresh(); int jobRequeueCount = this.schedulerJob.RequeueCount; TraceHelper.TraceEvent(this.sessionid, TraceEventType.Verbose, "[JobMonitorEntry] Job requeue count is {0}", jobRequeueCount); List <TaskInfo> results = new List <TaskInfo>(taskRows.Count); foreach (PropertyRow row in taskRows) { int objectId = (int)row[TaskPropertyIds.Id].Value; TaskAllocationHistoryItem taskInfo; if (!taskInfoDic.TryGetValue(objectId, out taskInfo)) { continue; } TaskState state = (TaskState)row[TaskPropertyIds.State].Value; if (state == TaskState.Running || state == TaskState.Dispatching) { TraceHelper.TraceEvent(this.sessionid, TraceEventType.Information, "[JobMonitorEntry] Task {0} changed into Running", objectId); string machineName = taskInfo.NodeName; NodeLocation location = NodeLocation.OnPremise; string azureServiceName = null; string azureLoadBalancerAddress = null; try { this.GetNodeInfo(machineName, out location, out azureServiceName, out azureLoadBalancerAddress); } catch (Exception e) { // if exception happens when querying node info, just skip this node temporarily. TraceHelper.TraceEvent(this.sessionid, TraceEventType.Warning, "[JobMonitorEntry] -> Get node info for task {0} throws exception. Exception: {1}", objectId, e); continue; } TraceHelper.TraceEvent(this.sessionid, TraceEventType.Verbose, "[JobMonitorEntry] ->Get machine name for task {0}: {1}", objectId, machineName); int capacity = taskInfo.Capacity; int coreindex = taskInfo.FirstCoreId; TraceHelper.TraceEvent(this.sessionid, TraceEventType.Verbose, "[JobMonitorEntry] ->Get coreid for task {0}: {1}", objectId, coreindex); TraceHelper.TraceEvent(this.sessionid, TraceEventType.Verbose, "[JobMonitorEntry] ->Get AzureLoadBalancerAddress for task {0}: {1}", objectId, azureLoadBalancerAddress); TaskInfo info = new TaskInfo(); info.Id = objectId; info.Capacity = capacity; if (SoaHelper.IsOnAzure()) { info.MachineVirtualName = machineName; } else { info.MachineName = machineName; } info.Location = NodeLocationConverter.FromHpcNodeLocation(location); info.ProxyServiceName = azureServiceName; info.AzureLoadBalancerAddress = azureLoadBalancerAddress; info.State = TaskStateConverter.FromHpcTaskState(state); info.FirstCoreIndex = coreindex; info.JobRequeueCount = jobRequeueCount; results.Add(info); } else if (state == TaskState.Failed || state == TaskState.Canceled || state == TaskState.Canceling || state == TaskState.Finished || state == TaskState.Finishing) { TaskInfo info = new TaskInfo(); info.Id = objectId; info.State = TaskStateConverter.FromHpcTaskState(state); info.JobRequeueCount = jobRequeueCount; results.Add(info); } } this.lastChangeTime = changeTime; return(results); } catch (Exception ex) { TraceHelper.TraceEvent(this.sessionid, TraceEventType.Warning, "[JobMonitorEntry] Fail when get task info: {0}", ex); return(null); } finally { TraceHelper.TraceEvent(this.sessionid, TraceEventType.Verbose, "[JobMonitorEntry] Query task info finished."); } }
/// <summary> /// Calculate the min and max value for the service job /// </summary> private void CalculateMinAndMax() { PropertyId[] propIds; PropertyRow row; int userMax, userMin; switch (this.schedulerJob.UnitType) { case JobUnitType.Node: userMax = this.schedulerJob.MaximumNumberOfNodes; userMin = this.schedulerJob.MinimumNumberOfNodes; propIds = new PropertyId[] { JobPropertyIds.ComputedMaxNodes, JobPropertyIds.ComputedMinNodes }; break; case JobUnitType.Socket: userMax = this.schedulerJob.MaximumNumberOfSockets; userMin = this.schedulerJob.MinimumNumberOfSockets; propIds = new PropertyId[] { JobPropertyIds.ComputedMaxSockets, JobPropertyIds.ComputedMinSockets }; break; default: userMax = this.schedulerJob.MaximumNumberOfCores; userMin = this.schedulerJob.MinimumNumberOfCores; propIds = new PropertyId[] { JobPropertyIds.ComputedMaxCores, JobPropertyIds.ComputedMinCores }; break; } IFilterCollection filter = new FilterCollection(); filter.Add(FilterOperator.Equal, JobPropertyIds.Id, this.schedulerJob.Id); IPropertyIdCollection property = new PropertyIdCollection(); foreach (PropertyId pid in propIds) { property.AddPropertyId(pid); } using (ISchedulerRowSet set = this.scheduler.OpenJobRowSet(property, filter, null)) { PropertyRow[] rows = set.GetRows(0, set.GetCount() - 1).Rows; Debug.Assert(rows.Length > 0); row = rows[0]; } string callerName = "[JobMonitorEntry.GetMinAndMax]"; int computedMax = (int)JobHelper.GetStorePropertyValue(row.Props[0], propIds[0], 0, callerName); int computedMin = (int)JobHelper.GetStorePropertyValue(row.Props[1], propIds[1], 0, callerName); if (this.schedulerJob.CanShrink) { this.minUnits = this.schedulerJob.AutoCalculateMin ? computedMin : userMin; } else { this.minUnits = userMin; } if (this.schedulerJob.CanGrow) { this.maxUnits = this.schedulerJob.AutoCalculateMax ? computedMax : userMax; } else { this.maxUnits = userMax; } if (this.maxUnits < this.minUnits) { this.maxUnits = this.minUnits; } TraceHelper.TraceEvent(this.sessionid, TraceEventType.Information, "[JobMonitorEntry] MaxUnits = {0}, MinUnits = {1}", this.maxUnits, this.minUnits); }