/// <summary> /// collectAllocatedInfo - From a single job, collect a rowset of allocated resources and pass it on to core or node sorting /// </summary> /// <param name="job">ISchedulerJob job to collect allocation history</param> private static void collectAllocatedInfo(ISchedulerJob job) { if (bVerbose) { Console.WriteLine("Entering collectAllocatedInfo: job {0} project: {1}", job.Id, job.Project); } IPropertyIdCollection props = new PropertyIdCollection(); props.Add(AllocationProperties.NodeName); props.Add(AllocationProperties.NodeId); props.Add(AllocationProperties.CoreId); props.Add(AllocationProperties.StartTime); props.Add(AllocationProperties.EndTime); // OpenJobAllocationHistory returns information sorted by ascending AllocationProperties.StartTime using (ISchedulerRowEnumerator rows = job.OpenJobAllocationHistoryEnumerator(props)) { if (bNodesOnly) { NodeDuration(rows); } else { CoreDuration(rows); } } return; }
/// <summary> /// NodeDuration - Sort information from the rows returned from OpenJobAllocationHistoryEnumerator /// </summary> /// <param name="rows">RowSet, allocated Core information from the job, to be resorted into node allocation</param> private static void NodeDuration(ISchedulerRowEnumerator rows) { if (bVerbose) { Console.WriteLine("Entering NodeDuration"); } TimeSpan tsTotal = new TimeSpan(0); List <NodeUse> nodeList = new List <NodeUse>(); // Convert core rowset into node list foreach (PropertyRow row in rows) { // Find the last item in this list that uses this node int iIndex = nodeList.FindLastIndex( delegate(NodeUse n) { return(n.NodeId == (int)row[(int)RowIndex.NodeId].Value); } ); // If this node does not yet exist, or if the current start is beyond the endtime in the list, add a new list item if ((iIndex < 0) || (nodeList[iIndex].EndTime < (DateTime)row[(int)RowIndex.StartTime].Value)) { if (bVerbose) { Console.WriteLine("Add item to Node List"); } // If the core is still running, set the end time to maximum so all other searches will be swallowed DateTime coreEndTime = (row[(int)RowIndex.EndTime].Id == AllocationProperties.EndTime) ? (DateTime)row[(int)RowIndex.EndTime].Value : DateTime.MaxValue; NodeUse nu = new NodeUse((int)row[(int)RowIndex.NodeId].Value, (string)row[(int)RowIndex.NodeName].Value, (DateTime)row[(int)RowIndex.StartTime].Value, coreEndTime); nodeList.Add(nu); if (bVerbose) { Console.WriteLine("Added Node List item for: {0}", (string)row[(int)RowIndex.NodeName].Value); } } else // A node was found in the list that overlaps this core's duration { if (row[(int)RowIndex.EndTime].Id != AllocationProperties.EndTime) { // If the current core is still running, set the end time to maximum nodeList[iIndex].EndTime = DateTime.MaxValue; } else if ((DateTime)row[(int)RowIndex.EndTime].Value > nodeList[iIndex].EndTime) { // If the current core endtime is greater than the list node endtime, extend the nodes duration nodeList[iIndex].EndTime = (DateTime)row[(int)RowIndex.EndTime].Value; } } } if (bVerbose) { Console.WriteLine("Node List created"); } // Add all node duration and display information if appropriate foreach (NodeUse nodeUse in nodeList) { // Show each node only if /detailed was set if (bDetailed) { Console.Write("{0} {1} Start: {2} End: ", nodeUse.NodeName, nodeUse.NodeId, nodeUse.StartTime); if (nodeUse.EndTime != DateTime.MaxValue) { Console.WriteLine((DateTime)nodeUse.EndTime); } else { Console.WriteLine(CORERUNNING); } } if (bVerbose) { Console.WriteLine("dtCurrent: {0}", dtCurrent); } // If the node still has a core running, set the end length to the current time if (nodeUse.EndTime == DateTime.MaxValue) { nodeUse.EndTime = dtCurrent; } // Add the amount of time spent on using this node tsTotal += nodeUse.EndTime - nodeUse.StartTime; } iAllJobThreads += nodeList.Count; tsAllJobUsage += tsTotal; // Round up/down to seconds if (tsTotal.TotalSeconds >= iRoundToSecondsMinimum) { if (tsTotal.Milliseconds >= 500) { tsTotal = tsTotal.Add(TimeSpan.FromSeconds(1)); } tsTotal = TimeSpan.FromSeconds((int)tsTotal.TotalSeconds); } Console.WriteLine("Total nodes: {0} Total node usage: {1}", nodeList.Count, tsTotal); return; }
/// <summary> /// CoreDuration - Sort information from the rows returned from OpenJobAllocationHistoryEnumerator /// </summary> /// <param name="rows">RowSet, allocated Core information from the job</param> private static void CoreDuration(ISchedulerRowEnumerator rows) { TimeSpan tsTotal = new TimeSpan(0); DateTime firstStart = DateTime.MaxValue; DateTime lastEnd = DateTime.MinValue; int iTotalThreads = 0; if (bVerbose) { Console.WriteLine("Entering CoreDuration"); } foreach (PropertyRow row in rows) { DateTime dtEnd = (row[(int)RowIndex.EndTime].Id == AllocationProperties.EndTime) ? (DateTime)row[(int)RowIndex.EndTime].Value : dtCurrent; // Show each core only if /detailed was set if (bDetailed) { Console.WriteLine("{0} {1}.{2} Start: {3} End: {4}", row[(int)RowIndex.NodeName].Value, row[(int)RowIndex.NodeId].Value, row[(int)RowIndex.CoreId].Value, row[(int)RowIndex.StartTime].Value, ((dtEnd != dtCurrent) ? dtEnd.ToString() : CORERUNNING)); } // Add the amount of time spent on using this core tsTotal += dtEnd - (DateTime)row[(int)RowIndex.StartTime].Value; // Set the earliest and latest times used by the job if (firstStart > (DateTime)row[(int)RowIndex.StartTime].Value) { firstStart = (DateTime)row[(int)RowIndex.StartTime].Value; } if (lastEnd < dtEnd) { lastEnd = dtEnd; } // Increment the number of cores opened by the job // Note: The same core can be opened and closed multiple times and each duration will be incremented iTotalThreads++; } iAllJobThreads += iTotalThreads; tsAllJobUsage += tsTotal; // Round up/down to seconds if (bVerbose) { Console.WriteLine("Total Seconds: {0}", tsTotal.TotalSeconds); } if (tsTotal.TotalSeconds >= iRoundToSecondsMinimum) { if (tsTotal.Milliseconds >= 500) { tsTotal = tsTotal.Add(TimeSpan.FromSeconds(1)); } //tsTotal = tsTotal.Subtract(TimeSpan.FromMilliseconds(tsTotal.Milliseconds)); tsTotal = TimeSpan.FromSeconds((int)tsTotal.TotalSeconds); } Console.WriteLine("Total cores: {0} Total core usage: {1}", iTotalThreads, tsTotal.ToString()); return; }
/// <summary> /// Gets the task info /// </summary> /// <returns>returns the task info as a dictionary, keyed by task id</returns> /// <remarks> /// This method returns a list of task info which ChangeTime property is in this rank: [this.lastChangeTime, DateTime.Now]. /// This method does not change this.lastChangeTime to DateTime.Now after getting tasks because it may fail when sending those information to broker /// So changeTime is outputed and this.lastChangeTime should be modified to this time after suceeded sending back task info /// </remarks> private List <TaskInfo> GetTaskInfo() { DateTime changeTime = DateTime.UtcNow; try { TraceHelper.TraceEvent(this.sessionid, TraceEventType.Verbose, "[JobMonitorEntry] Query task info..."); // Step 1: Query task allocation history to fetch node id and core id for tasks Dictionary <int, TaskAllocationHistoryItem> taskInfoDic = new Dictionary <int, TaskAllocationHistoryItem>(); PropertyIdCollection allocationPropertyCollection = new PropertyIdCollection(); allocationPropertyCollection.AddPropertyId(AllocationProperties.TaskId); allocationPropertyCollection.AddPropertyId(AllocationProperties.CoreId); allocationPropertyCollection.AddPropertyId(AllocationProperties.NodeName); using (ISchedulerRowEnumerator rows = this.schedulerJob.OpenTaskAllocationHistoryEnumerator(allocationPropertyCollection)) { foreach (PropertyRow row in rows) { // Note: Finished/Failed/Canceled task will also be enumerated here // We are going to add them into the dic regaredless of the state // because only running tasks will be queried in the following logic. int objectId = (int)row[AllocationProperties.TaskId].Value; TaskAllocationHistoryItem taskInfo; if (taskInfoDic.TryGetValue(objectId, out taskInfo)) { // For each task instance cache the assigned resource with the lowest coreId. This is needed when node or socket allocation is used // in order to generate the correct port to connect to the service host int coreId = (int)row[AllocationProperties.CoreId].Value; if (taskInfo.FirstCoreId > coreId) { taskInfo.FirstCoreId = coreId; } taskInfo.Capacity++; } else { taskInfo = new TaskAllocationHistoryItem(); taskInfo.Capacity = 1; taskInfo.FirstCoreId = (int)row[AllocationProperties.CoreId].Value; taskInfo.NodeName = (string)row[AllocationProperties.NodeName].Value; taskInfoDic.Add(objectId, taskInfo); } } } TraceHelper.TraceEvent(this.sessionid, TraceEventType.Verbose, "[JobMonitorEntry] Query task info (got task allocation history)."); // Step 2: Get task states from scheduler // Only task whose change time is between lastChangeTime and Now will be queried // Only task id and state are required, will get node name according to node id from allocation history got from step 1. IPropertyIdCollection collection = new PropertyIdCollection(); collection.AddPropertyId(TaskPropertyIds.Id); collection.AddPropertyId(TaskPropertyIds.State); FilterCollection fc = new FilterCollection(); fc.Add(FilterOperator.GreaterThan, TaskPropertyIds.ChangeTime, this.lastChangeTime); fc.Add(FilterOperator.LessThanOrEqual, TaskPropertyIds.ChangeTime, changeTime); // FIXME: There's performance impact on this query because we look for TaskPropertyIds.Type // which is requires a table join. Need to have a better way to do so. fc.Add(FilterOperator.Equal, TaskPropertyIds.Type, TaskType.Service); List <PropertyRow> taskRows = new List <PropertyRow>(); // The ISchedulerRowSet object is a snapshot and is always a new object, so no lock is needed foreach (var taskRow in this.schedulerJob.OpenTaskEnumerator(collection, fc, null, true)) { taskRows.Add(taskRow); } TraceHelper.TraceEvent(this.sessionid, TraceEventType.Verbose, "[JobMonitorEntry] GetTaskInfo, got {0} rows.", taskRows.Count); if (taskRows.Count == 0) { // no service task dispathed yet. TraceHelper.TraceEvent(this.sessionid, TraceEventType.Warning, "[JobMonitorEntry] Failed to get task property rows."); return(null); } TraceHelper.TraceEvent(this.sessionid, TraceEventType.Verbose, "[JobMonitorEntry] Query task info (got task info rows from scheduler)."); this.schedulerJob.Refresh(); int jobRequeueCount = this.schedulerJob.RequeueCount; TraceHelper.TraceEvent(this.sessionid, TraceEventType.Verbose, "[JobMonitorEntry] Job requeue count is {0}", jobRequeueCount); List <TaskInfo> results = new List <TaskInfo>(taskRows.Count); foreach (PropertyRow row in taskRows) { int objectId = (int)row[TaskPropertyIds.Id].Value; TaskAllocationHistoryItem taskInfo; if (!taskInfoDic.TryGetValue(objectId, out taskInfo)) { continue; } TaskState state = (TaskState)row[TaskPropertyIds.State].Value; if (state == TaskState.Running || state == TaskState.Dispatching) { TraceHelper.TraceEvent(this.sessionid, TraceEventType.Information, "[JobMonitorEntry] Task {0} changed into Running", objectId); string machineName = taskInfo.NodeName; NodeLocation location = NodeLocation.OnPremise; string azureServiceName = null; string azureLoadBalancerAddress = null; try { this.GetNodeInfo(machineName, out location, out azureServiceName, out azureLoadBalancerAddress); } catch (Exception e) { // if exception happens when querying node info, just skip this node temporarily. TraceHelper.TraceEvent(this.sessionid, TraceEventType.Warning, "[JobMonitorEntry] -> Get node info for task {0} throws exception. Exception: {1}", objectId, e); continue; } TraceHelper.TraceEvent(this.sessionid, TraceEventType.Verbose, "[JobMonitorEntry] ->Get machine name for task {0}: {1}", objectId, machineName); int capacity = taskInfo.Capacity; int coreindex = taskInfo.FirstCoreId; TraceHelper.TraceEvent(this.sessionid, TraceEventType.Verbose, "[JobMonitorEntry] ->Get coreid for task {0}: {1}", objectId, coreindex); TraceHelper.TraceEvent(this.sessionid, TraceEventType.Verbose, "[JobMonitorEntry] ->Get AzureLoadBalancerAddress for task {0}: {1}", objectId, azureLoadBalancerAddress); TaskInfo info = new TaskInfo(); info.Id = objectId; info.Capacity = capacity; if (SoaHelper.IsOnAzure()) { info.MachineVirtualName = machineName; } else { info.MachineName = machineName; } info.Location = NodeLocationConverter.FromHpcNodeLocation(location); info.ProxyServiceName = azureServiceName; info.AzureLoadBalancerAddress = azureLoadBalancerAddress; info.State = TaskStateConverter.FromHpcTaskState(state); info.FirstCoreIndex = coreindex; info.JobRequeueCount = jobRequeueCount; results.Add(info); } else if (state == TaskState.Failed || state == TaskState.Canceled || state == TaskState.Canceling || state == TaskState.Finished || state == TaskState.Finishing) { TaskInfo info = new TaskInfo(); info.Id = objectId; info.State = TaskStateConverter.FromHpcTaskState(state); info.JobRequeueCount = jobRequeueCount; results.Add(info); } } this.lastChangeTime = changeTime; return(results); } catch (Exception ex) { TraceHelper.TraceEvent(this.sessionid, TraceEventType.Warning, "[JobMonitorEntry] Fail when get task info: {0}", ex); return(null); } finally { TraceHelper.TraceEvent(this.sessionid, TraceEventType.Verbose, "[JobMonitorEntry] Query task info finished."); } }