/// <summary> /// Adds a set of nodeStatus's to the cycle graph /// </summary> private void AddTargetStatesToCycleDetector(NodeStatus[] nodeStatus, TargetCycleDetector cycleDetector) { for (int i = 0; i < nodeStatus.Length; i++) { cycleDetector.AddTargetsToGraph(nodeStatus[i].StateOfInProgressTargets); } }
/// <summary> /// This method is called when the parent engine doesn't see activity for a preset time period to /// determine if the whole system is making forward progress. In order to that, status is collected /// from every node in the system. If no node is making forward progress then the graph of all the /// inprogress targets is analyzed for cycles. If a cycle is found the appropriate node is instructed /// to break it. If no cause for deadlock can be determined the system is shutdown. /// </summary> /// <returns>New inactivity timeout</returns> internal int DetectDeadlock(int queueCounts, long lastLoopActivity, int currentTimeout) { // Don't try to detect deadlock in single threaded mode or on a child node if (parentEngine.Router.ChildMode || parentEngine.Router.SingleThreadedMode) { return(Timeout.Infinite); } // Calculate time since last loop activity TimeSpan timeSinceLastLoopActivity = new TimeSpan(DateTime.Now.Ticks - lastLoopActivity); // If there are items in the queue waiting to be processed or there was loop activity // not so long ago - continue if (queueCounts > 0 || timeSinceLastLoopActivity.TotalMilliseconds < currentTimeout) { return(currentTimeout); } if (nodeManager.TaskExecutionModule == null) { return(currentTimeout); } // Calculate the time since the last task activity TimeSpan timeSinceLastTEMActivity = new TimeSpan(DateTime.Now.Ticks - nodeManager.TaskExecutionModule.LastTaskActivity()); // If there was not task activity for the whole time period - check with individual nodes // to see if there was activity there if (timeSinceLastTEMActivity.TotalMilliseconds < currentTimeout) { // Increase the timeout since tasks are taking a long time return(calculateNewLoopTimeout(currentTimeout)); } // Check if we are waiting on an outcome of an operation if ((ignoreTimeout - DateTime.Now.Ticks) > 0) { return(currentTimeout); } long requestStartTime = DateTime.Now.Ticks; NodeStatus[] nodeStatus = nodeManager.RequestStatusForNodes(nodeStatusReplyTimeout); long requestDurationTime = DateTime.Now.Ticks - requestStartTime; for (int i = 0; i < nodeStatus.Length; i++) { if (nodeStatus[i] == null) { // A node failed to respond to the request for status. The only option is to shutdown // the build and error out LogOrDumpError("FailedToReceiveChildStatus", i + 1, nodeStatusReplyTimeout); SystemShutdown(); return(currentTimeout); } else if (nodeStatus[i].HasExited) { // A node has exited prematurely. The only option is to shutdown LogOrDumpError("ChildExitedPrematurely", i + 1); SystemShutdown(); return(currentTimeout); } else if (nodeStatus[i].IsActive) { // Calculate the time since last node activity TimeSpan timeSinceLastNodeTaskActivity = new TimeSpan(nodeStatus[i].TimeSinceLastTaskActivity); TimeSpan timeSinceLastNodeLoopActivity = new TimeSpan(nodeStatus[i].TimeSinceLastLoopActivity); // Check if there was activity on the node within the timeout if (nodeStatus[i].QueueDepth > 0 || timeSinceLastNodeTaskActivity.TotalMilliseconds < currentTimeout || timeSinceLastNodeLoopActivity.TotalMilliseconds < currentTimeout) { // If the time out has been exceeded while one of the nodes was // active lets increase the timeout return(calculateNewLoopTimeout(currentTimeout)); } } else if (nodeStatus[i].IsLaunchInProgress) { // If there is a node in process of being launched, only the NodeProvider // knows how long that should take so the decision to error out can // only be made by the node provider. return(currentTimeout); } } // There was no detected activity within the system for the whole time period. Check // if there is a cycle in the in progress targets TargetCycleDetector cycleDetector = new TargetCycleDetector(parentEngine.LoggingServices, parentEngine.EngineCallback); AddTargetStatesToCycleDetector(nodeStatus, cycleDetector); NodeStatus localStatus = parentEngine.RequestStatus(0); cycleDetector.AddTargetsToGraph(localStatus.StateOfInProgressTargets); if (cycleDetector.FindCycles()) { if (Engine.debugMode) { Console.WriteLine("Breaking cycle between " + cycleDetector.CycleEdgeChild.TargetId.name + " and " + cycleDetector.CycleEdgeParent.TargetId.name); } // A cycle has been detected - it needs to be broken for the build to continue nodeManager.PostCycleNotification(cycleDetector.CycleEdgeChild.TargetId.nodeId, cycleDetector.CycleEdgeChild, cycleDetector.CycleEdgeParent); // Use the amount of time it took us to receive the NodeStatus and buffer it a little because node status is sent via a faster code path ignoreTimeout = DateTime.Now.Ticks + requestDurationTime + (cycleBreakTimeout * TimeSpan.TicksPerMillisecond); return(currentTimeout); } // The system doesn't appear to be making progress. Switch to a largest sampling interval. if (currentTimeout != maxLoopTimeout) { return(maxLoopTimeout); } // Should make at least two observations before assuming that no forward progress is being made if (previousStatus == null || previousLocalStatus == null || nodeStatus.Length != previousStatus.Length) { previousStatus = nodeStatus; previousLocalStatus = localStatus; return(currentTimeout); } // There was some activity between previous and current status checks on the local node if (localStatus.LastLoopActivity != previousLocalStatus.LastLoopActivity || localStatus.LastTaskActivity != previousLocalStatus.LastTaskActivity) { previousStatus = nodeStatus; previousLocalStatus = localStatus; return(currentTimeout); } for (int i = 0; i < nodeStatus.Length; i++) { // There was some activity between previous and current status checks on the child node if (nodeStatus[i].LastTaskActivity != previousStatus[i].LastTaskActivity || nodeStatus[i].LastLoopActivity != previousStatus[i].LastLoopActivity) { previousStatus = nodeStatus; previousLocalStatus = localStatus; return(currentTimeout); } } // The system is not making forward progress for an unknown reason. The // only recourse to is to collect as much data as possible and shutdown with // an error message // UNDONE - using logging and resource string to output the state dump GatherNodeInformationForShutdown(nodeStatus, localStatus); SystemShutdown(); return(currentTimeout); }
/// <summary> /// This method is called when the parent engine doesn't see activity for a preset time period to /// determine if the whole system is making forward progress. In order to that, status is collected /// from every node in the system. If no node is making forward progress then the graph of all the /// inprogress targets is analyzed for cycles. If a cycle is found the appropriate node is instructed /// to break it. If no cause for deadlock can be determined the system is shutdown. /// </summary> /// <returns>New inactivity timeout</returns> internal int DetectDeadlock( int queueCounts, long lastLoopActivity, int currentTimeout) { // Don't try to detect deadlock in single threaded mode or on a child node if (parentEngine.Router.ChildMode || parentEngine.Router.SingleThreadedMode) { return Timeout.Infinite; } // Calculate time since last loop activity TimeSpan timeSinceLastLoopActivity = new TimeSpan(DateTime.Now.Ticks - lastLoopActivity); // If there are items in the queue waiting to be processed or there was loop activity // not so long ago - continue if (queueCounts > 0 || timeSinceLastLoopActivity.TotalMilliseconds < currentTimeout) { return currentTimeout; } if (nodeManager.TaskExecutionModule == null) { return currentTimeout; } // Calculate the time since the last task activity TimeSpan timeSinceLastTEMActivity = new TimeSpan(DateTime.Now.Ticks - nodeManager.TaskExecutionModule.LastTaskActivity()); // If there was not task activity for the whole time period - check with individual nodes // to see if there was activity there if (timeSinceLastTEMActivity.TotalMilliseconds < currentTimeout) { // Increase the timeout since tasks are taking a long time return calculateNewLoopTimeout(currentTimeout); } // Check if we are waiting on an outcome of an operation if ((ignoreTimeout - DateTime.Now.Ticks) > 0) { return currentTimeout; } long requestStartTime = DateTime.Now.Ticks; NodeStatus[] nodeStatus = nodeManager.RequestStatusForNodes(nodeStatusReplyTimeout); long requestDurationTime = DateTime.Now.Ticks - requestStartTime; for (int i = 0; i < nodeStatus.Length; i++) { if (nodeStatus[i] == null) { // A node failed to respond to the request for status. The only option is to shutdown // the build and error out LogOrDumpError("FailedToReceiveChildStatus", i + 1, nodeStatusReplyTimeout); SystemShutdown(); return currentTimeout; } else if (nodeStatus[i].HasExited) { // A node has exited prematurely. The only option is to shutdown LogOrDumpError("ChildExitedPrematurely", i + 1); SystemShutdown(); return currentTimeout; } else if (nodeStatus[i].IsActive) { // Calculate the time since last node activity TimeSpan timeSinceLastNodeTaskActivity = new TimeSpan(nodeStatus[i].TimeSinceLastTaskActivity); TimeSpan timeSinceLastNodeLoopActivity = new TimeSpan(nodeStatus[i].TimeSinceLastLoopActivity); // Check if there was activity on the node within the timeout if (nodeStatus[i].QueueDepth > 0 || timeSinceLastNodeTaskActivity.TotalMilliseconds < currentTimeout || timeSinceLastNodeLoopActivity.TotalMilliseconds < currentTimeout) { // If the time out has been exceeded while one of the nodes was // active lets increase the timeout return calculateNewLoopTimeout(currentTimeout); } } else if (nodeStatus[i].IsLaunchInProgress) { // If there is a node in process of being launched, only the NodeProvider // knows how long that should take so the decision to error out can // only be made by the node provider. return currentTimeout; } } // There was no detected activity within the system for the whole time period. Check // if there is a cycle in the in progress targets TargetCycleDetector cycleDetector = new TargetCycleDetector(parentEngine.LoggingServices, parentEngine.EngineCallback); AddTargetStatesToCycleDetector(nodeStatus, cycleDetector); NodeStatus localStatus = parentEngine.RequestStatus(0); cycleDetector.AddTargetsToGraph(localStatus.StateOfInProgressTargets); if (cycleDetector.FindCycles()) { if (Engine.debugMode) { Console.WriteLine("Breaking cycle between " + cycleDetector.CycleEdgeChild.TargetId.name + " and " + cycleDetector.CycleEdgeParent.TargetId.name); } // A cycle has been detected - it needs to be broken for the build to continue nodeManager.PostCycleNotification(cycleDetector.CycleEdgeChild.TargetId.nodeId, cycleDetector.CycleEdgeChild, cycleDetector.CycleEdgeParent); // Use the amount of time it took us to receive the NodeStatus and buffer it a little because node status is sent via a faster code path ignoreTimeout = DateTime.Now.Ticks + requestDurationTime + (cycleBreakTimeout * TimeSpan.TicksPerMillisecond); return currentTimeout; } // The system doesn't appear to be making progress. Switch to a largest sampling interval. if (currentTimeout != maxLoopTimeout) { return maxLoopTimeout; } // Should make at least two observations before assuming that no forward progress is being made if (previousStatus == null || previousLocalStatus == null || nodeStatus.Length != previousStatus.Length) { previousStatus = nodeStatus; previousLocalStatus = localStatus; return currentTimeout; } // There was some activity between previous and current status checks on the local node if (localStatus.LastLoopActivity != previousLocalStatus.LastLoopActivity || localStatus.LastTaskActivity != previousLocalStatus.LastTaskActivity ) { previousStatus = nodeStatus; previousLocalStatus = localStatus; return currentTimeout; } for (int i = 0; i < nodeStatus.Length; i++) { // There was some activity between previous and current status checks on the child node if (nodeStatus[i].LastTaskActivity != previousStatus[i].LastTaskActivity || nodeStatus[i].LastLoopActivity != previousStatus[i].LastLoopActivity) { previousStatus = nodeStatus; previousLocalStatus = localStatus; return currentTimeout; } } // The system is not making forward progress for an unknown reason. The // only recourse to is to collect as much data as possible and shutdown with // an error message // UNDONE - using logging and resource string to output the state dump GatherNodeInformationForShutdown(nodeStatus, localStatus); SystemShutdown(); return currentTimeout; }