internal static void ActOnClusDbHang(LatencyChecker.ClusDbHungAction action) { if (action != null && action.TakeAction && action.HungInfo != null) { ReplayCrimsonEvents.HungNodeRecoveryActionStart.Log <string>(LatencyChecker.ConvertAmServerNamesToString(action.TargetNodes)); bool flag; if (RegistryParameters.IsKillClusterServiceOnClusApiHang) { flag = true; if (action.TargetNodes != null && action.TargetNodes.Length > 0) { AmServerName amServerName = action.TargetNodes[0]; RpcKillServiceImpl.Reply reply = RpcKillServiceImpl.SendKillRequest(amServerName.Fqdn, "Clussvc", action.HungInfo.ApiHungStartTime.LocalTime, false, RegistryParameters.RpcKillServiceTimeoutInMSec); flag = (reply != null && reply.IsSucceeded && reply.IsSucceeded); } } else { flag = false; ReplayCrimsonEvents.SkippedSendingClussvcKillRequest.LogPeriodic(action.HungInfo.ApiName, TimeSpan.FromMinutes(15.0)); } if (!flag) { string text = LatencyChecker.ConvertAmServerNamesToString(action.TargetNodes); ReplayCrimsonEvents.HungNodeRebootRequested.Log <string>(text); LatencyChecker.TriggerNodeRestart(action.HungInfo.CurrentGumId.ToString(), (action.HungInfo.CurrentLockOwnerName != null) ? action.HungInfo.CurrentLockOwnerName.NetbiosName : "NULL", text, action.HungInfo, action); return; } } else if (action == null || action.HungInfo == null) { ReplayCrimsonEvents.GenericMessage.Log <string>("ActOnClusDbHang: Action is null or action.HungInfo is null"); } }
internal static void OnClusApiHang(object context) { LatencyChecker.LatencyContext latencyContext = (LatencyChecker.LatencyContext)context; LatencyChecker.ClusDbHungInfo clusDbHungInfo = LatencyChecker.GatherHungNodesInformation(latencyContext); LatencyChecker.LastKnownHungInfo = clusDbHungInfo; LatencyChecker.ClusDbHungAction action = LatencyChecker.AnalyzeAndSuggestActionForClusDbHang(clusDbHungInfo); LatencyChecker.ActOnClusDbHang(action); }
private static void TriggerNodeRestart(string currentGumId, string currentLockOwnerName, string hungNodeCsv, LatencyChecker.ClusDbHungInfo hungInfo, LatencyChecker.ClusDbHungAction hungAction) { EventNotificationItem eventNotificationItem = new EventNotificationItem("MSExchangeRepl", "Cluster", "ClusterNodeRestart", string.Format("Cluster Hung detected. GumId={0}, LockOwner={1}, HungNodes={2}, HungInfo={3}, Decision={4}", new object[] { currentGumId, currentLockOwnerName, hungNodeCsv, hungInfo.ToString(), hungAction.ToString() }), hungNodeCsv, ResultSeverityLevel.Critical); eventNotificationItem.Publish(false); }
internal static LatencyChecker.ClusDbHungAction AnalyzeAndSuggestActionForClusDbHang(LatencyChecker.ClusDbHungInfo hungInfo) { LatencyChecker.ClusDbHungAction clusDbHungAction = new LatencyChecker.ClusDbHungAction(); clusDbHungAction.HungInfo = hungInfo; clusDbHungAction.TakeAction = false; clusDbHungAction.TargetNodes = hungInfo.HungNodes; clusDbHungAction.Reason = "If you see this message, something is wrong..."; if (hungInfo.HungNodeApiException != null) { if (hungInfo.HungNodeApiException is HungDetectionGumIdChangedException) { clusDbHungAction.TakeAction = false; clusDbHungAction.Reason = "GumId changed."; } else if (hungInfo.HungNodeApiException is OpenClusterTimedoutException) { clusDbHungAction.TakeAction = true; OpenClusterTimedoutException ex = (OpenClusterTimedoutException)hungInfo.HungNodeApiException; clusDbHungAction.Reason = string.Format("OpenCluster timed-out for {0}", ex.ServerName); } else if (hungInfo.HungNodeApiException is ClusterException) { clusDbHungAction.TakeAction = false; clusDbHungAction.Reason = "ClusterException was caught."; } } else { clusDbHungAction.TakeAction = true; clusDbHungAction.Reason = "Hung node detected without any Exceptions caught."; } if (clusDbHungAction.TargetNodes == null || clusDbHungAction.TargetNodes.Length < 1) { clusDbHungAction.TakeAction = false; clusDbHungAction.Reason = "No hung node detected, and Rpc timeout did not catch anything."; if (hungInfo.RpcFailedNodes != null && hungInfo.RpcFailedNodes.Length > 0) { AmServerName amServerName = null; foreach (AmServerName amServerName2 in hungInfo.RpcFailedNodes) { AmNodeState amNodeState = AmNodeState.Unknown; if (hungInfo.ClusterNodesStatus.TryGetValue(amServerName2.NetbiosName, out amNodeState) && amNodeState != AmNodeState.Unknown && amNodeState != AmNodeState.Down) { amServerName = amServerName2; break; } } if (amServerName != null) { clusDbHungAction.TakeAction = true; clusDbHungAction.TargetNodes = new AmServerName[] { amServerName }; clusDbHungAction.Reason = string.Format("Hung nodes detected via Rpc timeout. Node '{0}' chosen for action. Original list={1}", amServerName.NetbiosName, LatencyChecker.ConvertAmServerNamesToString(hungInfo.RpcFailedNodes)); } else { clusDbHungAction.TakeAction = false; clusDbHungAction.TargetNodes = null; clusDbHungAction.Reason = string.Format("No nodes in Rpc non-responsive list are UP according to cluster. Skipping reboot. Original list={0}", LatencyChecker.ConvertAmServerNamesToString(hungInfo.RpcFailedNodes)); } } if (!clusDbHungAction.TakeAction && !AmServerName.IsNullOrEmpty(hungInfo.CurrentLockOwnerName)) { clusDbHungAction.TakeAction = true; clusDbHungAction.TargetNodes = new AmServerName[] { hungInfo.CurrentLockOwnerName }; clusDbHungAction.Reason = string.Format("Could not find any hung nodes, so taking restart/reboot action for the lock owner '{0}'", hungInfo.CurrentLockOwnerName.NetbiosName); } } ReplayCrimsonEvents.HungNodeAnalysisResult.Log <string>(clusDbHungAction.ToString()); return(clusDbHungAction); }