internal static void ActOnClusDbHang(LatencyChecker.ClusDbHungAction action) { if (action != null && action.TakeAction && action.HungInfo != null) { ReplayCrimsonEvents.HungNodeRecoveryActionStart.Log <string>(LatencyChecker.ConvertAmServerNamesToString(action.TargetNodes)); bool flag; if (RegistryParameters.IsKillClusterServiceOnClusApiHang) { flag = true; if (action.TargetNodes != null && action.TargetNodes.Length > 0) { AmServerName amServerName = action.TargetNodes[0]; RpcKillServiceImpl.Reply reply = RpcKillServiceImpl.SendKillRequest(amServerName.Fqdn, "Clussvc", action.HungInfo.ApiHungStartTime.LocalTime, false, RegistryParameters.RpcKillServiceTimeoutInMSec); flag = (reply != null && reply.IsSucceeded && reply.IsSucceeded); } } else { flag = false; ReplayCrimsonEvents.SkippedSendingClussvcKillRequest.LogPeriodic(action.HungInfo.ApiName, TimeSpan.FromMinutes(15.0)); } if (!flag) { string text = LatencyChecker.ConvertAmServerNamesToString(action.TargetNodes); ReplayCrimsonEvents.HungNodeRebootRequested.Log <string>(text); LatencyChecker.TriggerNodeRestart(action.HungInfo.CurrentGumId.ToString(), (action.HungInfo.CurrentLockOwnerName != null) ? action.HungInfo.CurrentLockOwnerName.NetbiosName : "NULL", text, action.HungInfo, action); return; } } else if (action == null || action.HungInfo == null) { ReplayCrimsonEvents.GenericMessage.Log <string>("ActOnClusDbHang: Action is null or action.HungInfo is null"); } }
private static void RaiseRedEvent() { string arg = string.Empty; string arg2 = string.Empty; if (LatencyChecker.LastKnownHungInfo != null) { arg = LatencyChecker.ConvertAmServerNamesToString(LatencyChecker.LastKnownHungInfo.HungNodes); arg2 = LatencyChecker.ConvertAmServerNamesToString(LatencyChecker.LastKnownHungInfo.RpcFailedNodes); } new EventNotificationItem("MSExchangeRepl", "Cluster", "ClusterHung", string.Format("ClusDb write timed out. HungNodeInfo={0}", (LatencyChecker.LastKnownHungInfo == null) ? "NULL" : LatencyChecker.LastKnownHungInfo.ToString()), string.Format("HungNodeApi={0}", arg), ResultSeverityLevel.Critical) { StateAttribute2 = string.Format("RpcHungNode={0}", arg2) }.Publish(false); }
public override string ToString() { string format = "CurrentGumId: '{0}' LockOwner: '{1}' ApiName: '{2}' ApiHungStartTime: '{3}' HungNodes: '{4}' RpcFailedNodes: '{5}' ClusterStatus: '{6}' HungNodeApiEx: '{7}' RpcExs: '{8}'"; object[] array = new object[9]; array[0] = this.CurrentGumId; array[1] = ((this.CurrentLockOwnerName == null) ? "NULL" : this.CurrentLockOwnerName.NetbiosName); array[2] = (string.IsNullOrEmpty(this.ApiName) ? "NULL" : this.ApiName); array[3] = this.ApiHungStartTime.ToString("o"); array[4] = LatencyChecker.ConvertAmServerNamesToString(this.HungNodes); array[5] = LatencyChecker.ConvertAmServerNamesToString(this.RpcFailedNodes); object[] array2 = array; int num = 6; string text; if (this.ClusterNodesStatus != null && this.ClusterNodesStatus.Count >= 1) { text = string.Join(",", this.ClusterNodesStatus.Select((KeyValuePair <string, AmNodeState> pair, int sel) => string.Format("{0}={1}", pair.Key, pair.Value))); } else { text = "NULL"; } array2[num] = text; array[7] = ((this.HungNodeApiException == null) ? "NULL" : this.HungNodeApiException.Message); object[] array3 = array; int num2 = 8; string text2; if (this.RpcExceptions != null && this.RpcExceptions.Length >= 1) { text2 = string.Join(",", from e in this.RpcExceptions select e.Message); } else { text2 = "NULL"; } array3[num2] = text2; return(string.Format(format, array)); }
public override string ToString() { return(string.Format("TakeAction: '{0}' TargetNodes: '{1}' Reason: '{2}'", this.TakeAction, LatencyChecker.ConvertAmServerNamesToString(this.TargetNodes), string.IsNullOrEmpty(this.Reason) ? "NULL" : this.Reason)); }
internal static LatencyChecker.ClusDbHungAction AnalyzeAndSuggestActionForClusDbHang(LatencyChecker.ClusDbHungInfo hungInfo) { LatencyChecker.ClusDbHungAction clusDbHungAction = new LatencyChecker.ClusDbHungAction(); clusDbHungAction.HungInfo = hungInfo; clusDbHungAction.TakeAction = false; clusDbHungAction.TargetNodes = hungInfo.HungNodes; clusDbHungAction.Reason = "If you see this message, something is wrong..."; if (hungInfo.HungNodeApiException != null) { if (hungInfo.HungNodeApiException is HungDetectionGumIdChangedException) { clusDbHungAction.TakeAction = false; clusDbHungAction.Reason = "GumId changed."; } else if (hungInfo.HungNodeApiException is OpenClusterTimedoutException) { clusDbHungAction.TakeAction = true; OpenClusterTimedoutException ex = (OpenClusterTimedoutException)hungInfo.HungNodeApiException; clusDbHungAction.Reason = string.Format("OpenCluster timed-out for {0}", ex.ServerName); } else if (hungInfo.HungNodeApiException is ClusterException) { clusDbHungAction.TakeAction = false; clusDbHungAction.Reason = "ClusterException was caught."; } } else { clusDbHungAction.TakeAction = true; clusDbHungAction.Reason = "Hung node detected without any Exceptions caught."; } if (clusDbHungAction.TargetNodes == null || clusDbHungAction.TargetNodes.Length < 1) { clusDbHungAction.TakeAction = false; clusDbHungAction.Reason = "No hung node detected, and Rpc timeout did not catch anything."; if (hungInfo.RpcFailedNodes != null && hungInfo.RpcFailedNodes.Length > 0) { AmServerName amServerName = null; foreach (AmServerName amServerName2 in hungInfo.RpcFailedNodes) { AmNodeState amNodeState = AmNodeState.Unknown; if (hungInfo.ClusterNodesStatus.TryGetValue(amServerName2.NetbiosName, out amNodeState) && amNodeState != AmNodeState.Unknown && amNodeState != AmNodeState.Down) { amServerName = amServerName2; break; } } if (amServerName != null) { clusDbHungAction.TakeAction = true; clusDbHungAction.TargetNodes = new AmServerName[] { amServerName }; clusDbHungAction.Reason = string.Format("Hung nodes detected via Rpc timeout. Node '{0}' chosen for action. Original list={1}", amServerName.NetbiosName, LatencyChecker.ConvertAmServerNamesToString(hungInfo.RpcFailedNodes)); } else { clusDbHungAction.TakeAction = false; clusDbHungAction.TargetNodes = null; clusDbHungAction.Reason = string.Format("No nodes in Rpc non-responsive list are UP according to cluster. Skipping reboot. Original list={0}", LatencyChecker.ConvertAmServerNamesToString(hungInfo.RpcFailedNodes)); } } if (!clusDbHungAction.TakeAction && !AmServerName.IsNullOrEmpty(hungInfo.CurrentLockOwnerName)) { clusDbHungAction.TakeAction = true; clusDbHungAction.TargetNodes = new AmServerName[] { hungInfo.CurrentLockOwnerName }; clusDbHungAction.Reason = string.Format("Could not find any hung nodes, so taking restart/reboot action for the lock owner '{0}'", hungInfo.CurrentLockOwnerName.NetbiosName); } } ReplayCrimsonEvents.HungNodeAnalysisResult.Log <string>(clusDbHungAction.ToString()); return(clusDbHungAction); }
internal static LatencyChecker.ClusDbHungInfo GatherHungNodesInformation(LatencyChecker.LatencyContext latencyContext) { LatencyChecker.ClusDbHungInfo clusDbHungInfo = new LatencyChecker.ClusDbHungInfo(); TimeSpan timeSpan = ExDateTime.Now - latencyContext.StartTime; clusDbHungInfo.ApiName = latencyContext.ApiName; clusDbHungInfo.ApiHungStartTime = latencyContext.StartTime; ReplayCrimsonEvents.ClusApiOperationAppearsToBeHung.Log <string, ExDateTime, TimeSpan, string, TimeSpan>(latencyContext.ApiName, latencyContext.StartTime, timeSpan, latencyContext.HintStr, latencyContext.MaxAllowedLatency); clusDbHungInfo.HungNodeApiException = null; try { ReplayCrimsonEvents.AttemptingToGetHungNodes.Log <string, ExDateTime, LatencyChecker.LatencyContext>(latencyContext.ApiName, latencyContext.StartTime, latencyContext); HungNodesInfo nodesHungInClusDbUpdate = HungNodesInfo.GetNodesHungInClusDbUpdate(); if (nodesHungInClusDbUpdate != null) { ReplayCrimsonEvents.HungNodeDetectionCompleted.Log <int, AmServerName, HungNodesInfo>(nodesHungInClusDbUpdate.CurrentGumId, nodesHungInClusDbUpdate.CurrentLockOwnerName, nodesHungInClusDbUpdate); clusDbHungInfo.CurrentGumId = nodesHungInClusDbUpdate.CurrentGumId; clusDbHungInfo.CurrentLockOwnerName = nodesHungInClusDbUpdate.CurrentLockOwnerName; clusDbHungInfo.HungNodes = nodesHungInClusDbUpdate.NodeMap.Values.ToArray <AmServerName>(); } } catch (HungDetectionGumIdChangedException ex) { clusDbHungInfo.HungNodeApiException = ex; ReplayCrimsonEvents.HungActionSkippedSinceGumIdChanged.Log <int, int, string, long>(ex.LocalGumId, ex.RemoteGumId, ex.LockOwnerName, ex.HungNodesMask); } catch (OpenClusterTimedoutException ex2) { clusDbHungInfo.HungNodeApiException = ex2; clusDbHungInfo.HungNodes = new AmServerName[] { new AmServerName(ex2.ServerName) }; ReplayCrimsonEvents.OpenClusterCallHung.Log <string, string, string>(ex2.ServerName, ex2.Message, ex2.Context); } catch (ClusterException ex3) { clusDbHungInfo.HungNodeApiException = ex3; ReplayCrimsonEvents.HungNodeDetectionFailed.Log <string, string>(ex3.Message, ex3.ToString()); } List <AmServerName> dagServers = LatencyChecker.DagServers; ReplayCrimsonEvents.HungNodeRpcScanStart.Log <string>(LatencyChecker.ConvertAmServerNamesToString(dagServers)); AmMultiNodeCopyStatusFetcher amMultiNodeCopyStatusFetcher = new AmMultiNodeCopyStatusFetcher(dagServers, LatencyChecker.DatabaseMap, RpcGetDatabaseCopyStatusFlags2.None, null, false, 60000); amMultiNodeCopyStatusFetcher.GetStatus(); List <AmServerName> list = new List <AmServerName>(); List <Exception> list2 = new List <Exception>(); clusDbHungInfo.ClusterNodesStatus = LatencyChecker.QueryClusterNodeStatus(TimeSpan.FromSeconds(30.0), true); foreach (AmServerName amServerName in LatencyChecker.DagServers) { Exception possibleExceptionForServer = amMultiNodeCopyStatusFetcher.GetPossibleExceptionForServer(amServerName); if (possibleExceptionForServer != null) { if (possibleExceptionForServer is ReplayServiceDownException) { list.Add(amServerName); } list2.Add(possibleExceptionForServer); } } clusDbHungInfo.RpcFailedNodes = list.ToArray(); clusDbHungInfo.RpcExceptions = list2.ToArray(); ReplayCrimsonEvents.HungNodeInformationLog.Log <string>(clusDbHungInfo.ToString()); return(clusDbHungInfo); }