Ejemplo n.º 1
0
 internal static void ActOnClusDbHang(LatencyChecker.ClusDbHungAction action)
 {
     if (action != null && action.TakeAction && action.HungInfo != null)
     {
         ReplayCrimsonEvents.HungNodeRecoveryActionStart.Log <string>(LatencyChecker.ConvertAmServerNamesToString(action.TargetNodes));
         bool flag;
         if (RegistryParameters.IsKillClusterServiceOnClusApiHang)
         {
             flag = true;
             if (action.TargetNodes != null && action.TargetNodes.Length > 0)
             {
                 AmServerName             amServerName = action.TargetNodes[0];
                 RpcKillServiceImpl.Reply reply        = RpcKillServiceImpl.SendKillRequest(amServerName.Fqdn, "Clussvc", action.HungInfo.ApiHungStartTime.LocalTime, false, RegistryParameters.RpcKillServiceTimeoutInMSec);
                 flag = (reply != null && reply.IsSucceeded && reply.IsSucceeded);
             }
         }
         else
         {
             flag = false;
             ReplayCrimsonEvents.SkippedSendingClussvcKillRequest.LogPeriodic(action.HungInfo.ApiName, TimeSpan.FromMinutes(15.0));
         }
         if (!flag)
         {
             string text = LatencyChecker.ConvertAmServerNamesToString(action.TargetNodes);
             ReplayCrimsonEvents.HungNodeRebootRequested.Log <string>(text);
             LatencyChecker.TriggerNodeRestart(action.HungInfo.CurrentGumId.ToString(), (action.HungInfo.CurrentLockOwnerName != null) ? action.HungInfo.CurrentLockOwnerName.NetbiosName : "NULL", text, action.HungInfo, action);
             return;
         }
     }
     else if (action == null || action.HungInfo == null)
     {
         ReplayCrimsonEvents.GenericMessage.Log <string>("ActOnClusDbHang: Action is null or action.HungInfo is null");
     }
 }
Ejemplo n.º 2
0
 internal static void OnClusApiHang(object context)
 {
     LatencyChecker.LatencyContext latencyContext = (LatencyChecker.LatencyContext)context;
     LatencyChecker.ClusDbHungInfo clusDbHungInfo = LatencyChecker.GatherHungNodesInformation(latencyContext);
     LatencyChecker.LastKnownHungInfo = clusDbHungInfo;
     LatencyChecker.ClusDbHungAction action = LatencyChecker.AnalyzeAndSuggestActionForClusDbHang(clusDbHungInfo);
     LatencyChecker.ActOnClusDbHang(action);
 }
Ejemplo n.º 3
0
        private static void TriggerNodeRestart(string currentGumId, string currentLockOwnerName, string hungNodeCsv, LatencyChecker.ClusDbHungInfo hungInfo, LatencyChecker.ClusDbHungAction hungAction)
        {
            EventNotificationItem eventNotificationItem = new EventNotificationItem("MSExchangeRepl", "Cluster", "ClusterNodeRestart", string.Format("Cluster Hung detected. GumId={0}, LockOwner={1}, HungNodes={2}, HungInfo={3}, Decision={4}", new object[]
            {
                currentGumId,
                currentLockOwnerName,
                hungNodeCsv,
                hungInfo.ToString(),
                hungAction.ToString()
            }), hungNodeCsv, ResultSeverityLevel.Critical);

            eventNotificationItem.Publish(false);
        }
Ejemplo n.º 4
0
 internal static LatencyChecker.ClusDbHungAction AnalyzeAndSuggestActionForClusDbHang(LatencyChecker.ClusDbHungInfo hungInfo)
 {
     LatencyChecker.ClusDbHungAction clusDbHungAction = new LatencyChecker.ClusDbHungAction();
     clusDbHungAction.HungInfo    = hungInfo;
     clusDbHungAction.TakeAction  = false;
     clusDbHungAction.TargetNodes = hungInfo.HungNodes;
     clusDbHungAction.Reason      = "If you see this message, something is wrong...";
     if (hungInfo.HungNodeApiException != null)
     {
         if (hungInfo.HungNodeApiException is HungDetectionGumIdChangedException)
         {
             clusDbHungAction.TakeAction = false;
             clusDbHungAction.Reason     = "GumId changed.";
         }
         else if (hungInfo.HungNodeApiException is OpenClusterTimedoutException)
         {
             clusDbHungAction.TakeAction = true;
             OpenClusterTimedoutException ex = (OpenClusterTimedoutException)hungInfo.HungNodeApiException;
             clusDbHungAction.Reason = string.Format("OpenCluster timed-out for {0}", ex.ServerName);
         }
         else if (hungInfo.HungNodeApiException is ClusterException)
         {
             clusDbHungAction.TakeAction = false;
             clusDbHungAction.Reason     = "ClusterException was caught.";
         }
     }
     else
     {
         clusDbHungAction.TakeAction = true;
         clusDbHungAction.Reason     = "Hung node detected without any Exceptions caught.";
     }
     if (clusDbHungAction.TargetNodes == null || clusDbHungAction.TargetNodes.Length < 1)
     {
         clusDbHungAction.TakeAction = false;
         clusDbHungAction.Reason     = "No hung node detected, and Rpc timeout did not catch anything.";
         if (hungInfo.RpcFailedNodes != null && hungInfo.RpcFailedNodes.Length > 0)
         {
             AmServerName amServerName = null;
             foreach (AmServerName amServerName2 in hungInfo.RpcFailedNodes)
             {
                 AmNodeState amNodeState = AmNodeState.Unknown;
                 if (hungInfo.ClusterNodesStatus.TryGetValue(amServerName2.NetbiosName, out amNodeState) && amNodeState != AmNodeState.Unknown && amNodeState != AmNodeState.Down)
                 {
                     amServerName = amServerName2;
                     break;
                 }
             }
             if (amServerName != null)
             {
                 clusDbHungAction.TakeAction  = true;
                 clusDbHungAction.TargetNodes = new AmServerName[]
                 {
                     amServerName
                 };
                 clusDbHungAction.Reason = string.Format("Hung nodes detected via Rpc timeout. Node '{0}' chosen for action. Original list={1}", amServerName.NetbiosName, LatencyChecker.ConvertAmServerNamesToString(hungInfo.RpcFailedNodes));
             }
             else
             {
                 clusDbHungAction.TakeAction  = false;
                 clusDbHungAction.TargetNodes = null;
                 clusDbHungAction.Reason      = string.Format("No nodes in Rpc non-responsive list are UP according to cluster. Skipping reboot. Original list={0}", LatencyChecker.ConvertAmServerNamesToString(hungInfo.RpcFailedNodes));
             }
         }
         if (!clusDbHungAction.TakeAction && !AmServerName.IsNullOrEmpty(hungInfo.CurrentLockOwnerName))
         {
             clusDbHungAction.TakeAction  = true;
             clusDbHungAction.TargetNodes = new AmServerName[]
             {
                 hungInfo.CurrentLockOwnerName
             };
             clusDbHungAction.Reason = string.Format("Could not find any hung nodes, so taking restart/reboot action for the lock owner '{0}'", hungInfo.CurrentLockOwnerName.NetbiosName);
         }
     }
     ReplayCrimsonEvents.HungNodeAnalysisResult.Log <string>(clusDbHungAction.ToString());
     return(clusDbHungAction);
 }