/// <summary> /// Perform pre-failover checks on the given service we plan to /// failover to, eg to prevent failing over to a service (eg due /// to it being inaccessible, already active, not healthy, etc). /// </summary> /// <remarks> /// Perform pre-failover checks on the given service we plan to /// failover to, eg to prevent failing over to a service (eg due /// to it being inaccessible, already active, not healthy, etc). /// An option to ignore toSvc if it claims it is not ready to /// become active is provided in case performing a failover will /// allow it to become active, eg because it triggers a log roll /// so the standby can learn about new blocks and leave safemode. /// </remarks> /// <param name="from">currently active service</param> /// <param name="target">service to make active</param> /// <param name="forceActive">ignore toSvc if it reports that it is not ready</param> /// <exception cref="FailoverFailedException">if we should avoid failover</exception> /// <exception cref="Org.Apache.Hadoop.HA.FailoverFailedException"/> private void PreFailoverChecks(HAServiceTarget from, HAServiceTarget target, bool forceActive) { HAServiceStatus toSvcStatus; HAServiceProtocol toSvc; if (from.GetAddress().Equals(target.GetAddress())) { throw new FailoverFailedException("Can't failover a service to itself"); } try { toSvc = target.GetProxy(conf, rpcTimeoutToNewActive); toSvcStatus = toSvc.GetServiceStatus(); } catch (IOException e) { string msg = "Unable to get service state for " + target; Log.Error(msg + ": " + e.GetLocalizedMessage()); throw new FailoverFailedException(msg, e); } if (!toSvcStatus.GetState().Equals(HAServiceProtocol.HAServiceState.Standby)) { throw new FailoverFailedException("Can't failover to an active service"); } if (!toSvcStatus.IsReadyToBecomeActive()) { string notReadyReason = toSvcStatus.GetNotReadyReason(); if (!forceActive) { throw new FailoverFailedException(target + " is not ready to become active: " + notReadyReason ); } else { Log.Warn("Service is not ready to become active, but forcing: " + notReadyReason); } } try { HAServiceProtocolHelper.MonitorHealth(toSvc, CreateReqInfo()); } catch (HealthCheckFailedException hce) { throw new FailoverFailedException("Can't failover to an unhealthy service", hce); } catch (IOException e) { throw new FailoverFailedException("Got an IO exception", e); } }
/// <exception cref="Org.Apache.Hadoop.HA.BadFencingConfigurationException"/> public virtual bool TryFence(HAServiceTarget target, string argsStr) { SshFenceByTcpPort.Args args = new SshFenceByTcpPort.Args(argsStr); IPEndPoint serviceAddr = target.GetAddress(); string host = serviceAddr.GetHostName(); Session session; try { session = CreateSession(serviceAddr.GetHostName(), args); } catch (JSchException e) { Log.Warn("Unable to create SSH session", e); return(false); } Log.Info("Connecting to " + host + "..."); try { session.Connect(GetSshConnectTimeout()); } catch (JSchException e) { Log.Warn("Unable to connect to " + host + " as user " + args.user, e); return(false); } Log.Info("Connected to " + host); try { return(DoFence(session, serviceAddr)); } catch (JSchException e) { Log.Warn("Unable to achieve fencing on remote host", e); return(false); } finally { session.Disconnect(); } }
/// <summary>Coordinate a graceful failover.</summary> /// <remarks> /// Coordinate a graceful failover. This proceeds in several phases: /// 1) Pre-flight checks: ensure that the local node is healthy, and /// thus a candidate for failover. /// 2) Determine the current active node. If it is the local node, no /// need to failover - return success. /// 3) Ask that node to yield from the election for a number of seconds. /// 4) Allow the normal election path to run in other threads. Wait until /// we either become unhealthy or we see an election attempt recorded by /// the normal code path. /// 5) Allow the old active to rejoin the election, so a future /// failback is possible. /// </remarks> /// <exception cref="Org.Apache.Hadoop.HA.ServiceFailedException"/> /// <exception cref="System.IO.IOException"/> /// <exception cref="System.Exception"/> private void DoGracefulFailover() { int timeout = FailoverController.GetGracefulFenceTimeout(conf) * 2; // Phase 1: pre-flight checks CheckEligibleForFailover(); // Phase 2: determine old/current active node. Check that we're not // ourselves active, etc. HAServiceTarget oldActive = GetCurrentActive(); if (oldActive == null) { // No node is currently active. So, if we aren't already // active ourselves by means of a normal election, then there's // probably something preventing us from becoming active. throw new ServiceFailedException("No other node is currently active."); } if (oldActive.GetAddress().Equals(localTarget.GetAddress())) { Log.Info("Local node " + localTarget + " is already active. " + "No need to failover. Returning success." ); return; } // Phase 3: ask the old active to yield from the election. Log.Info("Asking " + oldActive + " to cede its active state for " + timeout + "ms" ); ZKFCProtocol oldZkfc = oldActive.GetZKFCProxy(conf, timeout); oldZkfc.CedeActive(timeout); // Phase 4: wait for the normal election to make the local node // active. ZKFailoverController.ActiveAttemptRecord attempt = WaitForActiveAttempt(timeout + 60000); if (attempt == null) { // We didn't even make an attempt to become active. lock (this) { if (lastHealthState != HealthMonitor.State.ServiceHealthy) { throw new ServiceFailedException("Unable to become active. " + "Service became unhealthy while trying to failover." ); } } throw new ServiceFailedException("Unable to become active. " + "Local node did not get an opportunity to do so from ZooKeeper, " + "or the local node took too long to transition to active."); } // Phase 5. At this point, we made some attempt to become active. So we // can tell the old active to rejoin if it wants. This allows a quick // fail-back if we immediately crash. oldZkfc.CedeActive(-1); if (attempt.succeeded) { Log.Info("Successfully became active. " + attempt.status); } else { // Propagate failure string msg = "Failed to become active. " + attempt.status; throw new ServiceFailedException(msg); } }