/// <summary>
        /// Perform pre-failover checks on the given service we plan to
        /// failover to, eg to prevent failing over to a service (eg due
        /// to it being inaccessible, already active, not healthy, etc).
        /// </summary>
        /// <remarks>
        /// Perform pre-failover checks on the given service we plan to
        /// failover to, eg to prevent failing over to a service (eg due
        /// to it being inaccessible, already active, not healthy, etc).
        /// An option to ignore toSvc if it claims it is not ready to
        /// become active is provided in case performing a failover will
        /// allow it to become active, eg because it triggers a log roll
        /// so the standby can learn about new blocks and leave safemode.
        /// </remarks>
        /// <param name="from">currently active service</param>
        /// <param name="target">service to make active</param>
        /// <param name="forceActive">ignore toSvc if it reports that it is not ready</param>
        /// <exception cref="FailoverFailedException">if we should avoid failover</exception>
        /// <exception cref="Org.Apache.Hadoop.HA.FailoverFailedException"/>
        private void PreFailoverChecks(HAServiceTarget from, HAServiceTarget target, bool
                                       forceActive)
        {
            HAServiceStatus   toSvcStatus;
            HAServiceProtocol toSvc;

            if (from.GetAddress().Equals(target.GetAddress()))
            {
                throw new FailoverFailedException("Can't failover a service to itself");
            }
            try
            {
                toSvc       = target.GetProxy(conf, rpcTimeoutToNewActive);
                toSvcStatus = toSvc.GetServiceStatus();
            }
            catch (IOException e)
            {
                string msg = "Unable to get service state for " + target;
                Log.Error(msg + ": " + e.GetLocalizedMessage());
                throw new FailoverFailedException(msg, e);
            }
            if (!toSvcStatus.GetState().Equals(HAServiceProtocol.HAServiceState.Standby))
            {
                throw new FailoverFailedException("Can't failover to an active service");
            }
            if (!toSvcStatus.IsReadyToBecomeActive())
            {
                string notReadyReason = toSvcStatus.GetNotReadyReason();
                if (!forceActive)
                {
                    throw new FailoverFailedException(target + " is not ready to become active: " + notReadyReason
                                                      );
                }
                else
                {
                    Log.Warn("Service is not ready to become active, but forcing: " + notReadyReason);
                }
            }
            try
            {
                HAServiceProtocolHelper.MonitorHealth(toSvc, CreateReqInfo());
            }
            catch (HealthCheckFailedException hce)
            {
                throw new FailoverFailedException("Can't failover to an unhealthy service", hce);
            }
            catch (IOException e)
            {
                throw new FailoverFailedException("Got an IO exception", e);
            }
        }
Example #2
0
        /// <exception cref="Org.Apache.Hadoop.HA.BadFencingConfigurationException"/>
        public virtual bool TryFence(HAServiceTarget target, string argsStr)
        {
            SshFenceByTcpPort.Args args        = new SshFenceByTcpPort.Args(argsStr);
            IPEndPoint             serviceAddr = target.GetAddress();
            string  host = serviceAddr.GetHostName();
            Session session;

            try
            {
                session = CreateSession(serviceAddr.GetHostName(), args);
            }
            catch (JSchException e)
            {
                Log.Warn("Unable to create SSH session", e);
                return(false);
            }
            Log.Info("Connecting to " + host + "...");
            try
            {
                session.Connect(GetSshConnectTimeout());
            }
            catch (JSchException e)
            {
                Log.Warn("Unable to connect to " + host + " as user " + args.user, e);
                return(false);
            }
            Log.Info("Connected to " + host);
            try
            {
                return(DoFence(session, serviceAddr));
            }
            catch (JSchException e)
            {
                Log.Warn("Unable to achieve fencing on remote host", e);
                return(false);
            }
            finally
            {
                session.Disconnect();
            }
        }
Example #3
0
        /// <summary>Coordinate a graceful failover.</summary>
        /// <remarks>
        /// Coordinate a graceful failover. This proceeds in several phases:
        /// 1) Pre-flight checks: ensure that the local node is healthy, and
        /// thus a candidate for failover.
        /// 2) Determine the current active node. If it is the local node, no
        /// need to failover - return success.
        /// 3) Ask that node to yield from the election for a number of seconds.
        /// 4) Allow the normal election path to run in other threads. Wait until
        /// we either become unhealthy or we see an election attempt recorded by
        /// the normal code path.
        /// 5) Allow the old active to rejoin the election, so a future
        /// failback is possible.
        /// </remarks>
        /// <exception cref="Org.Apache.Hadoop.HA.ServiceFailedException"/>
        /// <exception cref="System.IO.IOException"/>
        /// <exception cref="System.Exception"/>
        private void DoGracefulFailover()
        {
            int timeout = FailoverController.GetGracefulFenceTimeout(conf) * 2;

            // Phase 1: pre-flight checks
            CheckEligibleForFailover();
            // Phase 2: determine old/current active node. Check that we're not
            // ourselves active, etc.
            HAServiceTarget oldActive = GetCurrentActive();

            if (oldActive == null)
            {
                // No node is currently active. So, if we aren't already
                // active ourselves by means of a normal election, then there's
                // probably something preventing us from becoming active.
                throw new ServiceFailedException("No other node is currently active.");
            }
            if (oldActive.GetAddress().Equals(localTarget.GetAddress()))
            {
                Log.Info("Local node " + localTarget + " is already active. " + "No need to failover. Returning success."
                         );
                return;
            }
            // Phase 3: ask the old active to yield from the election.
            Log.Info("Asking " + oldActive + " to cede its active state for " + timeout + "ms"
                     );
            ZKFCProtocol oldZkfc = oldActive.GetZKFCProxy(conf, timeout);

            oldZkfc.CedeActive(timeout);
            // Phase 4: wait for the normal election to make the local node
            // active.
            ZKFailoverController.ActiveAttemptRecord attempt = WaitForActiveAttempt(timeout +
                                                                                    60000);
            if (attempt == null)
            {
                // We didn't even make an attempt to become active.
                lock (this)
                {
                    if (lastHealthState != HealthMonitor.State.ServiceHealthy)
                    {
                        throw new ServiceFailedException("Unable to become active. " + "Service became unhealthy while trying to failover."
                                                         );
                    }
                }
                throw new ServiceFailedException("Unable to become active. " + "Local node did not get an opportunity to do so from ZooKeeper, "
                                                 + "or the local node took too long to transition to active.");
            }
            // Phase 5. At this point, we made some attempt to become active. So we
            // can tell the old active to rejoin if it wants. This allows a quick
            // fail-back if we immediately crash.
            oldZkfc.CedeActive(-1);
            if (attempt.succeeded)
            {
                Log.Info("Successfully became active. " + attempt.status);
            }
            else
            {
                // Propagate failure
                string msg = "Failed to become active. " + attempt.status;
                throw new ServiceFailedException(msg);
            }
        }