Exemplo n.º 1
0
        private void DoFence(HAServiceTarget target)
        {
            Log.Info("Should fence: " + target);
            bool gracefulWorked = new FailoverController(conf, HAServiceProtocol.RequestSource
                                                         .RequestByZkfc).TryGracefulFence(target);

            if (gracefulWorked)
            {
                // It's possible that it's in standby but just about to go into active,
                // no? Is there some race here?
                Log.Info("Successfully transitioned " + target + " to standby " + "state without fencing"
                         );
                return;
            }
            try
            {
                target.CheckFencingConfigured();
            }
            catch (BadFencingConfigurationException e)
            {
                Log.Error("Couldn't fence old active " + target, e);
                RecordActiveAttempt(new ZKFailoverController.ActiveAttemptRecord(false, "Unable to fence old active"
                                                                                 ));
                throw new RuntimeException(e);
            }
            if (!target.GetFencer().Fence(target))
            {
                throw new RuntimeException("Unable to fence " + target);
            }
        }
Exemplo n.º 2
0
        /// <exception cref="Org.Apache.Hadoop.HA.FailoverFailedException"/>
        private void DoFailover(HAServiceTarget tgt1, HAServiceTarget tgt2, bool forceFence
                                , bool forceActive)
        {
            FailoverController fc = new FailoverController(conf, HAServiceProtocol.RequestSource
                                                           .RequestByUser);

            fc.Failover(tgt1, tgt2, forceFence, forceActive);
        }
Exemplo n.º 3
0
        /// <exception cref="System.IO.IOException"/>
        /// <exception cref="Org.Apache.Hadoop.HA.ServiceFailedException"/>
        private int Failover(CommandLine cmd)
        {
            bool forceFence  = cmd.HasOption(Forcefence);
            bool forceActive = cmd.HasOption(Forceactive);
            int  numOpts     = cmd.GetOptions() == null ? 0 : cmd.GetOptions().Length;

            string[] args = cmd.GetArgs();
            if (numOpts > 3 || args.Length != 2)
            {
                errOut.WriteLine("failover: incorrect arguments");
                PrintUsage(errOut, "-failover");
                return(-1);
            }
            HAServiceTarget fromNode = ResolveTarget(args[0]);
            HAServiceTarget toNode   = ResolveTarget(args[1]);

            // Check that auto-failover is consistently configured for both nodes.
            Preconditions.CheckState(fromNode.IsAutoFailoverEnabled() == toNode.IsAutoFailoverEnabled
                                         (), "Inconsistent auto-failover configs between %s and %s!", fromNode, toNode);
            if (fromNode.IsAutoFailoverEnabled())
            {
                if (forceFence || forceActive)
                {
                    // -forceActive doesn't make sense with auto-HA, since, if the node
                    // is not healthy, then its ZKFC will immediately quit the election
                    // again the next time a health check runs.
                    //
                    // -forceFence doesn't seem to have any real use cases with auto-HA
                    // so it isn't implemented.
                    errOut.WriteLine(Forcefence + " and " + Forceactive + " flags not " + "supported with auto-failover enabled."
                                     );
                    return(-1);
                }
                try
                {
                    return(GracefulFailoverThroughZKFCs(toNode));
                }
                catch (NotSupportedException e)
                {
                    errOut.WriteLine("Failover command is not supported with " + "auto-failover enabled: "
                                     + e.GetLocalizedMessage());
                    return(-1);
                }
            }
            FailoverController fc = new FailoverController(GetConf(), requestSource);

            try
            {
                fc.Failover(fromNode, toNode, forceFence, forceActive);
                @out.WriteLine("Failover from " + args[0] + " to " + args[1] + " successful");
            }
            catch (FailoverFailedException ffe)
            {
                errOut.WriteLine("Failover failed: " + ffe.GetLocalizedMessage());
                return(-1);
            }
            return(0);
        }
Exemplo n.º 4
0
        /// <summary>Initiate a graceful failover by talking to the target node's ZKFC.</summary>
        /// <remarks>
        /// Initiate a graceful failover by talking to the target node's ZKFC.
        /// This sends an RPC to the ZKFC, which coordinates the failover.
        /// </remarks>
        /// <param name="toNode">the node to fail to</param>
        /// <returns>status code (0 for success)</returns>
        /// <exception cref="System.IO.IOException">if failover does not succeed</exception>
        private int GracefulFailoverThroughZKFCs(HAServiceTarget toNode)
        {
            int          timeout = FailoverController.GetRpcTimeoutToNewActive(GetConf());
            ZKFCProtocol proxy   = toNode.GetZKFCProxy(GetConf(), timeout);

            try
            {
                proxy.GracefulFailover();
                @out.WriteLine("Failover to " + toNode + " successful");
            }
            catch (ServiceFailedException sfe)
            {
                errOut.WriteLine("Failover failed: " + sfe.GetLocalizedMessage());
                return(-1);
            }
            return(0);
        }
Exemplo n.º 5
0
 private void BecomeStandby()
 {
     lock (this)
     {
         Log.Info("ZK Election indicated that " + localTarget + " should become standby");
         try
         {
             int timeout = FailoverController.GetGracefulFenceTimeout(conf);
             localTarget.GetProxy(conf, timeout).TransitionToStandby(CreateReqInfo());
             Log.Info("Successfully transitioned " + localTarget + " to standby state");
         }
         catch (Exception e)
         {
             Log.Error("Couldn't transition " + localTarget + " to standby state", e);
         }
         // TODO handle this. It's a likely case since we probably got fenced
         // at the same time.
         serviceState = HAServiceProtocol.HAServiceState.Standby;
     }
 }
Exemplo n.º 6
0
        /// <exception cref="Org.Apache.Hadoop.Security.AccessControlException"/>
        /// <exception cref="Org.Apache.Hadoop.HA.ServiceFailedException"/>
        /// <exception cref="System.IO.IOException"/>
        private void DoCedeActive(int millisToCede)
        {
            int timeout = FailoverController.GetGracefulFenceTimeout(conf);

            // Lock elector to maintain lock ordering of elector -> ZKFC
            lock (elector)
            {
                lock (this)
                {
                    if (millisToCede <= 0)
                    {
                        delayJoiningUntilNanotime = 0;
                        RecheckElectability();
                        return;
                    }
                    Log.Info("Requested by " + UserGroupInformation.GetCurrentUser() + " at " + Server
                             .GetRemoteAddress() + " to cede active role.");
                    bool needFence = false;
                    try
                    {
                        localTarget.GetProxy(conf, timeout).TransitionToStandby(CreateReqInfo());
                        Log.Info("Successfully ensured local node is in standby mode");
                    }
                    catch (IOException ioe)
                    {
                        Log.Warn("Unable to transition local node to standby: " + ioe.GetLocalizedMessage
                                     ());
                        Log.Warn("Quitting election but indicating that fencing is " + "necessary");
                        needFence = true;
                    }
                    delayJoiningUntilNanotime = Runtime.NanoTime() + TimeUnit.Milliseconds.ToNanos(millisToCede
                                                                                                   );
                    elector.QuitElection(needFence);
                    serviceState = HAServiceProtocol.HAServiceState.Initializing;
                }
            }
            RecheckElectability();
        }
Exemplo n.º 7
0
        /// <summary>Coordinate a graceful failover.</summary>
        /// <remarks>
        /// Coordinate a graceful failover. This proceeds in several phases:
        /// 1) Pre-flight checks: ensure that the local node is healthy, and
        /// thus a candidate for failover.
        /// 2) Determine the current active node. If it is the local node, no
        /// need to failover - return success.
        /// 3) Ask that node to yield from the election for a number of seconds.
        /// 4) Allow the normal election path to run in other threads. Wait until
        /// we either become unhealthy or we see an election attempt recorded by
        /// the normal code path.
        /// 5) Allow the old active to rejoin the election, so a future
        /// failback is possible.
        /// </remarks>
        /// <exception cref="Org.Apache.Hadoop.HA.ServiceFailedException"/>
        /// <exception cref="System.IO.IOException"/>
        /// <exception cref="System.Exception"/>
        private void DoGracefulFailover()
        {
            int timeout = FailoverController.GetGracefulFenceTimeout(conf) * 2;

            // Phase 1: pre-flight checks
            CheckEligibleForFailover();
            // Phase 2: determine old/current active node. Check that we're not
            // ourselves active, etc.
            HAServiceTarget oldActive = GetCurrentActive();

            if (oldActive == null)
            {
                // No node is currently active. So, if we aren't already
                // active ourselves by means of a normal election, then there's
                // probably something preventing us from becoming active.
                throw new ServiceFailedException("No other node is currently active.");
            }
            if (oldActive.GetAddress().Equals(localTarget.GetAddress()))
            {
                Log.Info("Local node " + localTarget + " is already active. " + "No need to failover. Returning success."
                         );
                return;
            }
            // Phase 3: ask the old active to yield from the election.
            Log.Info("Asking " + oldActive + " to cede its active state for " + timeout + "ms"
                     );
            ZKFCProtocol oldZkfc = oldActive.GetZKFCProxy(conf, timeout);

            oldZkfc.CedeActive(timeout);
            // Phase 4: wait for the normal election to make the local node
            // active.
            ZKFailoverController.ActiveAttemptRecord attempt = WaitForActiveAttempt(timeout +
                                                                                    60000);
            if (attempt == null)
            {
                // We didn't even make an attempt to become active.
                lock (this)
                {
                    if (lastHealthState != HealthMonitor.State.ServiceHealthy)
                    {
                        throw new ServiceFailedException("Unable to become active. " + "Service became unhealthy while trying to failover."
                                                         );
                    }
                }
                throw new ServiceFailedException("Unable to become active. " + "Local node did not get an opportunity to do so from ZooKeeper, "
                                                 + "or the local node took too long to transition to active.");
            }
            // Phase 5. At this point, we made some attempt to become active. So we
            // can tell the old active to rejoin if it wants. This allows a quick
            // fail-back if we immediately crash.
            oldZkfc.CedeActive(-1);
            if (attempt.succeeded)
            {
                Log.Info("Successfully became active. " + attempt.status);
            }
            else
            {
                // Propagate failure
                string msg = "Failed to become active. " + attempt.status;
                throw new ServiceFailedException(msg);
            }
        }
Exemplo n.º 8
0
 /// <exception cref="Org.Apache.Hadoop.HA.ServiceFailedException"/>
 private void BecomeActive()
 {
     lock (this)
     {
         Log.Info("Trying to make " + localTarget + " active...");
         try
         {
             HAServiceProtocolHelper.TransitionToActive(localTarget.GetProxy(conf, FailoverController
                                                                             .GetRpcTimeoutToNewActive(conf)), CreateReqInfo());
             string msg = "Successfully transitioned " + localTarget + " to active state";
             Log.Info(msg);
             serviceState = HAServiceProtocol.HAServiceState.Active;
             RecordActiveAttempt(new ZKFailoverController.ActiveAttemptRecord(true, msg));
         }
         catch (Exception t)
         {
             string msg = "Couldn't make " + localTarget + " active";
             Log.Fatal(msg, t);
             RecordActiveAttempt(new ZKFailoverController.ActiveAttemptRecord(false, msg + "\n"
                                                                              + StringUtils.StringifyException(t)));
             if (t is ServiceFailedException)
             {
                 throw (ServiceFailedException)t;
             }
             else
             {
                 throw new ServiceFailedException("Couldn't transition to active", t);
             }
         }
     }
 }