private void DoFence(HAServiceTarget target) { Log.Info("Should fence: " + target); bool gracefulWorked = new FailoverController(conf, HAServiceProtocol.RequestSource .RequestByZkfc).TryGracefulFence(target); if (gracefulWorked) { // It's possible that it's in standby but just about to go into active, // no? Is there some race here? Log.Info("Successfully transitioned " + target + " to standby " + "state without fencing" ); return; } try { target.CheckFencingConfigured(); } catch (BadFencingConfigurationException e) { Log.Error("Couldn't fence old active " + target, e); RecordActiveAttempt(new ZKFailoverController.ActiveAttemptRecord(false, "Unable to fence old active" )); throw new RuntimeException(e); } if (!target.GetFencer().Fence(target)) { throw new RuntimeException("Unable to fence " + target); } }
/// <exception cref="Org.Apache.Hadoop.HA.FailoverFailedException"/> private void DoFailover(HAServiceTarget tgt1, HAServiceTarget tgt2, bool forceFence , bool forceActive) { FailoverController fc = new FailoverController(conf, HAServiceProtocol.RequestSource .RequestByUser); fc.Failover(tgt1, tgt2, forceFence, forceActive); }
/// <exception cref="System.IO.IOException"/> /// <exception cref="Org.Apache.Hadoop.HA.ServiceFailedException"/> private int Failover(CommandLine cmd) { bool forceFence = cmd.HasOption(Forcefence); bool forceActive = cmd.HasOption(Forceactive); int numOpts = cmd.GetOptions() == null ? 0 : cmd.GetOptions().Length; string[] args = cmd.GetArgs(); if (numOpts > 3 || args.Length != 2) { errOut.WriteLine("failover: incorrect arguments"); PrintUsage(errOut, "-failover"); return(-1); } HAServiceTarget fromNode = ResolveTarget(args[0]); HAServiceTarget toNode = ResolveTarget(args[1]); // Check that auto-failover is consistently configured for both nodes. Preconditions.CheckState(fromNode.IsAutoFailoverEnabled() == toNode.IsAutoFailoverEnabled (), "Inconsistent auto-failover configs between %s and %s!", fromNode, toNode); if (fromNode.IsAutoFailoverEnabled()) { if (forceFence || forceActive) { // -forceActive doesn't make sense with auto-HA, since, if the node // is not healthy, then its ZKFC will immediately quit the election // again the next time a health check runs. // // -forceFence doesn't seem to have any real use cases with auto-HA // so it isn't implemented. errOut.WriteLine(Forcefence + " and " + Forceactive + " flags not " + "supported with auto-failover enabled." ); return(-1); } try { return(GracefulFailoverThroughZKFCs(toNode)); } catch (NotSupportedException e) { errOut.WriteLine("Failover command is not supported with " + "auto-failover enabled: " + e.GetLocalizedMessage()); return(-1); } } FailoverController fc = new FailoverController(GetConf(), requestSource); try { fc.Failover(fromNode, toNode, forceFence, forceActive); @out.WriteLine("Failover from " + args[0] + " to " + args[1] + " successful"); } catch (FailoverFailedException ffe) { errOut.WriteLine("Failover failed: " + ffe.GetLocalizedMessage()); return(-1); } return(0); }
/// <summary>Initiate a graceful failover by talking to the target node's ZKFC.</summary> /// <remarks> /// Initiate a graceful failover by talking to the target node's ZKFC. /// This sends an RPC to the ZKFC, which coordinates the failover. /// </remarks> /// <param name="toNode">the node to fail to</param> /// <returns>status code (0 for success)</returns> /// <exception cref="System.IO.IOException">if failover does not succeed</exception> private int GracefulFailoverThroughZKFCs(HAServiceTarget toNode) { int timeout = FailoverController.GetRpcTimeoutToNewActive(GetConf()); ZKFCProtocol proxy = toNode.GetZKFCProxy(GetConf(), timeout); try { proxy.GracefulFailover(); @out.WriteLine("Failover to " + toNode + " successful"); } catch (ServiceFailedException sfe) { errOut.WriteLine("Failover failed: " + sfe.GetLocalizedMessage()); return(-1); } return(0); }
private void BecomeStandby() { lock (this) { Log.Info("ZK Election indicated that " + localTarget + " should become standby"); try { int timeout = FailoverController.GetGracefulFenceTimeout(conf); localTarget.GetProxy(conf, timeout).TransitionToStandby(CreateReqInfo()); Log.Info("Successfully transitioned " + localTarget + " to standby state"); } catch (Exception e) { Log.Error("Couldn't transition " + localTarget + " to standby state", e); } // TODO handle this. It's a likely case since we probably got fenced // at the same time. serviceState = HAServiceProtocol.HAServiceState.Standby; } }
/// <exception cref="Org.Apache.Hadoop.Security.AccessControlException"/> /// <exception cref="Org.Apache.Hadoop.HA.ServiceFailedException"/> /// <exception cref="System.IO.IOException"/> private void DoCedeActive(int millisToCede) { int timeout = FailoverController.GetGracefulFenceTimeout(conf); // Lock elector to maintain lock ordering of elector -> ZKFC lock (elector) { lock (this) { if (millisToCede <= 0) { delayJoiningUntilNanotime = 0; RecheckElectability(); return; } Log.Info("Requested by " + UserGroupInformation.GetCurrentUser() + " at " + Server .GetRemoteAddress() + " to cede active role."); bool needFence = false; try { localTarget.GetProxy(conf, timeout).TransitionToStandby(CreateReqInfo()); Log.Info("Successfully ensured local node is in standby mode"); } catch (IOException ioe) { Log.Warn("Unable to transition local node to standby: " + ioe.GetLocalizedMessage ()); Log.Warn("Quitting election but indicating that fencing is " + "necessary"); needFence = true; } delayJoiningUntilNanotime = Runtime.NanoTime() + TimeUnit.Milliseconds.ToNanos(millisToCede ); elector.QuitElection(needFence); serviceState = HAServiceProtocol.HAServiceState.Initializing; } } RecheckElectability(); }
/// <summary>Coordinate a graceful failover.</summary> /// <remarks> /// Coordinate a graceful failover. This proceeds in several phases: /// 1) Pre-flight checks: ensure that the local node is healthy, and /// thus a candidate for failover. /// 2) Determine the current active node. If it is the local node, no /// need to failover - return success. /// 3) Ask that node to yield from the election for a number of seconds. /// 4) Allow the normal election path to run in other threads. Wait until /// we either become unhealthy or we see an election attempt recorded by /// the normal code path. /// 5) Allow the old active to rejoin the election, so a future /// failback is possible. /// </remarks> /// <exception cref="Org.Apache.Hadoop.HA.ServiceFailedException"/> /// <exception cref="System.IO.IOException"/> /// <exception cref="System.Exception"/> private void DoGracefulFailover() { int timeout = FailoverController.GetGracefulFenceTimeout(conf) * 2; // Phase 1: pre-flight checks CheckEligibleForFailover(); // Phase 2: determine old/current active node. Check that we're not // ourselves active, etc. HAServiceTarget oldActive = GetCurrentActive(); if (oldActive == null) { // No node is currently active. So, if we aren't already // active ourselves by means of a normal election, then there's // probably something preventing us from becoming active. throw new ServiceFailedException("No other node is currently active."); } if (oldActive.GetAddress().Equals(localTarget.GetAddress())) { Log.Info("Local node " + localTarget + " is already active. " + "No need to failover. Returning success." ); return; } // Phase 3: ask the old active to yield from the election. Log.Info("Asking " + oldActive + " to cede its active state for " + timeout + "ms" ); ZKFCProtocol oldZkfc = oldActive.GetZKFCProxy(conf, timeout); oldZkfc.CedeActive(timeout); // Phase 4: wait for the normal election to make the local node // active. ZKFailoverController.ActiveAttemptRecord attempt = WaitForActiveAttempt(timeout + 60000); if (attempt == null) { // We didn't even make an attempt to become active. lock (this) { if (lastHealthState != HealthMonitor.State.ServiceHealthy) { throw new ServiceFailedException("Unable to become active. " + "Service became unhealthy while trying to failover." ); } } throw new ServiceFailedException("Unable to become active. " + "Local node did not get an opportunity to do so from ZooKeeper, " + "or the local node took too long to transition to active."); } // Phase 5. At this point, we made some attempt to become active. So we // can tell the old active to rejoin if it wants. This allows a quick // fail-back if we immediately crash. oldZkfc.CedeActive(-1); if (attempt.succeeded) { Log.Info("Successfully became active. " + attempt.status); } else { // Propagate failure string msg = "Failed to become active. " + attempt.status; throw new ServiceFailedException(msg); } }
/// <exception cref="Org.Apache.Hadoop.HA.ServiceFailedException"/> private void BecomeActive() { lock (this) { Log.Info("Trying to make " + localTarget + " active..."); try { HAServiceProtocolHelper.TransitionToActive(localTarget.GetProxy(conf, FailoverController .GetRpcTimeoutToNewActive(conf)), CreateReqInfo()); string msg = "Successfully transitioned " + localTarget + " to active state"; Log.Info(msg); serviceState = HAServiceProtocol.HAServiceState.Active; RecordActiveAttempt(new ZKFailoverController.ActiveAttemptRecord(true, msg)); } catch (Exception t) { string msg = "Couldn't make " + localTarget + " active"; Log.Fatal(msg, t); RecordActiveAttempt(new ZKFailoverController.ActiveAttemptRecord(false, msg + "\n" + StringUtils.StringifyException(t))); if (t is ServiceFailedException) { throw (ServiceFailedException)t; } else { throw new ServiceFailedException("Couldn't transition to active", t); } } } }