public virtual bool TryFence(HAServiceTarget target, string args) { fencedSvc = target; callArgs.AddItem(args); fenceCalled++; return(false); }
private void DoFence(HAServiceTarget target) { Log.Info("Should fence: " + target); bool gracefulWorked = new FailoverController(conf, HAServiceProtocol.RequestSource .RequestByZkfc).TryGracefulFence(target); if (gracefulWorked) { // It's possible that it's in standby but just about to go into active, // no? Is there some race here? Log.Info("Successfully transitioned " + target + " to standby " + "state without fencing" ); return; } try { target.CheckFencingConfigured(); } catch (BadFencingConfigurationException e) { Log.Error("Couldn't fence old active " + target, e); RecordActiveAttempt(new ZKFailoverController.ActiveAttemptRecord(false, "Unable to fence old active" )); throw new RuntimeException(e); } if (!target.GetFencer().Fence(target)) { throw new RuntimeException("Unable to fence " + target); } }
// Nothing else we can really check without actually running the command public virtual bool TryFence(HAServiceTarget target, string cmd) { ProcessStartInfo builder; if (!Shell.Windows) { builder = new ProcessStartInfo("bash", "-e", "-c", cmd); } else { builder = new ProcessStartInfo("cmd.exe", "/c", cmd); } SetConfAsEnvVars(builder.EnvironmentVariables); AddTargetInfoAsEnvVars(target, builder.EnvironmentVariables); SystemProcess p; try { p = builder.Start(); p.GetOutputStream().Close(); } catch (IOException e) { Log.Warn("Unable to execute " + cmd, e); return(false); } string pid = TryGetPid(p); Log.Info("Launched fencing command '" + cmd + "' with " + ((pid != null) ? ("pid " + pid) : "unknown pid")); string logPrefix = Abbreviate(cmd, AbbrevLength); if (pid != null) { logPrefix = "[PID " + pid + "] " + logPrefix; } // Pump logs to stderr StreamPumper errPumper = new StreamPumper(Log, logPrefix, p.GetErrorStream(), StreamPumper.StreamType .Stderr); errPumper.Start(); StreamPumper outPumper = new StreamPumper(Log, logPrefix, p.GetInputStream(), StreamPumper.StreamType .Stdout); outPumper.Start(); int rc; try { rc = p.WaitFor(); errPumper.Join(); outPumper.Join(); } catch (Exception) { Log.Warn("Interrupted while waiting for fencing command: " + cmd); return(false); } return(rc == 0); }
/// <exception cref="System.IO.IOException"/> /// <exception cref="Org.Apache.Hadoop.HA.ServiceFailedException"/> private int TransitionToActive(CommandLine cmd) { string[] argv = cmd.GetArgs(); if (argv.Length != 1) { errOut.WriteLine("transitionToActive: incorrect number of arguments"); PrintUsage(errOut, "-transitionToActive"); return(-1); } /* returns true if other target node is active or some exception occurred * and forceActive was not set */ if (!cmd.HasOption(Forceactive)) { if (IsOtherTargetNodeActive(argv[0], cmd.HasOption(Forceactive))) { return(-1); } } HAServiceTarget target = ResolveTarget(argv[0]); if (!CheckManualStateManagementOK(target)) { return(-1); } HAServiceProtocol proto = target.GetProxy(GetConf(), 0); HAServiceProtocolHelper.TransitionToActive(proto, CreateReqInfo()); return(0); }
/// <summary>Try to get the HA state of the node at the given address.</summary> /// <remarks> /// Try to get the HA state of the node at the given address. This /// function is guaranteed to be "quick" -- ie it has a short timeout /// and no retries. Its only purpose is to avoid fencing a node that /// has already restarted. /// </remarks> internal virtual bool TryGracefulFence(HAServiceTarget svc) { HAServiceProtocol proxy = null; try { proxy = svc.GetProxy(gracefulFenceConf, gracefulFenceTimeout); proxy.TransitionToStandby(CreateReqInfo()); return(true); } catch (ServiceFailedException sfe) { Log.Warn("Unable to gracefully make " + svc + " standby (" + sfe.Message + ")"); } catch (IOException ioe) { Log.Warn("Unable to gracefully make " + svc + " standby (unable to connect)", ioe ); } finally { if (proxy != null) { RPC.StopProxy(proxy); } } return(false); }
public virtual bool Fence(HAServiceTarget fromSvc) { Log.Info("====== Beginning Service Fencing Process... ======"); int i = 0; foreach (NodeFencer.FenceMethodWithArg method in methods) { Log.Info("Trying method " + (++i) + "/" + methods.Count + ": " + method); try { if (method.method.TryFence(fromSvc, method.arg)) { Log.Info("====== Fencing successful by method " + method + " ======"); return(true); } } catch (BadFencingConfigurationException e) { Log.Error("Fencing method " + method + " misconfigured", e); continue; } catch (Exception t) { Log.Error("Fencing method " + method + " failed with an unexpected error.", t); continue; } Log.Warn("Fencing method " + method + " was unsuccessful."); } Log.Error("Unable to fence service by any configured method."); return(false); }
/// <exception cref="Org.Apache.Hadoop.HA.FailoverFailedException"/> private void DoFailover(HAServiceTarget tgt1, HAServiceTarget tgt2, bool forceFence , bool forceActive) { FailoverController fc = new FailoverController(conf, HAServiceProtocol.RequestSource .RequestByUser); fc.Failover(tgt1, tgt2, forceFence, forceActive); }
/// <exception cref="System.IO.IOException"/> /// <exception cref="Org.Apache.Hadoop.HA.ServiceFailedException"/> private int Failover(CommandLine cmd) { bool forceFence = cmd.HasOption(Forcefence); bool forceActive = cmd.HasOption(Forceactive); int numOpts = cmd.GetOptions() == null ? 0 : cmd.GetOptions().Length; string[] args = cmd.GetArgs(); if (numOpts > 3 || args.Length != 2) { errOut.WriteLine("failover: incorrect arguments"); PrintUsage(errOut, "-failover"); return(-1); } HAServiceTarget fromNode = ResolveTarget(args[0]); HAServiceTarget toNode = ResolveTarget(args[1]); // Check that auto-failover is consistently configured for both nodes. Preconditions.CheckState(fromNode.IsAutoFailoverEnabled() == toNode.IsAutoFailoverEnabled (), "Inconsistent auto-failover configs between %s and %s!", fromNode, toNode); if (fromNode.IsAutoFailoverEnabled()) { if (forceFence || forceActive) { // -forceActive doesn't make sense with auto-HA, since, if the node // is not healthy, then its ZKFC will immediately quit the election // again the next time a health check runs. // // -forceFence doesn't seem to have any real use cases with auto-HA // so it isn't implemented. errOut.WriteLine(Forcefence + " and " + Forceactive + " flags not " + "supported with auto-failover enabled." ); return(-1); } try { return(GracefulFailoverThroughZKFCs(toNode)); } catch (NotSupportedException e) { errOut.WriteLine("Failover command is not supported with " + "auto-failover enabled: " + e.GetLocalizedMessage()); return(-1); } } FailoverController fc = new FailoverController(GetConf(), requestSource); try { fc.Failover(fromNode, toNode, forceFence, forceActive); @out.WriteLine("Failover from " + args[0] + " to " + args[1] + " successful"); } catch (FailoverFailedException ffe) { errOut.WriteLine("Failover failed: " + ffe.GetLocalizedMessage()); return(-1); } return(0); }
/// <summary> /// Add information about the target to the the environment of the /// subprocess. /// </summary> /// <param name="target"/> /// <param name="environment"/> private void AddTargetInfoAsEnvVars(HAServiceTarget target, IDictionary <string, string > environment) { foreach (KeyValuePair <string, string> e in target.GetFencingParameters()) { string key = TargetPrefix + e.Key; key = key.Replace('.', '_'); environment[key] = e.Value; } }
public virtual void ClearMockState() { TestNodeFencer.AlwaysSucceedFencer.fenceCalled = 0; TestNodeFencer.AlwaysSucceedFencer.callArgs.Clear(); TestNodeFencer.AlwaysFailFencer.fenceCalled = 0; TestNodeFencer.AlwaysFailFencer.callArgs.Clear(); MockTarget = Org.Mockito.Mockito.Mock <HAServiceTarget>(); Org.Mockito.Mockito.DoReturn("my mock").When(MockTarget).ToString(); Org.Mockito.Mockito.DoReturn(new IPEndPoint("host", 1234)).When(MockTarget).GetAddress (); }
internal HealthMonitor(Configuration conf, HAServiceTarget target) { this.targetToMonitor = target; this.conf = conf; this.sleepAfterDisconnectMillis = conf.GetLong(HaHmSleepAfterDisconnectKey, HaHmSleepAfterDisconnectDefault ); this.checkIntervalMillis = conf.GetLong(HaHmCheckIntervalKey, HaHmCheckIntervalDefault ); this.connectRetryInterval = conf.GetLong(HaHmConnectRetryIntervalKey, HaHmConnectRetryIntervalDefault ); this.rpcTimeout = conf.GetInt(HaHmRpcTimeoutKey, HaHmRpcTimeoutDefault); this.daemon = new HealthMonitor.MonitorDaemon(this); }
/// <summary> /// Perform pre-failover checks on the given service we plan to /// failover to, eg to prevent failing over to a service (eg due /// to it being inaccessible, already active, not healthy, etc). /// </summary> /// <remarks> /// Perform pre-failover checks on the given service we plan to /// failover to, eg to prevent failing over to a service (eg due /// to it being inaccessible, already active, not healthy, etc). /// An option to ignore toSvc if it claims it is not ready to /// become active is provided in case performing a failover will /// allow it to become active, eg because it triggers a log roll /// so the standby can learn about new blocks and leave safemode. /// </remarks> /// <param name="from">currently active service</param> /// <param name="target">service to make active</param> /// <param name="forceActive">ignore toSvc if it reports that it is not ready</param> /// <exception cref="FailoverFailedException">if we should avoid failover</exception> /// <exception cref="Org.Apache.Hadoop.HA.FailoverFailedException"/> private void PreFailoverChecks(HAServiceTarget from, HAServiceTarget target, bool forceActive) { HAServiceStatus toSvcStatus; HAServiceProtocol toSvc; if (from.GetAddress().Equals(target.GetAddress())) { throw new FailoverFailedException("Can't failover a service to itself"); } try { toSvc = target.GetProxy(conf, rpcTimeoutToNewActive); toSvcStatus = toSvc.GetServiceStatus(); } catch (IOException e) { string msg = "Unable to get service state for " + target; Log.Error(msg + ": " + e.GetLocalizedMessage()); throw new FailoverFailedException(msg, e); } if (!toSvcStatus.GetState().Equals(HAServiceProtocol.HAServiceState.Standby)) { throw new FailoverFailedException("Can't failover to an active service"); } if (!toSvcStatus.IsReadyToBecomeActive()) { string notReadyReason = toSvcStatus.GetNotReadyReason(); if (!forceActive) { throw new FailoverFailedException(target + " is not ready to become active: " + notReadyReason ); } else { Log.Warn("Service is not ready to become active, but forcing: " + notReadyReason); } } try { HAServiceProtocolHelper.MonitorHealth(toSvc, CreateReqInfo()); } catch (HealthCheckFailedException hce) { throw new FailoverFailedException("Can't failover to an unhealthy service", hce); } catch (IOException e) { throw new FailoverFailedException("Got an IO exception", e); } }
/// <summary>Initiate a graceful failover by talking to the target node's ZKFC.</summary> /// <remarks> /// Initiate a graceful failover by talking to the target node's ZKFC. /// This sends an RPC to the ZKFC, which coordinates the failover. /// </remarks> /// <param name="toNode">the node to fail to</param> /// <returns>status code (0 for success)</returns> /// <exception cref="System.IO.IOException">if failover does not succeed</exception> private int GracefulFailoverThroughZKFCs(HAServiceTarget toNode) { int timeout = FailoverController.GetRpcTimeoutToNewActive(GetConf()); ZKFCProtocol proxy = toNode.GetZKFCProxy(GetConf(), timeout); try { proxy.GracefulFailover(); @out.WriteLine("Failover to " + toNode + " successful"); } catch (ServiceFailedException sfe) { errOut.WriteLine("Failover failed: " + sfe.GetLocalizedMessage()); return(-1); } return(0); }
private void FenceOldActive(byte[] data) { lock (this) { HAServiceTarget target = DataToTarget(data); try { DoFence(target); } catch (Exception t) { RecordActiveAttempt(new ZKFailoverController.ActiveAttemptRecord(false, "Unable to fence old active: " + StringUtils.StringifyException(t))); Throwables.Propagate(t); } } }
/// <exception cref="Org.Apache.Hadoop.HA.BadFencingConfigurationException"/> public virtual bool TryFence(HAServiceTarget target, string args) { Log.Info("tryFence(" + target + ")"); DummyHAService svc = (DummyHAService)target; lock (svc) { svc.fenceCount++; } if (svc.failToFence) { Log.Info("Injected failure to fence"); return(false); } svc.sharedResource.Release(svc); return(true); }
/// <summary> /// Ensure that we are allowed to manually manage the HA state of the target /// service. /// </summary> /// <remarks> /// Ensure that we are allowed to manually manage the HA state of the target /// service. If automatic failover is configured, then the automatic /// failover controllers should be doing state management, and it is generally /// an error to use the HAAdmin command line to do so. /// </remarks> /// <param name="target">the target to check</param> /// <returns>true if manual state management is allowed</returns> private bool CheckManualStateManagementOK(HAServiceTarget target) { if (target.IsAutoFailoverEnabled()) { if (requestSource != HAServiceProtocol.RequestSource.RequestByUserForced) { errOut.WriteLine("Automatic failover is enabled for " + target + "\n" + "Refusing to manually manage HA state, since it may cause\n" + "a split-brain scenario or other incorrect state.\n" + "If you are very sure you know what you are doing, please \n" + "specify the --" + Forcemanual + " flag."); return(false); } else { Log.Warn("Proceeding with manual HA state management even though\n" + "automatic failover is enabled for " + target); return(true); } } return(true); }
/// <exception cref="System.IO.IOException"/> /// <exception cref="Org.Apache.Hadoop.HA.ServiceFailedException"/> private int TransitionToStandby(CommandLine cmd) { string[] argv = cmd.GetArgs(); if (argv.Length != 1) { errOut.WriteLine("transitionToStandby: incorrect number of arguments"); PrintUsage(errOut, "-transitionToStandby"); return(-1); } HAServiceTarget target = ResolveTarget(argv[0]); if (!CheckManualStateManagementOK(target)) { return(-1); } HAServiceProtocol proto = target.GetProxy(GetConf(), 0); HAServiceProtocolHelper.TransitionToStandby(proto, CreateReqInfo()); return(0); }
/// <exception cref="Org.Apache.Hadoop.HA.BadFencingConfigurationException"/> public virtual bool TryFence(HAServiceTarget target, string argsStr) { SshFenceByTcpPort.Args args = new SshFenceByTcpPort.Args(argsStr); IPEndPoint serviceAddr = target.GetAddress(); string host = serviceAddr.GetHostName(); Session session; try { session = CreateSession(serviceAddr.GetHostName(), args); } catch (JSchException e) { Log.Warn("Unable to create SSH session", e); return(false); } Log.Info("Connecting to " + host + "..."); try { session.Connect(GetSshConnectTimeout()); } catch (JSchException e) { Log.Warn("Unable to connect to " + host + " as user " + args.user, e); return(false); } Log.Info("Connected to " + host); try { return(DoFence(session, serviceAddr)); } catch (JSchException e) { Log.Warn("Unable to achieve fencing on remote host", e); return(false); } finally { session.Disconnect(); } }
/// <summary>Checks whether other target node is active or not</summary> /// <param name="targetNodeToActivate"/> /// <returns> /// true if other target node is active or some other exception /// occurred and forceActive was set otherwise false /// </returns> /// <exception cref="System.IO.IOException"/> private bool IsOtherTargetNodeActive(string targetNodeToActivate, bool forceActive ) { ICollection <string> targetIds = GetTargetIds(targetNodeToActivate); targetIds.Remove(targetNodeToActivate); foreach (string targetId in targetIds) { HAServiceTarget target = ResolveTarget(targetId); if (!CheckManualStateManagementOK(target)) { return(true); } try { HAServiceProtocol proto = target.GetProxy(GetConf(), 5000); if (proto.GetServiceStatus().GetState() == HAServiceProtocol.HAServiceState.Active) { errOut.WriteLine("transitionToActive: Node " + targetId + " is already active"); PrintUsage(errOut, "-transitionToActive"); return(true); } } catch (Exception e) { //If forceActive switch is false then return true if (!forceActive) { errOut.WriteLine("Unexpected error occurred " + e.Message); PrintUsage(errOut, "-transitionToActive"); return(true); } } } return(false); }
/// <returns> /// an /// <see cref="HAServiceTarget"/> /// for the current active node /// in the cluster, or null if no node is active. /// </returns> /// <exception cref="System.IO.IOException">if a ZK-related issue occurs</exception> /// <exception cref="System.Exception">if thread is interrupted</exception> private HAServiceTarget GetCurrentActive() { lock (elector) { lock (this) { byte[] activeData; try { activeData = elector.GetActiveData(); } catch (ActiveStandbyElector.ActiveNotFoundException) { return(null); } catch (KeeperException ke) { throw new IOException("Unexpected ZooKeeper issue fetching active node info", ke); } HAServiceTarget oldActive = DataToTarget(activeData); return(oldActive); } } }
protected internal override byte[] TargetToData(HAServiceTarget target) { return(Ints.ToByteArray(((DummyHAService)target).index)); }
public _HealthMonitor_60(TestHealthMonitor _enclosing, Configuration baseArg1, HAServiceTarget baseArg2) : base(baseArg1, baseArg2) { this._enclosing = _enclosing; }
protected internal abstract byte[] TargetToData(HAServiceTarget target);
/// <summary>Coordinate a graceful failover.</summary> /// <remarks> /// Coordinate a graceful failover. This proceeds in several phases: /// 1) Pre-flight checks: ensure that the local node is healthy, and /// thus a candidate for failover. /// 2) Determine the current active node. If it is the local node, no /// need to failover - return success. /// 3) Ask that node to yield from the election for a number of seconds. /// 4) Allow the normal election path to run in other threads. Wait until /// we either become unhealthy or we see an election attempt recorded by /// the normal code path. /// 5) Allow the old active to rejoin the election, so a future /// failback is possible. /// </remarks> /// <exception cref="Org.Apache.Hadoop.HA.ServiceFailedException"/> /// <exception cref="System.IO.IOException"/> /// <exception cref="System.Exception"/> private void DoGracefulFailover() { int timeout = FailoverController.GetGracefulFenceTimeout(conf) * 2; // Phase 1: pre-flight checks CheckEligibleForFailover(); // Phase 2: determine old/current active node. Check that we're not // ourselves active, etc. HAServiceTarget oldActive = GetCurrentActive(); if (oldActive == null) { // No node is currently active. So, if we aren't already // active ourselves by means of a normal election, then there's // probably something preventing us from becoming active. throw new ServiceFailedException("No other node is currently active."); } if (oldActive.GetAddress().Equals(localTarget.GetAddress())) { Log.Info("Local node " + localTarget + " is already active. " + "No need to failover. Returning success." ); return; } // Phase 3: ask the old active to yield from the election. Log.Info("Asking " + oldActive + " to cede its active state for " + timeout + "ms" ); ZKFCProtocol oldZkfc = oldActive.GetZKFCProxy(conf, timeout); oldZkfc.CedeActive(timeout); // Phase 4: wait for the normal election to make the local node // active. ZKFailoverController.ActiveAttemptRecord attempt = WaitForActiveAttempt(timeout + 60000); if (attempt == null) { // We didn't even make an attempt to become active. lock (this) { if (lastHealthState != HealthMonitor.State.ServiceHealthy) { throw new ServiceFailedException("Unable to become active. " + "Service became unhealthy while trying to failover." ); } } throw new ServiceFailedException("Unable to become active. " + "Local node did not get an opportunity to do so from ZooKeeper, " + "or the local node took too long to transition to active."); } // Phase 5. At this point, we made some attempt to become active. So we // can tell the old active to rejoin if it wants. This allows a quick // fail-back if we immediately crash. oldZkfc.CedeActive(-1); if (attempt.succeeded) { Log.Info("Successfully became active. " + attempt.status); } else { // Propagate failure string msg = "Failed to become active. " + attempt.status; throw new ServiceFailedException(msg); } }
protected internal ZKFailoverController(Configuration conf, HAServiceTarget localTarget ) { this.localTarget = localTarget; this.conf = conf; }
/// <summary>Failover from service 1 to service 2.</summary> /// <remarks> /// Failover from service 1 to service 2. If the failover fails /// then try to failback. /// </remarks> /// <param name="fromSvc">currently active service</param> /// <param name="toSvc">service to make active</param> /// <param name="forceFence">to fence fromSvc even if not strictly necessary</param> /// <param name="forceActive">try to make toSvc active even if it is not ready</param> /// <exception cref="FailoverFailedException">if the failover fails</exception> /// <exception cref="Org.Apache.Hadoop.HA.FailoverFailedException"/> public virtual void Failover(HAServiceTarget fromSvc, HAServiceTarget toSvc, bool forceFence, bool forceActive) { Preconditions.CheckArgument(fromSvc.GetFencer() != null, "failover requires a fencer" ); PreFailoverChecks(fromSvc, toSvc, forceActive); // Try to make fromSvc standby bool tryFence = true; if (TryGracefulFence(fromSvc)) { tryFence = forceFence; } // Fence fromSvc if it's required or forced by the user if (tryFence) { if (!fromSvc.GetFencer().Fence(fromSvc)) { throw new FailoverFailedException("Unable to fence " + fromSvc + ". Fencing failed." ); } } // Try to make toSvc active bool failed = false; Exception cause = null; try { HAServiceProtocolHelper.TransitionToActive(toSvc.GetProxy(conf, rpcTimeoutToNewActive ), CreateReqInfo()); } catch (ServiceFailedException sfe) { Log.Error("Unable to make " + toSvc + " active (" + sfe.Message + "). Failing back." ); failed = true; cause = sfe; } catch (IOException ioe) { Log.Error("Unable to make " + toSvc + " active (unable to connect). Failing back." , ioe); failed = true; cause = ioe; } // We failed to make toSvc active if (failed) { string msg = "Unable to failover to " + toSvc; // Only try to failback if we didn't fence fromSvc if (!tryFence) { try { // Unconditionally fence toSvc in case it is still trying to // become active, eg we timed out waiting for its response. // Unconditionally force fromSvc to become active since it // was previously active when we initiated failover. Failover(toSvc, fromSvc, true, true); } catch (FailoverFailedException ffe) { msg += ". Failback to " + fromSvc + " failed (" + ffe.Message + ")"; Log.Fatal(msg); } } throw new FailoverFailedException(msg, cause); } }