/// <exception cref="Org.Apache.Hadoop.HA.ServiceFailedException"/> private void BecomeActive() { lock (this) { Log.Info("Trying to make " + localTarget + " active..."); try { HAServiceProtocolHelper.TransitionToActive(localTarget.GetProxy(conf, FailoverController .GetRpcTimeoutToNewActive(conf)), CreateReqInfo()); string msg = "Successfully transitioned " + localTarget + " to active state"; Log.Info(msg); serviceState = HAServiceProtocol.HAServiceState.Active; RecordActiveAttempt(new ZKFailoverController.ActiveAttemptRecord(true, msg)); } catch (Exception t) { string msg = "Couldn't make " + localTarget + " active"; Log.Fatal(msg, t); RecordActiveAttempt(new ZKFailoverController.ActiveAttemptRecord(false, msg + "\n" + StringUtils.StringifyException(t))); if (t is ServiceFailedException) { throw (ServiceFailedException)t; } else { throw new ServiceFailedException("Couldn't transition to active", t); } } } }
internal DummyHAService(HAServiceProtocol.HAServiceState state, IPEndPoint address , bool testWithProtoBufRPC) { this.state = state; this.testWithProtoBufRPC = testWithProtoBufRPC; if (testWithProtoBufRPC) { this.address = StartAndGetRPCServerAddress(address); } else { this.address = address; } Configuration conf = new Configuration(); this.proxy = MakeMock(conf, CommonConfigurationKeys.HaHmRpcTimeoutDefault); try { conf.Set(DummyFenceKey, typeof(DummyHAService.DummyFencer).FullName); this.fencer = Org.Mockito.Mockito.Spy(NodeFencer.Create(conf, DummyFenceKey)); } catch (BadFencingConfigurationException e) { throw new RuntimeException(e); } lock (instances) { instances.AddItem(this); this.index = instances.Count; } }
/// <summary>Wait for the given HA service to enter the given HA state.</summary> /// <remarks> /// Wait for the given HA service to enter the given HA state. /// This is based on the state of ZKFC, not the state of HA service. /// There could be difference between the two. For example, /// When the service becomes unhealthy, ZKFC will quit ZK election and /// transition to HAServiceState.INITIALIZING and remain in that state /// until the service becomes healthy. /// </remarks> /// <exception cref="System.Exception"/> public virtual void WaitForHAState(int idx, HAServiceProtocol.HAServiceState state ) { MiniZKFCCluster.DummyZKFC svc = GetZkfc(idx); while (svc.GetServiceState() != state) { ctx.CheckException(); Thread.Sleep(50); } }
private void BecomeStandby() { lock (this) { Log.Info("ZK Election indicated that " + localTarget + " should become standby"); try { int timeout = FailoverController.GetGracefulFenceTimeout(conf); localTarget.GetProxy(conf, timeout).TransitionToStandby(CreateReqInfo()); Log.Info("Successfully transitioned " + localTarget + " to standby state"); } catch (Exception e) { Log.Error("Couldn't transition " + localTarget + " to standby state", e); } // TODO handle this. It's a likely case since we probably got fenced // at the same time. serviceState = HAServiceProtocol.HAServiceState.Standby; } }
internal virtual void VerifyChangedServiceState(HAServiceProtocol.HAServiceState changedState) { lock (elector) { lock (this) { if (serviceState == HAServiceProtocol.HAServiceState.Initializing) { if (quitElectionOnBadState) { Log.Debug("rechecking for electability from bad state"); RecheckElectability(); } return; } if (changedState == serviceState) { serviceStateMismatchCount = 0; return; } if (serviceStateMismatchCount == 0) { // recheck one more time. As this might be due to parallel transition. serviceStateMismatchCount++; return; } // quit the election as the expected state and reported state // mismatches. Log.Error("Local service " + localTarget + " has changed the serviceState to " + changedState + ". Expected was " + serviceState + ". Quitting election marking fencing necessary." ); delayJoiningUntilNanotime = Runtime.NanoTime() + TimeUnit.Milliseconds.ToNanos(1000 ); elector.QuitElection(true); quitElectionOnBadState = true; serviceStateMismatchCount = 0; serviceState = HAServiceProtocol.HAServiceState.Initializing; } } }
/// <exception cref="Org.Apache.Hadoop.Security.AccessControlException"/> /// <exception cref="Org.Apache.Hadoop.HA.ServiceFailedException"/> /// <exception cref="System.IO.IOException"/> private void DoCedeActive(int millisToCede) { int timeout = FailoverController.GetGracefulFenceTimeout(conf); // Lock elector to maintain lock ordering of elector -> ZKFC lock (elector) { lock (this) { if (millisToCede <= 0) { delayJoiningUntilNanotime = 0; RecheckElectability(); return; } Log.Info("Requested by " + UserGroupInformation.GetCurrentUser() + " at " + Server .GetRemoteAddress() + " to cede active role."); bool needFence = false; try { localTarget.GetProxy(conf, timeout).TransitionToStandby(CreateReqInfo()); Log.Info("Successfully ensured local node is in standby mode"); } catch (IOException ioe) { Log.Warn("Unable to transition local node to standby: " + ioe.GetLocalizedMessage ()); Log.Warn("Quitting election but indicating that fencing is " + "necessary"); needFence = true; } delayJoiningUntilNanotime = Runtime.NanoTime() + TimeUnit.Milliseconds.ToNanos(millisToCede ); elector.QuitElection(needFence); serviceState = HAServiceProtocol.HAServiceState.Initializing; } } RecheckElectability(); }
internal DummyHAService(HAServiceProtocol.HAServiceState state, IPEndPoint address ) : this(state, address, false) { }
/// <summary> /// Check the current state of the service, and join the election /// if it should be in the election. /// </summary> private void RecheckElectability() { // Maintain lock ordering of elector -> ZKFC lock (elector) { lock (this) { bool healthy = lastHealthState == HealthMonitor.State.ServiceHealthy; long remainingDelay = delayJoiningUntilNanotime - Runtime.NanoTime(); if (remainingDelay > 0) { if (healthy) { Log.Info("Would have joined master election, but this node is " + "prohibited from doing so for " + TimeUnit.Nanoseconds.ToMillis(remainingDelay) + " more ms"); } ScheduleRecheck(remainingDelay); return; } switch (lastHealthState) { case HealthMonitor.State.ServiceHealthy: { elector.JoinElection(TargetToData(localTarget)); if (quitElectionOnBadState) { quitElectionOnBadState = false; } break; } case HealthMonitor.State.Initializing: { Log.Info("Ensuring that " + localTarget + " does not " + "participate in active master election" ); elector.QuitElection(false); serviceState = HAServiceProtocol.HAServiceState.Initializing; break; } case HealthMonitor.State.ServiceUnhealthy: case HealthMonitor.State.ServiceNotResponding: { Log.Info("Quitting master election for " + localTarget + " and marking that fencing is necessary" ); elector.QuitElection(true); serviceState = HAServiceProtocol.HAServiceState.Initializing; break; } case HealthMonitor.State.HealthMonitorFailed: { FatalError("Health monitor failed!"); break; } default: { throw new ArgumentException("Unhandled state:" + lastHealthState); } } } } }