private void RefreshAmbassadors(ClusterTopology clusterTopology, Dictionary <string, RemoteConnection> connections = null) { bool lockTaken = false; Monitor.TryEnter(this, ref lockTaken); try { //This only means we are been disposed so we can quit now if (lockTaken == false) { if (_engine.Log.IsInfoEnabled) { _engine.Log.Info($"{ToString()}: Skipping refreshing ambassadors because we are been disposed of"); } return; } if (Term != _engine.CurrentTerm) { if (_engine.Log.IsInfoEnabled) { _engine.Log.Info($"{ToString()}: We are no longer the actual leader, since the current term is {_engine.CurrentTerm}"); } return; } if (_engine.Log.IsInfoEnabled) { _engine.Log.Info($"{ToString()}: Refreshing ambassadors"); } var old = new Dictionary <string, FollowerAmbassador>(StringComparer.OrdinalIgnoreCase); foreach (var peers in new[] { _voters, _promotables, _nonVoters }) { foreach (var peer in peers) { old[peer.Key] = peer.Value; } peers.Clear(); } foreach (var voter in clusterTopology.Members) { if (voter.Key == _engine.Tag) { continue; // we obviously won't be applying to ourselves } if (old.TryGetValue(voter.Key, out FollowerAmbassador existingInstance)) { existingInstance.UpdateLeaderWake(_voterResponded); _voters.Add(voter.Key, existingInstance); old.Remove(voter.Key); continue; // already here } RemoteConnection connection = null; connections?.TryGetValue(voter.Key, out connection); var ambasaddor = new FollowerAmbassador(_engine, this, _voterResponded, voter.Key, voter.Value, _engine.ClusterCertificate, connection); _voters.Add(voter.Key, ambasaddor); _engine.AppendStateDisposable(this, ambasaddor); if (_engine.Log.IsInfoEnabled) { _engine.Log.Info($"{ToString()}: starting ambassador for voter {voter.Key} {voter.Value}"); } ambasaddor.Start(); } foreach (var promotable in clusterTopology.Promotables) { if (old.TryGetValue(promotable.Key, out FollowerAmbassador existingInstance)) { existingInstance.UpdateLeaderWake(_promotableUpdated); _promotables.Add(promotable.Key, existingInstance); old.Remove(promotable.Key); continue; // already here } RemoteConnection connection = null; connections?.TryGetValue(promotable.Key, out connection); var ambasaddor = new FollowerAmbassador(_engine, this, _promotableUpdated, promotable.Key, promotable.Value, _engine.ClusterCertificate, connection); _promotables.Add(promotable.Key, ambasaddor); _engine.AppendStateDisposable(this, ambasaddor); if (_engine.Log.IsInfoEnabled) { _engine.Log.Info($"{ToString()}: starting ambassador for promotable {promotable.Key} {promotable.Value}"); } ambasaddor.Start(); } foreach (var nonVoter in clusterTopology.Watchers) { if (old.TryGetValue(nonVoter.Key, out FollowerAmbassador existingInstance)) { existingInstance.UpdateLeaderWake(_noop); _nonVoters.Add(nonVoter.Key, existingInstance); old.Remove(nonVoter.Key); continue; // already here } RemoteConnection connection = null; connections?.TryGetValue(nonVoter.Key, out connection); var ambasaddor = new FollowerAmbassador(_engine, this, _noop, nonVoter.Key, nonVoter.Value, _engine.ClusterCertificate, connection); _nonVoters.Add(nonVoter.Key, ambasaddor); _engine.AppendStateDisposable(this, ambasaddor); if (_engine.Log.IsInfoEnabled) { _engine.Log.Info($"{ToString()}: starting ambassador for watcher {nonVoter.Key} {nonVoter.Value}"); } ambasaddor.Start(); } if (old.Count > 0) { Interlocked.Increment(ref _previousPeersWereDisposed); System.Threading.ThreadPool.QueueUserWorkItem(_ => { foreach (var ambasaddor in old) { // it is not used by anything else, so we can close it ambasaddor.Value.Dispose(); } Interlocked.Decrement(ref _previousPeersWereDisposed); }, null); } } finally { if (lockTaken) { Monitor.Exit(this); } } }
/// <summary> /// This method may run for a long while, as we are trying to get agreement /// from a majority of the cluster /// </summary> private void Run() { try { while (_candidate.Running && _disposed == false) { _conenctToPeer = null; try { try { using (_engine.ContextPool.AllocateOperationContext(out TransactionOperationContext context)) { _conenctToPeer = _engine.ConnectToPeer(_url, _certificate, context).Result; } if (_candidate.Running == false) { break; } } catch (Exception e) { Status = AmbassadorStatus.FailedToConnect; StatusMessage = $"Failed to connect with {_tag}.{Environment.NewLine} " + e.Message; if (_engine.Log.IsInfoEnabled) { _engine.Log.Info($"CandidateAmbassador {_engine.Tag}: Failed to connect to remote peer: " + _url, e); } // wait a bit _candidate.WaitForChangeInState(); continue; // we'll retry connecting } Status = AmbassadorStatus.Connected; StatusMessage = $"Connected to {_tag}"; using (var connection = new RemoteConnection(_tag, _engine.Tag, _conenctToPeer)) { try { _engine.AppendStateDisposable(_candidate, connection); } catch (ConcurrencyException) { // we probably lost the election, because someone else changed our state to follower // we'll still return to the top of the loop to ensure that this is the case continue; } while (_candidate.Running) { using (_engine.ContextPool.AllocateOperationContext(out TransactionOperationContext context)) { ClusterTopology topology; long lastLogIndex; long lastLogTerm; using (context.OpenReadTransaction()) { topology = _engine.GetTopology(context); lastLogIndex = _engine.GetLastEntryIndex(context); lastLogTerm = _engine.GetTermForKnownExisting(context, lastLogIndex); } Debug.Assert(topology.TopologyId != null); connection.Send(context, new RachisHello { TopologyId = topology.TopologyId, DebugSourceIdentifier = _engine.Tag, DebugDestinationIdentifier = _tag, InitialMessageType = InitialMessageType.RequestVote }); RequestVoteResponse rvr; var currentElectionTerm = _candidate.ElectionTerm; var engineCurrentTerm = _engine.CurrentTerm; if (_candidate.IsForcedElection == false || _candidate.RunRealElectionAtTerm != currentElectionTerm) { connection.Send(context, new RequestVote { Source = _engine.Tag, Term = currentElectionTerm, IsForcedElection = false, IsTrialElection = true, LastLogIndex = lastLogIndex, LastLogTerm = lastLogTerm }); rvr = connection.Read <RequestVoteResponse>(context); if (rvr.Term > currentElectionTerm) { var message = "Found election term " + rvr.Term + " that is higher than ours " + currentElectionTerm; // we need to abort the current elections _engine.SetNewState(RachisConsensus.State.Follower, null, engineCurrentTerm, message); if (_engine.Log.IsInfoEnabled) { _engine.Log.Info($"CandidateAmbassador {_engine.Tag}: {message}"); } _engine.FoundAboutHigherTerm(rvr.Term); throw new InvalidOperationException(message); } NotInTopology = rvr.NotInTopology; if (rvr.VoteGranted == false) { if (_engine.Log.IsInfoEnabled) { _engine.Log.Info($"CandidateAmbassador {_engine.Tag}: Got a negative response from {_tag} reseason:{rvr.Message}"); } // we go a negative response here, so we can't proceed // we'll need to wait until the candidate has done something, like // change term or given up _candidate.WaitForChangeInState(); continue; } TrialElectionWonAtTerm = rvr.Term; _candidate.WaitForChangeInState(); } connection.Send(context, new RequestVote { Source = _engine.Tag, Term = currentElectionTerm, IsForcedElection = _candidate.IsForcedElection, IsTrialElection = false, LastLogIndex = lastLogIndex, LastLogTerm = lastLogTerm }); rvr = connection.Read <RequestVoteResponse>(context); if (rvr.Term > currentElectionTerm) { var message = "Found election term " + rvr.Term + " that is higher than ours " + currentElectionTerm; if (_engine.Log.IsInfoEnabled) { _engine.Log.Info($"CandidateAmbassador {_engine.Tag}: {message}"); } // we need to abort the current elections _engine.SetNewState(RachisConsensus.State.Follower, null, engineCurrentTerm, message); _engine.FoundAboutHigherTerm(rvr.Term); throw new InvalidOperationException(message); } NotInTopology = rvr.NotInTopology; if (rvr.VoteGranted == false) { if (_engine.Log.IsInfoEnabled) { _engine.Log.Info($"CandidateAmbassador {_engine.Tag}: Got a negative response from {_tag} reseason:{rvr.Message}"); } // we go a negative response here, so we can't proceed // we'll need to wait until the candidate has done something, like // change term or given up _candidate.WaitForChangeInState(); continue; } ReadlElectionWonAtTerm = rvr.Term; _candidate.WaitForChangeInState(); } } } } catch (Exception e) { Status = AmbassadorStatus.FailedToConnect; StatusMessage = $"Failed to get vote from {_tag}.{Environment.NewLine}" + e.Message; if (_engine.Log.IsInfoEnabled) { _engine.Log.Info($"CandidateAmbassador {_engine.Tag}: Failed to get vote from remote peer url={_url} tag={_tag}", e); } _candidate.WaitForChangeInState(); } finally { _conenctToPeer?.Dispose(); } } } catch (OperationCanceledException) { Status = AmbassadorStatus.Closed; StatusMessage = "Closed"; } catch (ObjectDisposedException) { Status = AmbassadorStatus.Closed; StatusMessage = "Closed"; } catch (AggregateException ae) when(ae.InnerException is OperationCanceledException || ae.InnerException is ObjectDisposedException) { Status = AmbassadorStatus.Closed; StatusMessage = "Closed"; } catch (Exception e) { Status = AmbassadorStatus.FailedToConnect; StatusMessage = $"Failed to talk to {_url}.{Environment.NewLine}" + e; if (_engine.Log.IsInfoEnabled) { _engine.Log.Info("Failed to talk to remote peer: " + _url, e); } } }
private void Run() { var ambassadorsToRemove = new List <CandidateAmbassador>(); try { try { // Operation may fail, that's why we don't RaiseOrDie _running.Raise(); ElectionTerm = _engine.CurrentTerm; if (_engine.Log.IsInfoEnabled) { _engine.Log.Info($"Candidate {_engine.Tag}: Starting elections"); } ClusterTopology clusterTopology; using (_engine.ContextPool.AllocateOperationContext(out ClusterOperationContext context)) using (context.OpenReadTransaction()) { clusterTopology = _engine.GetTopology(context); } if (clusterTopology.Members.Count == 1) { CastVoteForSelf(ElectionTerm + 1, "Single member cluster, natural leader"); _engine.SwitchToLeaderState(ElectionTerm, ClusterCommandsVersionManager.CurrentClusterMinimalVersion, "I'm the only one in the cluster, so no need for elections, I rule."); return; } if (_engine.RequestSnapshot) { // we aren't allowed to be elected for leadership if we requested a snapshot if (_engine.Log.IsOperationsEnabled) { _engine.Log.Operations("we aren't allowed to be elected for leadership if we requested a snapshot"); } return; } if (IsForcedElection) { CastVoteForSelf(ElectionTerm + 1, "Voting for self in forced elections"); } else { ElectionTerm = ElectionTerm + 1; } foreach (var voter in clusterTopology.Members) { if (voter.Key == _engine.Tag) { continue; // we already voted for ourselves } var candidateAmbassador = new CandidateAmbassador(_engine, this, voter.Key, voter.Value); _voters = new List <CandidateAmbassador>(_voters) { candidateAmbassador }; _engine.AppendStateDisposable(this, candidateAmbassador); // concurrency exception here will dispose the current candidate and it ambassadors candidateAmbassador.Start(); } while (_running && _engine.CurrentState == RachisState.Candidate) { if (_peersWaiting.WaitOne(_engine.Timeout.TimeoutPeriod) == false) { ElectionTerm = _engine.CurrentTerm + 1; _engine.RandomizeTimeout(extend: true); StateChange(); // will wake ambassadors and make them ping peers again continue; } if (_running == false) { return; } _peersWaiting.Reset(); var trialElectionsCount = 1; var realElectionsCount = 1; foreach (var ambassador in _voters) { if (ambassador.NotInTopology) { MoveCandidateToPassive("A leader node has indicated that I'm not in their topology, I was probably kicked out."); return; } if (ambassador.TopologyMismatch) { ambassadorsToRemove.Add(ambassador); continue; } if (ambassador.RealElectionWonAtTerm == ElectionTerm) { realElectionsCount++; } if (ambassador.TrialElectionWonAtTerm == ElectionTerm) { trialElectionsCount++; } } if (StillHavePeers(ambassadorsToRemove) == false) { MoveCandidateToPassive("I'm left alone in the cluster."); return; } var majority = ((_voters.Count + 1) / 2) + 1; if (realElectionsCount >= majority) { ElectionResult = ElectionResult.Won; _running.Lower(); var connections = new Dictionary <string, RemoteConnection>(); var versions = new List <int> { ClusterCommandsVersionManager.MyCommandsVersion }; foreach (var candidateAmbassador in _voters) { if (candidateAmbassador.ClusterCommandsVersion > 0) { versions.Add(candidateAmbassador.ClusterCommandsVersion); } if (candidateAmbassador.TryGetPublishedConnection(out var connection)) { connections[candidateAmbassador.Tag] = connection; } } StateChange(); var minimalVersion = ClusterCommandsVersionManager.GetClusterMinimalVersion(versions, _engine.MaximalVersion); string msg = $"Was elected by {realElectionsCount} nodes for leadership in term {ElectionTerm} with cluster version of {minimalVersion}"; _engine.SwitchToLeaderState(ElectionTerm, minimalVersion, msg, connections); break; } if (RunRealElectionAtTerm != ElectionTerm && trialElectionsCount >= majority) { CastVoteForSelf(ElectionTerm, "Won in the trial elections"); } } } catch (Exception e) { if (_engine.Log.IsInfoEnabled) { _engine.Log.Info($"Candidate {_engine.Tag}: Failure during candidacy run with current state of {_engine.CurrentState}", e); } if (_engine.CurrentState == RachisState.Candidate) { // if we are still a candidate, start the candidacy again. _engine.SwitchToCandidateState("An error occurred during the last candidacy: " + e); } else if (_engine.CurrentState != RachisState.Passive) { _engine.Timeout.Start(_engine.SwitchToCandidateStateOnTimeout); } } } finally { try { Dispose(); } catch (Exception) { // nothing to be done here } } }
private void Run() { try { try { // Operation may fail, that's why we don't RaiseOrDie _running.Raise(); ElectionTerm = _engine.CurrentTerm; if (_engine.Log.IsInfoEnabled) { _engine.Log.Info($"Candidate {_engine.Tag}: Starting elections"); } ClusterTopology clusterTopology; using (_engine.ContextPool.AllocateOperationContext(out TransactionOperationContext context)) using (context.OpenReadTransaction()) { clusterTopology = _engine.GetTopology(context); } if (clusterTopology.Members.Count == 1) { CastVoteForSelf(ElectionTerm + 1, "Single member cluster, natural leader"); _engine.SwitchToLeaderState(ElectionTerm, "I'm the only one in the cluster, so no need for elections, I rule."); return; } if (IsForcedElection) { CastVoteForSelf(ElectionTerm + 1, "Voting for self in forced elections"); } else { ElectionTerm = ElectionTerm + 1; } foreach (var voter in clusterTopology.Members) { if (voter.Key == _engine.Tag) { continue; // we already voted for ourselves } var candidateAmbassador = new CandidateAmbassador(_engine, this, voter.Key, voter.Value, _engine.ClusterCertificate); _voters.Add(candidateAmbassador); try { _engine.AppendStateDisposable(this, candidateAmbassador); } catch (ConcurrencyException) { foreach (var ambassador in _voters) { ambassador.Dispose(); } return; // we lost the election, because someone else changed our state to follower } candidateAmbassador.Start(); } while (_running && _engine.CurrentState == RachisState.Candidate) { if (_peersWaiting.WaitOne(_engine.Timeout.TimeoutPeriod) == false) { ElectionTerm = _engine.CurrentTerm; // timeout? if (IsForcedElection) { CastVoteForSelf(ElectionTerm + 1, "Timeout during forced elections"); } else { ElectionTerm = ElectionTerm + 1; } _engine.RandomizeTimeout(extend: true); StateChange(); // will wake ambassadors and make them ping peers again continue; } if (_running == false) { return; } _peersWaiting.Reset(); bool removedFromTopology = false; var trialElectionsCount = 1; var realElectionsCount = 1; foreach (var ambassador in _voters) { if (ambassador.NotInTopology) { removedFromTopology = true; break; } if (ambassador.RealElectionWonAtTerm == ElectionTerm) { realElectionsCount++; } if (ambassador.TrialElectionWonAtTerm == ElectionTerm) { trialElectionsCount++; } } var majority = ((_voters.Count + 1) / 2) + 1; if (removedFromTopology) { if (_engine.Log.IsInfoEnabled) { _engine.Log.Info( $"Candidate {_engine.Tag}: A leader node has indicated that I'm not in their topology, I was probably kicked out. Moving to passive mode"); } _engine.SetNewState(RachisState.Passive, this, _engine.CurrentTerm, "I just learned from the leader that I'm not in their topology, moving to passive state"); break; } if (realElectionsCount >= majority) { ElectionResult = ElectionResult.Won; _running.Lower(); StateChange(); var connections = new Dictionary <string, RemoteConnection>(); foreach (var candidateAmbassador in _voters) { connections[candidateAmbassador.Tag] = candidateAmbassador.Connection; } _engine.SwitchToLeaderState(ElectionTerm, $"Was elected by {realElectionsCount} nodes to leadership in {ElectionTerm}", connections); break; } if (RunRealElectionAtTerm != ElectionTerm && trialElectionsCount >= majority) { CastVoteForSelf(ElectionTerm, "Won in the trial elections"); } } } catch (Exception e) { if (_engine.Log.IsInfoEnabled) { _engine.Log.Info($"Candidate {_engine.Tag}: Failure during candidacy run with current state of {_engine.CurrentState}", e); } if (_engine.CurrentState == RachisState.Candidate) { // if we are still a candidate, start the candidacy again. _engine.SwitchToCandidateState("An error occured during the last candidacy: " + e); } else if (_engine.CurrentState != RachisState.Passive) { _engine.Timeout.Start(_engine.SwitchToCandidateStateOnTimeout); } } } finally { try { Dispose(); } catch (Exception) { // nothing to be done here } } }