private HandleVoteResult ShouldGrantVote(TransactionOperationContext context, long lastIndex, RequestVote rv) { var result = new HandleVoteResult(); var lastLogIndexUnderWriteLock = _engine.GetLastEntryIndex(context); var lastLogTermUnderWriteLock = _engine.GetTermFor(context, lastLogIndexUnderWriteLock); if (lastLogIndexUnderWriteLock != lastIndex) { result.DeclineVote = true; result.DeclineReason = "Log was changed"; return(result); } if (lastLogTermUnderWriteLock > rv.LastLogTerm) { result.DeclineVote = true; result.DeclineReason = $"My last log term {lastLogTermUnderWriteLock}, is higher than yours {rv.LastLogTerm}."; return(result); } if (lastLogIndexUnderWriteLock > rv.LastLogIndex) { result.DeclineVote = true; result.DeclineReason = $"Vote declined because my last log index {lastLogIndexUnderWriteLock} is more up to date than yours {rv.LastLogIndex}"; return(result); } var(whoGotMyVoteIn, votedTerm) = _engine.GetWhoGotMyVoteIn(context, rv.Term); result.VotedTerm = votedTerm; if (whoGotMyVoteIn != null && whoGotMyVoteIn != rv.Source) { result.DeclineVote = true; result.DeclineReason = $"Already voted in {rv.LastLogTerm}, for {whoGotMyVoteIn}"; return(result); } if (votedTerm >= rv.Term) { result.DeclineVote = true; result.DeclineReason = $"Already voted in {rv.LastLogTerm}, for another node in higher term: {votedTerm}"; return(result); } return(result); }
private (bool HasRemovedFromTopology, long LastAcknowledgedIndex, long LastTruncate, long LastCommit) ApplyLeaderStateToLocalState(Stopwatch sp, ClusterOperationContext context, List <RachisEntry> entries, AppendEntries appendEntries) { long lastTruncate; long lastCommit; bool removedFromTopology = false; // we start the tx after we finished reading from the network if (_engine.Log.IsInfoEnabled) { _engine.Log.Info($"{ToString()}: Ready to start tx in {sp.Elapsed}"); } using (var tx = context.OpenWriteTransaction()) { _engine.ValidateTerm(_term); if (_engine.Log.IsInfoEnabled) { _engine.Log.Info($"{ToString()}: Tx running in {sp.Elapsed}"); } if (entries.Count > 0) { var(lastTopology, lastTopologyIndex) = _engine.AppendToLog(context, entries); using (lastTopology) { if (lastTopology != null) { if (_engine.Log.IsInfoEnabled) { _engine.Log.Info($"Topology changed to {lastTopology}"); } var topology = JsonDeserializationRachis <ClusterTopology> .Deserialize(lastTopology); if (topology.Members.ContainsKey(_engine.Tag) || topology.Promotables.ContainsKey(_engine.Tag) || topology.Watchers.ContainsKey(_engine.Tag)) { RachisConsensus.SetTopology(_engine, context, topology); } else { removedFromTopology = true; _engine.ClearAppendedEntriesAfter(context, lastTopologyIndex); } } } } var lastEntryIndexToCommit = Math.Min( _engine.GetLastEntryIndex(context), appendEntries.LeaderCommit); var lastAppliedIndex = _engine.GetLastCommitIndex(context); var lastAppliedTerm = _engine.GetTermFor(context, lastEntryIndexToCommit); // we start to commit only after we have any log with a term of the current leader if (lastEntryIndexToCommit > lastAppliedIndex && lastAppliedTerm == appendEntries.Term) { lastAppliedIndex = _engine.Apply(context, lastEntryIndexToCommit, null, sp); } lastTruncate = Math.Min(appendEntries.TruncateLogBefore, lastAppliedIndex); _engine.TruncateLogBefore(context, lastTruncate); lastCommit = lastAppliedIndex; if (_engine.Log.IsInfoEnabled) { _engine.Log.Info($"{ToString()}: Ready to commit in {sp.Elapsed}"); } tx.Commit(); } if (_engine.Log.IsInfoEnabled) { _engine.Log.Info($"{ToString()}: Processing entries request with {entries.Count} entries took {sp.Elapsed}"); } var lastAcknowledgedIndex = entries.Count == 0 ? appendEntries.PrevLogIndex : entries[entries.Count - 1].Index; return(HasRemovedFromTopology : removedFromTopology, LastAcknowledgedIndex : lastAcknowledgedIndex, LastTruncate : lastTruncate, LastCommit : lastCommit); }
private void NegotiateWithLeader(TransactionOperationContext context, LogLengthNegotiation negotiation) { _debugRecorder.Start(); // only the leader can send append entries, so if we accepted it, it's the leader if (_engine.Log.IsInfoEnabled) { _engine.Log.Info($"{ToString()}: Got a negotiation request for term {negotiation.Term} where our term is {_term}"); } if (negotiation.Term != _term) { // Our leader is no longer a valid one var msg = $"The term was changed after leader validation from {_term} to {negotiation.Term}, so you are no longer a valid leader"; _connection.Send(context, new LogLengthNegotiationResponse { Status = LogLengthNegotiationResponse.ResponseStatus.Rejected, Message = msg, CurrentTerm = _term, LastLogIndex = negotiation.PrevLogIndex }); throw new InvalidOperationException($"We close this follower because: {msg}"); } long prevTerm; using (context.OpenReadTransaction()) { prevTerm = _engine.GetTermFor(context, negotiation.PrevLogIndex) ?? 0; } if (prevTerm != negotiation.PrevLogTerm) { if (_engine.Log.IsInfoEnabled) { _engine.Log.Info($"{ToString()}: Got a negotiation request with PrevLogTerm={negotiation.PrevLogTerm} while our PrevLogTerm={prevTerm}" + " will negotiate to find next matched index"); } // we now have a mismatch with the log position, and need to negotiate it with // the leader NegotiateMatchEntryWithLeaderAndApplyEntries(context, _connection, negotiation); } else { if (_engine.Log.IsInfoEnabled) { _engine.Log.Info($"{ToString()}: Got a negotiation request with identical PrevLogTerm will continue to steady state"); } // this (or the negotiation above) completes the negotiation process _connection.Send(context, new LogLengthNegotiationResponse { Status = LogLengthNegotiationResponse.ResponseStatus.Acceptable, Message = $"Found a log index / term match at {negotiation.PrevLogIndex} with term {prevTerm}", CurrentTerm = _term, LastLogIndex = negotiation.PrevLogIndex }); } _debugRecorder.Record("Matching Negotiation is over, waiting for snapshot"); _engine.Timeout.Defer(_connection.Source); // at this point, the leader will send us a snapshot message // in most cases, it is an empty snapshot, then start regular append entries // the reason we send this is to simplify the # of states in the protocol var snapshot = _connection.ReadInstallSnapshot(context); _debugRecorder.Record("Snapshot received"); using (context.OpenWriteTransaction()) { var lastTerm = _engine.GetTermFor(context, snapshot.LastIncludedIndex); var lastCommitIndex = _engine.GetLastEntryIndex(context); if (snapshot.LastIncludedTerm == lastTerm && snapshot.LastIncludedIndex < lastCommitIndex) { if (_engine.Log.IsInfoEnabled) { _engine.Log.Info( $"{ToString()}: Got installed snapshot with last index={snapshot.LastIncludedIndex} while our lastCommitIndex={lastCommitIndex}, will just ignore it"); } //This is okay to ignore because we will just get the committed entries again and skip them ReadInstallSnapshotAndIgnoreContent(context); } else if (InstallSnapshot(context)) { if (_engine.Log.IsInfoEnabled) { _engine.Log.Info( $"{ToString()}: Installed snapshot with last index={snapshot.LastIncludedIndex} with LastIncludedTerm={snapshot.LastIncludedTerm} "); } _engine.SetLastCommitIndex(context, snapshot.LastIncludedIndex, snapshot.LastIncludedTerm); _engine.ClearLogEntriesAndSetLastTruncate(context, snapshot.LastIncludedIndex, snapshot.LastIncludedTerm); } else { var lastEntryIndex = _engine.GetLastEntryIndex(context); if (lastEntryIndex < snapshot.LastIncludedIndex) { var message = $"The snapshot installation had failed because the last included index {snapshot.LastIncludedIndex} in term {snapshot.LastIncludedTerm} doesn't match the last entry {lastEntryIndex}"; if (_engine.Log.IsInfoEnabled) { _engine.Log.Info($"{ToString()}: {message}"); } throw new InvalidOperationException(message); } } // snapshot always has the latest topology if (snapshot.Topology == null) { const string message = "Expected to get topology on snapshot"; if (_engine.Log.IsInfoEnabled) { _engine.Log.Info($"{ToString()}: {message}"); } throw new InvalidOperationException(message); } using (var topologyJson = context.ReadObject(snapshot.Topology, "topology")) { if (_engine.Log.IsInfoEnabled) { _engine.Log.Info($"{ToString()}: topology on install snapshot: {topologyJson}"); } var topology = JsonDeserializationRachis <ClusterTopology> .Deserialize(topologyJson); RachisConsensus.SetTopology(_engine, context, topology); } context.Transaction.Commit(); } _engine.Timeout.Defer(_connection.Source); _debugRecorder.Record("Invoking StateMachine.SnapshotInstalled"); // notify the state machine, we do this in an async manner, and start // the operator in a separate thread to avoid timeouts while this is // going on var task = Task.Run(() => _engine.SnapshotInstalledAsync(snapshot.LastIncludedIndex)); var sp = Stopwatch.StartNew(); var timeToWait = (int)(_engine.ElectionTimeout.TotalMilliseconds / 4); while (task.Wait(timeToWait) == false) { // this may take a while, so we let the other side know that // we are still processing, and we reset our own timer while // this is happening MaybeNotifyLeaderThatWeAreStillAlive(context, sp); } _debugRecorder.Record("Done with StateMachine.SnapshotInstalled"); // we might have moved from passive node, so we need to start the timeout clock _engine.Timeout.Start(_engine.SwitchToCandidateStateOnTimeout); _debugRecorder.Record("Snapshot installed"); //Here we send the LastIncludedIndex as our matched index even for the case where our lastCommitIndex is greater //So we could validate that the entries sent by the leader are indeed the same as the ones we have. _connection.Send(context, new InstallSnapshotResponse { Done = true, CurrentTerm = _term, LastLogIndex = snapshot.LastIncludedIndex }); _engine.Timeout.Defer(_connection.Source); }
private void NegotiateWithLeader(TransactionOperationContext context, LogLengthNegotiation negotiation) { _debugRecorder.Start(); // only the leader can send append entries, so if we accepted it, it's the leader if (_engine.Log.IsInfoEnabled) { _engine.Log.Info($"{ToString()}: Got a negotiation request for term {negotiation.Term} where our term is {_term}."); } if (negotiation.Term != _term) { // Our leader is no longer a valid one var msg = $"The term was changed after leader validation from {_term} to {negotiation.Term}, so you are no longer a valid leader"; _connection.Send(context, new LogLengthNegotiationResponse { Status = LogLengthNegotiationResponse.ResponseStatus.Rejected, Message = msg, CurrentTerm = _term, LastLogIndex = negotiation.PrevLogIndex }); throw new InvalidOperationException($"We close this follower because: {msg}"); } long prevTerm; bool requestSnapshot; using (context.OpenReadTransaction()) { prevTerm = _engine.GetTermFor(context, negotiation.PrevLogIndex) ?? 0; requestSnapshot = _engine.GetSnapshotRequest(context); } if (requestSnapshot) { if (_engine.Log.IsInfoEnabled) { _engine.Log.Info($"{ToString()}: Request snapshot by the admin"); } _connection.Send(context, new LogLengthNegotiationResponse { Status = LogLengthNegotiationResponse.ResponseStatus.Acceptable, Message = "Request snapshot by the admin", CurrentTerm = _term, LastLogIndex = 0 }); } else if (prevTerm != negotiation.PrevLogTerm) { if (_engine.Log.IsInfoEnabled) { _engine.Log.Info($"{ToString()}: Got a negotiation request with PrevLogTerm={negotiation.PrevLogTerm} while our PrevLogTerm={prevTerm}" + " will negotiate to find next matched index"); } // we now have a mismatch with the log position, and need to negotiate it with // the leader NegotiateMatchEntryWithLeaderAndApplyEntries(context, _connection, negotiation); } else { if (_engine.Log.IsInfoEnabled) { _engine.Log.Info($"{ToString()}: Got a negotiation request with identical PrevLogTerm will continue to steady state"); } // this (or the negotiation above) completes the negotiation process _connection.Send(context, new LogLengthNegotiationResponse { Status = LogLengthNegotiationResponse.ResponseStatus.Acceptable, Message = $"Found a log index / term match at {negotiation.PrevLogIndex} with term {prevTerm}", CurrentTerm = _term, LastLogIndex = negotiation.PrevLogIndex }); } _debugRecorder.Record("Matching Negotiation is over, waiting for snapshot"); _engine.Timeout.Defer(_connection.Source); // at this point, the leader will send us a snapshot message // in most cases, it is an empty snapshot, then start regular append entries // the reason we send this is to simplify the # of states in the protocol var snapshot = _connection.ReadInstallSnapshot(context); _debugRecorder.Record("Start receiving snapshot"); // reading the snapshot from network and committing it to the disk might take a long time. using (var cts = new CancellationTokenSource()) { KeepAliveAndExecuteAction(() => ReadAndCommitSnapshot(context, snapshot, cts.Token), cts, "ReadAndCommitSnapshot"); } _debugRecorder.Record("Snapshot was received and committed"); // notify the state machine, we do this in an async manner, and start // the operator in a separate thread to avoid timeouts while this is // going on using (var cts = new CancellationTokenSource()) { KeepAliveAndExecuteAction(() => _engine.SnapshotInstalledAsync(snapshot.LastIncludedIndex, cts.Token), cts, "SnapshotInstalledAsync"); } _debugRecorder.Record("Done with StateMachine.SnapshotInstalled"); _debugRecorder.Record("Snapshot installed"); //Here we send the LastIncludedIndex as our matched index even for the case where our lastCommitIndex is greater //So we could validate that the entries sent by the leader are indeed the same as the ones we have. _connection.Send(context, new InstallSnapshotResponse { Done = true, CurrentTerm = _term, LastLogIndex = snapshot.LastIncludedIndex }); _engine.Timeout.Defer(_connection.Source); }
private void NegotiateWithLeader(TransactionOperationContext context, LogLengthNegotiation negotiation) { // only the leader can send append entries, so if we accepted it, it's the leader if (_engine.Log.IsInfoEnabled) { _engine.Log.Info($"Follower {_engine.Tag}: Got a negotiation request for term {negotiation.Term} where our term is {_engine.CurrentTerm}"); } if (negotiation.Term > _engine.CurrentTerm) { _engine.FoundAboutHigherTerm(negotiation.Term); } long prevTerm; using (context.OpenReadTransaction()) { prevTerm = _engine.GetTermFor(context, negotiation.PrevLogIndex) ?? 0; } if (prevTerm != negotiation.PrevLogTerm) { if (_engine.Log.IsInfoEnabled) { _engine.Log.Info($"Follower {_engine.Tag}: Got a negotiation request with PrevLogTerm={negotiation.PrevLogTerm} while our PrevLogTerm={prevTerm}" + " will negotiate to find next matched index"); } // we now have a mismatch with the log position, and need to negotiate it with // the leader NegotiateMatchEntryWithLeaderAndApplyEntries(context, _connection, negotiation); } else { if (_engine.Log.IsInfoEnabled) { _engine.Log.Info($"Follower {_engine.Tag}: Got a negotiation request with identical PrevLogTerm will continue to steady state"); } // this (or the negotiation above) completes the negotiation process _connection.Send(context, new LogLengthNegotiationResponse { Status = LogLengthNegotiationResponse.ResponseStatus.Acceptable, Message = $"Found a log index / term match at {negotiation.PrevLogIndex} with term {prevTerm}", CurrentTerm = _engine.CurrentTerm, LastLogIndex = negotiation.PrevLogIndex }); } _engine.Timeout.Defer(_connection.Source); // at this point, the leader will send us a snapshot message // in most cases, it is an empty snapshot, then start regular append entries // the reason we send this is to simplify the # of states in the protocol var snapshot = _connection.ReadInstallSnapshot(context); using (context.OpenWriteTransaction()) { var lastCommitIndex = _engine.GetLastCommitIndex(context); if (snapshot.LastIncludedIndex < lastCommitIndex) { if (_engine.Log.IsInfoEnabled) { _engine.Log.Info( $"Follower {_engine.Tag}: Got installed snapshot with last index={snapshot.LastIncludedIndex} while our lastCommitIndex={lastCommitIndex}, will just ignore it"); } //This is okay to ignore because we will just get the commited entries again and skip them ReadInstallSnapshotAndIgnoreContent(context); } else if (InstallSnapshot(context)) { if (_engine.Log.IsInfoEnabled) { _engine.Log.Info( $"Follower {_engine.Tag}: Installed snapshot with last index={snapshot.LastIncludedIndex} with LastIncludedTerm={snapshot.LastIncludedTerm} "); } _engine.SetLastCommitIndex(context, snapshot.LastIncludedIndex, snapshot.LastIncludedTerm); _engine.ClearLogEntriesAndSetLastTruncate(context, snapshot.LastIncludedIndex, snapshot.LastIncludedTerm); } else { var lastEntryIndex = _engine.GetLastEntryIndex(context); if (lastEntryIndex < snapshot.LastIncludedIndex) { var message = $"The snapshot installation had failed because the last included index {snapshot.LastIncludedIndex} in term {snapshot.LastIncludedTerm} doesn't match the last entry {lastEntryIndex}"; if (_engine.Log.IsInfoEnabled) { _engine.Log.Info($"Follower {_engine.Tag}: {message}"); } throw new InvalidOperationException(message); } } // snapshot always has the latest topology if (snapshot.Topology == null) { const string message = "Expected to get topology on snapshot"; if (_engine.Log.IsInfoEnabled) { _engine.Log.Info($"Follower {_engine.Tag}: {message}"); } throw new InvalidOperationException(message); } using (var topologyJson = context.ReadObject(snapshot.Topology, "topology")) { if (_engine.Log.IsInfoEnabled) { _engine.Log.Info($"Follower {_engine.Tag}: topology on install snapshot: {topologyJson}"); } var topology = JsonDeserializationRachis <ClusterTopology> .Deserialize(topologyJson); RachisConsensus.SetTopology(_engine, context, topology); } context.Transaction.Commit(); } //Here we send the LastIncludedIndex as our matched index even for the case where our lastCommitIndex is greater //So we could validate that the entries sent by the leader are indeed the same as the ones we have. _connection.Send(context, new InstallSnapshotResponse { Done = true, CurrentTerm = _engine.CurrentTerm, LastLogIndex = snapshot.LastIncludedIndex }); _engine.Timeout.Defer(_connection.Source); // notify the state machine _engine.SnapshotInstalled(context, snapshot.LastIncludedIndex); _engine.Timeout.Defer(_connection.Source); }