public void EndReceiveInstallSnapshot(FileStream s, InstallSnapshot r) { lock (Raft) { // 6. If existing log entry has same index and term as snapshot’s // last included entry, retain log entries following it and reply var last = ReadLog(r.Argument.LastIncludedIndex); if (null != last && last.Term == r.Argument.LastIncludedTerm) { // 这里全部保留更简单吧,否则如果没有applied,那不就糟了吗? // RemoveLogReverse(r.Argument.LastIncludedIndex - 1); return; } // 7. Discard the entire log // 整个删除,那么下一次AppendEnties又会找不到prev。不就xxx了吗? // 我的想法是,InstallSnapshot 最后一个 trunk 带上 LastIncludedLog, // 接收者清除log,并把这条日志插入(这个和系统初始化时插入的Index=0的日志道理差不多)。 // 【除了快照最后包含的日志,其他都删除。】 var lastIncludedLog = RaftLog.Decode(r.Argument.LastIncludedLog, Raft.StateMachine.LogFactory); SaveLog(lastIncludedLog); // follower 没有并发请求需要处理,在锁内删除。 RemoveLogReverse(lastIncludedLog.Index - 1, FirstIndex); RemoveLogAndCancelStart(lastIncludedLog.Index + 1, LastIndex); LastIndex = lastIncludedLog.Index; FirstIndex = lastIncludedLog.Index; CommitIndex = FirstIndex; LastApplied = FirstIndex; // 8. Reset state machine using snapshot contents (and load // snapshot’s cluster configuration) Raft.StateMachine.LoadFromSnapshot(s.Name); logger.Debug("{0} EndReceiveInstallSnapshot Path={1}", Raft.Name, s.Name); } }
public static RaftLog Decode(Binary data, Func <int, Log> logFactory) { var raftLog = new RaftLog(logFactory); data.Decode(raftLog); return(raftLog); }
public void AppendLog(Log log, bool WaitApply = true) { if (false == Raft.IsLeader) { throw new TaskCanceledException(); // 快速失败 } TaskCompletionSource <int> future = null; lock (Raft) { ++LastIndex; var raftLog = new RaftLog(Term, LastIndex, log); if (WaitApply) { future = new TaskCompletionSource <int>(); if (false == WaitApplyFutures.TryAdd(raftLog.Index, future)) { throw new Exception("Impossible"); } } SaveLog(raftLog); } // 广播给followers并异步等待多数确认 Raft.Server.Config.ForEachConnector( (connector) => TrySendAppendEntries(connector as Server.ConnectorEx, null)); if (WaitApply) { future.Task.Wait(); } }
internal int FollowerOnAppendEntries(AppendEntries r) { LeaderActiveTime = Zeze.Util.Time.NowUnixMillis; r.Result.Term = Term; r.Result.Success = false; // set default false if (r.Argument.Term < Term) { // 1. Reply false if term < currentTerm (§5.1) r.SendResult(); return(Procedure.LogicError); } var prevLog = ReadLog(r.Argument.PrevLogIndex); if (prevLog == null || prevLog.Term != r.Argument.PrevLogTerm) { // 2. Reply false if log doesn’t contain an entry // at prevLogIndex whose term matches prevLogTerm(§5.3) r.SendResult(); return(Procedure.LogicError); } foreach (var raftLogData in r.Argument.Entries) { var copyLog = RaftLog.Decode(raftLogData, Raft.StateMachine.LogFactory); var conflictCheck = ReadLog(copyLog.Index); if (null != conflictCheck) { if (conflictCheck.Term != copyLog.Term) { // 3. If an existing entry conflicts // with a new one (same index but different terms), // delete the existing entry and all that follow it(§5.3) // raft.pdf 5.3 RemoveLogAndCancelStart(conflictCheck.Index, LastIndex); LastIndex = prevLog.Index; } } else { // 4. Append any new entries not already in the log SaveLog(copyLog); } // 复用这个变量。当冲突需要删除时,精确指到前一个日志。 prevLog = copyLog; } // 5. If leaderCommit > commitIndex, // set commitIndex = min(leaderCommit, index of last new entry) if (r.Argument.LeaderCommit > CommitIndex) { CommitIndex = Math.Min(r.Argument.LeaderCommit, LastRaftLog().Index); TryApply(ReadLog(CommitIndex)); } r.Result.Success = true; logger.Debug("{0}: {1}", Raft.Name, r); r.SendResultCode(0); return(Procedure.Success); }
private RaftLog ReadLog(long index) { var key = ByteBuffer.Allocate(); key.WriteLong8(index); var value = Logs.Get(key.Bytes, key.Size); if (null == value) { return(null); } return(RaftLog.Decode(new Binary(value), Raft.StateMachine.LogFactory)); }
private void SaveLog(RaftLog log) { LastIndex = log.Index; // 记住最后一个Index,用来下一次生成。 var key = ByteBuffer.Allocate(); key.WriteLong8(log.Index); var value = log.Encode(); // key,value offset must 0 Logs.Put( key.Bytes, key.Size, value.Bytes, value.Size, null, new WriteOptions().SetSync(true) ); }
private void TrySendAppendEntries(Server.ConnectorEx connector, AppendEntries pending) { lock (Raft) { // 按理说,多个Follower设置一次就够了,这里就不做这个处理了。 AppendLogActiveTime = Util.Time.NowUnixMillis; if (connector.Pending != pending) { return; } // 先清除,下面中断(return)不用每次自己清除。 connector.Pending = null; if (false == connector.IsHandshakeDone) { // Hearbeat Will Retry return; } // 【注意】 // 正在安装Snapshot,此时不复制日志,肯定失败。 // 不做这个判断也是可以工作的,算是优化。 if (connector.InstallSnapshotting) { return; } if (connector.NextIndex > LastIndex) { return; } var nextLog = ReadLogReverse(connector.NextIndex); if (nextLog.Index == FirstIndex) { // 已经到了日志开头,此时不会有prev-log,无法复制日志了。 // 这一般发生在Leader进行了Snapshot,但是Follower的日志还更老。 // 新起的Follower也一样。 StartInstallSnapshot(connector); return; } // 现在Index总是递增,但没有确认步长总是为1,这样能处理不为1的情况。 connector.NextIndex = nextLog.Index; connector.Pending = new AppendEntries(); connector.Pending.Argument.Term = Term; connector.Pending.Argument.LeaderId = Raft.Name; connector.Pending.Argument.LeaderCommit = CommitIndex; // 肯定能找到的。 var prevLog = ReadLogReverse(nextLog.Index - 1); connector.Pending.Argument.PrevLogIndex = prevLog.Index; connector.Pending.Argument.PrevLogTerm = prevLog.Term; // 限制一次发送的日志数量,【注意】这个不是raft要求的。 int maxCount = Raft.RaftConfig.MaxAppendEntiresCount; RaftLog lastCopyLog = nextLog; for (var copyLog = nextLog; maxCount > 0 && null != copyLog && copyLog.Index <= LastIndex; copyLog = ReadLogStart(copyLog.Index + 1), --maxCount ) { lastCopyLog = copyLog; connector.Pending.Argument.Entries.Add(new Binary(copyLog.Encode())); } connector.Pending.Argument.LastEntryIndex = lastCopyLog.Index; if (false == connector.Pending.Send( connector.Socket, (p) => ProcessAppendEntriesResult(connector, p), Raft.RaftConfig.AppendEntriesTimeout)) { connector.Pending = null; // Hearbeat Will Retry } } }
private RaftLog FindMaxMajorityLog(long startIndex) { RaftLog lastMajorityLog = null; for (long index = startIndex; index <= LastIndex; /**/) { var raftLog = ReadLogStart(index); if (null == raftLog) { break; } index = raftLog.Index + 1; lastMajorityLog = raftLog; int MajorityCount = 0; Raft.Server.Config.ForEachConnector( (c) => { var cex = c as Server.ConnectorEx; if (cex.MatchIndex >= raftLog.Index) { ++MajorityCount; } }); // 没有达成多数派,中断循环。后面返回上一个majority,仍可能为null。 // 等于的时候加上自己就是多数派了。 if (MajorityCount < Raft.RaftConfig.HalfCount) { break; } } return(lastMajorityLog); } private void TryCommit(AppendEntries rpc, Server.ConnectorEx connector) { connector.NextIndex = rpc.Argument.LastEntryIndex + 1; connector.MatchIndex = rpc.Argument.LastEntryIndex; // 已经提交的,旧的 AppendEntries 的结果,不用继续处理了。 // 【注意】这个不是必要的,是一个小优化。 if (rpc.Argument.LastEntryIndex <= CommitIndex) { return; } // Rules for Servers // If there exists an N such that N > commitIndex, a majority // of matchIndex[i] ≥ N, and log[N].term == currentTerm: // set commitIndex = N(§5.3, §5.4). // TODO 对于 Leader CommitIndex 初始化问题。 var raftLog = FindMaxMajorityLog(CommitIndex + 1); if (null == raftLog) { return; // 一个多数派都没有找到。 } if (raftLog.Term != Term) { // 如果是上一个 Term 未提交的日志在这一次形成的多数派, // 不自动提交。 // 总是等待当前 Term 推进时,顺便提交它。 return; } CommitIndex = raftLog.Index; TryApply(raftLog); } private void TryApply(RaftLog lastApplyableLog) { if (null == lastApplyableLog) { logger.Error("lastApplyableLog is null."); return; } for (long index = LastApplied + 1; index <= lastApplyableLog.Index; /**/) { var raftLog = ReadLogStart(index); if (null == raftLog) { return; // end? } index = raftLog.Index + 1; if (Raft.RaftConfig.AutoKeyLocalStep > 0 && raftLog.Log.UniqueRequestId > 0) { // 这是防止请求重复执行用的。 // 需要对每个Raft.Agent的请求排队处理。 // see Net.cs Server.DispatchProtocol var appInstance = raftLog.Log.UniqueRequestId % Raft.RaftConfig.AutoKeyLocalStep; LastAppliedAppRpcSessionId[appInstance] = raftLog.Log.UniqueRequestId; } raftLog.Log.Apply(Raft.StateMachine); LastApplied = raftLog.Index; // 循环可能退出,在这里修改。 if (WaitApplyFutures.TryRemove(raftLog.Index, out var future)) { future.SetResult(0); } } }
public LogSequence(Raft raft) { Raft = raft; var options = new DbOptions().SetCreateIfMissing(true); Rafts = RocksDb.Open(options, Path.Combine(Raft.RaftConfig.DbHome, "rafts")); { // Read Term var termKey = ByteBuffer.Allocate(); termKey.WriteInt(0); RaftsTermKey = termKey.Copy(); var termValue = Rafts.Get(RaftsTermKey); if (null != termValue) { var bb = ByteBuffer.Wrap(termValue); Term = bb.ReadLong(); } else { Term = 0; } // Read VoteFor var voteForKey = ByteBuffer.Allocate(); voteForKey.WriteInt(1); RaftsVoteForKey = voteForKey.Copy(); var voteForvalue = Rafts.Get(RaftsVoteForKey); if (null != voteForvalue) { var bb = ByteBuffer.Wrap(voteForvalue); VoteFor = bb.ReadString(); } else { VoteFor = string.Empty; } } Logs = RocksDb.Open(options, Path.Combine(Raft.RaftConfig.DbHome, "logs")); { // Read Last Log Index using var itLast = Logs.NewIterator(); itLast.SeekToLast(); if (itLast.Valid()) { LastIndex = RaftLog.Decode( new Binary(itLast.Value()), Raft.StateMachine.LogFactory ).Index; } else { // empty. add one for prev. SaveLog(new RaftLog(Term, 0, new HeartbeatLog())); LastIndex = 0; } using var itFirst = Logs.NewIterator(); itFirst.SeekToFirst(); FirstIndex = RaftLog.Decode( new Binary(itFirst.Value()), Raft.StateMachine.LogFactory ).Index; // 【注意】snapshot 以后 FirstIndex 会推进,不再是从0开始。 LastApplied = FirstIndex; CommitIndex = FirstIndex; } }