private void OnTopologyChanged(TopologyChangeCommand tcc) { // if we have any removed servers, we need to know let them know that they have // been removed, we do that by committing the current entry (hopefully they already // have topology change command, so they know they are being removed from the cluster). // This is mostly us being nice neighbors, this isn't required, and the cluster will reject // messages from nodes not considered to be in the cluster. if (tcc.Previous == null) return; var removedNodes = tcc.Previous.AllNodeNames.Except(tcc.Requested.AllNodeNames).ToList(); foreach (var removedNode in removedNodes) { var nodeByName = tcc.Previous.GetNodeByName(removedNode); if (nodeByName == null) continue; // try sending the latest updates (which include the topology removal entry) SendEntriesToPeer(nodeByName); // at any rate, try sending the disconnection command explicitly, to gracefully shut down the node if we can Engine.Transport.Send(nodeByName, new DisconnectedFromCluster { From = Engine.Name, ClusterTopologyId = Engine.CurrentTopology.TopologyId, Term = Engine.PersistentState.CurrentTerm }); } }
protected bool FromOurTopology(BaseMessage msg) { if (msg.ClusterTopologyId == Engine.CurrentTopology.TopologyId) return true; // if we don't have the same topology id, maybe we have _no_ topology, if that is the case, // we are accepting the new topology id immediately if (Engine.CurrentTopology.TopologyId == Guid.Empty && Engine.CurrentTopology.HasVoters == false) { var tcc = new TopologyChangeCommand { Requested = new Topology(msg.ClusterTopologyId) }; Engine.StartTopologyChange(tcc); Engine.CommitTopologyChange(tcc); return true; } return false; }
internal void StartTopologyChange(TopologyChangeCommand tcc) { Interlocked.Exchange(ref _currentTopology, tcc.Requested); Interlocked.Exchange(ref _changingTopology, new TaskCompletionSource<object>().Task); OnTopologyChanging(tcc); }
private void OnTopologyChanging(TopologyChangeCommand tcc) { var handler = TopologyChanging; if (handler != null) { try { handler(); } catch (Exception e) { _log.Error("Error on raising TopologyChanging event", e); } } }
protected virtual void OnTopologyChanged(TopologyChangeCommand cmd) { _log.Info ("OnTopologyChanged() - " + this.Name); var handler = TopologyChanged; if (handler != null) { try { handler(cmd); } catch (Exception e) { _log.Error("Error on raising TopologyChanged event", e); } } }
public void CommitTopologyChange(TopologyChangeCommand tcc) { //it is logical that _before_ OnTopologyChanged is fired the topology change task will be complete // - since this task is used to track progress of topoplogy changes in the interface Interlocked.Exchange(ref _changingTopology, null); //if no topology was present and TopologyChangeCommand is issued to just //accept new topology id - then tcc.Previous == null - it means that //shouldRemainInTopology should be true - because there is no removal from topology actually var isRemovedFromTopology = tcc.Requested.Contains(Name) == false && tcc.Previous != null && tcc.Previous.Contains(Name); if (isRemovedFromTopology) { _log.Debug("This node is being removed from topology, setting its state to follower, it will be idle until a leader will join it to the cluster again"); CurrentLeader = null; SetState(RaftEngineState.Follower); return; } if (_log.IsInfoEnabled) { _log.Info("Finished applying new topology: {0}{1}", _currentTopology, tcc.Previous == null ? ", Previous topology was null - perhaps it is setting topology for the first time?" : String.Empty); } OnTopologyChanged(tcc); }
internal Task ModifyTopology(Topology requested) { if (State != RaftEngineState.Leader) throw new InvalidOperationException("Cannot modify topology from a non leader node, current leader is: " + (CurrentLeader ?? "no leader")); var tcc = new TopologyChangeCommand { Completion = new TaskCompletionSource<object>(), Requested = requested, Previous = _currentTopology, BufferCommand = false, }; if (Interlocked.CompareExchange(ref _changingTopology, tcc.Completion.Task, null) != null) throw new InvalidOperationException("Cannot change the cluster topology while another topology change is in progress"); try { _log.Debug("Topology change started on leader"); StartTopologyChange(tcc); AppendCommand(tcc); return tcc.Completion.Task; } catch (Exception) { Interlocked.Exchange(ref _changingTopology, null); throw; } }
public void Handle(DisconnectedFromCluster req) { if (FromOurTopology(req) == false) { _log.Info("Got a disconnection notification message outside my cluster topology (id: {0}), ignoring", req.ClusterTopologyId); return; } if (req.Term < Engine.PersistentState.CurrentTerm) { _log.Info("Got disconnection notification from an older term, ignoring"); return; } if (req.From != Engine.CurrentLeader) { _log.Info("Got disconnection notification from {0}, who isn't the current leader, ignoring.", req.From); return; } _log.Warn("Got disconnection notification from the leader, clearing topology and moving to idle follower state"); var tcc = new TopologyChangeCommand { Requested = new Topology(req.ClusterTopologyId) }; Engine.PersistentState.SetCurrentTopology(tcc.Requested, 0L); Engine.StartTopologyChange(tcc); Engine.CommitTopologyChange(tcc); Engine.SetState(RaftEngineState.Follower); }
public override InstallSnapshotResponse Handle(MessageContext context, InstallSnapshotRequest req, Stream stream) { if (_installingSnapshot != null) { return new InstallSnapshotResponse { Success = false, Message = "Cannot install snapshot because we are already installing a snapshot", CurrentTerm = Engine.PersistentState.CurrentTerm, From = Engine.Name, ClusterTopologyId = Engine.CurrentTopology.TopologyId, LastLogIndex = Engine.PersistentState.LastLogEntry().Index }; } if (FromOurTopology(req) == false) { _log.Info("Got an install snapshot message outside my cluster topology (id: {0}), ignoring", req.ClusterTopologyId); return new InstallSnapshotResponse { Success = false, Message = "Cannot install snapshot because the cluster topology id doesn't match, mine is: " + Engine.CurrentTopology.TopologyId, CurrentTerm = Engine.PersistentState.CurrentTerm, From = Engine.Name, ClusterTopologyId = Engine.CurrentTopology.TopologyId, LastLogIndex = Engine.PersistentState.LastLogEntry().Index }; } var lastLogEntry = Engine.PersistentState.LastLogEntry(); if (req.Term < lastLogEntry.Term || req.LastIncludedIndex < lastLogEntry.Index) { stream.Dispose(); return new InstallSnapshotResponse { From = Engine.Name, ClusterTopologyId = Engine.CurrentTopology.TopologyId, CurrentTerm = lastLogEntry.Term, LastLogIndex = lastLogEntry.Index, Message = string.Format("Snapshot is too old (term {0} index {1}) while we have (term {2} index {3})", req.Term, req.LastIncludedIndex, lastLogEntry.Term, lastLogEntry.Index), Success = false }; } _log.Info("Received InstallSnapshotRequest from {0} until term {1} / {2}", req.From, req.LastIncludedTerm, req.LastIncludedIndex); Engine.OnSnapshotInstallationStarted(); // this can be a long running task _installingSnapshot = Task.Run(() => { try { Engine.StateMachine.ApplySnapshot(req.LastIncludedTerm, req.LastIncludedIndex, stream); Engine.PersistentState.MarkSnapshotFor(req.LastIncludedIndex, req.LastIncludedTerm, int.MaxValue); Engine.PersistentState.SetCurrentTopology(req.Topology, req.LastIncludedIndex); var tcc = new TopologyChangeCommand { Requested = req.Topology }; Engine.StartTopologyChange(tcc); Engine.CommitTopologyChange(tcc); } catch (Exception e) { _log.Warn(string.Format("Failed to install snapshot term {0} index {1}", req.LastIncludedIndex, req.LastIncludedIndex), e); context.ExecuteInEventLoop(() => { _installingSnapshot = null; }); } // we are doing it this way to ensure that we are single threaded context.ExecuteInEventLoop(() => { Engine.UpdateCurrentTerm(req.Term, req.From); // implicitly put us in follower state _log.Info("Updating the commit index to the snapshot last included index of {0}", req.LastIncludedIndex); Engine.OnSnapshotInstallationEnded(req.Term); context.Reply(new InstallSnapshotResponse { From = Engine.Name, ClusterTopologyId = Engine.CurrentTopology.TopologyId, CurrentTerm = req.Term, LastLogIndex = req.LastIncludedIndex, Success = true }); }); }); return null; }