private bool IsRebalancingRequired(ClientsZnode clients, ResourcesZnode resources) { // if this is the first rebalancing as coordinator or the last one was not successful then rebalancing is required if (this.store.GetAssignmentStatus() == AssignmentStatus.NoAssignmentYet || !lastRebalancingResult.HasValue || lastRebalancingResult.Value != RebalancingResult.Complete) return true; // any change to resources requires a rebalancing if (resources.HasResourceChange()) return true; // given a client was either added or removed // if there are less clients than resources then we require a rebalancing if (clients.ClientPaths.Count < resources.Resources.Count) return true; // given we have an equal or greater number clients than resources // if an existing client is currently assigned more than one resource we require a rebalancing if (resources.ResourceAssignments.Assignments.GroupBy(x => x.ClientId).Any(x => x.Count() > 1)) return true; // given all existing assignments are one client to one resource // if any client for the existing assignments is no longer around then we require a rebalancing var clientIds = clients.ClientPaths.Select(GetClientId).ToList(); foreach (var assignment in resources.ResourceAssignments.Assignments) { if (!clientIds.Contains(assignment.ClientId, StringComparer.Ordinal)) return true; } // otherwise no rebalancing is required return false; }
private async Task <RebalancingResult> ProcessStatusChangeAsync(CancellationToken rebalancingToken) { await store.InvokeOnStopActionsAsync(clientId, "Follower"); ResourcesZnode resources = await zooKeeperService.GetResourcesAsync(null, null); List <string> assignedResources = resources.ResourceAssignments.Assignments .Where(x => x.ClientId.Equals(clientId)) .Select(x => x.Resource) .ToList(); if (onStartDelay.Ticks > 0) { logger.Info(clientId, $"Follower - Delaying on start for {(int)onStartDelay.TotalMilliseconds}ms"); await WaitFor(onStartDelay, rebalancingToken); } if (rebalancingToken.IsCancellationRequested) { return(RebalancingResult.Cancelled); } await store.InvokeOnStartActionsAsync(clientId, "Follower", assignedResources, rebalancingToken, followerToken); return(RebalancingResult.Complete); }
public async Task <BecomeCoordinatorResult> BecomeCoordinatorAsync(int currentEpoch) { try { ignoreWatches = false; await zooKeeperService.IncrementAndWatchEpochAsync(currentEpoch, this); await zooKeeperService.WatchNodesAsync(this); ResourcesZnode getResourcesRes = await zooKeeperService.GetResourcesAsync(this, null); resourcesVersion = getResourcesRes.Version; } catch (ZkStaleVersionException e) { logger.Error(clientId, "Could not become coordinator as a stale version number was used", e); return(BecomeCoordinatorResult.StaleEpoch); } catch (ZkInvalidOperationException e) { logger.Error(clientId, "Could not become coordinator as an invalid ZooKeeper operation occurred", e); return(BecomeCoordinatorResult.Error); } events.Add(CoordinatorEvent.RebalancingTriggered); return(BecomeCoordinatorResult.Ok); }
private async Task <RebalancingResult> AssignResourcesPhaseAsync(CancellationToken rebalancingToken, ResourcesZnode resources, ClientsZnode clients) { logger.Info(this.clientId, "Coordinator - Assign resources to clients"); var resourcesToAssign = new Queue <string>(resources.Resources); var resourceAssignments = new List <ResourceAssignment>(); var clientIndex = 0; while (resourcesToAssign.Any()) { resourceAssignments.Add(new ResourceAssignment() { ClientId = GetClientId(clients.ClientPaths[clientIndex]), Resource = resourcesToAssign.Dequeue() }); clientIndex++; if (clientIndex >= clients.ClientPaths.Count) { clientIndex = 0; } } // write assignments back to resources znode resources.ResourceAssignments.Assignments = resourceAssignments; this.resourcesVersion = await this.zooKeeperService.SetResourcesAsync(resources); if (rebalancingToken.IsCancellationRequested) { return(RebalancingResult.Cancelled); } this.status.RebalancingStatus = RebalancingStatus.ResourcesGranted; this.status.Version = await this.zooKeeperService.SetStatus(this.status); if (this.onStartDelay.Ticks > 0) { this.logger.Info(this.clientId, $"Coordinator - Delaying on start for {(int)this.onStartDelay.TotalMilliseconds}ms"); await WaitFor(this.onStartDelay, rebalancingToken); } if (rebalancingToken.IsCancellationRequested) { return(RebalancingResult.Cancelled); } var leaderAssignments = resourceAssignments.Where(x => x.ClientId == this.clientId).Select(x => x.Resource).ToList(); await this.store.InvokeOnStartActionsAsync(this.clientId, "Coordinator", leaderAssignments, rebalancingToken, this.coordinatorToken); if (rebalancingToken.IsCancellationRequested) { return(RebalancingResult.Cancelled); } return(RebalancingResult.Complete); }
private async Task CheckForRebalancingAsync() { ResourcesZnode resources = await zooKeeperService.GetResourcesAsync(null, null); List <string> assignedResources = resources.ResourceAssignments.Assignments .Where(x => x.ClientId.Equals(clientId)) .Select(x => x.Resource) .ToList(); if (assignedResources.Any()) { events.Add(FollowerEvent.RebalancingTriggered); } }
private async Task <RebalancingResult> RebalanceAsync(CancellationToken rebalancingToken) { Stopwatch sw = new(); sw.Start(); logger.Info(clientId, "Coordinator - Get clients and resources list"); ClientsZnode clients = await zooKeeperService.GetActiveClientsAsync(); ResourcesZnode resources = await zooKeeperService.GetResourcesAsync(null, null); if (resources.Version != resourcesVersion) { throw new ZkStaleVersionException( "Resources znode version does not match expected value, indicates another client has been made coordinator and is executing a rebalancing."); } if (rebalancingToken.IsCancellationRequested) { return(RebalancingResult.Cancelled); } // if no resources were changed and there are more clients than resources then check // to see if rebalancing is necessary. If existing assignments are still valid then // a new client or the loss of a client with no assignments need not trigger a rebalancing if (!IsRebalancingRequired(clients, resources)) { logger.Info(clientId, "Coordinator - No rebalancing required. No resource change. No change to existing clients. More clients than resources."); return(RebalancingResult.Complete); } logger.Info(clientId, $"Coordinator - Assign resources ({string.Join(",", resources.Resources)}) to clients ({string.Join(",", clients.ClientPaths.Select(GetClientId))})"); Queue <string> resourcesToAssign = new(resources.Resources); List <ResourceAssignment> resourceAssignments = new(); int clientIndex = 0; while (resourcesToAssign.Any()) { resourceAssignments.Add(new ResourceAssignment { ClientId = GetClientId(clients.ClientPaths[clientIndex]), Resource = resourcesToAssign.Dequeue() }); clientIndex++; if (clientIndex >= clients.ClientPaths.Count) { clientIndex = 0; } } // write assignments back to resources znode resources.ResourceAssignments.Assignments = resourceAssignments; resourcesVersion = await zooKeeperService.SetResourcesAsync(resources); if (rebalancingToken.IsCancellationRequested) { return(RebalancingResult.Cancelled); } await store.InvokeOnStopActionsAsync(clientId, "Coordinator"); if (rebalancingToken.IsCancellationRequested) { return(RebalancingResult.Cancelled); } if (onStartDelay.Ticks > 0) { logger.Info(clientId, $"Coordinator - Delaying on start for {(int)onStartDelay.TotalMilliseconds}ms"); await WaitFor(onStartDelay, rebalancingToken); } if (rebalancingToken.IsCancellationRequested) { return(RebalancingResult.Cancelled); } List <string> leaderAssignments = resourceAssignments .Where(x => x.ClientId == clientId) .Select(x => x.Resource) .ToList(); await store.InvokeOnStartActionsAsync(clientId, "Coordinator", leaderAssignments, rebalancingToken, coordinatorToken); if (rebalancingToken.IsCancellationRequested) { return(RebalancingResult.Cancelled); } return(RebalancingResult.Complete); }
private async Task <RebalancingResult> ProcessStatusChangeAsync(CancellationToken rebalancingToken) { StatusZnode status = await zooKeeperService.WatchStatusAsync(this); if (status.Version != statusVersion) { logger.Warn(clientId, "Follower - The status has changed between the notification and response"); } if (rebalancingToken.IsCancellationRequested) { return(RebalancingResult.Cancelled); } if (status.RebalancingStatus == RebalancingStatus.StopActivity) { logger.Info(clientId, "Follower - Status change received - stop activity"); await store.InvokeOnStopActionsAsync(clientId, "Follower"); if (rebalancingToken.IsCancellationRequested) { return(RebalancingResult.Cancelled); } await zooKeeperService.SetFollowerAsStopped(clientId); logger.Info(clientId, "Follower - Created follower stopped node"); } else if (status.RebalancingStatus == RebalancingStatus.ResourcesGranted) { logger.Info(clientId, "Follower - Status change received - resources granted"); ResourcesZnode resources = await zooKeeperService.GetResourcesAsync(null, null); List <string> assignedResources = resources.ResourceAssignments.Assignments .Where(x => x.ClientId.Equals(clientId)) .Select(x => x.Resource) .ToList(); logger.Info(clientId, $"Follower - {assignedResources.Count} resources granted"); if (store.IsInStartedState()) { logger.Warn(clientId, "Follower - The resources granted status change has been received while already in the started state. Stopped all activity first"); await store.InvokeOnStopActionsAsync(clientId, "Follower"); } if (onStartDelay.Ticks > 0) { logger.Info(clientId, $"Follower - Delaying on start for {(int)onStartDelay.TotalMilliseconds}ms"); await WaitFor(onStartDelay, rebalancingToken); } if (rebalancingToken.IsCancellationRequested) { return(RebalancingResult.Cancelled); } await store.InvokeOnStartActionsAsync(clientId, "Follower", assignedResources, rebalancingToken, followerToken); if (rebalancingToken.IsCancellationRequested) { return(RebalancingResult.Cancelled); } await zooKeeperService.SetFollowerAsStarted(clientId); logger.Info(clientId, "Follower - Removed follower stopped node"); } else if (status.RebalancingStatus == RebalancingStatus.StartConfirmed) { logger.Info(clientId, "Follower - All followers confirm started"); // no longer used } else { logger.Error(clientId, "Follower - Non-supported status received - ignoring"); } return(RebalancingResult.Complete); }
private async Task <StopPhaseResult> StopActivityPhaseAsync(CancellationToken rebalancingToken) { logger.Info(clientId, "Coordinator - Get active clients and resources"); ClientsZnode clients = await zooKeeperService.GetActiveClientsAsync(); List <string> followerIds = clients.ClientPaths.Select(GetClientId).Where(x => x != clientId).ToList(); ResourcesZnode resources = await zooKeeperService.GetResourcesAsync(null, null); logger.Info(clientId, $"Coordinator - {followerIds.Count} followers in scope and {resources.Resources.Count} resources in scope"); logger.Info(clientId, $"Coordinator - Assign resources ({string.Join(",", resources.Resources)}) to clients ({string.Join(",", clients.ClientPaths.Select(GetClientId))})"); if (resources.Version != resourcesVersion) { throw new ZkStaleVersionException( "Resources znode version does not match expected value, indicates another client has been made coordinator and is executing a rebalancing."); } if (rebalancingToken.IsCancellationRequested) { return(new StopPhaseResult(RebalancingResult.Cancelled)); } // if no resources were changed and there are more clients than resources then check // to see if rebalancing is necessary. If existing assignments are still valid then // a new client or the loss of a client with no assignments need not trigger a rebalancing if (!IsRebalancingRequired(clients, resources)) { logger.Info(clientId, "Coordinator - No rebalancing required. No resource change. No change to existing assigned clients. More clients than resources."); return(new StopPhaseResult(RebalancingResult.NotRequired)); } logger.Info(clientId, "Coordinator - Command followers to stop"); status.RebalancingStatus = RebalancingStatus.StopActivity; status.Version = await zooKeeperService.SetStatus(status); if (rebalancingToken.IsCancellationRequested) { return(new StopPhaseResult(RebalancingResult.Cancelled)); } await store.InvokeOnStopActionsAsync(clientId, "Coordinator"); // wait for confirmation that all followers have stopped or for time limit while (!rebalancingToken.IsCancellationRequested) { List <string> stopped = await zooKeeperService.GetStoppedAsync(); if (AreClientsStopped(followerIds, stopped)) { logger.Info(clientId, $"Coordinator - All {stopped.Count} in scope followers have stopped"); break; } // check that a client hasn't died mid-rebalancing, if so, trigger a new rebalancing and abort this one. // else wait and check again ClientsZnode latestClients = await zooKeeperService.GetActiveClientsAsync(); List <string> missingClients = GetMissing(followerIds, latestClients.ClientPaths); if (missingClients.Any()) { logger.Info(clientId, $"Coordinator - {missingClients.Count} followers have disappeared. Missing: {string.Join(",", missingClients)}. Triggering new rebalancing."); events.Add(CoordinatorEvent.RebalancingTriggered); return(new StopPhaseResult(RebalancingResult.Cancelled)); } List <string> pendingClientIds = GetMissing(followerIds, stopped); logger.Info(clientId, $"Coordinator - waiting for followers to stop: {string.Join(",", pendingClientIds)}"); await WaitFor(TimeSpan.FromSeconds(2)); // try again in 2s } if (rebalancingToken.IsCancellationRequested) { return(new StopPhaseResult(RebalancingResult.Cancelled)); } StopPhaseResult phaseResult = new(RebalancingResult.Complete) { ResourcesZnode = resources, ClientsZnode = clients, FollowerIds = followerIds }; return(phaseResult); }