public async Task <BecomeCoordinatorResult> BecomeCoordinatorAsync(int currentEpoch) { try { ignoreWatches = false; await zooKeeperService.IncrementAndWatchEpochAsync(currentEpoch, this); await zooKeeperService.WatchNodesAsync(this); ResourcesZnode getResourcesRes = await zooKeeperService.GetResourcesAsync(this, null); resourcesVersion = getResourcesRes.Version; } catch (ZkStaleVersionException e) { logger.Error(clientId, "Could not become coordinator as a stale version number was used", e); return(BecomeCoordinatorResult.StaleEpoch); } catch (ZkInvalidOperationException e) { logger.Error(clientId, "Could not become coordinator as an invalid ZooKeeper operation occurred", e); return(BecomeCoordinatorResult.Error); } events.Add(CoordinatorEvent.RebalancingTriggered); return(BecomeCoordinatorResult.Ok); }
public async Task <BecomeFollowerResult> BecomeFollowerAsync() { try { ignoreWatches = false; await zooKeeperService.WatchSiblingNodeAsync(watchSiblingPath, this); logger.Info(clientId, $"Follower - Set a watch on sibling node {watchSiblingPath}"); await zooKeeperService.WatchResourcesDataAsync(this); logger.Info(clientId, "Follower - Set a watch on resources node"); } catch (ZkNoEphemeralNodeWatchException) { logger.Info(clientId, "Follower - Could not set a watch on the sibling node as it has gone"); return(BecomeFollowerResult.WatchSiblingGone); } catch (Exception e) { logger.Error("Follower - Could not become a follower due to an error", e); return(BecomeFollowerResult.Error); } return(BecomeFollowerResult.Ok); }
private async Task TriggerRebalancingAsync(Guid coordinatorClientId, ClientEvent clientEvent, List <Client> clients, List <string> resources, OnChangeActions onChangeActions, CancellationToken token) { logger.Info(coordinatorClientId.ToString(), "---------- Rebalancing triggered -----------"); // request stop of all clients logger.Info(coordinatorClientId.ToString(), "COORDINATOR: Requested stop"); if (clients.Any()) { ModifyClientResult result = await clientService.StopActivityAsync(clientEvent.FencingToken, clients); if (result == ModifyClientResult.FencingTokenViolation) { clientEvent.CoordinatorToken.FencingTokenViolation = true; return; } if (result == ModifyClientResult.Error) { logger.Error(coordinatorClientId.ToString(), "COORDINATOR: Rebalancing error"); return; } } // stop all resource activity in local coordinator client foreach (Action onStopAction in onChangeActions.OnStopActions) { onStopAction.Invoke(); } // wait for all live clients to confirm stopped bool allClientsWaiting = false; List <Client> clientsNow = null; while (!allClientsWaiting && !token.IsCancellationRequested) { await WaitFor(TimeSpan.FromSeconds(5), token); clientsNow = await GetLiveClientsAsync(clientEvent, coordinatorClientId); if (!clientsNow.Any()) { allClientsWaiting = true; } else { allClientsWaiting = clientsNow.All(x => x.ClientStatus == ClientStatus.Waiting); } } logger.Info(coordinatorClientId.ToString(), "COORDINATOR: Stop confirmed"); // assign resources first to coordinator then to other live clients if (token.IsCancellationRequested) { return; } if (allClientsWaiting) { Queue <string> resourcesToAssign = new(resources); List <ClientStartRequest> clientStartRequests = new(); int remainingClients = clientsNow.Count + 1; int resourcesPerClient = Math.Max(1, resourcesToAssign.Count / remainingClients); ClientStartRequest coordinatorRequest = new() { ClientId = coordinatorClientId }; while (coordinatorRequest.AssignedResources.Count < resourcesPerClient && resourcesToAssign.Any()) { coordinatorRequest.AssignedResources.Add(resourcesToAssign.Dequeue()); } clientStartRequests.Add(coordinatorRequest); remainingClients--; foreach (Client client in clientsNow) { resourcesPerClient = Math.Max(1, resourcesToAssign.Count / remainingClients); ClientStartRequest request = new() { ClientId = client.ClientId }; while (request.AssignedResources.Count < resourcesPerClient && resourcesToAssign.Any()) { request.AssignedResources.Add(resourcesToAssign.Dequeue()); } clientStartRequests.Add(request); remainingClients--; } if (token.IsCancellationRequested) { return; } logger.Info(coordinatorClientId.ToString(), "COORDINATOR: Resources assigned"); ModifyClientResult startResult = await clientService.StartActivityAsync(clientEvent.FencingToken, clientStartRequests); if (startResult == ModifyClientResult.FencingTokenViolation) { clientEvent.CoordinatorToken.FencingTokenViolation = true; return; } if (startResult == ModifyClientResult.Error) { logger.Error(coordinatorClientId.ToString(), "COORDINATOR: Rebalancing error"); return; } store.SetResources(new SetResourcesRequest { AssignmentStatus = AssignmentStatus.ResourcesAssigned, Resources = coordinatorRequest.AssignedResources }); foreach (Action <IList <string> > onStartAction in onChangeActions.OnStartActions) { onStartAction.Invoke(coordinatorRequest.AssignedResources); } logger.Debug(coordinatorClientId.ToString(), "COORDINATOR: Local client started"); List <Guid> clientIds = clientsNow.Select(x => x.ClientId).ToList(); clientIds.Add(coordinatorClientId); this.clients = clientIds; this.resources = resources; logger.Info(coordinatorClientId.ToString(), "---------- Activity Started -----------"); } else { // log it logger.Info(coordinatorClientId.ToString(), "!!!"); } }
public async Task <LeaseResponse> TryAcquireLeaseAsync(AcquireLeaseRequest acquireLeaseRequest) { using (SqlConnection conn = await ConnectionHelper.GetOpenConnectionAsync(connectionString)) { SqlTransaction transaction = conn.BeginTransaction(IsolationLevel.Serializable); SqlCommand command = conn.CreateCommand(); command.Transaction = transaction; try { // obtain lock on the record blocking other nodes until the transaction is committed command.CommandText = "UPDATE [RBR].[ResourceGroups] SET LockedByClient = @ClientId WHERE ResourceGroup = @ResourceGroup"; command.Parameters.AddWithValue("@ClientId", acquireLeaseRequest.ClientId); command.Parameters.Add("@ResourceGroup", SqlDbType.VarChar, 100).Value = acquireLeaseRequest.ResourceGroup; await command.ExecuteNonQueryAsync(); // get the resource group (TODO, use OUTPUT on UPDATE query instead of another query) command.Parameters.Clear(); command.CommandText = @"SELECT [ResourceGroup] ,[CoordinatorId] ,[LastCoordinatorRenewal] ,[CoordinatorServer] ,[LockedByClient] ,[FencingToken] ,[LeaseExpirySeconds] ,[HeartbeatSeconds] ,GETUTCDATE() AS [TimeNow] FROM [RBR].[ResourceGroups] WHERE ResourceGroup = @ResourceGroup"; command.Parameters.Add("@ResourceGroup", SqlDbType.VarChar, 100).Value = acquireLeaseRequest.ResourceGroup; ResourceGroup rg = null; using (SqlDataReader reader = await command.ExecuteReaderAsync()) { if (await reader.ReadAsync()) { rg = new ResourceGroup { Name = acquireLeaseRequest.ResourceGroup, CoordinatorId = GetGuidFromNullableGuid(reader, "CoordinatorId"), CoordinatorServer = GetStringFromNullableGuid(reader, "CoordinatorServer"), LastCoordinatorRenewal = GetDateTimeFromNullable(reader, "LastCoordinatorRenewal"), TimeNow = (DateTime)reader["TimeNow"], LockedByClientId = GetGuidFromNullableGuid(reader, "LockedByClient"), FencingToken = (int)reader["FencingToken"], LeaseExpirySeconds = (int)reader["LeaseExpirySeconds"], HeartbeatSeconds = (int)reader["HeartbeatSeconds"] }; } } if (rg == null) { return(new LeaseResponse { Result = LeaseResult.NoLease, Lease = new Lease { ExpiryPeriod = TimeSpan.FromMinutes(1), HeartbeatPeriod = TimeSpan.FromSeconds(25) } }); } // determine the response, if the CoordinatorId is empty or expired then grant, else deny LeaseResponse response = new() { Lease = new Lease() }; if (rg.CoordinatorId == Guid.Empty || (rg.TimeNow - rg.LastCoordinatorRenewal).TotalSeconds > rg.LeaseExpirySeconds) { response.Lease.ResourceGroup = acquireLeaseRequest.ResourceGroup; response.Lease.ClientId = acquireLeaseRequest.ClientId; response.Lease.ExpiryPeriod = TimeSpan.FromSeconds(rg.LeaseExpirySeconds); response.Lease.HeartbeatPeriod = TimeSpan.FromSeconds(rg.HeartbeatSeconds); response.Lease.FencingToken = ++rg.FencingToken; response.Result = LeaseResult.Granted; command.Parameters.Clear(); command.CommandText = @"UPDATE [RBR].[ResourceGroups] SET [CoordinatorId] = @ClientId ,[LastCoordinatorRenewal] = GETUTCDATE() ,[CoordinatorServer] = @Server ,[FencingToken] = @FencingToken WHERE ResourceGroup = @ResourceGroup"; command.Parameters.AddWithValue("@ClientId", acquireLeaseRequest.ClientId); command.Parameters.AddWithValue("@FencingToken", response.Lease.FencingToken); command.Parameters.Add("@Server", SqlDbType.NVarChar, 500).Value = Environment.MachineName; command.Parameters.Add("@ResourceGroup", SqlDbType.VarChar, 100).Value = acquireLeaseRequest.ResourceGroup; await command.ExecuteNonQueryAsync(); } else { response.Lease.ExpiryPeriod = TimeSpan.FromSeconds(rg.LeaseExpirySeconds); response.Lease.HeartbeatPeriod = TimeSpan.FromSeconds(rg.HeartbeatSeconds); response.Result = LeaseResult.Denied; } transaction.Commit(); return(response); } catch (Exception ex) { try { logger.Error("Rolling back lease acquisition: ", ex); transaction.Rollback(); } catch (Exception rex) { logger.Error("Rollback of lease acquisition failed: ", rex); } return(new LeaseResponse { Result = TransientErrorDetector.IsTransient(ex) ? LeaseResult.TransientError : LeaseResult.Error, Message = "Lease acquisition failure", Exception = ex }); } } }