public async Task AllocateShards() { while (true) { try { if (_nodeStateService.Role == NodeState.Leader && _nodeStateService.InCluster) { _logger.LogDebug("Allocating shards..."); var updates = new List <BaseCommand>(); var newTasks = new List <BaseTask>(); var allShards = _stateMachine.GetShards(); foreach (var shard in allShards) { //Scan for new allocations first var newAllocations = Allocator.GetAllocationCandidates(shard.Id, shard.Type); if (newAllocations.Count() > 0) { _logger.LogInformation("Found new allocations for shard " + shard.Id); updates.Add(new UpdateShardMetadataAllocations() { StaleAllocationsToAdd = newAllocations.Select(na => na.NodeId).ToHashSet <Guid>(), ShardId = shard.Id, Type = shard.Type }); foreach (var candidate in newAllocations) { var taskId = ResyncShard.GetTaskUniqueId(shard.Id, candidate.NodeId); BaseTask recoveryTask = _stateMachine.GetRunningTask(taskId); if (recoveryTask == null) { newTasks.Add(new ResyncShard() { Id = Guid.NewGuid(), ShardId = shard.Id, NodeId = candidate.NodeId, Type = shard.Type, UniqueRunningId = taskId, CreatedOn = DateTime.UtcNow }); } } } var staleAllocationsToRemove = new List <Guid>(); //Scan for stale Allocations foreach (var staleAllocation in shard.StaleAllocations) { //If the node is just stale then try resync it if (_stateMachine.GetNode(staleAllocation) != null) { _logger.LogInformation("Found stale allocation " + staleAllocation + " for shard " + shard.Id); var taskId = ResyncShard.GetTaskUniqueId(shard.Id, staleAllocation); BaseTask recoveryTask = _stateMachine.GetRunningTask(taskId); if (recoveryTask == null) { newTasks.Add(new ResyncShard() { Id = Guid.NewGuid(), ShardId = shard.Id, NodeId = staleAllocation, Type = shard.Type, UniqueRunningId = taskId, CreatedOn = DateTime.UtcNow }); } } else { staleAllocationsToRemove.Add(staleAllocation); } } if (staleAllocationsToRemove.Count() > 0) { updates.Add(new UpdateShardMetadataAllocations() { ShardId = shard.Id, Type = shard.Type, StaleAllocationsToRemove = staleAllocationsToRemove.ToHashSet() }); } //If there are new stale allocations var stillInsync = shard.InsyncAllocations.Where(insync => _stateMachine.IsNodeContactable(insync)); var staleAllocations = shard.InsyncAllocations.Where(ia => !stillInsync.Contains(ia)); if (staleAllocations.Count() > 0) { if (stillInsync.Count() > 0) { updates.Add(new UpdateShardMetadataAllocations() { ShardId = shard.Id, Type = shard.Type, PrimaryAllocation = stillInsync.Contains(shard.PrimaryAllocation) ? shard.PrimaryAllocation : stillInsync.First(), StaleAllocationsToAdd = staleAllocations.ToHashSet(), InsyncAllocationsToRemove = staleAllocations.ToHashSet() }); //Scan for primary allocations or in-sync allocations becoming unavailable foreach (var staleAllocation in staleAllocations) { _logger.LogInformation("Found stale allocation " + staleAllocation + " for shard " + shard.Id); var taskId = ResyncShard.GetTaskUniqueId(shard.Id, staleAllocation); BaseTask recoveryTask = _stateMachine.GetRunningTask(taskId); if (recoveryTask == null) { newTasks.Add(new ResyncShard() { Id = Guid.NewGuid(), ShardId = shard.Id, NodeId = staleAllocation, Type = shard.Type, UniqueRunningId = taskId, CreatedOn = DateTime.UtcNow }); } } } else { _logger.LogError("Shard " + shard.Id + " has no primaries available, shard is unavailable..."); } } //Get the latest position of the shard var latestOperation = await _clusterClient.Send(new RequestShardWriteOperations() { From = 0, To = 0, ShardId = shard.Id, Type = shard.Type }); } if (newTasks.Count > 0) { updates.Add(new UpdateClusterTasks() { TasksToAdd = newTasks }); } if (updates.Count > 0) { await _clusterClient.Send(new ExecuteCommands() { Commands = updates, WaitForCommits = true }); } await Task.Delay(3000); } else { await Task.Delay(5000); } } catch (Exception e) { _logger.LogError("Failed to allocate shards with error " + e.Message + Environment.NewLine + e.StackTrace); } } }