private void HandleInternalReplication(DatabaseRecord newRecord, ref List <OutgoingReplicationHandler> instancesToDispose) { var clusterTopology = GetClusterTopology(); var newInternalDestinations = newRecord.Topology?.GetDestinations(_server.NodeTag, Database.Name, clusterTopology, _server.Engine.CurrentState); var internalConnections = DatabaseTopology.FindChanges(_internalDestinations, newInternalDestinations); if (internalConnections.RemovedDestiantions.Count > 0) { var removed = internalConnections.RemovedDestiantions.Select(r => new InternalReplication { NodeTag = clusterTopology.TryGetNodeTagByUrl(r).NodeTag, Url = r, Database = Database.Name }); DropOutgoingConnections(removed, ref instancesToDispose); } if (internalConnections.AddedDestinations.Count > 0) { var added = internalConnections.AddedDestinations.Select(r => new InternalReplication { NodeTag = clusterTopology.TryGetNodeTagByUrl(r).NodeTag, Url = r, Database = Database.Name }); StartOutgoingConnections(added.ToList()); } _internalDestinations.Clear(); _internalDestinations.AddRange(newInternalDestinations); }
protected async Task<T> WaitForValueOnGroupAsync<T>(DatabaseTopology topology, Func<ServerStore, T> func, T expected, int timeout = 15000) { var nodes = topology.AllNodes; var servers = new List<ServerStore>(); var tasks = new Dictionary<string, Task<T>>(); foreach (var node in nodes) { var server = Servers.Single(s => s.ServerStore.NodeTag == node); servers.Add(server.ServerStore); } foreach (var server in servers) { var task = WaitForValueAsync(() => func(server), expected, timeout); tasks.Add(server.NodeTag, task); } var res = await Task.WhenAll(tasks.Values); var hasExpectedVals = res.Where(t => t?.Equals(expected) ?? false); if (hasExpectedVals.Count() == servers.Count) return expected; var lookup = tasks.ToLookup(key => key.Value.Result, val => val.Key); var otherValues = ""; foreach (var val in lookup) { otherValues += $"\n the value {val.Key} appears on "; foreach (string str in val) { otherValues += str + ", "; } } throw new Exception($"Not all node in the group have the expected value of {expected}. {otherValues}"); }
private void RemoveOtherNodesIfNeeded(string dbName, DatabaseTopology topology, ref List <DeleteDatabaseCommand> deletions) { if (topology.Members.Count < topology.ReplicationFactor) { return; } if (topology.Promotables.Count == 0 && topology.Rehabs.Count == 0) { return; } if (_logger.IsOperationsEnabled) { _logger.Operations("We reached the replication factor, so we remove all other rehab/promotable nodes."); } var nodesToDelete = topology.Promotables.Concat(topology.Rehabs); var deletionCmd = new DeleteDatabaseCommand { ErrorOnDatabaseDoesNotExists = false, DatabaseName = dbName, FromNodes = nodesToDelete.ToArray(), HardDelete = _hardDeleteOnReplacement, UpdateReplicationFactor = false }; if (deletions == null) { deletions = new List <DeleteDatabaseCommand>(); } deletions.Add(deletionCmd); }
private static void SetNodeStatus( DatabaseTopology topology, string nodeTag, NodesTopology nodesTopology, Dictionary <string, NodeStatus> nodeStatuses) { var nodeStatus = new DatabaseGroupNodeStatus { LastStatus = DatabasePromotionStatus.Ok }; if (topology.PromotablesStatus.TryGetValue(nodeTag, out var status)) { nodeStatus.LastStatus = status; } if (topology.DemotionReasons.TryGetValue(nodeTag, out var reason)) { nodeStatus.LastError = reason; } if (nodeStatus.LastStatus == DatabasePromotionStatus.Ok && nodeStatuses.TryGetValue(nodeTag, out var serverNodeStatus) && serverNodeStatus.Connected == false) { nodeStatus.LastError = serverNodeStatus.ErrorDetails; nodeStatus.LastStatus = DatabasePromotionStatus.NotResponding; } nodesTopology.Status[nodeTag] = nodeStatus; }
private bool TryFindFitNode(string badNode, string db, DatabaseTopology topology, ClusterTopology clusterTopology, Dictionary <string, ClusterNodeStatusReport> current, out string bestNode) { bestNode = null; var dbCount = int.MaxValue; var databaseNodes = topology.AllNodes.ToList(); if (topology.Members.Count == 0) // no one can be used as mentor { return(false); } foreach (var node in clusterTopology.AllNodes.Keys) { if (databaseNodes.Contains(node)) { continue; } if (FailedDatabaseInstanceOrNode(clusterTopology, node, db, current) == DatabaseHealth.Bad) { continue; } if (current.TryGetValue(node, out var nodeReport) == false) { if (bestNode == null) { bestNode = node; } continue; } if (dbCount > nodeReport.Report.Count) { dbCount = nodeReport.Report.Count; bestNode = node; } } if (bestNode == null) { if (_logger.IsOperationsEnabled) { _logger.Operations($"The database '{db}' on {badNode} has not responded for a long time, but there is no free node to reassign it."); } return(false); } if (_logger.IsOperationsEnabled) { _logger.Operations($"The database '{db}' on {badNode} has not responded for a long time, so we reassign it to {bestNode}."); } return(true); }
private bool TryMoveToRehab(string dbName, DatabaseTopology topology, Dictionary <string, ClusterNodeStatusReport> current, string member) { DatabaseStatusReport dbStats = null; if (current.TryGetValue(member, out var nodeStats) && nodeStats.Status == ClusterNodeStatusReport.ReportStatus.Ok && nodeStats.Report.TryGetValue(dbName, out dbStats) && dbStats.Status != Faulted) { return(false); } string reason; if (nodeStats == null) { reason = "In rehabilitation because it had no status report in the latest cluster stats"; } else if (nodeStats.Status != ClusterNodeStatusReport.ReportStatus.Ok) { reason = $"In rehabilitation because the last report status was \"{nodeStats.Status}\""; } else if (nodeStats.Report.TryGetValue(dbName, out var stats) && stats.Status == Faulted) { reason = "In rehabilitation because the DatabaseStatus for this node is Faulted"; } else { reason = "In rehabilitation because the node is reachable but had no report about the database"; } if (nodeStats?.Error != null) { reason += $". {nodeStats.Error}"; } if (dbStats?.Error != null) { reason += $". {dbStats.Error}"; } if (topology.Rehabs.Contains(member) == false) { topology.Members.Remove(member); topology.Rehabs.Add(member); } topology.DemotionReasons[member] = reason; topology.PromotablesStatus[member] = DatabasePromotionStatus.NotResponding; if (_logger.IsOperationsEnabled) { _logger.Operations(reason); } return(true); }
protected async Task<bool> WaitForDocumentInClusterAsync<T>(DatabaseTopology topology, string db, string docId, Func<T, bool> predicate, TimeSpan timeout, X509Certificate2 certificate = null) { var allNodes = topology.Members; var serversTopology = Servers.Where(s => allNodes.Contains(s.ServerStore.NodeTag)); var nodes = serversTopology.Select(x => new ServerNode { Url = x.WebUrl, Database = db }); var stores = GetDocumentStores(nodes, disableTopologyUpdates: true, certificate: certificate); return await WaitForDocumentInClusterAsyncInternal(docId, predicate, timeout, stores); }
private static IEnumerable <OngoingTask> CollectBackupTasks( DatabaseRecord databaseRecord, DatabaseTopology dbTopology, ClusterTopology clusterTopology, ServerStore store) { if (dbTopology == null) { yield break; } if (databaseRecord.PeriodicBackups == null) { yield break; } if (databaseRecord.PeriodicBackups.Count == 0) { yield break; } var database = store.DatabasesLandlord.TryGetOrCreateResourceStore(databaseRecord.DatabaseName).Result; foreach (var backupConfiguration in databaseRecord.PeriodicBackups) { var tag = dbTopology.WhoseTaskIsIt(backupConfiguration, store.IsPassive()); var backupDestinations = GetBackupDestinations(backupConfiguration); var backupStatus = database.PeriodicBackupRunner.GetBackupStatus(backupConfiguration.TaskId); var nextBackup = database.PeriodicBackupRunner.GetNextBackupDetails(databaseRecord, backupConfiguration, backupStatus); yield return(new OngoingTaskBackup { TaskId = backupConfiguration.TaskId, BackupType = backupConfiguration.BackupType, TaskName = backupConfiguration.Name, TaskState = backupConfiguration.Disabled ? OngoingTaskState.Disabled : OngoingTaskState.Enabled, LastFullBackup = backupStatus.LastFullBackup, LastIncrementalBackup = backupStatus.LastIncrementalBackup, NextBackup = nextBackup, ResponsibleNode = new NodeId { NodeTag = tag, NodeUrl = clusterTopology.GetUrlFromTag(tag) }, BackupDestinations = backupDestinations }); } }
public async Task RedistributeDatabaseOnMultiFailure() { DebuggerAttachedTimeout.DisableLongTimespan = true; var clusterSize = 5; var dbGroupSize = 3; var databaseName = GetDatabaseName(); var leader = await CreateRaftClusterAndGetLeader(clusterSize, false, 0); using (var store = new DocumentStore { Urls = new[] { leader.WebUrl }, Database = databaseName }.Initialize()) { var doc = new DatabaseRecord(databaseName) { Topology = new DatabaseTopology { DynamicNodesDistribution = true } }; doc.Topology.Members.Add("A"); doc.Topology.Members.Add("B"); doc.Topology.Members.Add("C"); var databaseResult = await store.Maintenance.Server.SendAsync(new CreateDatabaseOperation(doc, dbGroupSize)); Assert.Equal(dbGroupSize, databaseResult.Topology.Members.Count); await WaitForRaftIndexToBeAppliedInCluster(databaseResult.RaftCommandIndex, TimeSpan.FromSeconds(10)); using (var session = store.OpenAsyncSession()) { await session.StoreAsync(new User { Name = "Karmel" }, "users/1"); await session.SaveChangesAsync(); } Assert.True(await WaitForDocumentInClusterAsync <User>(doc.Topology, databaseName, "users/1", u => u.Name == "Karmel", TimeSpan.FromSeconds(5))); DisposeServerAndWaitForFinishOfDisposal(Servers[1]); DisposeServerAndWaitForFinishOfDisposal(Servers[2]); // the db should move to D & E var newTopology = new DatabaseTopology(); newTopology.Members.Add("A"); newTopology.Members.Add("D"); newTopology.Members.Add("E"); Assert.True(await WaitForDocumentInClusterAsync <User>(newTopology, databaseName, "users/1", u => u.Name == "Karmel", TimeSpan.FromSeconds(60))); var members = await WaitForValueAsync(async() => await GetMembersCount(store, databaseName), 3, 30_000); Assert.Equal(3, members); } }
private bool TryGetMentorNode(string dbName, DatabaseTopology topology, ClusterTopology clusterTopology, string promotable, out string mentorNode) { var url = clusterTopology.GetUrlFromTag(promotable); var task = new PromotableTask(promotable, url, dbName); mentorNode = topology.WhoseTaskIsIt(task, _server.IsPassive()); if (mentorNode == null) { // We are in passive mode and were kicked out of the cluster. return(false); } return(true); }
private TaskStatus GetTaskStatus( DatabaseTopology topology, PeriodicBackupConfiguration configuration, bool skipErrorLog = false) { if (configuration.Disabled) { return(TaskStatus.Disabled); } if (configuration.HasBackup() == false) { if (skipErrorLog == false) { var message = $"All backup destinations are disabled for backup task id: {configuration.TaskId}"; _database.NotificationCenter.Add(AlertRaised.Create( _database.Name, "Periodic Backup", message, AlertType.PeriodicBackup, NotificationSeverity.Info)); } return(TaskStatus.Disabled); } var backupStatus = GetBackupStatus(configuration.TaskId); var whoseTaskIsIt = _database.WhoseTaskIsIt(topology, configuration, backupStatus, keepTaskOnOriginalMemberNode: true); if (whoseTaskIsIt == null) { return(TaskStatus.Disabled); } if (whoseTaskIsIt == _serverStore.NodeTag) { return(TaskStatus.ActiveByCurrentNode); } if (_logger.IsInfoEnabled) { _logger.Info($"Backup job is skipped at {SystemTime.UtcNow}, because it is managed " + $"by '{whoseTaskIsIt}' node and not the current node ({_serverStore.NodeTag})"); } return(TaskStatus.ActiveByOtherNode); }
private static void SetNodeStatus(DatabaseTopology topology, string nodeTag, NodesTopology nodesTopology) { var nodeStatus = new DbGroupNodeStatus { LastStatus = DatabasePromotionStatus.Ok }; if (topology.PromotablesStatus.TryGetValue(nodeTag, out var status)) { nodeStatus.LastStatus = status; } if (topology.DemotionReasons.TryGetValue(nodeTag, out var reason)) { nodeStatus.LastError = reason; } nodesTopology.Status[nodeTag] = nodeStatus; }
private bool TryGetMentorNode(string dbName, DatabaseTopology topology, ClusterTopology clusterTopology, string promotable, out string mentorNode) { var url = clusterTopology.GetUrlFromTag(promotable); topology.PredefinedMentors.TryGetValue(promotable, out var mentor); var task = new PromotableTask(promotable, url, dbName, mentor); mentorNode = topology.WhoseTaskIsIt(_server.Engine.CurrentState, task, null); if (mentorNode == null) { // We are in passive mode and were kicked out of the cluster. return(false); } return(true); }
/// <summary> /// Generating a static topology of the requested size. /// </summary> /// <param name="options">Contains replication factor.</param> /// <param name="mainServer">The main server for which we generate the database, must be contained in the topology.</param> /// <returns></returns> private DatabaseTopology GenerateStaticTopology(Options options, RavenServer mainServer) { DatabaseTopology topology = new DatabaseTopology(); var mainTag = mainServer.ServerStore.NodeTag; topology.Members.Add(mainTag); var rand = new Random(); var serverTags = Servers.Where(s => s != mainServer).Select(s => s.ServerStore.NodeTag).ToList(); for (var i = 0; i < options.ReplicationFactor - 1; i++) { var position = rand.Next(0, serverTags.Count); topology.Members.Add(serverTags[position]); serverTags.RemoveAt(position); } return(topology); }
public static Func <string> GetLastResponsibleNode( bool hasHighlyAvailableTasks, DatabaseTopology topology, string nodeTag) { return(() => { if (hasHighlyAvailableTasks) { return null; } if (topology.Members.Contains(nodeTag) == false) { return null; } return nodeTag; }); }
private List <string> GetResponsibleNodes(DatabaseTopology topology, string databaseGroupId, PullReplicationDefinition pullReplication) { var list = new List <string>(); // we distribute connections to have load balancing when many sinks are connected. // this is the hub cluster, so we make the decision which node will do the pull replication only once and only here, // for that we create a dummy IDatabaseTask. var mentorNodeTask = new PullNodeTask { Mentor = pullReplication.MentorNode, DatabaseGroupId = databaseGroupId }; while (topology.Members.Count > 0) { var next = topology.WhoseTaskIsIt(ServerStore.CurrentRachisState, mentorNodeTask, null); list.Add(next); topology.Members.Remove(next); } return(list); }
private OngoingTaskReplication GetExternalReplicationInfo(DatabaseTopology dbTopology, ClusterTopology clusterTopology, ExternalReplication watcher) { NodeId responsibale = null; var tag = dbTopology.WhoseTaskIsIt(watcher, ServerStore.Engine.CurrentState); if (tag != null) { responsibale = new NodeId { NodeTag = tag, NodeUrl = clusterTopology.GetUrlFromTag(tag) }; } (string Url, OngoingTaskConnectionStatus Status)res = (null, OngoingTaskConnectionStatus.None); if (tag == ServerStore.NodeTag) { res = Database.ReplicationLoader.GetExternalReplicationDestination(watcher.TaskId); } else { res.Status = OngoingTaskConnectionStatus.NotOnThisNode; } var taskInfo = new OngoingTaskReplication { TaskId = watcher.TaskId, TaskName = watcher.Name, ResponsibleNode = responsibale, DestinationDatabase = watcher.Database, TaskState = watcher.Disabled ? OngoingTaskState.Disabled : OngoingTaskState.Enabled, DestinationUrl = res.Url, TaskConnectionStatus = res.Status, }; return(taskInfo); }
public string WhoseTaskIsIt( DatabaseTopology databaseTopology, IDatabaseTask configuration, IDatabaseTaskStatus taskStatus, bool useLastResponsibleNodeIfNoAvailableNodes = false) { var whoseTaskIsIt = databaseTopology.WhoseTaskIsIt( ServerStore.Engine.CurrentState, configuration, getLastReponsibleNode: () => ServerStore.LicenseManager.GetLastResponsibleNodeForTask( taskStatus, databaseTopology, configuration, NotificationCenter)); if (whoseTaskIsIt == null && useLastResponsibleNodeIfNoAvailableNodes) { return(taskStatus.NodeTag); } return(whoseTaskIsIt); }
private (bool Promote, string UpdateTopologyReason) TryPromote(string dbName, DatabaseTopology topology, Dictionary <string, ClusterNodeStatusReport> current, Dictionary <string, ClusterNodeStatusReport> previous, string mentorNode, string promotable) { if (previous.TryGetValue(mentorNode, out var mentorPrevClusterStats) == false || mentorPrevClusterStats.Report.TryGetValue(dbName, out var mentorPrevDbStats) == false) { return(false, null); } if (previous.TryGetValue(promotable, out var promotablePrevClusterStats) == false || promotablePrevClusterStats.Report.TryGetValue(dbName, out var promotablePrevDbStats) == false) { return(false, null); } if (current.TryGetValue(mentorNode, out var mentorCurrClusterStats) == false || mentorCurrClusterStats.Report.TryGetValue(dbName, out var mentorCurrDbStats) == false) { return(false, null); } if (current.TryGetValue(promotable, out var promotableClusterStats) == false || promotableClusterStats.Report.TryGetValue(dbName, out var promotableDbStats) == false) { return(false, null); } if (topology.Members.Count == topology.ReplicationFactor) { return(false, null); } var mentorsEtag = mentorPrevDbStats.LastEtag; if (mentorCurrDbStats.LastSentEtag.TryGetValue(promotable, out var lastSentEtag) == false) { return(false, null); } var timeDiff = mentorCurrClusterStats.LastSuccessfulUpdateDateTime - mentorPrevClusterStats.LastSuccessfulUpdateDateTime; if (lastSentEtag < mentorsEtag || timeDiff > 3 * SupervisorSamplePeriod) { var msg = $"The database '{dbName}' on {promotable} not ready to be promoted, because the mentor hasn't sent all of the documents yet." + Environment.NewLine + $"Last sent Etag: {lastSentEtag:#,#;;0}" + Environment.NewLine + $"Mentor's Etag: {mentorsEtag:#,#;;0}"; if (_logger.IsInfoEnabled) { _logger.Info(msg); } if (msg.Equals(topology.DemotionReasons[promotable]) == false) { topology.DemotionReasons[promotable] = msg; topology.PromotablesStatus[promotable] = DatabasePromotionStatus.ChangeVectorNotMerged; return(false, msg); } return(false, null); } var indexesCatchedUp = CheckIndexProgress(promotablePrevDbStats.LastEtag, promotablePrevDbStats.LastIndexStats, promotableDbStats.LastIndexStats, mentorCurrDbStats.LastIndexStats); if (indexesCatchedUp) { if (_logger.IsOperationsEnabled) { _logger.Operations($"We try to promoted the database '{dbName}' on {promotable} to be a full member"); } topology.PromotablesStatus.Remove(promotable); topology.DemotionReasons.Remove(promotable); return(true, $"Node {promotable} is up-to-date so promoting it to be member"); } if (_logger.IsInfoEnabled) { _logger.Info($"The database '{dbName}' on {promotable} is not ready to be promoted, because the indexes are not up-to-date." + Environment.NewLine); } if (topology.PromotablesStatus.TryGetValue(promotable, out var currentStatus) == false || currentStatus != DatabasePromotionStatus.IndexNotUpToDate) { topology.PromotablesStatus[promotable] = DatabasePromotionStatus.IndexNotUpToDate; return(false, $"Node {promotable} not ready to be a member, because the indexes are not up-to-date"); } return(false, null); }
private IEnumerable <OngoingTask> CollectExternalReplicationTasks(List <ExternalReplication> watchers, DatabaseTopology dbTopology, ClusterTopology clusterTopology) { if (dbTopology == null) { yield break; } foreach (var watcher in watchers) { var taskInfo = GetExternalReplicationInfo(dbTopology, clusterTopology, watcher); yield return(taskInfo); } }
static void FillNodesAvailabilityReportForState(SubscriptionGeneralDataAndStats subscription, DatabaseTopology topology, Dictionary <string, string> databaseTopologyAvailabilityExplenation, List <string> stateGroup, string stateName) { foreach (var nodeInGroup in stateGroup) { var rehabMessage = string.Empty; if (subscription.MentorNode == nodeInGroup) { rehabMessage = $"Although this node is a mentor, it's state is {stateName} and can't run the subscription"; } else { rehabMessage = $"Node's state is {stateName}, can't run subscription"; } if (topology.DemotionReasons.TryGetValue(nodeInGroup, out var demotionReason)) { rehabMessage = rehabMessage + ". Reason:" + demotionReason; } databaseTopologyAvailabilityExplenation[nodeInGroup] = rehabMessage; } }
public async Task DontRemoveNodeWhileItHasNotReplicatedDocs() { var databaseName = "DontRemoveNodeWhileItHasNotReplicatedDocs" + Guid.NewGuid(); var leader = await CreateRaftClusterAndGetLeader(3, shouldRunInMemory : false); using (var leaderStore = new DocumentStore { Urls = new[] { leader.WebUrl }, Database = databaseName, }) { leaderStore.Initialize(); var topology = new DatabaseTopology { Members = new List <string> { "B", "C" }, DynamicNodesDistribution = true }; var(index, dbGroupNodes) = await CreateDatabaseInCluster(new DatabaseRecord { DatabaseName = databaseName, Topology = topology }, 2, leader.WebUrl); await WaitForRaftIndexToBeAppliedInCluster(index, TimeSpan.FromSeconds(30)); using (var session = leaderStore.OpenSession()) { session.Store(new User(), "users/1"); session.SaveChanges(); } var dbToplogy = (await leaderStore.Maintenance.Server.SendAsync(new GetDatabaseRecordOperation(databaseName))).Topology; Assert.Equal(2, dbToplogy.AllNodes.Count()); Assert.Equal(0, dbToplogy.Promotables.Count); Assert.True(await WaitForDocumentInClusterAsync <User>(topology, databaseName, "users/1", null, TimeSpan.FromSeconds(30))); var serverA = Servers.Single(s => s.ServerStore.NodeTag == "A"); var urlsA = new[] { serverA.WebUrl }; var dataDirA = serverA.Configuration.Core.DataDirectory.FullPath.Split('/').Last(); DisposeServerAndWaitForFinishOfDisposal(serverA); var serverB = Servers.Single(s => s.ServerStore.NodeTag == "B"); var urlsB = new[] { serverB.WebUrl }; var dataDirB = serverB.Configuration.Core.DataDirectory.FullPath.Split('/').Last(); DisposeServerAndWaitForFinishOfDisposal(serverB); // write doc only to C using (var session = leaderStore.OpenSession()) { session.Store(new User(), "users/2"); session.SaveChanges(); } var serverC = Servers.Single(s => s.ServerStore.NodeTag == "C"); var urlsC = new[] { serverC.WebUrl }; var dataDirC = serverC.Configuration.Core.DataDirectory.FullPath.Split('/').Last(); DisposeServerAndWaitForFinishOfDisposal(serverC); Servers[0] = GetNewServer(new Dictionary <string, string> { { RavenConfiguration.GetKey(x => x.Core.ServerUrls), urlsA[0] } }, runInMemory: false, deletePrevious: false, partialPath: dataDirA); Servers[1] = GetNewServer(new Dictionary <string, string> { { RavenConfiguration.GetKey(x => x.Core.ServerUrls), urlsB[0] } }, runInMemory: false, deletePrevious: false, partialPath: dataDirB); await Task.Delay(TimeSpan.FromSeconds(10)); Assert.Equal(2, await WaitForValueAsync(async() => await GetMembersCount(leaderStore, databaseName), 2)); Assert.Equal(1, await WaitForValueAsync(async() => await GetRehabCount(leaderStore, databaseName), 1)); using (var session = leaderStore.OpenSession()) { session.Store(new User(), "users/3"); session.SaveChanges(); } Assert.True(await WaitForDocumentInClusterAsync <User>(new DatabaseTopology { Members = new List <string> { "A", "B" } }, databaseName, "users/3", null, TimeSpan.FromSeconds(10))); Servers[2] = GetNewServer(new Dictionary <string, string> { { RavenConfiguration.GetKey(x => x.Core.ServerUrls), urlsC[0] } }, runInMemory: false, deletePrevious: false, partialPath: dataDirC); Assert.Equal(2, await WaitForValueAsync(async() => await GetMembersCount(leaderStore, databaseName), 2)); Assert.Equal(0, await WaitForValueAsync(async() => await GetRehabCount(leaderStore, databaseName), 0, 30_000)); Assert.True(await WaitForDocumentInClusterAsync <User>(dbToplogy, databaseName, "users/3", null, TimeSpan.FromSeconds(10))); dbToplogy = (await leaderStore.Maintenance.Server.SendAsync(new GetDatabaseRecordOperation(databaseName))).Topology; Assert.Equal(2, dbToplogy.AllNodes.Count()); Assert.Equal(2, dbToplogy.Members.Count); Assert.Equal(0, dbToplogy.Rehabs.Count); Assert.True(await WaitForDocumentInClusterAsync <User>(dbToplogy, databaseName, "users/1", null, TimeSpan.FromSeconds(10))); Assert.True(await WaitForDocumentInClusterAsync <User>(dbToplogy, databaseName, "users/3", null, TimeSpan.FromSeconds(10))); Assert.True(await WaitForDocumentInClusterAsync <User>(dbToplogy, databaseName, "users/2", null, TimeSpan.FromSeconds(30))); dbToplogy = (await leaderStore.Maintenance.Server.SendAsync(new GetDatabaseRecordOperation(databaseName))).Topology; Assert.Equal(2, dbToplogy.AllNodes.Count()); Assert.Equal(2, dbToplogy.Members.Count); Assert.Equal(0, dbToplogy.Rehabs.Count); } }
public async Task ChangesApiFailOver() { var db = "Test"; var topology = new DatabaseTopology { DynamicNodesDistribution = true }; var leader = await CreateRaftClusterAndGetLeader(3, customSettings : new Dictionary <string, string>() { [RavenConfiguration.GetKey(x => x.Cluster.AddReplicaTimeout)] = "1", [RavenConfiguration.GetKey(x => x.Cluster.MoveToRehabGraceTime)] = "0", [RavenConfiguration.GetKey(x => x.Cluster.StabilizationTime)] = "1", [RavenConfiguration.GetKey(x => x.Cluster.ElectionTimeout)] = "50" }); await CreateDatabaseInCluster(new DatabaseRecord { DatabaseName = db, Topology = topology }, 2, leader.WebUrl); using (var store = new DocumentStore { Database = db, Urls = new[] { leader.WebUrl } }.Initialize()) { var list = new BlockingCollection <DocumentChange>(); var taskObservable = store.Changes(); await taskObservable.EnsureConnectedNow(); var observableWithTask = taskObservable.ForDocument("users/1"); observableWithTask.Subscribe(list.Add); await observableWithTask.EnsureSubscribedNow(); using (var session = store.OpenSession()) { session.Store(new User(), "users/1"); session.SaveChanges(); } WaitForDocument(store, "users/1"); var value = WaitForValue(() => list.Count, 1); Assert.Equal(1, value); var currentUrl = store.GetRequestExecutor().Url; RavenServer toDispose = null; RavenServer workingServer = null; DisposeCurrentServer(currentUrl, ref toDispose, ref workingServer); await taskObservable.EnsureConnectedNow(); WaitForTopologyStabilization(db, workingServer, 1, 2); using (var session = store.OpenAsyncSession()) { await session.StoreAsync(new User(), "users/1"); await session.SaveChangesAsync(); } value = WaitForValue(() => list.Count, 2); Assert.Equal(2, value); currentUrl = store.GetRequestExecutor().Url; DisposeCurrentServer(currentUrl, ref toDispose, ref workingServer); await taskObservable.EnsureConnectedNow(); WaitForTopologyStabilization(db, workingServer, 2, 1); using (var session = store.OpenSession()) { session.Store(new User(), "users/1"); session.SaveChanges(); } value = WaitForValue(() => list.Count, 3); Assert.Equal(3, value); } }
private static IEnumerable <OngoingTask> CollectEtlTasks(DatabaseRecord databaseRecord, DatabaseTopology dbTopology, ClusterTopology clusterTopology, ServerStore store) { if (dbTopology == null) { yield break; } if (databaseRecord.RavenEtls != null) { foreach (var ravenEtl in databaseRecord.RavenEtls) { var tag = dbTopology.WhoseTaskIsIt(ravenEtl, store.IsPassive()); var taskState = OngoingTaskState.Enabled; if (ravenEtl.Disabled || ravenEtl.Transforms.All(x => x.Disabled)) { taskState = OngoingTaskState.Disabled; } else if (ravenEtl.Transforms.Any(x => x.Disabled)) { taskState = OngoingTaskState.PartiallyEnabled; } if (databaseRecord.RavenConnectionStrings.TryGetValue(ravenEtl.ConnectionStringName, out var connection) == false) { throw new InvalidOperationException( $"Could not find connection string named '{ravenEtl.ConnectionStringName}' in the database record for '{ravenEtl.Name}' ETL"); } yield return(new OngoingTaskRavenEtl { TaskId = ravenEtl.TaskId, TaskName = ravenEtl.Name, // TODO arek TaskConnectionStatus = TaskState = taskState, ResponsibleNode = new NodeId { NodeTag = tag, NodeUrl = clusterTopology.GetUrlFromTag(tag) }, DestinationUrl = connection.Url, DestinationDatabase = connection.Database }); } } if (databaseRecord.SqlEtls != null) { foreach (var sqlEtl in databaseRecord.SqlEtls) { var tag = dbTopology.WhoseTaskIsIt(sqlEtl, store.IsPassive()); var taskState = OngoingTaskState.Enabled; if (sqlEtl.Disabled || sqlEtl.Transforms.All(x => x.Disabled)) { taskState = OngoingTaskState.Disabled; } else if (sqlEtl.Transforms.Any(x => x.Disabled)) { taskState = OngoingTaskState.PartiallyEnabled; } if (databaseRecord.SqlConnectionStrings.TryGetValue(sqlEtl.ConnectionStringName, out var sqlConnection) == false) { throw new InvalidOperationException( $"Could not find connection string named '{sqlEtl.ConnectionStringName}' in the database record for '{sqlEtl.Name}' ETL"); } var(database, server) = SqlConnectionStringParser.GetDatabaseAndServerFromConnectionString(sqlEtl.FactoryName, sqlConnection.ConnectionString); yield return(new OngoingTaskSqlEtl { TaskId = sqlEtl.TaskId, TaskName = sqlEtl.Name, // TODO arek TaskConnectionStatus = TaskState = taskState, ResponsibleNode = new NodeId { NodeTag = tag, NodeUrl = clusterTopology.GetUrlFromTag(tag) }, DestinationServer = server, DestinationDatabase = database }); } } }
private (bool Promote, string UpdateTopologyReason) TryPromote(string dbName, DatabaseTopology topology, Dictionary <string, ClusterNodeStatusReport> current, Dictionary <string, ClusterNodeStatusReport> previous, string mentorNode, string promotable) { if (previous.TryGetValue(mentorNode, out var mentorPrevClusterStats) == false || mentorPrevClusterStats.Report.TryGetValue(dbName, out var mentorPrevDbStats) == false) { return(false, null); } if (previous.TryGetValue(promotable, out var promotablePrevClusterStats) == false || promotablePrevClusterStats.Report.TryGetValue(dbName, out var promotablePrevDbStats) == false) { return(false, null); } if (current.TryGetValue(mentorNode, out var mentorCurrClusterStats) == false || mentorCurrClusterStats.Report.TryGetValue(dbName, out var mentorCurrDbStats) == false) { return(false, null); } if (current.TryGetValue(promotable, out var promotableClusterStats) == false || promotableClusterStats.Report.TryGetValue(dbName, out var promotableDbStats) == false) { return(false, null); } var mentorsEtag = mentorPrevDbStats.LastEtag; if (mentorCurrDbStats.LastSentEtag.TryGetValue(promotable, out var lastSentEtag)) { if (lastSentEtag < mentorsEtag) { var msg = $"The database {dbName} on {promotable} not ready to be promoted, because the mentor hasn't sent all his documents.\n" + $"Last sent Etag: {lastSentEtag}, Mentor's Etag: {mentorsEtag}"; if (_logger.IsInfoEnabled) { _logger.Info(msg); } if (msg.Equals(topology.DemotionReasons[promotable]) == false) { topology.DemotionReasons[promotable] = msg; topology.PromotablesStatus[promotable] = DatabasePromotionStatus.ChangeVectorNotMerged; return(false, msg); } return(false, null); } } var indexesCatchedUp = CheckIndexProgress(promotablePrevDbStats.LastEtag, promotablePrevDbStats.LastIndexStats, promotableDbStats.LastIndexStats); if (indexesCatchedUp) { if (_logger.IsOperationsEnabled) { _logger.Operations($"We try to promoted the database {dbName} on {promotable} to be a full member"); } topology.PromotablesStatus.Remove(promotable); topology.DemotionReasons.Remove(promotable); return(true, $"Node {promotable} is up-to-date so promoting it to be member"); } if (_logger.IsInfoEnabled) { _logger.Info($"The database {dbName} on {promotable} not ready to be promoted, because the indexes are not up-to-date.\n"); } if (topology.PromotablesStatus.TryGetValue(promotable, out var currentStatus) == false || currentStatus != DatabasePromotionStatus.IndexNotUpToDate) { topology.PromotablesStatus[promotable] = DatabasePromotionStatus.IndexNotUpToDate; return(false, $"Node {promotable} not ready to be a member, because the indexes are not up-to-date"); } return(false, null); }
private IEnumerable <OngoingTask> CollectEtlTasks(DatabaseRecord databaseRecord, DatabaseTopology dbTopology, ClusterTopology clusterTopology) { if (dbTopology == null) { yield break; } if (databaseRecord.RavenEtls != null) { foreach (var ravenEtl in databaseRecord.RavenEtls) { var tag = dbTopology.WhoseTaskIsIt(ravenEtl, ServerStore.Engine.CurrentState); var taskState = GetEtlTaskState(ravenEtl); if (databaseRecord.RavenConnectionStrings.TryGetValue(ravenEtl.ConnectionStringName, out var connection) == false) { throw new InvalidOperationException( $"Could not find connection string named '{ravenEtl.ConnectionStringName}' in the database record for '{ravenEtl.Name}' ETL"); } (string Url, OngoingTaskConnectionStatus Status)res = (null, OngoingTaskConnectionStatus.None); string error = null; if (tag == ServerStore.NodeTag) { foreach (var process in Database.EtlLoader.Processes) { if (process is RavenEtl etlProcess) { if (etlProcess.Name == ravenEtl.Name) { res.Url = etlProcess.Url; res.Status = OngoingTaskConnectionStatus.Active; break; } } } if (res.Status == OngoingTaskConnectionStatus.None) { error = $"The raven etl process'{ravenEtl.Name}' was not found."; } } else { res.Status = OngoingTaskConnectionStatus.NotOnThisNode; } yield return(new OngoingTaskRavenEtlListView() { TaskId = ravenEtl.TaskId, TaskName = ravenEtl.Name, // TODO arek TaskConnectionStatus = TaskState = taskState, ResponsibleNode = new NodeId { NodeTag = tag, NodeUrl = clusterTopology.GetUrlFromTag(tag) }, DestinationUrl = res.Url, TaskConnectionStatus = res.Status, DestinationDatabase = connection.Database, ConnectionStringName = ravenEtl.ConnectionStringName, Error = error }); } } if (databaseRecord.SqlEtls != null) { foreach (var sqlEtl in databaseRecord.SqlEtls) { var tag = dbTopology.WhoseTaskIsIt(sqlEtl, ServerStore.Engine.CurrentState); var taskState = GetEtlTaskState(sqlEtl); if (databaseRecord.SqlConnectionStrings.TryGetValue(sqlEtl.ConnectionStringName, out var sqlConnection) == false) { throw new InvalidOperationException( $"Could not find connection string named '{sqlEtl.ConnectionStringName}' in the database record for '{sqlEtl.Name}' ETL"); } var(database, server) = SqlConnectionStringParser.GetDatabaseAndServerFromConnectionString(sqlEtl.FactoryName, sqlConnection.ConnectionString); yield return(new OngoingTaskSqlEtlListView() { TaskId = sqlEtl.TaskId, TaskName = sqlEtl.Name, // TODO arek TaskConnectionStatus = TaskState = taskState, ResponsibleNode = new NodeId { NodeTag = tag, NodeUrl = clusterTopology.GetUrlFromTag(tag) }, DestinationServer = server, DestinationDatabase = database, ConnectionStringName = sqlEtl.ConnectionStringName }); } } }
private int GetNumberOfRespondingNodes(ClusterTopology clusterTopology, string dbName, DatabaseTopology topology, Dictionary <string, ClusterNodeStatusReport> current) { var goodMembers = topology.Members.Count; foreach (var promotable in topology.Promotables) { if (FailedDatabaseInstanceOrNode(clusterTopology, promotable, dbName, current) != DatabaseHealth.Bad) { goodMembers++; } } foreach (var rehab in topology.Rehabs) { if (FailedDatabaseInstanceOrNode(clusterTopology, rehab, dbName, current) != DatabaseHealth.Bad) { goodMembers++; } } return(goodMembers); }
private bool TryMoveToRehab(string dbName, DatabaseTopology topology, Dictionary <string, ClusterNodeStatusReport> current, string member) { DatabaseStatusReport dbStats = null; if (current.TryGetValue(member, out var nodeStats) && nodeStats.Status == ClusterNodeStatusReport.ReportStatus.Ok && nodeStats.Report.TryGetValue(dbName, out dbStats) && dbStats.Status != Faulted) { return(false); } string reason; if (nodeStats == null) { reason = "Node in rehabilitation due to no status report in the latest cluster stats"; } else if (nodeStats.Status != ClusterNodeStatusReport.ReportStatus.Ok) { switch (nodeStats.Status) { case ClusterNodeStatusReport.ReportStatus.Timeout: reason = $"Node in rehabilitation due to timeout reached trying to get stats from node.{Environment.NewLine}"; break; default: reason = $"Node in rehabilitation due to last report status being '{nodeStats.Status}'.{Environment.NewLine}"; break; } } else if (nodeStats.Report.TryGetValue(dbName, out var stats) && stats.Status == Faulted) { reason = $"In rehabilitation because the DatabaseStatus for this node is {nameof(Faulted)}.{Environment.NewLine}"; } else { reason = $"In rehabilitation because the node is reachable but had no report about the database.{Environment.NewLine}"; } if (nodeStats?.Error != null) { reason += $". {nodeStats.Error}"; } if (dbStats?.Error != null) { reason += $". {dbStats.Error}"; } if (topology.Rehabs.Contains(member) == false) { topology.Members.Remove(member); topology.Rehabs.Add(member); } topology.DemotionReasons[member] = reason; topology.PromotablesStatus[member] = DatabasePromotionStatus.NotResponding; if (_logger.IsOperationsEnabled) { _logger.Operations($"Node {member} of database '{dbName}': {reason}"); } return(true); }
public ClusterTransactionCommand(string databaseName, char identityPartsSeparator, DatabaseTopology topology, ArraySegment <BatchRequestParser.CommandData> commandParsedCommands, ClusterTransactionOptions options, string uniqueRequestId) : base(uniqueRequestId) { DatabaseName = databaseName; DatabaseRecordId = topology.DatabaseTopologyIdBase64 ?? Guid.NewGuid().ToBase64Unpadded(); ClusterTransactionId = topology.ClusterTransactionIdBase64 ?? Guid.NewGuid().ToBase64Unpadded(); Options = options; CommandCreationTicks = SystemTime.UtcNow.Ticks; foreach (var commandData in commandParsedCommands) { var command = ClusterTransactionDataCommand.FromCommandData(commandData); ClusterCommandValidation(command, identityPartsSeparator); switch (commandData.Type) { case CommandType.PUT: case CommandType.DELETE: DatabaseCommands.Add(command); break; case CommandType.CompareExchangePUT: case CommandType.CompareExchangeDELETE: ClusterCommands.Add(command); break; default: throw new RachisApplyException($"The type '{commandData.Type}' is not supported in '{nameof(ClusterTransactionCommand)}.'"); } } DatabaseCommandsCount = DatabaseCommands.Count; }
public static (List <string> Members, List <string> Promotables, List <string> Rehabs) Reorder(DatabaseTopology topology, List <string> order) { if (topology.Count != order.Count || topology.AllNodes.All(order.Contains) == false) { throw new ArgumentException("The reordered list doesn't correspond to the existing nodes of the database group."); } var newMembers = new List <string>(); var newPromotables = new List <string>(); var newRehabs = new List <string>(); foreach (var node in order) { if (topology.Members.Contains(node)) { newMembers.Add(node); } else if (topology.Promotables.Contains(node)) { newPromotables.Add(node); } else if (topology.Rehabs.Contains(node)) { newRehabs.Add(node); } else { throw new ArgumentException($"Can't find node {node} in the topology"); } } return(newMembers, newPromotables, newRehabs); }