private List <string> GetRelevantNodes(string databaseName, ClusterTopology clusterTopology) { if (databaseName == null) { return(clusterTopology.AllNodes.Keys.ToList()); } using (_serverStore.ContextPool.AllocateOperationContext(out TransactionOperationContext context)) using (context.OpenReadTransaction()) using (var rawRecord = _serverStore.Cluster.ReadRawDatabaseRecord(context, databaseName)) { if (rawRecord == null) { return(new List <string>()); } var databaseTopology = rawRecord.Topology; if (databaseTopology == null) { return(new List <string>()); } return(databaseTopology.AllNodes.ToList()); } }
private void HandleTopologyChange(DatabaseRecord newRecord) { var instancesToDispose = new List <OutgoingReplicationHandler>(); if (newRecord == null || _server.IsPassive()) { DropOutgoingConnections(Destinations, instancesToDispose); _internalDestinations.Clear(); _externalDestinations.Clear(); _destinations.Clear(); DisposeConnections(instancesToDispose); return; } _clusterTopology = GetClusterTopology(); HandleInternalReplication(newRecord, instancesToDispose); HandleExternalReplication(newRecord, instancesToDispose); var destinations = new List <ReplicationNode>(); destinations.AddRange(_internalDestinations); destinations.AddRange(_externalDestinations); _destinations = destinations; _numberOfSiblings = _destinations.Select(x => x.Url).Intersect(_clusterTopology.AllNodes.Select(x => x.Value)).Count(); DisposeConnections(instancesToDispose); }
private void RemoveOtherNodesIfNeeded(string dbName, DatabaseRecord record, ClusterTopology clusterTopology, Dictionary <string, ClusterNodeStatusReport> current, ref List <DeleteDatabaseCommand> deletions) { var topology = record.Topology; if (topology.Members.Count < topology.ReplicationFactor) { return; } if (topology.Promotables.Count == 0 && topology.Rehabs.Count == 0) { return; } if (_logger.IsOperationsEnabled) { _logger.Operations("We reached the replication factor, so we try to remove redundant nodes."); } var nodesToDelete = new List <string>(); var mentorChangeVector = new Dictionary <string, string>(); foreach (var node in topology.Promotables.Concat(topology.Rehabs)) { if (TryGetMentorNode(dbName, topology, clusterTopology, node, out var mentorNode) == false || current.TryGetValue(mentorNode, out var metorStats) == false || metorStats.Report.TryGetValue(dbName, out var dbReport) == false) { continue; } if (record.DeletionInProgress?.ContainsKey(node) == true) { continue; } nodesToDelete.Add(node); mentorChangeVector.Add(node, dbReport.DatabaseChangeVector); } if (nodesToDelete.Count > 0) { var deletionCmd = new DeleteDatabaseCommand { ErrorOnDatabaseDoesNotExists = false, DatabaseName = dbName, FromNodes = nodesToDelete.ToArray(), HardDelete = _hardDeleteOnReplacement, UpdateReplicationFactor = false, MentorChangeVector = mentorChangeVector }; if (deletions == null) { deletions = new List <DeleteDatabaseCommand>(); } deletions.Add(deletionCmd); } }
public Task GetClusterTopology() { using (ServerStore.ContextPool.AllocateOperationContext(out TransactionOperationContext context)) using (context.OpenReadTransaction()) { var topology = ServerStore.GetClusterTopology(context); var nodeTag = ServerStore.NodeTag; if (topology.Members.Count == 0) { var tag = ServerStore.NodeTag ?? "A"; var serverUrl = ServerStore.NodeHttpServerUrl; topology = new ClusterTopology( "dummy", new Dictionary <string, string> { [tag] = serverUrl }, new Dictionary <string, string>(), new Dictionary <string, string>(), tag ); nodeTag = tag; } HttpContext.Response.StatusCode = (int)HttpStatusCode.OK; using (var writer = new BlittableJsonTextWriter(context, ResponseBodyStream())) { var loadLicenseLimits = ServerStore.LoadLicenseLimits(); var nodeLicenseDetails = loadLicenseLimits == null ? null : DynamicJsonValue.Convert(loadLicenseLimits.NodeLicenseDetails); var json = new DynamicJsonValue { ["Topology"] = topology.ToSortedJson(), ["Leader"] = ServerStore.LeaderTag, ["CurrentState"] = ServerStore.CurrentRachisState, ["NodeTag"] = nodeTag, ["CurrentTerm"] = ServerStore.Engine.CurrentTerm, ["NodeLicenseDetails"] = nodeLicenseDetails, [nameof(ServerStore.Engine.LastStateChangeReason)] = ServerStore.LastStateChangeReason() }; var clusterErrors = ServerStore.GetClusterErrors(); if (clusterErrors.Count > 0) { json["Errors"] = clusterErrors; } var nodesStatues = ServerStore.GetNodesStatuses(); json["Status"] = DynamicJsonValue.Convert(nodesStatues); context.Write(writer, json); writer.Flush(); } } return(Task.CompletedTask); }
public List <ReplicationNode> GetDestinations(string myTag, string databaseName, Dictionary <string, DeletionInProgressStatus> deletionInProgress, ClusterTopology clusterTopology, RachisState state) { var list = new List <string>(); var destinations = new List <ReplicationNode>(); if (Promotables.Contains(myTag)) // if we are a promotable we can't have any destinations { return(destinations); } var nodes = Members.Concat(Rehabs); foreach (var node in nodes) { if (node == myTag) // skip me { continue; } if (deletionInProgress != null && deletionInProgress.ContainsKey(node)) { continue; } list.Add(clusterTopology.GetUrlFromTag(node)); } foreach (var promotable in Promotables) { if (deletionInProgress != null && deletionInProgress.ContainsKey(promotable)) { continue; } var url = clusterTopology.GetUrlFromTag(promotable); PredefinedMentors.TryGetValue(promotable, out var mentor); if (WhoseTaskIsIt(state, new PromotableTask(promotable, url, databaseName, mentor), null) == myTag) { list.Add(url); } } // remove nodes that are not in the raft cluster topology list.RemoveAll(url => clusterTopology.TryGetNodeTagByUrl(url).HasUrl == false); foreach (var url in list) { destinations.Add(new InternalReplication { NodeTag = clusterTopology.TryGetNodeTagByUrl(url).NodeTag, Url = url, Database = databaseName }); } return(destinations); }
public static ClusterTopologyChanged Create(ClusterTopology clusterTopology, string leaderTag, string nodeTag, long term, Dictionary <string, NodeStatus> status) { return(new ClusterTopologyChanged { Topology = clusterTopology, Leader = leaderTag, NodeTag = nodeTag, Status = status, CurrentTerm = term }); }
private static InternalReplication GetNode(string databaseName, ClusterTopology clusterTopology, string rehab, string mentor, out PromotableTask promotableTask) { var url = clusterTopology.GetUrlFromTag(rehab); var node = new InternalReplication { Database = databaseName, NodeTag = rehab, Url = url }; promotableTask = new PromotableTask(rehab, url, databaseName, mentor); return(node); }
private bool TryGetMentorNode(string dbName, DatabaseTopology topology, ClusterTopology clusterTopology, string promotable, out string mentorNode) { var url = clusterTopology.GetUrlFromTag(promotable); var task = new PromotableTask(promotable, url, dbName); mentorNode = topology.WhoseTaskIsIt(task, _server.IsPassive()); if (mentorNode == null) { // We are in passive mode and were kicked out of the cluster. return(false); } return(true); }
private static IEnumerable <OngoingTask> CollectBackupTasks( DatabaseRecord databaseRecord, DatabaseTopology dbTopology, ClusterTopology clusterTopology, ServerStore store) { if (dbTopology == null) { yield break; } if (databaseRecord.PeriodicBackups == null) { yield break; } if (databaseRecord.PeriodicBackups.Count == 0) { yield break; } var database = store.DatabasesLandlord.TryGetOrCreateResourceStore(databaseRecord.DatabaseName).Result; foreach (var backupConfiguration in databaseRecord.PeriodicBackups) { var tag = dbTopology.WhoseTaskIsIt(backupConfiguration, store.IsPassive()); var backupDestinations = GetBackupDestinations(backupConfiguration); var backupStatus = database.PeriodicBackupRunner.GetBackupStatus(backupConfiguration.TaskId); var nextBackup = database.PeriodicBackupRunner.GetNextBackupDetails(databaseRecord, backupConfiguration, backupStatus); yield return(new OngoingTaskBackup { TaskId = backupConfiguration.TaskId, BackupType = backupConfiguration.BackupType, TaskName = backupConfiguration.Name, TaskState = backupConfiguration.Disabled ? OngoingTaskState.Disabled : OngoingTaskState.Enabled, LastFullBackup = backupStatus.LastFullBackup, LastIncrementalBackup = backupStatus.LastIncrementalBackup, NextBackup = nextBackup, ResponsibleNode = new NodeId { NodeTag = tag, NodeUrl = clusterTopology.GetUrlFromTag(tag) }, BackupDestinations = backupDestinations }); } }
private string GetUrl(string tag, ClusterTopology clusterTopology) { string url = null; if (Server.ServerStore.NodeTag == tag) { url = ServerStore.GetNodeHttpServerUrl(HttpContext.Request.GetClientRequestedNodeUrl()); } if (url == null) { url = clusterTopology.GetUrlFromTag(tag); } return(url); }
private string GetUrl(string tag, ClusterTopology clusterTopology) { string url = null; if (Server.ServerStore.NodeTag == tag) { url = ServerStore.NodeHttpServerUrl; } if (url == null) { url = clusterTopology.GetUrlFromTag(tag); } return(url); }
private bool TryGetMentorNode(string dbName, DatabaseTopology topology, ClusterTopology clusterTopology, string promotable, out string mentorNode) { var url = clusterTopology.GetUrlFromTag(promotable); topology.PredefinedMentors.TryGetValue(promotable, out var mentor); var task = new PromotableTask(promotable, url, dbName, mentor); mentorNode = topology.WhoseTaskIsIt(_server.Engine.CurrentState, task, null); if (mentorNode == null) { // We are in passive mode and were kicked out of the cluster. return(false); } return(true); }
public static ClusterTopologyChanged Create(ClusterTopology clusterTopology, string leaderTag, string nodeTag, long term, Dictionary <string, NodeStatus> status, Dictionary <string, DetailsPerNode> nodeLicenseDetails) { return(new ClusterTopologyChanged { Severity = NotificationSeverity.Info, Title = "Cluster topology was changed", Topology = clusterTopology, Leader = leaderTag, NodeTag = nodeTag, Status = status, CurrentTerm = term, NodeLicenseDetails = nodeLicenseDetails }); }
private async Task UpdateNodesDirectoryResult(IEnumerable <string> nodes, ClusterTopology clusterTopology, DataDirectoryResult dataDirectoryResult) { var tasks = new List <Task <SingleNodeDataDirectoryResult> >(); foreach (var nodeTag in nodes) { _serverStore.ServerShutdown.ThrowIfCancellationRequested(); if (nodeTag.Equals(_serverStore.NodeTag, StringComparison.OrdinalIgnoreCase)) { continue; } var serverUrl = clusterTopology.GetUrlFromTag(nodeTag); if (serverUrl == null) { continue; } tasks.Add(Task.Run(async() => { var singleNodeResult = await GetSingleNodeDataDirectoryInfo(serverUrl); return(singleNodeResult); }, _serverStore.ServerShutdown)); } await Task.WhenAll(tasks); foreach (var task in tasks) { if (task.IsCompletedSuccessfully == false) { continue; } var singleNodeResult = await task; if (singleNodeResult == null) { continue; } dataDirectoryResult.List.Add(singleNodeResult); } }
private int GetNumberOfRespondingNodes(ClusterTopology clusterTopology, string dbName, DatabaseTopology topology, Dictionary <string, ClusterNodeStatusReport> current) { var goodMembers = topology.Members.Count; foreach (var promotable in topology.Promotables) { if (FailedDatabaseInstanceOrNode(clusterTopology, promotable, dbName, current) != DatabaseHealth.Bad) { goodMembers++; } } foreach (var rehab in topology.Rehabs) { if (FailedDatabaseInstanceOrNode(clusterTopology, rehab, dbName, current) != DatabaseHealth.Bad) { goodMembers++; } } return(goodMembers); }
private DatabaseHealth FailedDatabaseInstanceOrNode( ClusterTopology clusterTopology, string node, string db, Dictionary <string, ClusterNodeStatusReport> current) { if (clusterTopology.Contains(node) == false) // this node is no longer part of the *Cluster* topology and need to be replaced. { return(DatabaseHealth.Bad); } var hasCurrent = current.TryGetValue(node, out var currentNodeStats); // Wait until we have more info if (hasCurrent == false) { return(DatabaseHealth.NotEnoughInfo); } // if server is down we should reassign if (DateTime.UtcNow - currentNodeStats.LastSuccessfulUpdateDateTime > _breakdownTimeout) { return(DatabaseHealth.Bad); } if (currentNodeStats.LastGoodDatabaseStatus.TryGetValue(db, out var lastGoodTime) == false) { // here we have a problem, the topology says that the db needs to be in the node, but the node // doesn't know that the db is on it, that probably indicate some problem and we'll move it // to another node to resolve it. return(DatabaseHealth.NotEnoughInfo); } if (lastGoodTime == default(DateTime) || lastGoodTime == DateTime.MinValue) { return(DatabaseHealth.NotEnoughInfo); } return(DateTime.UtcNow - lastGoodTime > _breakdownTimeout ? DatabaseHealth.Bad : DatabaseHealth.Good); }
private static void AssertCanAddNodeWithTopologyId(ClusterTopology clusterTopology, NodeInfo nodeInfo, string nodeUrl) { if (clusterTopology.TopologyId != nodeInfo.TopologyId) { throw new TopologyMismatchException( $"Adding a new node to cluster failed. The new node is already in another cluster. " + $"Expected topology id: {clusterTopology.TopologyId}, but we get {nodeInfo.TopologyId}"); } if (nodeInfo.NodeTag != RachisConsensus.InitialTag && clusterTopology.Contains(nodeInfo.NodeTag) == false) { // this is fine, since we probably adding back a node that we just removed return; } if (nodeInfo.CurrentState != RachisState.Passive) { throw new InvalidOperationException($"Can't add a new node on {nodeUrl} to cluster " + $"because it's already in the cluster under tag :{nodeInfo.NodeTag} " + $"and URL: {clusterTopology.GetUrlFromTag(nodeInfo.NodeTag)}"); } }
private static IEnumerable <OngoingTask> CollectEtlTasks(DatabaseRecord databaseRecord, DatabaseTopology dbTopology, ClusterTopology clusterTopology, ServerStore store) { if (dbTopology == null) { yield break; } if (databaseRecord.RavenEtls != null) { foreach (var ravenEtl in databaseRecord.RavenEtls) { var tag = dbTopology.WhoseTaskIsIt(ravenEtl, store.IsPassive()); var taskState = OngoingTaskState.Enabled; if (ravenEtl.Disabled || ravenEtl.Transforms.All(x => x.Disabled)) { taskState = OngoingTaskState.Disabled; } else if (ravenEtl.Transforms.Any(x => x.Disabled)) { taskState = OngoingTaskState.PartiallyEnabled; } if (databaseRecord.RavenConnectionStrings.TryGetValue(ravenEtl.ConnectionStringName, out var connection) == false) { throw new InvalidOperationException( $"Could not find connection string named '{ravenEtl.ConnectionStringName}' in the database record for '{ravenEtl.Name}' ETL"); } yield return(new OngoingTaskRavenEtl { TaskId = ravenEtl.TaskId, TaskName = ravenEtl.Name, // TODO arek TaskConnectionStatus = TaskState = taskState, ResponsibleNode = new NodeId { NodeTag = tag, NodeUrl = clusterTopology.GetUrlFromTag(tag) }, DestinationUrl = connection.Url, DestinationDatabase = connection.Database }); } } if (databaseRecord.SqlEtls != null) { foreach (var sqlEtl in databaseRecord.SqlEtls) { var tag = dbTopology.WhoseTaskIsIt(sqlEtl, store.IsPassive()); var taskState = OngoingTaskState.Enabled; if (sqlEtl.Disabled || sqlEtl.Transforms.All(x => x.Disabled)) { taskState = OngoingTaskState.Disabled; } else if (sqlEtl.Transforms.Any(x => x.Disabled)) { taskState = OngoingTaskState.PartiallyEnabled; } if (databaseRecord.SqlConnectionStrings.TryGetValue(sqlEtl.ConnectionStringName, out var sqlConnection) == false) { throw new InvalidOperationException( $"Could not find connection string named '{sqlEtl.ConnectionStringName}' in the database record for '{sqlEtl.Name}' ETL"); } var(database, server) = SqlConnectionStringParser.GetDatabaseAndServerFromConnectionString(sqlEtl.FactoryName, sqlConnection.ConnectionString); yield return(new OngoingTaskSqlEtl { TaskId = sqlEtl.TaskId, TaskName = sqlEtl.Name, // TODO arek TaskConnectionStatus = TaskState = taskState, ResponsibleNode = new NodeId { NodeTag = tag, NodeUrl = clusterTopology.GetUrlFromTag(tag) }, DestinationServer = server, DestinationDatabase = database }); } } }
public List <ReplicationNode> GetDestinations(string nodeTag, string databaseName, ClusterTopology clusterTopology, RachisState state) { var list = new List <string>(); var destinations = new List <ReplicationNode>(); if (Members.Contains(nodeTag) == false) // if we are not a member we can't have any destinations { return(destinations); } foreach (var member in Members) { if (member == nodeTag) //skip me { continue; } list.Add(clusterTopology.GetUrlFromTag(member)); } foreach (var promotable in Promotables.Concat(Rehabs)) { var url = clusterTopology.GetUrlFromTag(promotable); PredefinedMentors.TryGetValue(promotable, out var mentor); if (WhoseTaskIsIt(new PromotableTask(promotable, url, databaseName, mentor), state) == nodeTag) { list.Add(url); } } // remove nodes that are not in the raft cluster topology list.RemoveAll(url => clusterTopology.TryGetNodeTagByUrl(url).HasUrl == false); foreach (var url in list) { destinations.Add(new InternalReplication { NodeTag = clusterTopology.TryGetNodeTagByUrl(url).NodeTag, Url = url, Database = databaseName }); } return(destinations); }
private IEnumerable <OngoingTask> CollectSubscriptionTasks(TransactionOperationContext context, DatabaseRecord databaseRecord, ClusterTopology clusterTopology) { foreach (var keyValue in ClusterStateMachine.ReadValuesStartingWith(context, SubscriptionState.SubscriptionPrefix(databaseRecord.DatabaseName))) { var subscriptionState = JsonDeserializationClient.SubscriptionState(keyValue.Value); var tag = databaseRecord.Topology.WhoseTaskIsIt(subscriptionState, ServerStore.Engine.CurrentState); yield return(new OngoingTaskSubscription { // Supply only needed fields for List View ResponsibleNode = new NodeId { NodeTag = tag, NodeUrl = clusterTopology.GetUrlFromTag(tag) }, TaskName = subscriptionState.SubscriptionName, TaskState = subscriptionState.Disabled ? OngoingTaskState.Disabled : OngoingTaskState.Enabled, TaskId = subscriptionState.SubscriptionId, Query = subscriptionState.Query }); } }
private IEnumerable <OngoingTask> CollectEtlTasks(DatabaseRecord databaseRecord, DatabaseTopology dbTopology, ClusterTopology clusterTopology) { if (dbTopology == null) { yield break; } if (databaseRecord.RavenEtls != null) { foreach (var ravenEtl in databaseRecord.RavenEtls) { var tag = dbTopology.WhoseTaskIsIt(ravenEtl, ServerStore.Engine.CurrentState); var taskState = GetEtlTaskState(ravenEtl); if (databaseRecord.RavenConnectionStrings.TryGetValue(ravenEtl.ConnectionStringName, out var connection) == false) { throw new InvalidOperationException( $"Could not find connection string named '{ravenEtl.ConnectionStringName}' in the database record for '{ravenEtl.Name}' ETL"); } (string Url, OngoingTaskConnectionStatus Status)res = (null, OngoingTaskConnectionStatus.None); string error = null; if (tag == ServerStore.NodeTag) { foreach (var process in Database.EtlLoader.Processes) { if (process is RavenEtl etlProcess) { if (etlProcess.Name == ravenEtl.Name) { res.Url = etlProcess.Url; res.Status = OngoingTaskConnectionStatus.Active; break; } } } if (res.Status == OngoingTaskConnectionStatus.None) { error = $"The raven etl process'{ravenEtl.Name}' was not found."; } } else { res.Status = OngoingTaskConnectionStatus.NotOnThisNode; } yield return(new OngoingTaskRavenEtlListView() { TaskId = ravenEtl.TaskId, TaskName = ravenEtl.Name, // TODO arek TaskConnectionStatus = TaskState = taskState, ResponsibleNode = new NodeId { NodeTag = tag, NodeUrl = clusterTopology.GetUrlFromTag(tag) }, DestinationUrl = res.Url, TaskConnectionStatus = res.Status, DestinationDatabase = connection.Database, ConnectionStringName = ravenEtl.ConnectionStringName, Error = error }); } } if (databaseRecord.SqlEtls != null) { foreach (var sqlEtl in databaseRecord.SqlEtls) { var tag = dbTopology.WhoseTaskIsIt(sqlEtl, ServerStore.Engine.CurrentState); var taskState = GetEtlTaskState(sqlEtl); if (databaseRecord.SqlConnectionStrings.TryGetValue(sqlEtl.ConnectionStringName, out var sqlConnection) == false) { throw new InvalidOperationException( $"Could not find connection string named '{sqlEtl.ConnectionStringName}' in the database record for '{sqlEtl.Name}' ETL"); } var(database, server) = SqlConnectionStringParser.GetDatabaseAndServerFromConnectionString(sqlEtl.FactoryName, sqlConnection.ConnectionString); yield return(new OngoingTaskSqlEtlListView() { TaskId = sqlEtl.TaskId, TaskName = sqlEtl.Name, // TODO arek TaskConnectionStatus = TaskState = taskState, ResponsibleNode = new NodeId { NodeTag = tag, NodeUrl = clusterTopology.GetUrlFromTag(tag) }, DestinationServer = server, DestinationDatabase = database, ConnectionStringName = sqlEtl.ConnectionStringName }); } } }
private OngoingTaskReplication GetExternalReplicationInfo(DatabaseTopology dbTopology, ClusterTopology clusterTopology, ExternalReplication watcher) { NodeId responsibale = null; var tag = dbTopology.WhoseTaskIsIt(watcher, ServerStore.Engine.CurrentState); if (tag != null) { responsibale = new NodeId { NodeTag = tag, NodeUrl = clusterTopology.GetUrlFromTag(tag) }; } (string Url, OngoingTaskConnectionStatus Status)res = (null, OngoingTaskConnectionStatus.None); if (tag == ServerStore.NodeTag) { res = Database.ReplicationLoader.GetExternalReplicationDestination(watcher.TaskId); } else { res.Status = OngoingTaskConnectionStatus.NotOnThisNode; } var taskInfo = new OngoingTaskReplication { TaskId = watcher.TaskId, TaskName = watcher.Name, ResponsibleNode = responsibale, DestinationDatabase = watcher.Database, TaskState = watcher.Disabled ? OngoingTaskState.Disabled : OngoingTaskState.Enabled, DestinationUrl = res.Url, TaskConnectionStatus = res.Status, }; return(taskInfo); }
private IEnumerable <OngoingTask> CollectExternalReplicationTasks(List <ExternalReplication> watchers, DatabaseTopology dbTopology, ClusterTopology clusterTopology) { if (dbTopology == null) { yield break; } foreach (var watcher in watchers) { var taskInfo = GetExternalReplicationInfo(dbTopology, clusterTopology, watcher); yield return(taskInfo); } }
public Task GetClusterTopology() { using (ServerStore.ContextPool.AllocateOperationContext(out TransactionOperationContext context)) using (context.OpenReadTransaction()) { var topology = ServerStore.GetClusterTopology(context); var nodeTag = ServerStore.NodeTag; if (topology.AllNodes.Count == 0) { var tag = ServerStore.NodeTag ?? "A"; var serverUrl = ServerStore.GetNodeHttpServerUrl(HttpContext.Request.GetClientRequestedNodeUrl()); topology = new ClusterTopology( topology.TopologyId ?? "dummy", new Dictionary <string, string> { [tag] = serverUrl }, new Dictionary <string, string>(), new Dictionary <string, string>(), tag, -1L ); nodeTag = tag; } else { var isClientIndependent = GetBoolValueQueryString("clientIndependent", false) ?? false; if (isClientIndependent == false && HttpContext.Items.TryGetValue(nameof(LocalEndpointClient.DebugPackage), out var _) == false) { topology.ReplaceCurrentNodeUrlWithClientRequestedNodeUrlIfNecessary(ServerStore, HttpContext); } } HttpContext.Response.StatusCode = (int)HttpStatusCode.OK; using (var writer = new BlittableJsonTextWriter(context, ResponseBodyStream())) { var loadLicenseLimits = ServerStore.LoadLicenseLimits(); var nodeLicenseDetails = loadLicenseLimits == null ? null : DynamicJsonValue.Convert(loadLicenseLimits.NodeLicenseDetails); var json = new DynamicJsonValue { [nameof(ClusterTopologyResponse.Topology)] = topology.ToSortedJson(), [nameof(ClusterTopologyResponse.Etag)] = topology.Etag, [nameof(ClusterTopologyResponse.Leader)] = ServerStore.LeaderTag, ["LeaderShipDuration"] = ServerStore.Engine.CurrentLeader?.LeaderShipDuration, ["CurrentState"] = ServerStore.CurrentRachisState, [nameof(ClusterTopologyResponse.NodeTag)] = nodeTag, ["CurrentTerm"] = ServerStore.Engine.CurrentTerm, ["NodeLicenseDetails"] = nodeLicenseDetails, [nameof(ServerStore.Engine.LastStateChangeReason)] = ServerStore.LastStateChangeReason() }; var clusterErrors = ServerStore.GetClusterErrors(); if (clusterErrors.Count > 0) { json["Errors"] = clusterErrors; } var nodesStatues = ServerStore.GetNodesStatuses(); json["Status"] = DynamicJsonValue.Convert(nodesStatues); context.Write(writer, json); writer.Flush(); } } return(Task.CompletedTask); }
private string UpdateDatabaseTopology(string dbName, DatabaseRecord record, ClusterTopology clusterTopology, Dictionary <string, ClusterNodeStatusReport> current, Dictionary <string, ClusterNodeStatusReport> previous, ref List <DeleteDatabaseCommand> deletions) { if (record.Disabled) { return(null); } var topology = record.Topology; var hasLivingNodes = false; foreach (var member in topology.Members) { var status = None; if (current.TryGetValue(member, out var nodeStats) == false) { // there isn't much we can do here, except for log it. if (previous.TryGetValue(member, out _)) { // if we found this node in the previous report, we will ignore it this time and wait for the next report. continue; } var msg = $"The member node {member} was not found in both current and previous reports of the cluster observer. " + $"If this error continue to raise, check the latency between the cluster nodes."; if (_logger.IsInfoEnabled) { _logger.Info(msg); } RaiseNodeNotFoundAlert(msg, member); continue; } if (nodeStats.Status == ClusterNodeStatusReport.ReportStatus.Ok && nodeStats.Report.TryGetValue(dbName, out var dbStats)) { status = dbStats.Status; if (status == Loaded || status == Loading || status == Unloaded) { hasLivingNodes = true; if (topology.PromotablesStatus.TryGetValue(member, out var _)) { topology.DemotionReasons.Remove(member); topology.PromotablesStatus.Remove(member); return($"Node {member} is online"); } continue; } } // Give one minute of grace before we move the node to a rehab if (DateTime.UtcNow.AddMilliseconds(-_moveToRehabTime) < current[member]?.LastSuccessfulUpdateDateTime) { continue; } if (TryMoveToRehab(dbName, topology, current, member)) { return($"Node {member} is currently not responding (with status: {status}) and moved to rehab"); } // database distribution is off and the node is down if (topology.DynamicNodesDistribution == false && ( topology.PromotablesStatus.TryGetValue(member, out var currentStatus) == false || currentStatus != DatabasePromotionStatus.NotResponding)) { topology.DemotionReasons[member] = "Not responding"; topology.PromotablesStatus[member] = DatabasePromotionStatus.NotResponding; return($"Node {member} is currently not responding with the status '{status}'"); } } if (hasLivingNodes == false) { var recoverable = new List <string>(); foreach (var rehab in topology.Rehabs) { if (FailedDatabaseInstanceOrNode(clusterTopology, rehab, dbName, current) == DatabaseHealth.Good) { recoverable.Add(rehab); } } if (recoverable.Count > 0) { var node = FindMostUpToDateNode(recoverable, dbName, current); topology.Rehabs.Remove(node); topology.Members.Add(node); RaiseNoLivingNodesAlert($"None of '{dbName}' database nodes are responding to the supervisor, promoting {node} from rehab to avoid making the database completely unreachable.", dbName); return($"None of '{dbName}' nodes are responding, promoting {node} from rehab"); } if (topology.Members.Count == 0 && record.DeletionInProgress?.Count > 0) { return(null); // We delete the whole database. } RaiseNoLivingNodesAlert($"None of '{dbName}' database nodes are responding to the supervisor, the database is unreachable.", dbName); } var shouldUpdateTopologyStatus = false; var updateTopologyStatusReason = new StringBuilder(); foreach (var promotable in topology.Promotables) { if (FailedDatabaseInstanceOrNode(clusterTopology, promotable, dbName, current) == DatabaseHealth.Bad) { // database distribution is off and the node is down if (topology.DynamicNodesDistribution == false) { if (topology.PromotablesStatus.TryGetValue(promotable, out var currentStatus) == false || currentStatus != DatabasePromotionStatus.NotResponding) { topology.DemotionReasons[promotable] = "Not responding"; topology.PromotablesStatus[promotable] = DatabasePromotionStatus.NotResponding; return($"Node {promotable} is currently not responding"); } continue; } if (TryFindFitNode(promotable, dbName, topology, clusterTopology, current, out var node) == false) { if (topology.PromotablesStatus.TryGetValue(promotable, out var currentStatus) == false || currentStatus != DatabasePromotionStatus.NotResponding) { topology.DemotionReasons[promotable] = "Not responding"; topology.PromotablesStatus[promotable] = DatabasePromotionStatus.NotResponding; return($"Node {promotable} is currently not responding"); } continue; } if (_server.LicenseManager.CanDynamicallyDistributeNodes(out _) == false) { continue; } // replace the bad promotable otherwise we will continue to add more and more nodes. topology.Promotables.Add(node); topology.DemotionReasons[node] = $"Just replaced the promotable node {promotable}"; topology.PromotablesStatus[node] = DatabasePromotionStatus.WaitingForFirstPromotion; var deletionCmd = new DeleteDatabaseCommand { ErrorOnDatabaseDoesNotExists = false, DatabaseName = dbName, FromNodes = new[] { promotable }, HardDelete = _hardDeleteOnReplacement, UpdateReplicationFactor = false }; if (deletions == null) { deletions = new List <DeleteDatabaseCommand>(); } deletions.Add(deletionCmd); return($"The promotable {promotable} is not responsive, replace it with a node {node}"); } if (TryGetMentorNode(dbName, topology, clusterTopology, promotable, out var mentorNode) == false) { continue; } var tryPromote = TryPromote(dbName, topology, current, previous, mentorNode, promotable); if (tryPromote.Promote) { topology.Promotables.Remove(promotable); topology.Members.Add(promotable); topology.PredefinedMentors.Remove(promotable); RemoveOtherNodesIfNeeded(dbName, record, clusterTopology, current, ref deletions); return($"Promoting node {promotable} to member"); } if (tryPromote.UpdateTopologyReason != null) { shouldUpdateTopologyStatus = true; updateTopologyStatusReason.AppendLine(tryPromote.UpdateTopologyReason); } } var goodMembers = GetNumberOfRespondingNodes(clusterTopology, dbName, topology, current); var pendingDelete = GetPendingDeleteNodes(record); foreach (var rehab in topology.Rehabs) { var health = FailedDatabaseInstanceOrNode(clusterTopology, rehab, dbName, current); switch (health) { case DatabaseHealth.Bad: if (topology.DynamicNodesDistribution == false) { continue; } if (goodMembers < topology.ReplicationFactor && TryFindFitNode(rehab, dbName, topology, clusterTopology, current, out var node)) { if (_server.LicenseManager.CanDynamicallyDistributeNodes(out _) == false) { continue; } topology.Promotables.Add(node); topology.DemotionReasons[node] = $"Maintain the replication factor and create new replica instead of node {rehab}"; topology.PromotablesStatus[node] = DatabasePromotionStatus.WaitingForFirstPromotion; return($"The rehab node {rehab} was too long in rehabilitation, create node {node} to replace it"); } if (topology.PromotablesStatus.TryGetValue(rehab, out var status) == false || status != DatabasePromotionStatus.NotResponding) { // was already online, but now we lost the connection again if (TryMoveToRehab(dbName, topology, current, rehab)) { return($"Node {rehab} is currently not responding"); } } break; case DatabaseHealth.Good: if (pendingDelete.Contains(rehab) && topology.PromotablesStatus.ContainsKey(rehab) == false) { // already tried to promote, so we just ignore and continue continue; } if (TryGetMentorNode(dbName, topology, clusterTopology, rehab, out var mentorNode) == false) { continue; } var tryPromote = TryPromote(dbName, topology, current, previous, mentorNode, rehab); if (tryPromote.Promote) { if (_logger.IsOperationsEnabled) { _logger.Operations($"The database {dbName} on {rehab} is reachable and up to date, so we promote it back to member."); } topology.Members.Add(rehab); topology.Rehabs.Remove(rehab); RemoveOtherNodesIfNeeded(dbName, record, clusterTopology, current, ref deletions); return($"Node {rehab} was recovered from rehabilitation and promoted back to member"); } if (tryPromote.UpdateTopologyReason != null) { shouldUpdateTopologyStatus = true; updateTopologyStatusReason.AppendLine(tryPromote.UpdateTopologyReason); } break; } } RemoveOtherNodesIfNeeded(dbName, record, clusterTopology, current, ref deletions); if (shouldUpdateTopologyStatus) { return(updateTopologyStatusReason.ToString()); } return(null); }
private void RefreshAmbassadors(ClusterTopology clusterTopology, Dictionary <string, RemoteConnection> connections = null) { bool lockTaken = false; Monitor.TryEnter(this, ref lockTaken); try { //This only means we are been disposed so we can quit now if (lockTaken == false) { if (_engine.Log.IsInfoEnabled) { _engine.Log.Info($"{ToString()}: Skipping refreshing ambassadors because we are been disposed of"); } return; } if (Term != _engine.CurrentTerm) { if (_engine.Log.IsInfoEnabled) { _engine.Log.Info($"{ToString()}: We are no longer the actual leader, since the current term is {_engine.CurrentTerm}"); } return; } if (_engine.Log.IsInfoEnabled) { _engine.Log.Info($"{ToString()}: Refreshing ambassadors"); } var old = new Dictionary <string, FollowerAmbassador>(StringComparer.OrdinalIgnoreCase); foreach (var peers in new[] { _voters, _promotables, _nonVoters }) { foreach (var peer in peers) { old[peer.Key] = peer.Value; } peers.Clear(); } foreach (var voter in clusterTopology.Members) { if (voter.Key == _engine.Tag) { continue; // we obviously won't be applying to ourselves } if (old.TryGetValue(voter.Key, out FollowerAmbassador existingInstance)) { existingInstance.UpdateLeaderWake(_voterResponded); _voters.Add(voter.Key, existingInstance); old.Remove(voter.Key); continue; // already here } RemoteConnection connection = null; connections?.TryGetValue(voter.Key, out connection); var ambasaddor = new FollowerAmbassador(_engine, this, _voterResponded, voter.Key, voter.Value, _engine.ClusterCertificate, connection); _voters.Add(voter.Key, ambasaddor); _engine.AppendStateDisposable(this, ambasaddor); if (_engine.Log.IsInfoEnabled) { _engine.Log.Info($"{ToString()}: starting ambassador for voter {voter.Key} {voter.Value}"); } ambasaddor.Start(); } foreach (var promotable in clusterTopology.Promotables) { if (old.TryGetValue(promotable.Key, out FollowerAmbassador existingInstance)) { existingInstance.UpdateLeaderWake(_promotableUpdated); _promotables.Add(promotable.Key, existingInstance); old.Remove(promotable.Key); continue; // already here } RemoteConnection connection = null; connections?.TryGetValue(promotable.Key, out connection); var ambasaddor = new FollowerAmbassador(_engine, this, _promotableUpdated, promotable.Key, promotable.Value, _engine.ClusterCertificate, connection); _promotables.Add(promotable.Key, ambasaddor); _engine.AppendStateDisposable(this, ambasaddor); if (_engine.Log.IsInfoEnabled) { _engine.Log.Info($"{ToString()}: starting ambassador for promotable {promotable.Key} {promotable.Value}"); } ambasaddor.Start(); } foreach (var nonVoter in clusterTopology.Watchers) { if (old.TryGetValue(nonVoter.Key, out FollowerAmbassador existingInstance)) { existingInstance.UpdateLeaderWake(_noop); _nonVoters.Add(nonVoter.Key, existingInstance); old.Remove(nonVoter.Key); continue; // already here } RemoteConnection connection = null; connections?.TryGetValue(nonVoter.Key, out connection); var ambasaddor = new FollowerAmbassador(_engine, this, _noop, nonVoter.Key, nonVoter.Value, _engine.ClusterCertificate, connection); _nonVoters.Add(nonVoter.Key, ambasaddor); _engine.AppendStateDisposable(this, ambasaddor); if (_engine.Log.IsInfoEnabled) { _engine.Log.Info($"{ToString()}: starting ambassador for watcher {nonVoter.Key} {nonVoter.Value}"); } ambasaddor.Start(); } if (old.Count > 0) { Interlocked.Increment(ref _previousPeersWereDisposed); System.Threading.ThreadPool.QueueUserWorkItem(_ => { foreach (var ambasaddor in old) { // it is not used by anything else, so we can close it ambasaddor.Value.Dispose(); } Interlocked.Decrement(ref _previousPeersWereDisposed); }, null); } } finally { if (lockTaken) { Monitor.Exit(this); } } }
protected async Task WaitForExecutionOnRelevantNodes(JsonOperationContext context, string database, ClusterTopology clusterTopology, List <string> members, long index) { await ServerStore.Cluster.WaitForIndexNotification(index); // first let see if we commit this in the leader if (members.Count == 0) { throw new InvalidOperationException("Cannot wait for execution when there are no nodes to execute ON."); } var executors = new List <ClusterRequestExecutor>(); try { using (var cts = CancellationTokenSource.CreateLinkedTokenSource(ServerStore.ServerShutdown)) { cts.CancelAfter(ServerStore.Configuration.Cluster.OperationTimeout.AsTimeSpan); var waitingTasks = new List <Task <Exception> >(); List <Exception> exceptions = null; foreach (var member in members) { var url = clusterTopology.GetUrlFromTag(member); var executor = ClusterRequestExecutor.CreateForSingleNode(url, ServerStore.Server.Certificate.Certificate); executors.Add(executor); waitingTasks.Add(ExecuteTask(executor, member, cts.Token)); } while (waitingTasks.Count > 0) { var task = await Task.WhenAny(waitingTasks); waitingTasks.Remove(task); if (task.Result == null) { continue; } var exception = task.Result.ExtractSingleInnerException(); if (exceptions == null) { exceptions = new List <Exception>(); } exceptions.Add(exception); } if (exceptions != null) { var allTimeouts = true; foreach (var exception in exceptions) { if (exception is OperationCanceledException) { continue; } allTimeouts = false; } var aggregateException = new AggregateException(exceptions); if (allTimeouts) { throw new TimeoutException($"Waited too long for the raft command (number {index}) to be executed on any of the relevant nodes to this command.", aggregateException); } throw new InvalidDataException($"The database '{database}' was created but is not accessible, because all of the nodes on which this database was supposed to reside on, threw an exception.", aggregateException); } } } finally { foreach (var executor in executors) { executor.Dispose(); } } async Task <Exception> ExecuteTask(RequestExecutor executor, string nodeTag, CancellationToken token) { try { await executor.ExecuteAsync(new WaitForRaftIndexCommand(index), context, token : token); return(null); } catch (RavenException re) when(re.InnerException is HttpRequestException) { // we want to throw for self-checks if (nodeTag == ServerStore.NodeTag) { return(re); } // ignore - we are ok when connection with a node cannot be established (test: AddDatabaseOnDisconnectedNode) return(null); } catch (Exception e) { return(e); } } }
protected async Task WaitForExecutionOnSpecificNode(TransactionOperationContext context, ClusterTopology clusterTopology, string node, long index) { await ServerStore.Cluster.WaitForIndexNotification(index); // first let see if we commit this in the leader using (var requester = ClusterRequestExecutor.CreateForSingleNode(clusterTopology.GetUrlFromTag(node), ServerStore.Server.Certificate.Certificate)) { await requester.ExecuteAsync(new WaitForRaftIndexCommand(index), context); } }
public static void ReplaceCurrentNodeUrlWithClientRequestedNodeUrlIfNecessary(this ClusterTopology topology, ServerStore serverStore, HttpContext httpContext) { var currentNodeUrlAsSeenByTheClient = serverStore.GetNodeHttpServerUrl(httpContext.Request.GetClientRequestedNodeUrl()); topology.ReplaceCurrentNodeUrlWithClientRequestedUrl(serverStore.NodeTag, currentNodeUrlAsSeenByTheClient); }
private bool TryFindFitNode(string badNode, string db, DatabaseTopology topology, ClusterTopology clusterTopology, Dictionary <string, ClusterNodeStatusReport> current, out string bestNode) { bestNode = null; var dbCount = int.MaxValue; var databaseNodes = topology.AllNodes.ToList(); if (topology.Members.Count == 0) // no one can be used as mentor { return(false); } foreach (var node in clusterTopology.AllNodes.Keys) { if (databaseNodes.Contains(node)) { continue; } if (FailedDatabaseInstanceOrNode(clusterTopology, node, db, current) == DatabaseHealth.Bad) { continue; } if (current.TryGetValue(node, out var nodeReport) == false) { if (bestNode == null) { bestNode = node; } continue; } if (dbCount > nodeReport.Report.Count) { dbCount = nodeReport.Report.Count; bestNode = node; } } if (bestNode == null) { if (_logger.IsOperationsEnabled) { _logger.Operations($"The database '{db}' on {badNode} has not responded for a long time, but there is no free node to reassign it."); } return(false); } if (_logger.IsOperationsEnabled) { _logger.Operations($"The database '{db}' on {badNode} has not responded for a long time, so we reassign it to {bestNode}."); } return(true); }