private async Task <T> HandleWithFailovers <T>(Func <OperationMetadata, IRequestTimeMetric, Task <T> > operation, CancellationToken token, bool withClusterFailoverHeader) { var nodes = NodeUrls; for (var i = 0; i < nodes.Count; i++) { var n = nodes[i]; // Have to be here more thread safe n.ClusterInformation.WithClusterFailoverHeader = withClusterFailoverHeader; if (ShouldExecuteUsing(n) == false) { continue; } var hasMoreNodes = nodes.Count > i + 1; var result = await TryClusterOperationAsync(n, operation, hasMoreNodes, token).ConfigureAwait(false); if (result.Success) { return(result.Result); } if (Log.IsDebugEnabled) { Log.Debug($"Tried executing operation on failover server {n.Url} with no success."); } FailureCounters.IncrementFailureCount(n.Url); } throw new InvalidOperationException("Cluster is not reachable. Executing operation on any of the nodes failed, aborting."); }
private bool ShouldExecuteUsing(OperationMetadata operationMetadata) { var failureCounter = FailureCounters.GetHolder(operationMetadata.Url); if (failureCounter.Value <= 1) // can fail once return true; return false; }
private async Task<T> ExecuteWithinClusterInternalAsync<T>(AsyncServerClient serverClient, HttpMethod method, Func<OperationMetadata, Task<T>> operation, CancellationToken token, int numberOfRetries = 2) { token.ThrowIfCancellationRequested(); if (numberOfRetries < 0) throw new InvalidOperationException("Cluster is not reachable. Out of retries, aborting."); var node = LeaderNode; if (node == null) { #pragma warning disable 4014 UpdateReplicationInformationIfNeededAsync(serverClient); // maybe start refresh task #pragma warning restore 4014 switch (serverClient.ClusterBehavior) { case ClusterBehavior.ReadFromAllWriteToLeaderWithFailovers: case ClusterBehavior.ReadFromLeaderWriteToLeaderWithFailovers: if (Nodes.Count == 0) leaderNodeSelected.Wait(TimeSpan.FromSeconds(WaitForLeaderTimeoutInSeconds)); break; default: if (leaderNodeSelected.Wait(TimeSpan.FromSeconds(WaitForLeaderTimeoutInSeconds)) == false) throw new InvalidOperationException("Cluster is not reachable. No leader was selected, aborting."); break; } node = LeaderNode; } switch (serverClient.ClusterBehavior) { case ClusterBehavior.ReadFromAllWriteToLeader: if (method == HttpMethods.Get) node = GetNodeForReadOperation(node); break; case ClusterBehavior.ReadFromAllWriteToLeaderWithFailovers: if (node == null) return await HandleWithFailovers(operation, token).ConfigureAwait(false); if (method == HttpMethods.Get) node = GetNodeForReadOperation(node); break; case ClusterBehavior.ReadFromLeaderWriteToLeaderWithFailovers: if (node == null) return await HandleWithFailovers(operation, token).ConfigureAwait(false); break; } var operationResult = await TryClusterOperationAsync(node, operation, false, token).ConfigureAwait(false); if (operationResult.Success) return operationResult.Result; LeaderNode = null; FailureCounters.IncrementFailureCount(node.Url); return await ExecuteWithinClusterInternalAsync(serverClient, method, operation, token, numberOfRetries - 1).ConfigureAwait(false); }
private async Task<AsyncOperationResult<T>> TryClusterOperationAsync<T>(OperationMetadata node, Func<OperationMetadata, Task<T>> operation, bool avoidThrowing, CancellationToken token) { Debug.Assert(node != null); token.ThrowIfCancellationRequested(); var shouldRetry = false; var operationResult = new AsyncOperationResult<T>(); try { operationResult.Result = await operation(node).ConfigureAwait(false); operationResult.Success = true; } catch (Exception e) { bool wasTimeout; if (HttpConnectionHelper.IsServerDown(e, out wasTimeout)) { shouldRetry = true; operationResult.WasTimeout = wasTimeout; } else { var ae = e as AggregateException; ErrorResponseException errorResponseException; if (ae != null) errorResponseException = ae.ExtractSingleInnerException() as ErrorResponseException; else errorResponseException = e as ErrorResponseException; if (errorResponseException != null && (errorResponseException.StatusCode == HttpStatusCode.Redirect || errorResponseException.StatusCode == HttpStatusCode.ExpectationFailed)) shouldRetry = true; } if (shouldRetry == false && avoidThrowing == false) throw; } if (operationResult.Success) FailureCounters.ResetFailureCount(node.Url); return operationResult; }
private async Task<T> HandleWithFailovers<T>(Func<OperationMetadata, Task<T>> operation, CancellationToken token) { var nodes = NodeUrls; for (var i = 0; i < nodes.Count; i++) { var n = nodes[i]; if (ShouldExecuteUsing(n) == false) continue; var hasMoreNodes = nodes.Count > i + 1; var result = await TryClusterOperationAsync(n, operation, hasMoreNodes, token).ConfigureAwait(false); if (result.Success) return result.Result; FailureCounters.IncrementFailureCount(n.Url); } throw new InvalidOperationException("Cluster is not reachable. Executing operation on any of the nodes failed, aborting."); }
private Task UpdateReplicationInformationForCluster(AsyncServerClient serverClient, OperationMetadata primaryNode, Func <OperationMetadata, Task <ReplicationDocumentWithClusterInformation> > getReplicationDestinationsTask) { lock (this) { var serverHash = ServerHash.GetServerHash(primaryNode.Url); var taskCopy = refreshReplicationInformationTask; if (taskCopy != null) { return(taskCopy); } if (firstTime) { firstTime = false; var document = ReplicationInformerLocalCache.TryLoadReplicationInformationFromLocalCache(serverHash); var nodes = GetNodes(primaryNode, document?.DataAsJson.JsonDeserialization <ReplicationDocumentWithClusterInformation>()); if (nodes != null) { Nodes = nodes; var newLeaderNode = GetLeaderNode(Nodes); if (newLeaderNode != null) { if (Log.IsDebugEnabled) { Log.Debug($"Fetched topology from cache, Leader is {LeaderNode}\n Nodes:" + string.Join(",", Nodes.Select(n => n.Url))); } SetLeaderNodeToKnownLeader(newLeaderNode); return(new CompletedTask()); } if (Log.IsDebugEnabled) { Log.Debug($"Fetched topology from cache, no leader found.\n Nodes:" + string.Join(",", Nodes.Select(n => n.Url))); } SetLeaderNodeToNull(); } } return(refreshReplicationInformationTask = Task.Factory.StartNew(() => { var tryFailoverServers = false; var triedFailoverServers = FailoverServers == null || FailoverServers.Length == 0; for (;;) { //taking a snapshot so we could tell if the value changed while we fetch the topology var prevLeader = LeaderNode; var nodes = NodeUrls.ToHashSet(); if (tryFailoverServers == false) { if (nodes.Count == 0) { nodes.Add(primaryNode); } } else { nodes.Add(primaryNode); // always check primary node during failover check foreach (var failoverServer in FailoverServers) { var node = ConvertReplicationDestinationToOperationMetadata(failoverServer, ClusterInformation.NotInCluster); if (node != null) { nodes.Add(node); } } triedFailoverServers = true; } var replicationDocuments = nodes .Select(operationMetadata => new { Node = operationMetadata, Task = getReplicationDestinationsTask(operationMetadata) }) .ToArray(); var tasks = replicationDocuments .Select(x => (Task)x.Task) .ToArray(); var tasksCompleted = Task.WaitAll(tasks, ReplicationDestinationsTopologyTimeout); if (Log.IsDebugEnabled && tasksCompleted == false) { Log.Debug($"During fetch topology {tasks.Count(t=>t.IsCompleted)} servers have responded out of {tasks.Length}"); } replicationDocuments.ForEach(x => { if (x.Task.IsCompleted && x.Task.Result != null) { FailureCounters.ResetFailureCount(x.Node.Url); } }); var newestTopology = replicationDocuments .Where(x => x.Task.IsCompleted && x.Task.Result != null) .OrderByDescending(x => x.Task.Result.Term) .ThenByDescending(x => { var index = x.Task.Result.ClusterCommitIndex; return x.Task.Result.ClusterInformation.IsLeader ? index + 1 : index; }) .FirstOrDefault(); if (newestTopology == null && FailoverServers != null && FailoverServers.Length > 0 && tryFailoverServers == false) { tryFailoverServers = true; } if (newestTopology == null && triedFailoverServers) { if (Log.IsDebugEnabled) { Log.Debug($"Fetching topology resulted with no topology, tried failoever servers, setting leader node to primary node ({primaryNode})."); } //if the leader Node is not null this means that somebody updated it, we don't want to overwrite it with the primary. // i'm rasing the leader changed event although we don't have a real leader because some tests don't wait for leader but actually any node //Todo: change back to: if (SetLeaderNodeIfLeaderIsNull(primaryNode, false) == false) if (SetLeaderNodeIfLeaderIsNull(primaryNode) == false) { return; } if (Nodes.Count == 0) { Nodes = new List <OperationMetadata> { primaryNode } } ; return; } if (newestTopology != null) { var replicationDocument = newestTopology.Task.Result; var node = newestTopology.Node; if (UpdateTopology(serverClient, node, replicationDocument, serverHash, prevLeader)) { return; } } Thread.Sleep(500); } }).ContinueWith(t => { lastUpdate = SystemTime.UtcNow; refreshReplicationInformationTask = null; })); } }
private async Task <AsyncOperationResult <T> > TryClusterOperationAsync <T>(OperationMetadata node, Func <OperationMetadata, IRequestTimeMetric, Task <T> > operation, bool avoidThrowing, CancellationToken token) { Debug.Assert(node != null); token.ThrowIfCancellationRequested(); var shouldRetry = false; var operationResult = new AsyncOperationResult <T>(); try { operationResult.Result = await operation(node, null).ConfigureAwait(false); operationResult.Success = true; } catch (Exception e) { bool wasTimeout; if (HttpConnectionHelper.IsServerDown(e, out wasTimeout)) { shouldRetry = true; operationResult.WasTimeout = wasTimeout; if (Log.IsDebugEnabled) { Log.Debug($"Operation failed because server {node.Url} is down."); } } else { var ae = e as AggregateException; ErrorResponseException errorResponseException; if (ae != null) { errorResponseException = ae.ExtractSingleInnerException() as ErrorResponseException; } else { errorResponseException = e as ErrorResponseException; } if (errorResponseException != null) { if (errorResponseException.StatusCode == HttpStatusCode.Redirect) { IEnumerable <string> values; if (errorResponseException.Response.Headers.TryGetValues("Raven-Leader-Redirect", out values) == false && values.Contains("true") == false) { throw new InvalidOperationException("Got 302 Redirect, but without Raven-Leader-Redirect: true header, maybe there is a proxy in the middle", e); } var redirectUrl = errorResponseException.Response.Headers.Location.ToString(); var newLeaderNode = Nodes.FirstOrDefault(n => n.Url.Equals(redirectUrl)) ?? new OperationMetadata(redirectUrl, node.Credentials, node.ClusterInformation); SetLeaderNodeToKnownLeader(newLeaderNode); if (Log.IsDebugEnabled) { Log.Debug($"Redirecting to {redirectUrl} because {node.Url} responded with 302-redirect."); } return(await TryClusterOperationAsync(newLeaderNode, operation, avoidThrowing, token).ConfigureAwait(false)); } if (errorResponseException.StatusCode == HttpStatusCode.ExpectationFailed) { if (Log.IsDebugEnabled) { Log.Debug($"Operation failed with status code {HttpStatusCode.ExpectationFailed}, will retry."); } shouldRetry = true; } } } if (shouldRetry == false && avoidThrowing == false) { throw; } operationResult.Error = e; } if (operationResult.Success) { FailureCounters.ResetFailureCount(node.Url); } return(operationResult); }
private async Task <T> ExecuteWithinClusterInternalAsync <T>(AsyncServerClient serverClient, HttpMethod method, Func <OperationMetadata, IRequestTimeMetric, Task <T> > operation, CancellationToken token, int numberOfRetries = 2, bool withClusterFailoverHeader = false) { token.ThrowIfCancellationRequested(); var node = LeaderNode; if (node == null) { #pragma warning disable 4014 // If withClusterFailover set to true we will need to force the update and choose another leader. UpdateReplicationInformationIfNeededAsync(serverClient, force: withClusterFailoverHeader); // maybe start refresh task #pragma warning restore 4014 switch (serverClient.convention.FailoverBehavior) { case FailoverBehavior.ReadFromAllWriteToLeaderWithFailovers: case FailoverBehavior.ReadFromLeaderWriteToLeaderWithFailovers: var waitResult = leaderNodeSelected.Wait(WaitForLeaderTimeout); if (Log.IsDebugEnabled && waitResult == false) { Log.Debug($"Failover behavior is {serverClient.convention.FailoverBehavior}, waited for {WaitForLeaderTimeout.TotalSeconds} seconds and no leader was selected."); } break; default: if (leaderNodeSelected.Wait(WaitForLeaderTimeout) == false) { if (Log.IsDebugEnabled) { Log.Debug($"Failover behavior is {serverClient.convention.FailoverBehavior}, waited for {WaitForLeaderTimeout.TotalSeconds} seconds and no leader was selected."); } throw new InvalidOperationException($"Cluster is not in a stable state. No leader was selected, but we require one for making a request using {serverClient.convention.FailoverBehavior}."); } break; } node = LeaderNode; } switch (serverClient.convention.FailoverBehavior) { case FailoverBehavior.ReadFromAllWriteToLeader: if (method == HttpMethods.Get) { node = GetNodeForReadOperation(node) ?? node; } break; case FailoverBehavior.ReadFromAllWriteToLeaderWithFailovers: if (node == null) { return(await HandleWithFailovers(operation, token, withClusterFailoverHeader).ConfigureAwait(false)); } if (method == HttpMethods.Get) { node = GetNodeForReadOperation(node) ?? node; } break; case FailoverBehavior.ReadFromLeaderWriteToLeaderWithFailovers: if (node == null) { return(await HandleWithFailovers(operation, token, withClusterFailoverHeader).ConfigureAwait(false)); } break; } var operationResult = await TryClusterOperationAsync(node, operation, false, token).ConfigureAwait(false); if (operationResult.Success) { return(operationResult.Result); } if (Log.IsDebugEnabled) { Log.Debug($"Faield executing operation on node {node.Url} number of remaining retries: {numberOfRetries}."); } //the value of the leader was changed since we took a snapshot of it and it is not null so we will try to run again without // considering this a failure if (SetLeaderNodeToNullIfPrevIsTheSame(node) == false) { return(await ExecuteWithinClusterInternalAsync(serverClient, method, operation, token, numberOfRetries, withClusterFailoverHeader).ConfigureAwait(false)); } FailureCounters.IncrementFailureCount(node.Url); if (serverClient.convention.FailoverBehavior == FailoverBehavior.ReadFromLeaderWriteToLeaderWithFailovers || serverClient.convention.FailoverBehavior == FailoverBehavior.ReadFromAllWriteToLeaderWithFailovers) { withClusterFailoverHeader = true; } if (numberOfRetries <= 0) { throw new InvalidOperationException("Cluster is not reachable. Out of retries, aborting.", operationResult.Error); } return(await ExecuteWithinClusterInternalAsync(serverClient, method, operation, token, numberOfRetries - 1, withClusterFailoverHeader).ConfigureAwait(false)); }
public ClusterAwareRequestExecuter() { Nodes = new List <OperationMetadata>(); FailureCounters = new FailureCounters(); }
private Task UpdateReplicationInformationForCluster(AsyncServerClient serverClient, OperationMetadata primaryNode, Func <OperationMetadata, Task <ReplicationDocumentWithClusterInformation> > getReplicationDestinationsTask) { lock (this) { var serverHash = ServerHash.GetServerHash(primaryNode.Url); var taskCopy = refreshReplicationInformationTask; if (taskCopy != null) { return(taskCopy); } if (firstTime) { firstTime = false; var document = ReplicationInformerLocalCache.TryLoadReplicationInformationFromLocalCache(serverHash); var nodes = GetNodes(primaryNode, document?.DataAsJson.JsonDeserialization <ReplicationDocumentWithClusterInformation>()); if (nodes != null) { Nodes = nodes; var newLeaderNode = GetLeaderNode(Nodes); if (newLeaderNode != null) { if (Log.IsDebugEnabled) { Log.Debug($"Fetched topology from cache, Leader is {LeaderNode}\n Nodes:" + string.Join(",", Nodes.Select(n => n.Url))); } SetLeaderNodeToKnownLeader(newLeaderNode); return(new CompletedTask()); } if (Log.IsDebugEnabled) { Log.Debug($"Fetched topology from cache, no leader found.\n Nodes:" + string.Join(",", Nodes.Select(n => n.Url))); } SetLeaderNodeToNull(); } } return(refreshReplicationInformationTask = Task.Factory.StartNew(async() => { var tryFailoverServers = false; var triedFailoverServers = FailoverServers == null || FailoverServers.Length == 0; for (; ;) { //taking a snapshot so we could tell if the value changed while we fetch the topology var prevLeader = LeaderNode; var nodes = NodeUrls.ToHashSet(); if (tryFailoverServers == false) { if (nodes.Count == 0) { nodes.Add(primaryNode); } } else { nodes.Add(primaryNode); // always check primary node during failover check foreach (var failoverServer in FailoverServers) { var node = ConvertReplicationDestinationToOperationMetadata(failoverServer, ClusterInformation.NotInCluster); if (node != null) { nodes.Add(node); } } triedFailoverServers = true; } var replicationDocuments = nodes .Select(operationMetadata => new { Node = operationMetadata, Task = getReplicationDestinationsTask(operationMetadata), }) .ToArray(); var tasks = replicationDocuments .Select(x => (Task)x.Task) .ToArray(); var tasksCompleted = Task.WaitAll(tasks, ReplicationDestinationsTopologyTimeout); if (Log.IsDebugEnabled && tasksCompleted == false) { Log.Debug($"During fetch topology {tasks.Count(t => t.IsCompleted)} servers have responded out of {tasks.Length}"); } replicationDocuments.ForEach(x => { if (x.Task.IsCompleted && x.Task.Result != null) { FailureCounters.ResetFailureCount(x.Node.Url); } }); var newestTopologies = replicationDocuments .Where(x => x.Task.IsCompleted && x.Task.Result != null) .OrderByDescending(x => x.Task.Result.Term) .ThenByDescending(x => { var index = x.Task.Result.ClusterCommitIndex; return x.Task.Result.ClusterInformation.IsLeader ? index + 1 : index; }).ToList(); var newestTopology = newestTopologies.FirstOrDefault(); var hasLeaderCount = replicationDocuments .Count(x => x.Task.IsCompleted && x.Task.Result != null && x.Task.Result.HasLeader); if (newestTopology == null && FailoverServers != null && FailoverServers.Length > 0 && tryFailoverServers == false) { tryFailoverServers = true; } if (newestTopology == null && triedFailoverServers) { if (Log.IsDebugEnabled) { Log.Debug($"Fetching topology resulted with no topology, tried failoever servers, setting leader node to primary node ({primaryNode})."); } //if the leader Node is not null this means that somebody updated it, we don't want to overwrite it with the primary. // i'm raising the leader changed event although we don't have a real leader because some tests don't wait for leader but actually any node //Todo: change back to: if (SetLeaderNodeIfLeaderIsNull(primaryNode, false) == false) if (SetLeaderNodeIfLeaderIsNull(primaryNode) == false) { return; } if (Nodes.Count == 0) { Nodes = new List <OperationMetadata> { primaryNode } } ; return; } if (Log.IsDebugEnabled) { foreach (var x in replicationDocuments) { Log.Debug($"Topology fetched from {x.Node.Url}"); Log.Debug($"{JsonConvert.SerializeObject(x.Task?.Result)}"); } } var majorityOfNodesAgreeThereIsLeader = Nodes.Count == 1 || hasLeaderCount > (newestTopology?.Task.Result.Destinations.Count + 1) / 2; if (newestTopology != null && majorityOfNodesAgreeThereIsLeader) { var replicationDocument = newestTopology.Task.Result; var node = newestTopology.Node; if (newestTopologies.Count > 1 && node.Url.Equals(serverClient.Url) == false) { // we got the replication document not from the primary url // need to add the node url destination to the destinations // (we know it exists since we have majority of nodes that agree on the leader) // and remove the primary url destination from the destinations var sourceNode = node; var destination = replicationDocument.Destinations .FirstOrDefault(x => DestinationUrl(x.Url, x.Database).Equals(serverClient.Url, StringComparison.OrdinalIgnoreCase)); if (destination != null) { replicationDocument.Destinations.Remove(destination); // we need to update the cluster information of the primary url for this node replicationDocument.ClusterInformation = destination.ClusterInformation; node = ConvertReplicationDestinationToOperationMetadata(destination, destination.ClusterInformation); } destination = destination ?? replicationDocument.Destinations.FirstOrDefault(); if (destination != null) { var database = destination.Database; var networkCredentials = sourceNode.Credentials?.Credentials as NetworkCredential; replicationDocument.Destinations.Add(new ReplicationDestination.ReplicationDestinationWithClusterInformation { Url = sourceNode.Url, Database = database, ApiKey = sourceNode.Credentials?.ApiKey, Username = networkCredentials?.UserName, Password = networkCredentials?.Password, Domain = networkCredentials?.Domain, ClusterInformation = sourceNode.ClusterInformation }); } } if (UpdateTopology(serverClient, node, replicationDocument, serverHash, prevLeader)) { return; } } await Task.Delay(3000).ConfigureAwait(false); } }).ContinueWith(t => { lastUpdate = SystemTime.UtcNow; refreshReplicationInformationTask = null; })); } }
private async Task <T> ExecuteWithinClusterInternalAsync <T>(AsyncServerClient serverClient, HttpMethod method, Func <OperationMetadata, IRequestTimeMetric, Task <T> > operation, CancellationToken token, int numberOfRetries = 3, bool withClusterFailoverHeader = false) { token.ThrowIfCancellationRequested(); bool isFaultedNode = false; var node = LeaderNode; if (node == null) { if (Log.IsDebugEnabled) { Log.Debug($"Fetching topology, {serverClient.Url}: Retries={numberOfRetries} When={DateTime.UtcNow}"); } #pragma warning disable 4014 //We always want to fetch a new topology if we don't know who the leader is. UpdateReplicationInformationIfNeededAsync(serverClient, force: true); #pragma warning restore 4014 //there is no reason for us to throw cluster not reachable for a read operation when we can read from all nodes. if (method == HttpMethod.Get && (serverClient.convention.FailoverBehavior == FailoverBehavior.ReadFromAllWriteToLeader || serverClient.convention.FailoverBehavior == FailoverBehavior.ReadFromAllWriteToLeaderWithFailovers)) { var primaryNode = new OperationMetadata(serverClient.Url, serverClient.PrimaryCredentials, null); node = GetNodeForReadOperation(primaryNode, out isFaultedNode); } else { switch (serverClient.convention.FailoverBehavior) { case FailoverBehavior.ReadFromAllWriteToLeaderWithFailovers: case FailoverBehavior.ReadFromLeaderWriteToLeaderWithFailovers: var waitResult = leaderNodeSelected.Wait(WaitForLeaderTimeout); if (Log.IsDebugEnabled && waitResult == false) { Log.Debug($"Failover behavior is {serverClient.convention.FailoverBehavior}, waited for {WaitForLeaderTimeout.TotalSeconds} seconds and no leader was selected."); } break; default: if (leaderNodeSelected.Wait(WaitForLeaderTimeout) == false) { if (Log.IsDebugEnabled) { Log.Debug($"Failover behavior is {serverClient.convention.FailoverBehavior}, waited for {WaitForLeaderTimeout.TotalSeconds} seconds and no leader was selected."); } throw new InvalidOperationException($"Cluster is not in a stable state. No leader was selected, but we require one for making a request using {serverClient.convention.FailoverBehavior}."); } break; } node = LeaderNode; } } switch (serverClient.convention.FailoverBehavior) { case FailoverBehavior.ReadFromAllWriteToLeader: if (method == HttpMethods.Get) { node = GetNodeForReadOperation(node, out isFaultedNode); } break; case FailoverBehavior.ReadFromAllWriteToLeaderWithFailovers: if (node == null) { return(await HandleWithFailovers(operation, token, withClusterFailoverHeader).ConfigureAwait(false)); } if (method == HttpMethods.Get) { node = GetNodeForReadOperation(node, out isFaultedNode); } break; case FailoverBehavior.ReadFromLeaderWriteToLeaderWithFailovers: if (node == null) { return(await HandleWithFailovers(operation, token, withClusterFailoverHeader).ConfigureAwait(false)); } break; } var operationResult = await TryClusterOperationAsync(node, operation, false, token).ConfigureAwait(false); if (operationResult.Success) { return(operationResult.Result); } if (isFaultedNode) //the node had more than one failure, but we tried it anyway. { if (Log.IsDebugEnabled) { Log.Debug($"Failed executing operation on node {node.Url}. Connecting to this node has failed already at least once, but we tried again anyway and failed. Got the following result: {operationResult.Result}. (Timeout = {operationResult.WasTimeout})"); } throw operationResult.Error; } if (Log.IsDebugEnabled) { Log.Debug($"Failed executing operation on node {node.Url} number of remaining retries: {numberOfRetries}."); } SetLeaderNodeToNullIfPrevIsTheSame(node); FailureCounters.IncrementFailureCount(node.Url); if (serverClient.convention.FailoverBehavior == FailoverBehavior.ReadFromLeaderWriteToLeaderWithFailovers || serverClient.convention.FailoverBehavior == FailoverBehavior.ReadFromAllWriteToLeaderWithFailovers) { withClusterFailoverHeader = true; } if (numberOfRetries <= 0) { throw new InvalidOperationException("Cluster is not reachable. Out of retries, aborting.", operationResult.Error); } return(await ExecuteWithinClusterInternalAsync(serverClient, method, operation, token, numberOfRetries - 1, withClusterFailoverHeader).ConfigureAwait(false)); }
private Task UpdateReplicationInformationForCluster(OperationMetadata primaryNode, Func<OperationMetadata, Task<ReplicationDocumentWithClusterInformation>> getReplicationDestinationsTask) { lock (this) { var serverHash = ServerHash.GetServerHash(primaryNode.Url); var taskCopy = refreshReplicationInformationTask; if (taskCopy != null) return taskCopy; if (firstTime) { firstTime = false; var nodes = ReplicationInformerLocalCache.TryLoadClusterNodesFromLocalCache(serverHash); if (nodes != null) { Nodes = nodes; LeaderNode = GetLeaderNode(Nodes); if (LeaderNode != null) return new CompletedTask(); } } return refreshReplicationInformationTask = Task.Factory.StartNew(() => { var tryFailoverServers = false; var triedFailoverServers = FailoverServers == null || FailoverServers.Length == 0; for (; ; ) { var nodes = NodeUrls.ToHashSet(); if (tryFailoverServers == false) { if (nodes.Count == 0) nodes.Add(primaryNode); } else { nodes.Add(primaryNode); // always check primary node during failover check foreach (var failoverServer in FailoverServers) { var node = ConvertReplicationDestinationToOperationMetadata(failoverServer, ClusterInformation.NotInCluster); if (node != null) nodes.Add(node); } triedFailoverServers = true; } var replicationDocuments = nodes .Select(operationMetadata => new { Node = operationMetadata, Task = getReplicationDestinationsTask(operationMetadata) }) .ToArray(); var tasks = replicationDocuments .Select(x => x.Task) .ToArray(); Task.WaitAll(tasks); replicationDocuments.ForEach(x => { if (x.Task.Result == null) return; FailureCounters.ResetFailureCount(x.Node.Url); }); var newestTopology = replicationDocuments .Where(x => x.Task.Result != null) .OrderByDescending(x => x.Task.Result.Term) .ThenByDescending(x => { var index = x.Task.Result.ClusterCommitIndex; return x.Task.Result.ClusterInformation.IsLeader ? index + 1 : index; }) .FirstOrDefault(); if (newestTopology == null && FailoverServers != null && FailoverServers.Length > 0 && tryFailoverServers == false) tryFailoverServers = true; if (newestTopology == null && triedFailoverServers) { LeaderNode = primaryNode; Nodes = new List<OperationMetadata> { primaryNode }; return; } if (newestTopology != null) { Nodes = GetNodes(newestTopology.Node, newestTopology.Task.Result); LeaderNode = newestTopology.Task.Result.ClusterInformation.IsLeader ? Nodes.FirstOrDefault(n => n.Url == newestTopology.Node.Url) : null; ReplicationInformerLocalCache.TrySavingClusterNodesToLocalCache(serverHash, Nodes); if (LeaderNode != null) return; } Thread.Sleep(500); } }).ContinueWith(t => { lastUpdate = SystemTime.UtcNow; refreshReplicationInformationTask = null; }); } }