private async Task <T> HandleWithFailovers <T>(Func <OperationMetadata, IRequestTimeMetric, Task <T> > operation, CancellationToken token, bool withClusterFailoverHeader) { var nodes = NodeUrls; for (var i = 0; i < nodes.Count; i++) { var n = nodes[i]; // Have to be here more thread safe n.ClusterInformation.WithClusterFailoverHeader = withClusterFailoverHeader; if (ShouldExecuteUsing(n) == false) { continue; } var hasMoreNodes = nodes.Count > i + 1; var result = await TryClusterOperationAsync(n, operation, hasMoreNodes, token).ConfigureAwait(false); if (result.Success) { return(result.Result); } if (Log.IsDebugEnabled) { Log.Debug($"Tried executing operation on failover server {n.Url} with no success."); } FailureCounters.IncrementFailureCount(n.Url); } throw new InvalidOperationException("Cluster is not reachable. Executing operation on any of the nodes failed, aborting."); }
private async Task<T> ExecuteWithinClusterInternalAsync<T>(AsyncServerClient serverClient, HttpMethod method, Func<OperationMetadata, Task<T>> operation, CancellationToken token, int numberOfRetries = 2) { token.ThrowIfCancellationRequested(); if (numberOfRetries < 0) throw new InvalidOperationException("Cluster is not reachable. Out of retries, aborting."); var node = LeaderNode; if (node == null) { #pragma warning disable 4014 UpdateReplicationInformationIfNeededAsync(serverClient); // maybe start refresh task #pragma warning restore 4014 switch (serverClient.ClusterBehavior) { case ClusterBehavior.ReadFromAllWriteToLeaderWithFailovers: case ClusterBehavior.ReadFromLeaderWriteToLeaderWithFailovers: if (Nodes.Count == 0) leaderNodeSelected.Wait(TimeSpan.FromSeconds(WaitForLeaderTimeoutInSeconds)); break; default: if (leaderNodeSelected.Wait(TimeSpan.FromSeconds(WaitForLeaderTimeoutInSeconds)) == false) throw new InvalidOperationException("Cluster is not reachable. No leader was selected, aborting."); break; } node = LeaderNode; } switch (serverClient.ClusterBehavior) { case ClusterBehavior.ReadFromAllWriteToLeader: if (method == HttpMethods.Get) node = GetNodeForReadOperation(node); break; case ClusterBehavior.ReadFromAllWriteToLeaderWithFailovers: if (node == null) return await HandleWithFailovers(operation, token).ConfigureAwait(false); if (method == HttpMethods.Get) node = GetNodeForReadOperation(node); break; case ClusterBehavior.ReadFromLeaderWriteToLeaderWithFailovers: if (node == null) return await HandleWithFailovers(operation, token).ConfigureAwait(false); break; } var operationResult = await TryClusterOperationAsync(node, operation, false, token).ConfigureAwait(false); if (operationResult.Success) return operationResult.Result; LeaderNode = null; FailureCounters.IncrementFailureCount(node.Url); return await ExecuteWithinClusterInternalAsync(serverClient, method, operation, token, numberOfRetries - 1).ConfigureAwait(false); }
private async Task<T> HandleWithFailovers<T>(Func<OperationMetadata, Task<T>> operation, CancellationToken token) { var nodes = NodeUrls; for (var i = 0; i < nodes.Count; i++) { var n = nodes[i]; if (ShouldExecuteUsing(n) == false) continue; var hasMoreNodes = nodes.Count > i + 1; var result = await TryClusterOperationAsync(n, operation, hasMoreNodes, token).ConfigureAwait(false); if (result.Success) return result.Result; FailureCounters.IncrementFailureCount(n.Url); } throw new InvalidOperationException("Cluster is not reachable. Executing operation on any of the nodes failed, aborting."); }
private async Task <T> ExecuteWithinClusterInternalAsync <T>(AsyncServerClient serverClient, HttpMethod method, Func <OperationMetadata, IRequestTimeMetric, Task <T> > operation, CancellationToken token, int numberOfRetries = 2, bool withClusterFailoverHeader = false) { token.ThrowIfCancellationRequested(); var node = LeaderNode; if (node == null) { #pragma warning disable 4014 // If withClusterFailover set to true we will need to force the update and choose another leader. UpdateReplicationInformationIfNeededAsync(serverClient, force: withClusterFailoverHeader); // maybe start refresh task #pragma warning restore 4014 switch (serverClient.convention.FailoverBehavior) { case FailoverBehavior.ReadFromAllWriteToLeaderWithFailovers: case FailoverBehavior.ReadFromLeaderWriteToLeaderWithFailovers: var waitResult = leaderNodeSelected.Wait(WaitForLeaderTimeout); if (Log.IsDebugEnabled && waitResult == false) { Log.Debug($"Failover behavior is {serverClient.convention.FailoverBehavior}, waited for {WaitForLeaderTimeout.TotalSeconds} seconds and no leader was selected."); } break; default: if (leaderNodeSelected.Wait(WaitForLeaderTimeout) == false) { if (Log.IsDebugEnabled) { Log.Debug($"Failover behavior is {serverClient.convention.FailoverBehavior}, waited for {WaitForLeaderTimeout.TotalSeconds} seconds and no leader was selected."); } throw new InvalidOperationException($"Cluster is not in a stable state. No leader was selected, but we require one for making a request using {serverClient.convention.FailoverBehavior}."); } break; } node = LeaderNode; } switch (serverClient.convention.FailoverBehavior) { case FailoverBehavior.ReadFromAllWriteToLeader: if (method == HttpMethods.Get) { node = GetNodeForReadOperation(node) ?? node; } break; case FailoverBehavior.ReadFromAllWriteToLeaderWithFailovers: if (node == null) { return(await HandleWithFailovers(operation, token, withClusterFailoverHeader).ConfigureAwait(false)); } if (method == HttpMethods.Get) { node = GetNodeForReadOperation(node) ?? node; } break; case FailoverBehavior.ReadFromLeaderWriteToLeaderWithFailovers: if (node == null) { return(await HandleWithFailovers(operation, token, withClusterFailoverHeader).ConfigureAwait(false)); } break; } var operationResult = await TryClusterOperationAsync(node, operation, false, token).ConfigureAwait(false); if (operationResult.Success) { return(operationResult.Result); } if (Log.IsDebugEnabled) { Log.Debug($"Faield executing operation on node {node.Url} number of remaining retries: {numberOfRetries}."); } //the value of the leader was changed since we took a snapshot of it and it is not null so we will try to run again without // considering this a failure if (SetLeaderNodeToNullIfPrevIsTheSame(node) == false) { return(await ExecuteWithinClusterInternalAsync(serverClient, method, operation, token, numberOfRetries, withClusterFailoverHeader).ConfigureAwait(false)); } FailureCounters.IncrementFailureCount(node.Url); if (serverClient.convention.FailoverBehavior == FailoverBehavior.ReadFromLeaderWriteToLeaderWithFailovers || serverClient.convention.FailoverBehavior == FailoverBehavior.ReadFromAllWriteToLeaderWithFailovers) { withClusterFailoverHeader = true; } if (numberOfRetries <= 0) { throw new InvalidOperationException("Cluster is not reachable. Out of retries, aborting.", operationResult.Error); } return(await ExecuteWithinClusterInternalAsync(serverClient, method, operation, token, numberOfRetries - 1, withClusterFailoverHeader).ConfigureAwait(false)); }
private async Task <T> ExecuteWithinClusterInternalAsync <T>(AsyncServerClient serverClient, HttpMethod method, Func <OperationMetadata, IRequestTimeMetric, Task <T> > operation, CancellationToken token, int numberOfRetries = 3, bool withClusterFailoverHeader = false) { token.ThrowIfCancellationRequested(); bool isFaultedNode = false; var node = LeaderNode; if (node == null) { if (Log.IsDebugEnabled) { Log.Debug($"Fetching topology, {serverClient.Url}: Retries={numberOfRetries} When={DateTime.UtcNow}"); } #pragma warning disable 4014 //We always want to fetch a new topology if we don't know who the leader is. UpdateReplicationInformationIfNeededAsync(serverClient, force: true); #pragma warning restore 4014 //there is no reason for us to throw cluster not reachable for a read operation when we can read from all nodes. if (method == HttpMethod.Get && (serverClient.convention.FailoverBehavior == FailoverBehavior.ReadFromAllWriteToLeader || serverClient.convention.FailoverBehavior == FailoverBehavior.ReadFromAllWriteToLeaderWithFailovers)) { var primaryNode = new OperationMetadata(serverClient.Url, serverClient.PrimaryCredentials, null); node = GetNodeForReadOperation(primaryNode, out isFaultedNode); } else { switch (serverClient.convention.FailoverBehavior) { case FailoverBehavior.ReadFromAllWriteToLeaderWithFailovers: case FailoverBehavior.ReadFromLeaderWriteToLeaderWithFailovers: var waitResult = leaderNodeSelected.Wait(WaitForLeaderTimeout); if (Log.IsDebugEnabled && waitResult == false) { Log.Debug($"Failover behavior is {serverClient.convention.FailoverBehavior}, waited for {WaitForLeaderTimeout.TotalSeconds} seconds and no leader was selected."); } break; default: if (leaderNodeSelected.Wait(WaitForLeaderTimeout) == false) { if (Log.IsDebugEnabled) { Log.Debug($"Failover behavior is {serverClient.convention.FailoverBehavior}, waited for {WaitForLeaderTimeout.TotalSeconds} seconds and no leader was selected."); } throw new InvalidOperationException($"Cluster is not in a stable state. No leader was selected, but we require one for making a request using {serverClient.convention.FailoverBehavior}."); } break; } node = LeaderNode; } } switch (serverClient.convention.FailoverBehavior) { case FailoverBehavior.ReadFromAllWriteToLeader: if (method == HttpMethods.Get) { node = GetNodeForReadOperation(node, out isFaultedNode); } break; case FailoverBehavior.ReadFromAllWriteToLeaderWithFailovers: if (node == null) { return(await HandleWithFailovers(operation, token, withClusterFailoverHeader).ConfigureAwait(false)); } if (method == HttpMethods.Get) { node = GetNodeForReadOperation(node, out isFaultedNode); } break; case FailoverBehavior.ReadFromLeaderWriteToLeaderWithFailovers: if (node == null) { return(await HandleWithFailovers(operation, token, withClusterFailoverHeader).ConfigureAwait(false)); } break; } var operationResult = await TryClusterOperationAsync(node, operation, false, token).ConfigureAwait(false); if (operationResult.Success) { return(operationResult.Result); } if (isFaultedNode) //the node had more than one failure, but we tried it anyway. { if (Log.IsDebugEnabled) { Log.Debug($"Failed executing operation on node {node.Url}. Connecting to this node has failed already at least once, but we tried again anyway and failed. Got the following result: {operationResult.Result}. (Timeout = {operationResult.WasTimeout})"); } throw operationResult.Error; } if (Log.IsDebugEnabled) { Log.Debug($"Failed executing operation on node {node.Url} number of remaining retries: {numberOfRetries}."); } SetLeaderNodeToNullIfPrevIsTheSame(node); FailureCounters.IncrementFailureCount(node.Url); if (serverClient.convention.FailoverBehavior == FailoverBehavior.ReadFromLeaderWriteToLeaderWithFailovers || serverClient.convention.FailoverBehavior == FailoverBehavior.ReadFromAllWriteToLeaderWithFailovers) { withClusterFailoverHeader = true; } if (numberOfRetries <= 0) { throw new InvalidOperationException("Cluster is not reachable. Out of retries, aborting.", operationResult.Error); } return(await ExecuteWithinClusterInternalAsync(serverClient, method, operation, token, numberOfRetries - 1, withClusterFailoverHeader).ConfigureAwait(false)); }