Ejemplo n.º 1
0
        private async Task <T> HandleWithFailovers <T>(Func <OperationMetadata, IRequestTimeMetric, Task <T> > operation, CancellationToken token, bool withClusterFailoverHeader)
        {
            var nodes = NodeUrls;

            for (var i = 0; i < nodes.Count; i++)
            {
                var n = nodes[i];

                // Have to be here more thread safe
                n.ClusterInformation.WithClusterFailoverHeader = withClusterFailoverHeader;
                if (ShouldExecuteUsing(n) == false)
                {
                    continue;
                }

                var hasMoreNodes = nodes.Count > i + 1;
                var result       = await TryClusterOperationAsync(n, operation, hasMoreNodes, token).ConfigureAwait(false);

                if (result.Success)
                {
                    return(result.Result);
                }
                if (Log.IsDebugEnabled)
                {
                    Log.Debug($"Tried executing operation on failover server {n.Url} with no success.");
                }
                FailureCounters.IncrementFailureCount(n.Url);
            }

            throw new InvalidOperationException("Cluster is not reachable. Executing operation on any of the nodes failed, aborting.");
        }
        private bool ShouldExecuteUsing(OperationMetadata operationMetadata)
        {
            var failureCounter = FailureCounters.GetHolder(operationMetadata.Url);
            if (failureCounter.Value <= 1) // can fail once
                return true;

            return false;
        }
        private async Task<T> ExecuteWithinClusterInternalAsync<T>(AsyncServerClient serverClient, HttpMethod method, Func<OperationMetadata, Task<T>> operation, CancellationToken token, int numberOfRetries = 2)
        {
            token.ThrowIfCancellationRequested();

            if (numberOfRetries < 0)
                throw new InvalidOperationException("Cluster is not reachable. Out of retries, aborting.");

            var node = LeaderNode;
            if (node == null)
            {
#pragma warning disable 4014
                UpdateReplicationInformationIfNeededAsync(serverClient); // maybe start refresh task
#pragma warning restore 4014

                switch (serverClient.ClusterBehavior)
                {
                    case ClusterBehavior.ReadFromAllWriteToLeaderWithFailovers:
                    case ClusterBehavior.ReadFromLeaderWriteToLeaderWithFailovers:
                        if (Nodes.Count == 0)
                            leaderNodeSelected.Wait(TimeSpan.FromSeconds(WaitForLeaderTimeoutInSeconds));
                        break;
                    default:
                        if (leaderNodeSelected.Wait(TimeSpan.FromSeconds(WaitForLeaderTimeoutInSeconds)) == false)
                            throw new InvalidOperationException("Cluster is not reachable. No leader was selected, aborting.");
                        break;
                }

                node = LeaderNode;
            }

            switch (serverClient.ClusterBehavior)
            {
                case ClusterBehavior.ReadFromAllWriteToLeader:
                    if (method == HttpMethods.Get)
                        node = GetNodeForReadOperation(node);
                    break;
                case ClusterBehavior.ReadFromAllWriteToLeaderWithFailovers:
                    if (node == null)
                        return await HandleWithFailovers(operation, token).ConfigureAwait(false);

                    if (method == HttpMethods.Get)
                        node = GetNodeForReadOperation(node);
                    break;
                case ClusterBehavior.ReadFromLeaderWriteToLeaderWithFailovers:
                    if (node == null)
                        return await HandleWithFailovers(operation, token).ConfigureAwait(false);
                    break;
            }

            var operationResult = await TryClusterOperationAsync(node, operation, false, token).ConfigureAwait(false);
            if (operationResult.Success)
                return operationResult.Result;

            LeaderNode = null;
            FailureCounters.IncrementFailureCount(node.Url);
            return await ExecuteWithinClusterInternalAsync(serverClient, method, operation, token, numberOfRetries - 1).ConfigureAwait(false);
        }
        private async Task<AsyncOperationResult<T>> TryClusterOperationAsync<T>(OperationMetadata node, Func<OperationMetadata, Task<T>> operation, bool avoidThrowing, CancellationToken token)
        {
            Debug.Assert(node != null);

            token.ThrowIfCancellationRequested();
            var shouldRetry = false;

            var operationResult = new AsyncOperationResult<T>();
            try
            {
                operationResult.Result = await operation(node).ConfigureAwait(false);
                operationResult.Success = true;
            }
            catch (Exception e)
            {
                bool wasTimeout;
                if (HttpConnectionHelper.IsServerDown(e, out wasTimeout))
                {
                    shouldRetry = true;
                    operationResult.WasTimeout = wasTimeout;
                }
                else
                {
                    var ae = e as AggregateException;
                    ErrorResponseException errorResponseException;
                    if (ae != null)
                        errorResponseException = ae.ExtractSingleInnerException() as ErrorResponseException;
                    else
                        errorResponseException = e as ErrorResponseException;

                    if (errorResponseException != null && (errorResponseException.StatusCode == HttpStatusCode.Redirect || errorResponseException.StatusCode == HttpStatusCode.ExpectationFailed))
                        shouldRetry = true;
                }

                if (shouldRetry == false && avoidThrowing == false)
                    throw;
            }

            if (operationResult.Success)
                FailureCounters.ResetFailureCount(node.Url);

            return operationResult;
        }
        private async Task<T> HandleWithFailovers<T>(Func<OperationMetadata, Task<T>> operation, CancellationToken token)
        {
            var nodes = NodeUrls;
            for (var i = 0; i < nodes.Count; i++)
            {
                var n = nodes[i];
                if (ShouldExecuteUsing(n) == false)
                    continue;

                var hasMoreNodes = nodes.Count > i + 1;
                var result = await TryClusterOperationAsync(n, operation, hasMoreNodes, token).ConfigureAwait(false);
                if (result.Success)
                    return result.Result;

                FailureCounters.IncrementFailureCount(n.Url);
            }

            throw new InvalidOperationException("Cluster is not reachable. Executing operation on any of the nodes failed, aborting.");
        }
Ejemplo n.º 6
0
        private Task UpdateReplicationInformationForCluster(AsyncServerClient serverClient, OperationMetadata primaryNode, Func <OperationMetadata, Task <ReplicationDocumentWithClusterInformation> > getReplicationDestinationsTask)
        {
            lock (this)
            {
                var serverHash = ServerHash.GetServerHash(primaryNode.Url);

                var taskCopy = refreshReplicationInformationTask;
                if (taskCopy != null)
                {
                    return(taskCopy);
                }

                if (firstTime)
                {
                    firstTime = false;
                    var document = ReplicationInformerLocalCache.TryLoadReplicationInformationFromLocalCache(serverHash);
                    var nodes    = GetNodes(primaryNode, document?.DataAsJson.JsonDeserialization <ReplicationDocumentWithClusterInformation>());
                    if (nodes != null)
                    {
                        Nodes = nodes;
                        var newLeaderNode = GetLeaderNode(Nodes);
                        if (newLeaderNode != null)
                        {
                            if (Log.IsDebugEnabled)
                            {
                                Log.Debug($"Fetched topology from cache, Leader is {LeaderNode}\n Nodes:" + string.Join(",", Nodes.Select(n => n.Url)));
                            }
                            SetLeaderNodeToKnownLeader(newLeaderNode);
                            return(new CompletedTask());
                        }
                        if (Log.IsDebugEnabled)
                        {
                            Log.Debug($"Fetched topology from cache, no leader found.\n Nodes:" + string.Join(",", Nodes.Select(n => n.Url)));
                        }
                        SetLeaderNodeToNull();
                    }
                }

                return(refreshReplicationInformationTask = Task.Factory.StartNew(() =>
                {
                    var tryFailoverServers = false;
                    var triedFailoverServers = FailoverServers == null || FailoverServers.Length == 0;
                    for (;;)
                    {
                        //taking a snapshot so we could tell if the value changed while we fetch the topology
                        var prevLeader = LeaderNode;
                        var nodes = NodeUrls.ToHashSet();

                        if (tryFailoverServers == false)
                        {
                            if (nodes.Count == 0)
                            {
                                nodes.Add(primaryNode);
                            }
                        }
                        else
                        {
                            nodes.Add(primaryNode); // always check primary node during failover check

                            foreach (var failoverServer in FailoverServers)
                            {
                                var node = ConvertReplicationDestinationToOperationMetadata(failoverServer, ClusterInformation.NotInCluster);
                                if (node != null)
                                {
                                    nodes.Add(node);
                                }
                            }

                            triedFailoverServers = true;
                        }

                        var replicationDocuments = nodes
                                                   .Select(operationMetadata => new
                        {
                            Node = operationMetadata,
                            Task = getReplicationDestinationsTask(operationMetadata)
                        })
                                                   .ToArray();

                        var tasks = replicationDocuments
                                    .Select(x => (Task)x.Task)
                                    .ToArray();

                        var tasksCompleted = Task.WaitAll(tasks, ReplicationDestinationsTopologyTimeout);
                        if (Log.IsDebugEnabled && tasksCompleted == false)
                        {
                            Log.Debug($"During fetch topology {tasks.Count(t=>t.IsCompleted)} servers have responded out of {tasks.Length}");
                        }
                        replicationDocuments.ForEach(x =>
                        {
                            if (x.Task.IsCompleted && x.Task.Result != null)
                            {
                                FailureCounters.ResetFailureCount(x.Node.Url);
                            }
                        });

                        var newestTopology = replicationDocuments
                                             .Where(x => x.Task.IsCompleted && x.Task.Result != null)
                                             .OrderByDescending(x => x.Task.Result.Term)
                                             .ThenByDescending(x =>
                        {
                            var index = x.Task.Result.ClusterCommitIndex;
                            return x.Task.Result.ClusterInformation.IsLeader ? index + 1 : index;
                        })
                                             .FirstOrDefault();


                        if (newestTopology == null && FailoverServers != null && FailoverServers.Length > 0 && tryFailoverServers == false)
                        {
                            tryFailoverServers = true;
                        }

                        if (newestTopology == null && triedFailoverServers)
                        {
                            if (Log.IsDebugEnabled)
                            {
                                Log.Debug($"Fetching topology resulted with no topology, tried failoever servers, setting leader node to primary node ({primaryNode}).");
                            }
                            //if the leader Node is not null this means that somebody updated it, we don't want to overwrite it with the primary.
                            // i'm rasing the leader changed event although we don't have a real leader because some tests don't wait for leader but actually any node
                            //Todo: change back to: if (SetLeaderNodeIfLeaderIsNull(primaryNode, false) == false)
                            if (SetLeaderNodeIfLeaderIsNull(primaryNode) == false)
                            {
                                return;
                            }

                            if (Nodes.Count == 0)
                            {
                                Nodes = new List <OperationMetadata>
                                {
                                    primaryNode
                                }
                            }
                            ;
                            return;
                        }

                        if (newestTopology != null)
                        {
                            var replicationDocument = newestTopology.Task.Result;
                            var node = newestTopology.Node;
                            if (UpdateTopology(serverClient, node, replicationDocument, serverHash, prevLeader))
                            {
                                return;
                            }
                        }

                        Thread.Sleep(500);
                    }
                }).ContinueWith(t =>
                {
                    lastUpdate = SystemTime.UtcNow;
                    refreshReplicationInformationTask = null;
                }));
            }
        }
Ejemplo n.º 7
0
        private async Task <AsyncOperationResult <T> > TryClusterOperationAsync <T>(OperationMetadata node, Func <OperationMetadata, IRequestTimeMetric, Task <T> > operation, bool avoidThrowing, CancellationToken token)
        {
            Debug.Assert(node != null);

            token.ThrowIfCancellationRequested();
            var shouldRetry = false;

            var operationResult = new AsyncOperationResult <T>();

            try
            {
                operationResult.Result = await operation(node, null).ConfigureAwait(false);

                operationResult.Success = true;
            }
            catch (Exception e)
            {
                bool wasTimeout;
                if (HttpConnectionHelper.IsServerDown(e, out wasTimeout))
                {
                    shouldRetry = true;
                    operationResult.WasTimeout = wasTimeout;
                    if (Log.IsDebugEnabled)
                    {
                        Log.Debug($"Operation failed because server {node.Url} is down.");
                    }
                }
                else
                {
                    var ae = e as AggregateException;
                    ErrorResponseException errorResponseException;
                    if (ae != null)
                    {
                        errorResponseException = ae.ExtractSingleInnerException() as ErrorResponseException;
                    }
                    else
                    {
                        errorResponseException = e as ErrorResponseException;
                    }

                    if (errorResponseException != null)
                    {
                        if (errorResponseException.StatusCode == HttpStatusCode.Redirect)
                        {
                            IEnumerable <string> values;
                            if (errorResponseException.Response.Headers.TryGetValues("Raven-Leader-Redirect", out values) == false &&
                                values.Contains("true") == false)
                            {
                                throw new InvalidOperationException("Got 302 Redirect, but without Raven-Leader-Redirect: true header, maybe there is a proxy in the middle", e);
                            }
                            var redirectUrl   = errorResponseException.Response.Headers.Location.ToString();
                            var newLeaderNode = Nodes.FirstOrDefault(n => n.Url.Equals(redirectUrl)) ?? new OperationMetadata(redirectUrl, node.Credentials, node.ClusterInformation);
                            SetLeaderNodeToKnownLeader(newLeaderNode);
                            if (Log.IsDebugEnabled)
                            {
                                Log.Debug($"Redirecting to {redirectUrl} because {node.Url} responded with 302-redirect.");
                            }
                            return(await TryClusterOperationAsync(newLeaderNode, operation, avoidThrowing, token).ConfigureAwait(false));
                        }

                        if (errorResponseException.StatusCode == HttpStatusCode.ExpectationFailed)
                        {
                            if (Log.IsDebugEnabled)
                            {
                                Log.Debug($"Operation failed with status code {HttpStatusCode.ExpectationFailed}, will retry.");
                            }
                            shouldRetry = true;
                        }
                    }
                }

                if (shouldRetry == false && avoidThrowing == false)
                {
                    throw;
                }

                operationResult.Error = e;
            }

            if (operationResult.Success)
            {
                FailureCounters.ResetFailureCount(node.Url);
            }

            return(operationResult);
        }
Ejemplo n.º 8
0
        private async Task <T> ExecuteWithinClusterInternalAsync <T>(AsyncServerClient serverClient, HttpMethod method, Func <OperationMetadata, IRequestTimeMetric, Task <T> > operation, CancellationToken token, int numberOfRetries = 2, bool withClusterFailoverHeader = false)
        {
            token.ThrowIfCancellationRequested();

            var node = LeaderNode;

            if (node == null)
            {
#pragma warning disable 4014
                // If withClusterFailover set to true we will need to force the update and choose another leader.
                UpdateReplicationInformationIfNeededAsync(serverClient, force: withClusterFailoverHeader); // maybe start refresh task
#pragma warning restore 4014
                switch (serverClient.convention.FailoverBehavior)
                {
                case FailoverBehavior.ReadFromAllWriteToLeaderWithFailovers:
                case FailoverBehavior.ReadFromLeaderWriteToLeaderWithFailovers:
                    var waitResult = leaderNodeSelected.Wait(WaitForLeaderTimeout);
                    if (Log.IsDebugEnabled && waitResult == false)
                    {
                        Log.Debug($"Failover behavior is {serverClient.convention.FailoverBehavior}, waited for {WaitForLeaderTimeout.TotalSeconds} seconds and no leader was selected.");
                    }
                    break;

                default:
                    if (leaderNodeSelected.Wait(WaitForLeaderTimeout) == false)
                    {
                        if (Log.IsDebugEnabled)
                        {
                            Log.Debug($"Failover behavior is {serverClient.convention.FailoverBehavior}, waited for {WaitForLeaderTimeout.TotalSeconds} seconds and no leader was selected.");
                        }
                        throw new InvalidOperationException($"Cluster is not in a stable state. No leader was selected, but we require one for making a request using {serverClient.convention.FailoverBehavior}.");
                    }
                    break;
                }

                node = LeaderNode;
            }
            switch (serverClient.convention.FailoverBehavior)
            {
            case FailoverBehavior.ReadFromAllWriteToLeader:
                if (method == HttpMethods.Get)
                {
                    node = GetNodeForReadOperation(node) ?? node;
                }
                break;

            case FailoverBehavior.ReadFromAllWriteToLeaderWithFailovers:
                if (node == null)
                {
                    return(await HandleWithFailovers(operation, token, withClusterFailoverHeader).ConfigureAwait(false));
                }

                if (method == HttpMethods.Get)
                {
                    node = GetNodeForReadOperation(node) ?? node;
                }
                break;

            case FailoverBehavior.ReadFromLeaderWriteToLeaderWithFailovers:
                if (node == null)
                {
                    return(await HandleWithFailovers(operation, token, withClusterFailoverHeader).ConfigureAwait(false));
                }
                break;
            }

            var operationResult = await TryClusterOperationAsync(node, operation, false, token).ConfigureAwait(false);

            if (operationResult.Success)
            {
                return(operationResult.Result);
            }
            if (Log.IsDebugEnabled)
            {
                Log.Debug($"Faield executing operation on node {node.Url} number of remaining retries: {numberOfRetries}.");
            }

            //the value of the leader was changed since we took a snapshot of it and it is not null so we will try to run again without
            // considering this a failure
            if (SetLeaderNodeToNullIfPrevIsTheSame(node) == false)
            {
                return(await ExecuteWithinClusterInternalAsync(serverClient, method, operation, token, numberOfRetries, withClusterFailoverHeader).ConfigureAwait(false));
            }
            FailureCounters.IncrementFailureCount(node.Url);

            if (serverClient.convention.FailoverBehavior == FailoverBehavior.ReadFromLeaderWriteToLeaderWithFailovers ||
                serverClient.convention.FailoverBehavior == FailoverBehavior.ReadFromAllWriteToLeaderWithFailovers)
            {
                withClusterFailoverHeader = true;
            }

            if (numberOfRetries <= 0)
            {
                throw new InvalidOperationException("Cluster is not reachable. Out of retries, aborting.", operationResult.Error);
            }

            return(await ExecuteWithinClusterInternalAsync(serverClient, method, operation, token, numberOfRetries - 1, withClusterFailoverHeader).ConfigureAwait(false));
        }
Ejemplo n.º 9
0
 public ClusterAwareRequestExecuter()
 {
     Nodes           = new List <OperationMetadata>();
     FailureCounters = new FailureCounters();
 }
        private Task UpdateReplicationInformationForCluster(AsyncServerClient serverClient, OperationMetadata primaryNode, Func <OperationMetadata, Task <ReplicationDocumentWithClusterInformation> > getReplicationDestinationsTask)
        {
            lock (this)
            {
                var serverHash = ServerHash.GetServerHash(primaryNode.Url);

                var taskCopy = refreshReplicationInformationTask;
                if (taskCopy != null)
                {
                    return(taskCopy);
                }

                if (firstTime)
                {
                    firstTime = false;
                    var document = ReplicationInformerLocalCache.TryLoadReplicationInformationFromLocalCache(serverHash);
                    var nodes    = GetNodes(primaryNode, document?.DataAsJson.JsonDeserialization <ReplicationDocumentWithClusterInformation>());
                    if (nodes != null)
                    {
                        Nodes = nodes;
                        var newLeaderNode = GetLeaderNode(Nodes);
                        if (newLeaderNode != null)
                        {
                            if (Log.IsDebugEnabled)
                            {
                                Log.Debug($"Fetched topology from cache, Leader is {LeaderNode}\n Nodes:" + string.Join(",", Nodes.Select(n => n.Url)));
                            }
                            SetLeaderNodeToKnownLeader(newLeaderNode);
                            return(new CompletedTask());
                        }
                        if (Log.IsDebugEnabled)
                        {
                            Log.Debug($"Fetched topology from cache, no leader found.\n Nodes:" + string.Join(",", Nodes.Select(n => n.Url)));
                        }
                        SetLeaderNodeToNull();
                    }
                }

                return(refreshReplicationInformationTask = Task.Factory.StartNew(async() =>
                {
                    var tryFailoverServers = false;
                    var triedFailoverServers = FailoverServers == null || FailoverServers.Length == 0;
                    for (; ;)
                    {
                        //taking a snapshot so we could tell if the value changed while we fetch the topology
                        var prevLeader = LeaderNode;
                        var nodes = NodeUrls.ToHashSet();

                        if (tryFailoverServers == false)
                        {
                            if (nodes.Count == 0)
                            {
                                nodes.Add(primaryNode);
                            }
                        }
                        else
                        {
                            nodes.Add(primaryNode); // always check primary node during failover check

                            foreach (var failoverServer in FailoverServers)
                            {
                                var node = ConvertReplicationDestinationToOperationMetadata(failoverServer, ClusterInformation.NotInCluster);
                                if (node != null)
                                {
                                    nodes.Add(node);
                                }
                            }

                            triedFailoverServers = true;
                        }

                        var replicationDocuments = nodes
                                                   .Select(operationMetadata => new
                        {
                            Node = operationMetadata,
                            Task = getReplicationDestinationsTask(operationMetadata),
                        })
                                                   .ToArray();

                        var tasks = replicationDocuments
                                    .Select(x => (Task)x.Task)
                                    .ToArray();

                        var tasksCompleted = Task.WaitAll(tasks, ReplicationDestinationsTopologyTimeout);
                        if (Log.IsDebugEnabled && tasksCompleted == false)
                        {
                            Log.Debug($"During fetch topology {tasks.Count(t => t.IsCompleted)} servers have responded out of {tasks.Length}");
                        }
                        replicationDocuments.ForEach(x =>
                        {
                            if (x.Task.IsCompleted && x.Task.Result != null)
                            {
                                FailureCounters.ResetFailureCount(x.Node.Url);
                            }
                        });

                        var newestTopologies = replicationDocuments
                                               .Where(x => x.Task.IsCompleted && x.Task.Result != null)
                                               .OrderByDescending(x => x.Task.Result.Term)
                                               .ThenByDescending(x =>
                        {
                            var index = x.Task.Result.ClusterCommitIndex;
                            return x.Task.Result.ClusterInformation.IsLeader ? index + 1 : index;
                        }).ToList();

                        var newestTopology = newestTopologies.FirstOrDefault();

                        var hasLeaderCount = replicationDocuments
                                             .Count(x => x.Task.IsCompleted && x.Task.Result != null && x.Task.Result.HasLeader);

                        if (newestTopology == null && FailoverServers != null && FailoverServers.Length > 0 && tryFailoverServers == false)
                        {
                            tryFailoverServers = true;
                        }

                        if (newestTopology == null && triedFailoverServers)
                        {
                            if (Log.IsDebugEnabled)
                            {
                                Log.Debug($"Fetching topology resulted with no topology, tried failoever servers, setting leader node to primary node ({primaryNode}).");
                            }
                            //if the leader Node is not null this means that somebody updated it, we don't want to overwrite it with the primary.
                            // i'm raising the leader changed event although we don't have a real leader because some tests don't wait for leader but actually any node
                            //Todo: change back to: if (SetLeaderNodeIfLeaderIsNull(primaryNode, false) == false)
                            if (SetLeaderNodeIfLeaderIsNull(primaryNode) == false)
                            {
                                return;
                            }

                            if (Nodes.Count == 0)
                            {
                                Nodes = new List <OperationMetadata>
                                {
                                    primaryNode
                                }
                            }
                            ;
                            return;
                        }
                        if (Log.IsDebugEnabled)
                        {
                            foreach (var x in replicationDocuments)
                            {
                                Log.Debug($"Topology fetched from {x.Node.Url}");
                                Log.Debug($"{JsonConvert.SerializeObject(x.Task?.Result)}");
                            }
                        }
                        var majorityOfNodesAgreeThereIsLeader = Nodes.Count == 1 || hasLeaderCount > (newestTopology?.Task.Result.Destinations.Count + 1) / 2;
                        if (newestTopology != null && majorityOfNodesAgreeThereIsLeader)
                        {
                            var replicationDocument = newestTopology.Task.Result;
                            var node = newestTopology.Node;

                            if (newestTopologies.Count > 1 && node.Url.Equals(serverClient.Url) == false)
                            {
                                // we got the replication document not from the primary url
                                // need to add the node url destination to the destinations
                                // (we know it exists since we have majority of nodes that agree on the leader)
                                // and remove the primary url destination from the destinations

                                var sourceNode = node;
                                var destination = replicationDocument.Destinations
                                                  .FirstOrDefault(x => DestinationUrl(x.Url, x.Database).Equals(serverClient.Url, StringComparison.OrdinalIgnoreCase));
                                if (destination != null)
                                {
                                    replicationDocument.Destinations.Remove(destination);
                                    // we need to update the cluster information of the primary url for this node
                                    replicationDocument.ClusterInformation = destination.ClusterInformation;
                                    node = ConvertReplicationDestinationToOperationMetadata(destination, destination.ClusterInformation);
                                }

                                destination = destination ?? replicationDocument.Destinations.FirstOrDefault();
                                if (destination != null)
                                {
                                    var database = destination.Database;
                                    var networkCredentials = sourceNode.Credentials?.Credentials as NetworkCredential;
                                    replicationDocument.Destinations.Add(new ReplicationDestination.ReplicationDestinationWithClusterInformation
                                    {
                                        Url = sourceNode.Url,
                                        Database = database,
                                        ApiKey = sourceNode.Credentials?.ApiKey,
                                        Username = networkCredentials?.UserName,
                                        Password = networkCredentials?.Password,
                                        Domain = networkCredentials?.Domain,
                                        ClusterInformation = sourceNode.ClusterInformation
                                    });
                                }
                            }

                            if (UpdateTopology(serverClient, node, replicationDocument, serverHash, prevLeader))
                            {
                                return;
                            }
                        }
                        await Task.Delay(3000).ConfigureAwait(false);
                    }
                }).ContinueWith(t =>
                {
                    lastUpdate = SystemTime.UtcNow;
                    refreshReplicationInformationTask = null;
                }));
            }
        }
        private async Task <T> ExecuteWithinClusterInternalAsync <T>(AsyncServerClient serverClient, HttpMethod method, Func <OperationMetadata, IRequestTimeMetric, Task <T> > operation, CancellationToken token, int numberOfRetries = 3, bool withClusterFailoverHeader = false)
        {
            token.ThrowIfCancellationRequested();
            bool isFaultedNode = false;
            var  node          = LeaderNode;

            if (node == null)
            {
                if (Log.IsDebugEnabled)
                {
                    Log.Debug($"Fetching topology, {serverClient.Url}: Retries={numberOfRetries} When={DateTime.UtcNow}");
                }
#pragma warning disable 4014
                //We always want to fetch a new topology if we don't know who the leader is.
                UpdateReplicationInformationIfNeededAsync(serverClient, force: true);
#pragma warning restore 4014
                //there is no reason for us to throw cluster not reachable for a read operation when we can read from all nodes.
                if (method == HttpMethod.Get &&
                    (serverClient.convention.FailoverBehavior == FailoverBehavior.ReadFromAllWriteToLeader ||
                     serverClient.convention.FailoverBehavior == FailoverBehavior.ReadFromAllWriteToLeaderWithFailovers))
                {
                    var primaryNode = new OperationMetadata(serverClient.Url, serverClient.PrimaryCredentials, null);
                    node = GetNodeForReadOperation(primaryNode, out isFaultedNode);
                }
                else
                {
                    switch (serverClient.convention.FailoverBehavior)
                    {
                    case FailoverBehavior.ReadFromAllWriteToLeaderWithFailovers:
                    case FailoverBehavior.ReadFromLeaderWriteToLeaderWithFailovers:
                        var waitResult = leaderNodeSelected.Wait(WaitForLeaderTimeout);
                        if (Log.IsDebugEnabled && waitResult == false)
                        {
                            Log.Debug($"Failover behavior is {serverClient.convention.FailoverBehavior}, waited for {WaitForLeaderTimeout.TotalSeconds} seconds and no leader was selected.");
                        }
                        break;

                    default:
                        if (leaderNodeSelected.Wait(WaitForLeaderTimeout) == false)
                        {
                            if (Log.IsDebugEnabled)
                            {
                                Log.Debug($"Failover behavior is {serverClient.convention.FailoverBehavior}, waited for {WaitForLeaderTimeout.TotalSeconds} seconds and no leader was selected.");
                            }
                            throw new InvalidOperationException($"Cluster is not in a stable state. No leader was selected, but we require one for making a request using {serverClient.convention.FailoverBehavior}.");
                        }
                        break;
                    }

                    node = LeaderNode;
                }
            }

            switch (serverClient.convention.FailoverBehavior)
            {
            case FailoverBehavior.ReadFromAllWriteToLeader:
                if (method == HttpMethods.Get)
                {
                    node = GetNodeForReadOperation(node, out isFaultedNode);
                }
                break;

            case FailoverBehavior.ReadFromAllWriteToLeaderWithFailovers:
                if (node == null)
                {
                    return(await HandleWithFailovers(operation, token, withClusterFailoverHeader).ConfigureAwait(false));
                }

                if (method == HttpMethods.Get)
                {
                    node = GetNodeForReadOperation(node, out isFaultedNode);
                }
                break;

            case FailoverBehavior.ReadFromLeaderWriteToLeaderWithFailovers:
                if (node == null)
                {
                    return(await HandleWithFailovers(operation, token, withClusterFailoverHeader).ConfigureAwait(false));
                }
                break;
            }

            var operationResult = await TryClusterOperationAsync(node, operation, false, token).ConfigureAwait(false);

            if (operationResult.Success)
            {
                return(operationResult.Result);
            }

            if (isFaultedNode) //the node had more than one failure, but we tried it anyway.
            {
                if (Log.IsDebugEnabled)
                {
                    Log.Debug($"Failed executing operation on node {node.Url}. Connecting to this node has failed already at least once, but we tried again anyway and failed. Got the following result: {operationResult.Result}. (Timeout = {operationResult.WasTimeout})");
                }

                throw operationResult.Error;
            }

            if (Log.IsDebugEnabled)
            {
                Log.Debug($"Failed executing operation on node {node.Url} number of remaining retries: {numberOfRetries}.");
            }

            SetLeaderNodeToNullIfPrevIsTheSame(node);
            FailureCounters.IncrementFailureCount(node.Url);

            if (serverClient.convention.FailoverBehavior == FailoverBehavior.ReadFromLeaderWriteToLeaderWithFailovers ||
                serverClient.convention.FailoverBehavior == FailoverBehavior.ReadFromAllWriteToLeaderWithFailovers)
            {
                withClusterFailoverHeader = true;
            }

            if (numberOfRetries <= 0)
            {
                throw new InvalidOperationException("Cluster is not reachable. Out of retries, aborting.", operationResult.Error);
            }

            return(await ExecuteWithinClusterInternalAsync(serverClient, method, operation, token, numberOfRetries - 1, withClusterFailoverHeader).ConfigureAwait(false));
        }
        private Task UpdateReplicationInformationForCluster(OperationMetadata primaryNode, Func<OperationMetadata, Task<ReplicationDocumentWithClusterInformation>> getReplicationDestinationsTask)
        {
            lock (this)
            {
                var serverHash = ServerHash.GetServerHash(primaryNode.Url);

                var taskCopy = refreshReplicationInformationTask;
                if (taskCopy != null)
                    return taskCopy;

                if (firstTime)
                {
                    firstTime = false;

                    var nodes = ReplicationInformerLocalCache.TryLoadClusterNodesFromLocalCache(serverHash);
                    if (nodes != null)
                    {
                        Nodes = nodes;
                        LeaderNode = GetLeaderNode(Nodes);

                        if (LeaderNode != null)
                            return new CompletedTask();
                    }
                }

                return refreshReplicationInformationTask = Task.Factory.StartNew(() =>
                {
                    var tryFailoverServers = false;
                    var triedFailoverServers = FailoverServers == null || FailoverServers.Length == 0;
                    for (; ; )
                    {
                        var nodes = NodeUrls.ToHashSet();

                        if (tryFailoverServers == false)
                        {
                            if (nodes.Count == 0)
                                nodes.Add(primaryNode);
                        }
                        else
                        {
                            nodes.Add(primaryNode); // always check primary node during failover check

                            foreach (var failoverServer in FailoverServers)
                            {
                                var node = ConvertReplicationDestinationToOperationMetadata(failoverServer, ClusterInformation.NotInCluster);
                                if (node != null)
                                    nodes.Add(node);
                            }

                            triedFailoverServers = true;
                        }

                        var replicationDocuments = nodes
                            .Select(operationMetadata => new
                            {
                                Node = operationMetadata,
                                Task = getReplicationDestinationsTask(operationMetadata)
                            })
                            .ToArray();

                        var tasks = replicationDocuments
                            .Select(x => x.Task)
                            .ToArray();

                        Task.WaitAll(tasks);

                        replicationDocuments.ForEach(x =>
                        {
                            if (x.Task.Result == null)
                                return;

                            FailureCounters.ResetFailureCount(x.Node.Url);
                        });

                        var newestTopology = replicationDocuments
                            .Where(x => x.Task.Result != null)
                            .OrderByDescending(x => x.Task.Result.Term)
                            .ThenByDescending(x =>
                             {
                                var index = x.Task.Result.ClusterCommitIndex;
                                return x.Task.Result.ClusterInformation.IsLeader ? index + 1 : index;
                             })
                            .FirstOrDefault();

                        if (newestTopology == null && FailoverServers != null && FailoverServers.Length > 0 && tryFailoverServers == false)
                            tryFailoverServers = true;

                        if (newestTopology == null && triedFailoverServers)
                        {
                            LeaderNode = primaryNode;
                            Nodes = new List<OperationMetadata>
                            {
                                primaryNode
                            };
                            return;
                        }

                        if (newestTopology != null)
                        {
                            Nodes = GetNodes(newestTopology.Node, newestTopology.Task.Result);
                            LeaderNode = newestTopology.Task.Result.ClusterInformation.IsLeader ?
                                Nodes.FirstOrDefault(n => n.Url == newestTopology.Node.Url) : null;

                            ReplicationInformerLocalCache.TrySavingClusterNodesToLocalCache(serverHash, Nodes);

                            if (LeaderNode != null)
                                return;
                        }

                        Thread.Sleep(500);
                    }
                }).ContinueWith(t =>
                {
                    lastUpdate = SystemTime.UtcNow;
                    refreshReplicationInformationTask = null;
                });
            }
        }