private async Task<AsyncOperationResult<T>> TryClusterOperationAsync<T>(OperationMetadata node, Func<OperationMetadata, Task<T>> operation, bool avoidThrowing, CancellationToken token) { Debug.Assert(node != null); token.ThrowIfCancellationRequested(); var shouldRetry = false; var operationResult = new AsyncOperationResult<T>(); try { operationResult.Result = await operation(node).ConfigureAwait(false); operationResult.Success = true; } catch (Exception e) { bool wasTimeout; if (HttpConnectionHelper.IsServerDown(e, out wasTimeout)) { shouldRetry = true; operationResult.WasTimeout = wasTimeout; } else { var ae = e as AggregateException; ErrorResponseException errorResponseException; if (ae != null) errorResponseException = ae.ExtractSingleInnerException() as ErrorResponseException; else errorResponseException = e as ErrorResponseException; if (errorResponseException != null && (errorResponseException.StatusCode == HttpStatusCode.Redirect || errorResponseException.StatusCode == HttpStatusCode.ExpectationFailed)) shouldRetry = true; } if (shouldRetry == false && avoidThrowing == false) throw; } if (operationResult.Success) FailureCounters.ResetFailureCount(node.Url); return operationResult; }
private Task UpdateReplicationInformationForCluster(AsyncServerClient serverClient, OperationMetadata primaryNode, Func <OperationMetadata, Task <ReplicationDocumentWithClusterInformation> > getReplicationDestinationsTask) { lock (this) { var serverHash = ServerHash.GetServerHash(primaryNode.Url); var taskCopy = refreshReplicationInformationTask; if (taskCopy != null) { return(taskCopy); } if (firstTime) { firstTime = false; var document = ReplicationInformerLocalCache.TryLoadReplicationInformationFromLocalCache(serverHash); var nodes = GetNodes(primaryNode, document?.DataAsJson.JsonDeserialization <ReplicationDocumentWithClusterInformation>()); if (nodes != null) { Nodes = nodes; var newLeaderNode = GetLeaderNode(Nodes); if (newLeaderNode != null) { if (Log.IsDebugEnabled) { Log.Debug($"Fetched topology from cache, Leader is {LeaderNode}\n Nodes:" + string.Join(",", Nodes.Select(n => n.Url))); } SetLeaderNodeToKnownLeader(newLeaderNode); return(new CompletedTask()); } if (Log.IsDebugEnabled) { Log.Debug($"Fetched topology from cache, no leader found.\n Nodes:" + string.Join(",", Nodes.Select(n => n.Url))); } SetLeaderNodeToNull(); } } return(refreshReplicationInformationTask = Task.Factory.StartNew(() => { var tryFailoverServers = false; var triedFailoverServers = FailoverServers == null || FailoverServers.Length == 0; for (;;) { //taking a snapshot so we could tell if the value changed while we fetch the topology var prevLeader = LeaderNode; var nodes = NodeUrls.ToHashSet(); if (tryFailoverServers == false) { if (nodes.Count == 0) { nodes.Add(primaryNode); } } else { nodes.Add(primaryNode); // always check primary node during failover check foreach (var failoverServer in FailoverServers) { var node = ConvertReplicationDestinationToOperationMetadata(failoverServer, ClusterInformation.NotInCluster); if (node != null) { nodes.Add(node); } } triedFailoverServers = true; } var replicationDocuments = nodes .Select(operationMetadata => new { Node = operationMetadata, Task = getReplicationDestinationsTask(operationMetadata) }) .ToArray(); var tasks = replicationDocuments .Select(x => (Task)x.Task) .ToArray(); var tasksCompleted = Task.WaitAll(tasks, ReplicationDestinationsTopologyTimeout); if (Log.IsDebugEnabled && tasksCompleted == false) { Log.Debug($"During fetch topology {tasks.Count(t=>t.IsCompleted)} servers have responded out of {tasks.Length}"); } replicationDocuments.ForEach(x => { if (x.Task.IsCompleted && x.Task.Result != null) { FailureCounters.ResetFailureCount(x.Node.Url); } }); var newestTopology = replicationDocuments .Where(x => x.Task.IsCompleted && x.Task.Result != null) .OrderByDescending(x => x.Task.Result.Term) .ThenByDescending(x => { var index = x.Task.Result.ClusterCommitIndex; return x.Task.Result.ClusterInformation.IsLeader ? index + 1 : index; }) .FirstOrDefault(); if (newestTopology == null && FailoverServers != null && FailoverServers.Length > 0 && tryFailoverServers == false) { tryFailoverServers = true; } if (newestTopology == null && triedFailoverServers) { if (Log.IsDebugEnabled) { Log.Debug($"Fetching topology resulted with no topology, tried failoever servers, setting leader node to primary node ({primaryNode})."); } //if the leader Node is not null this means that somebody updated it, we don't want to overwrite it with the primary. // i'm rasing the leader changed event although we don't have a real leader because some tests don't wait for leader but actually any node //Todo: change back to: if (SetLeaderNodeIfLeaderIsNull(primaryNode, false) == false) if (SetLeaderNodeIfLeaderIsNull(primaryNode) == false) { return; } if (Nodes.Count == 0) { Nodes = new List <OperationMetadata> { primaryNode } } ; return; } if (newestTopology != null) { var replicationDocument = newestTopology.Task.Result; var node = newestTopology.Node; if (UpdateTopology(serverClient, node, replicationDocument, serverHash, prevLeader)) { return; } } Thread.Sleep(500); } }).ContinueWith(t => { lastUpdate = SystemTime.UtcNow; refreshReplicationInformationTask = null; })); } }
private async Task <AsyncOperationResult <T> > TryClusterOperationAsync <T>(OperationMetadata node, Func <OperationMetadata, IRequestTimeMetric, Task <T> > operation, bool avoidThrowing, CancellationToken token) { Debug.Assert(node != null); token.ThrowIfCancellationRequested(); var shouldRetry = false; var operationResult = new AsyncOperationResult <T>(); try { operationResult.Result = await operation(node, null).ConfigureAwait(false); operationResult.Success = true; } catch (Exception e) { bool wasTimeout; if (HttpConnectionHelper.IsServerDown(e, out wasTimeout)) { shouldRetry = true; operationResult.WasTimeout = wasTimeout; if (Log.IsDebugEnabled) { Log.Debug($"Operation failed because server {node.Url} is down."); } } else { var ae = e as AggregateException; ErrorResponseException errorResponseException; if (ae != null) { errorResponseException = ae.ExtractSingleInnerException() as ErrorResponseException; } else { errorResponseException = e as ErrorResponseException; } if (errorResponseException != null) { if (errorResponseException.StatusCode == HttpStatusCode.Redirect) { IEnumerable <string> values; if (errorResponseException.Response.Headers.TryGetValues("Raven-Leader-Redirect", out values) == false && values.Contains("true") == false) { throw new InvalidOperationException("Got 302 Redirect, but without Raven-Leader-Redirect: true header, maybe there is a proxy in the middle", e); } var redirectUrl = errorResponseException.Response.Headers.Location.ToString(); var newLeaderNode = Nodes.FirstOrDefault(n => n.Url.Equals(redirectUrl)) ?? new OperationMetadata(redirectUrl, node.Credentials, node.ClusterInformation); SetLeaderNodeToKnownLeader(newLeaderNode); if (Log.IsDebugEnabled) { Log.Debug($"Redirecting to {redirectUrl} because {node.Url} responded with 302-redirect."); } return(await TryClusterOperationAsync(newLeaderNode, operation, avoidThrowing, token).ConfigureAwait(false)); } if (errorResponseException.StatusCode == HttpStatusCode.ExpectationFailed) { if (Log.IsDebugEnabled) { Log.Debug($"Operation failed with status code {HttpStatusCode.ExpectationFailed}, will retry."); } shouldRetry = true; } } } if (shouldRetry == false && avoidThrowing == false) { throw; } operationResult.Error = e; } if (operationResult.Success) { FailureCounters.ResetFailureCount(node.Url); } return(operationResult); }
private Task UpdateReplicationInformationForCluster(AsyncServerClient serverClient, OperationMetadata primaryNode, Func <OperationMetadata, Task <ReplicationDocumentWithClusterInformation> > getReplicationDestinationsTask) { lock (this) { var serverHash = ServerHash.GetServerHash(primaryNode.Url); var taskCopy = refreshReplicationInformationTask; if (taskCopy != null) { return(taskCopy); } if (firstTime) { firstTime = false; var document = ReplicationInformerLocalCache.TryLoadReplicationInformationFromLocalCache(serverHash); var nodes = GetNodes(primaryNode, document?.DataAsJson.JsonDeserialization <ReplicationDocumentWithClusterInformation>()); if (nodes != null) { Nodes = nodes; var newLeaderNode = GetLeaderNode(Nodes); if (newLeaderNode != null) { if (Log.IsDebugEnabled) { Log.Debug($"Fetched topology from cache, Leader is {LeaderNode}\n Nodes:" + string.Join(",", Nodes.Select(n => n.Url))); } SetLeaderNodeToKnownLeader(newLeaderNode); return(new CompletedTask()); } if (Log.IsDebugEnabled) { Log.Debug($"Fetched topology from cache, no leader found.\n Nodes:" + string.Join(",", Nodes.Select(n => n.Url))); } SetLeaderNodeToNull(); } } return(refreshReplicationInformationTask = Task.Factory.StartNew(async() => { var tryFailoverServers = false; var triedFailoverServers = FailoverServers == null || FailoverServers.Length == 0; for (; ;) { //taking a snapshot so we could tell if the value changed while we fetch the topology var prevLeader = LeaderNode; var nodes = NodeUrls.ToHashSet(); if (tryFailoverServers == false) { if (nodes.Count == 0) { nodes.Add(primaryNode); } } else { nodes.Add(primaryNode); // always check primary node during failover check foreach (var failoverServer in FailoverServers) { var node = ConvertReplicationDestinationToOperationMetadata(failoverServer, ClusterInformation.NotInCluster); if (node != null) { nodes.Add(node); } } triedFailoverServers = true; } var replicationDocuments = nodes .Select(operationMetadata => new { Node = operationMetadata, Task = getReplicationDestinationsTask(operationMetadata), }) .ToArray(); var tasks = replicationDocuments .Select(x => (Task)x.Task) .ToArray(); var tasksCompleted = Task.WaitAll(tasks, ReplicationDestinationsTopologyTimeout); if (Log.IsDebugEnabled && tasksCompleted == false) { Log.Debug($"During fetch topology {tasks.Count(t => t.IsCompleted)} servers have responded out of {tasks.Length}"); } replicationDocuments.ForEach(x => { if (x.Task.IsCompleted && x.Task.Result != null) { FailureCounters.ResetFailureCount(x.Node.Url); } }); var newestTopologies = replicationDocuments .Where(x => x.Task.IsCompleted && x.Task.Result != null) .OrderByDescending(x => x.Task.Result.Term) .ThenByDescending(x => { var index = x.Task.Result.ClusterCommitIndex; return x.Task.Result.ClusterInformation.IsLeader ? index + 1 : index; }).ToList(); var newestTopology = newestTopologies.FirstOrDefault(); var hasLeaderCount = replicationDocuments .Count(x => x.Task.IsCompleted && x.Task.Result != null && x.Task.Result.HasLeader); if (newestTopology == null && FailoverServers != null && FailoverServers.Length > 0 && tryFailoverServers == false) { tryFailoverServers = true; } if (newestTopology == null && triedFailoverServers) { if (Log.IsDebugEnabled) { Log.Debug($"Fetching topology resulted with no topology, tried failoever servers, setting leader node to primary node ({primaryNode})."); } //if the leader Node is not null this means that somebody updated it, we don't want to overwrite it with the primary. // i'm raising the leader changed event although we don't have a real leader because some tests don't wait for leader but actually any node //Todo: change back to: if (SetLeaderNodeIfLeaderIsNull(primaryNode, false) == false) if (SetLeaderNodeIfLeaderIsNull(primaryNode) == false) { return; } if (Nodes.Count == 0) { Nodes = new List <OperationMetadata> { primaryNode } } ; return; } if (Log.IsDebugEnabled) { foreach (var x in replicationDocuments) { Log.Debug($"Topology fetched from {x.Node.Url}"); Log.Debug($"{JsonConvert.SerializeObject(x.Task?.Result)}"); } } var majorityOfNodesAgreeThereIsLeader = Nodes.Count == 1 || hasLeaderCount > (newestTopology?.Task.Result.Destinations.Count + 1) / 2; if (newestTopology != null && majorityOfNodesAgreeThereIsLeader) { var replicationDocument = newestTopology.Task.Result; var node = newestTopology.Node; if (newestTopologies.Count > 1 && node.Url.Equals(serverClient.Url) == false) { // we got the replication document not from the primary url // need to add the node url destination to the destinations // (we know it exists since we have majority of nodes that agree on the leader) // and remove the primary url destination from the destinations var sourceNode = node; var destination = replicationDocument.Destinations .FirstOrDefault(x => DestinationUrl(x.Url, x.Database).Equals(serverClient.Url, StringComparison.OrdinalIgnoreCase)); if (destination != null) { replicationDocument.Destinations.Remove(destination); // we need to update the cluster information of the primary url for this node replicationDocument.ClusterInformation = destination.ClusterInformation; node = ConvertReplicationDestinationToOperationMetadata(destination, destination.ClusterInformation); } destination = destination ?? replicationDocument.Destinations.FirstOrDefault(); if (destination != null) { var database = destination.Database; var networkCredentials = sourceNode.Credentials?.Credentials as NetworkCredential; replicationDocument.Destinations.Add(new ReplicationDestination.ReplicationDestinationWithClusterInformation { Url = sourceNode.Url, Database = database, ApiKey = sourceNode.Credentials?.ApiKey, Username = networkCredentials?.UserName, Password = networkCredentials?.Password, Domain = networkCredentials?.Domain, ClusterInformation = sourceNode.ClusterInformation }); } } if (UpdateTopology(serverClient, node, replicationDocument, serverHash, prevLeader)) { return; } } await Task.Delay(3000).ConfigureAwait(false); } }).ContinueWith(t => { lastUpdate = SystemTime.UtcNow; refreshReplicationInformationTask = null; })); } }
private Task UpdateReplicationInformationForCluster(OperationMetadata primaryNode, Func<OperationMetadata, Task<ReplicationDocumentWithClusterInformation>> getReplicationDestinationsTask) { lock (this) { var serverHash = ServerHash.GetServerHash(primaryNode.Url); var taskCopy = refreshReplicationInformationTask; if (taskCopy != null) return taskCopy; if (firstTime) { firstTime = false; var nodes = ReplicationInformerLocalCache.TryLoadClusterNodesFromLocalCache(serverHash); if (nodes != null) { Nodes = nodes; LeaderNode = GetLeaderNode(Nodes); if (LeaderNode != null) return new CompletedTask(); } } return refreshReplicationInformationTask = Task.Factory.StartNew(() => { var tryFailoverServers = false; var triedFailoverServers = FailoverServers == null || FailoverServers.Length == 0; for (; ; ) { var nodes = NodeUrls.ToHashSet(); if (tryFailoverServers == false) { if (nodes.Count == 0) nodes.Add(primaryNode); } else { nodes.Add(primaryNode); // always check primary node during failover check foreach (var failoverServer in FailoverServers) { var node = ConvertReplicationDestinationToOperationMetadata(failoverServer, ClusterInformation.NotInCluster); if (node != null) nodes.Add(node); } triedFailoverServers = true; } var replicationDocuments = nodes .Select(operationMetadata => new { Node = operationMetadata, Task = getReplicationDestinationsTask(operationMetadata) }) .ToArray(); var tasks = replicationDocuments .Select(x => x.Task) .ToArray(); Task.WaitAll(tasks); replicationDocuments.ForEach(x => { if (x.Task.Result == null) return; FailureCounters.ResetFailureCount(x.Node.Url); }); var newestTopology = replicationDocuments .Where(x => x.Task.Result != null) .OrderByDescending(x => x.Task.Result.Term) .ThenByDescending(x => { var index = x.Task.Result.ClusterCommitIndex; return x.Task.Result.ClusterInformation.IsLeader ? index + 1 : index; }) .FirstOrDefault(); if (newestTopology == null && FailoverServers != null && FailoverServers.Length > 0 && tryFailoverServers == false) tryFailoverServers = true; if (newestTopology == null && triedFailoverServers) { LeaderNode = primaryNode; Nodes = new List<OperationMetadata> { primaryNode }; return; } if (newestTopology != null) { Nodes = GetNodes(newestTopology.Node, newestTopology.Task.Result); LeaderNode = newestTopology.Task.Result.ClusterInformation.IsLeader ? Nodes.FirstOrDefault(n => n.Url == newestTopology.Node.Url) : null; ReplicationInformerLocalCache.TrySavingClusterNodesToLocalCache(serverHash, Nodes); if (LeaderNode != null) return; } Thread.Sleep(500); } }).ContinueWith(t => { lastUpdate = SystemTime.UtcNow; refreshReplicationInformationTask = null; }); } }