/// <summary> /// Subscribes a connection to cluster events. /// </summary> /// <param name="connection">The connection.</param> /// <param name="correlationId">The correlation identifier.</param> /// <param name="cancellationToken">A cancellation token.</param> /// <returns>A task that will complete when the subscription has been processed, and represent whether it was successful.</returns> private async Task <bool> SubscribeToClusterEventsAsync(MemberConnection connection, long correlationId, CancellationToken cancellationToken) { // aka subscribe to member/partition view events HConsole.TraceLine(this, "subscribe"); // handles the event ValueTask HandleEventAsync(ClientMessage message, object _) => ClientAddClusterViewListenerCodec.HandleEventAsync(message, HandleCodecMemberViewEvent, HandleCodecPartitionViewEvent, connection.Id, _clusterState.LoggerFactory); try { var subscribeRequest = ClientAddClusterViewListenerCodec.EncodeRequest(); _correlatedSubscriptions[correlationId] = new ClusterSubscription(HandleEventAsync); _ = await _clusterMessaging.SendToMemberAsync(subscribeRequest, connection, correlationId, cancellationToken).CfAwait(); HConsole.WriteLine(this, "subscribed"); return(true); } catch (Exception e) { HConsole.WriteLine(this, "failed " + e); _correlatedSubscriptions.TryRemove(correlationId, out _); _logger.LogWarning(e, "Failed to subscribe to cluster events, may retry."); return(false); } }
/// <summary> /// (thread-unsafe) Notifies that a connection has been opened. /// </summary> /// <param name="connection">The connection.</param> /// <returns><c>true</c> if the connection is the first one to be established; otherwise <c>false</c>.</returns> /// <remarks> /// <para>This method is not thread-safe; the caller has to lock the /// <see cref="Mutex"/> object to ensure thread-safety.</para> /// </remarks> public bool NotifyConnectionOpened(MemberConnection connection) { var isFirst = _connections.Count == 0; #if NETSTANDARD2_0 var contains = _connections.ContainsKey(connection.MemberId); _connections[connection.MemberId] = connection; if (contains) #else if (!_connections.TryAdd(connection.MemberId, connection)) #endif { throw new HazelcastException("Failed to add a connection (duplicate memberId)."); } if (_clusterId == default) { _clusterId = connection.ClusterId; // first cluster } else if (_clusterId != connection.ClusterId) { // see TcpClientConnectionManager java class handleSuccessfulAuth method // does not even consider the cluster identifier when !isFirst if (isFirst) { _clusterId = connection.ClusterId; // new cluster _memberTable = new MemberTable(); } } return(isFirst); }
/// <summary> /// Initializes a new instance of the <see cref="MemberSubscription"/> class. /// </summary> /// <param name="clusterSubscription">The cluster subscription.</param> /// <param name="serverSubscriptionId">The unique identifier assigned by the server to the client subscription.</param> /// <param name="correlationId">The correlation identifier.</param> /// <param name="connection">The connection to the member.</param> public MemberSubscription(ClusterSubscription clusterSubscription, Guid serverSubscriptionId, long correlationId, MemberConnection connection) { ClusterSubscription = clusterSubscription; ServerSubscriptionId = serverSubscriptionId; CorrelationId = correlationId; Connection = connection; }
/// <summary> /// Adds a connection. /// </summary> /// <param name="connection">The connection to add.</param> public void AddConnection(MemberConnection connection) { lock (_mutex) { if (connection.Active) { _connections.Add(connection); } } }
/// <summary> /// Notifies the member service of a terminated connection. /// </summary> /// <param name="connection">The connection.</param> /// <returns><c>true</c> if the connection was the last one; otherwise <c>false</c>.</returns> /// <remarks> /// <para>This method should be invoked within the global cluster lock.</para> /// </remarks> public bool NotifyTerminatedConnection(MemberConnection connection) { _connections.TryRemove(connection.MemberId, out _); var wasLast = _connections.IsEmpty; return(wasLast); }
// runs once on a connection to a member private async Task RunAsync(MemberConnection connection, DateTime now, CancellationToken cancellationToken) { // must ensure that timeout > interval ?! var readElapsed = now - connection.LastReadTime; var writeElapsed = now - connection.LastWriteTime; HConsole.WriteLine(this, $"Heartbeat on {connection.Id.ToShortString()}, written {(long)(now - connection.LastWriteTime).TotalMilliseconds}ms ago, read {(long)(now - connection.LastReadTime).TotalMilliseconds}ms ago"); // make sure we read from the client at least every 'timeout', // which is greater than the interval, so we *should* have // read from the last ping, if nothing else, so no read means // that the client not responsive - terminate it if (readElapsed > _timeout && writeElapsed < _period) { _logger.LogWarning("Heartbeat timeout for connection {ConnectionId}.", connection.Id); if (connection.Active) { await connection.TerminateAsync().CfAwait(); // does not throw; } return; } // make sure we write to the client at least every 'interval', // this should trigger a read when we receive the response if (writeElapsed > _period) { _logger.LogDebug("Ping client {ClientId}", connection.Id); var requestMessage = ClientPingCodec.EncodeRequest(); try { // ping should complete within the default invocation timeout var responseMessage = await _clusterMessaging .SendToMemberAsync(requestMessage, connection, cancellationToken) .CfAwait(); // just to be sure everything is ok _ = ClientPingCodec.DecodeResponse(responseMessage); } catch (TaskTimeoutException) { _logger.LogWarning("Heartbeat ping timeout for connection {ConnectionId}.", connection.Id); if (connection.Active) { await connection.TerminateAsync().CfAwait(); // does not throw; } } catch (Exception e) { // unexpected _logger.LogWarning(e, "Heartbeat has thrown an exception."); } } }
/// <summary> /// Adds a connection. /// </summary> /// <param name="connection">The connection.</param> /// <param name="isNewCluster">Whether the connection is the first connection to a new cluster.</param> public void AddConnection(MemberConnection connection, bool isNewCluster) { // accept every connection, regardless of whether there is a known corresponding member, // since the first connection is going to come before we get the first members view. lock (_mutex) { // don't add the connection if it is not active - if it *is* active, it still // could turn not-active anytime, but thanks to _mutex that will only happen // after the connection has been added if (!connection.Active) { return; } var contains = _connections.ContainsKey(connection.MemberId); if (contains) { // we cannot accept this connection, it's a duplicate (internal error?) _logger.LogWarning($"Cannot accept connection {connection.Id.ToShortString()} to member {connection.MemberId.ToShortString()}, a connection to that member already exists."); _terminateConnections.Add(connection); // kill.kill.kill return; } // add the connection _connections[connection.MemberId] = connection; if (isNewCluster) { // reset members // this is safe because... isNewCluster means that this is the very first connection and there are // no other connections yet and therefore we should not receive events and therefore no one // should invoke SetMembers. // TODO: what if and "old" membersUpdated event is processed? _members = new MemberTable(); } // if this is a true member connection if (_members.ContainsMember(connection.MemberId)) { // if this is the first connection to an actual member, change state & trigger event if (!_connected) { // change Started | Disconnected -> Connected, ignore otherwise, it could be ShuttingDown or Shutdown _logger.LogDebug($"Added connection {connection.Id.ToShortString()} to member {connection.MemberId.ToShortString()}, now connected."); _clusterState.ChangeState(ClientState.Connected, ClientState.Started, ClientState.Disconnected); _connected = true; } else { _logger.LogDebug($"Added connection {connection.Id.ToShortString()} to member {connection.MemberId.ToShortString()}."); } } } }
/// <summary> /// Proposes a connection to support the cluster view event. /// </summary> /// <param name="connection">A connection.</param> /// <remarks> /// <para>if there is no connection currently supporting the cluster view event, then this starts a background /// task to assign a connection to support the event, trying the supplied <paramref name="connection"/> first.</para> /// </remarks> private void ProposeClusterViewsConnection(MemberConnection connection) { lock (_clusterViewsMutex) { if (_clusterViewsConnection == null) { _clusterViewsTask ??= AssignClusterViewsConnectionAsync(connection, _cancel.Token); } } }
/// <summary> /// Starts the task that ensures that a connection handles cluster events, if that task is not already running. /// </summary> /// <param name="connection">A candidate connection.</param> /// <param name="cancellationToken">A cancellation token.</param> private void StartSetClusterEventsConnectionWithLock(MemberConnection connection, CancellationToken cancellationToken) { // there can only be one instance of that task running at a time // and it runs in the background, and at any time any connection could // shutdown, which might clear the current cluster event connection // // the task self-removes itself when it ends _clusterEventsTask ??= SetClusterEventsConnectionAsync(connection, cancellationToken); }
/// <summary> /// (thread-unsafe) Notifies that a connection has been closed. /// </summary> /// <param name="connection">The connection.</param> /// <returns><c>true</c> if the connection was the last one; otherwise <c>false</c>.</returns> /// <remarks> /// <para>This method is not thread-safe; the caller has to lock the /// <see cref="Mutex"/> object to ensure thread-safety.</para> /// </remarks> public bool NotifyConnectionClosed(MemberConnection connection) { var removed = _connections.ContainsKey(connection.MemberId); if (removed) { _connections.Remove(connection.MemberId); } var wasLast = removed && _connections.Count == 0; return(wasLast); }
/// <summary> /// Adds a connection to terminate. /// </summary> /// <param name="connection">The connection.</param> public void Add(MemberConnection connection) { ThrowIfDisposed(); if (_connections.TryWrite(connection)) { return; } // that should not happen, but log to be sure _logger.LogWarning($"Failed to add a connection ({connection})."); }
// runs once on a connection to a member private async Task RunAsync(MemberConnection connection, DateTime now, CancellationToken cancellationToken) { // must ensure that timeout > interval ?! // make sure we read from the client at least every 'timeout', // which is greater than the interval, so we *should* have // read from the last ping, if nothing else, so no read means // that the client not responsive - terminate it if (now - connection.LastReadTime > _timeout) { await TerminateConnection(connection).CAF(); return; } // make sure we write to the client at least every 'interval', // this should trigger a read when we receive the response if (now - connection.LastWriteTime > _period) { _logger.LogDebug("Ping client {ClientId}", connection.Id); var requestMessage = ClientPingCodec.EncodeRequest(); var cancellation = CancellationTokenSource.CreateLinkedTokenSource(cancellationToken); try { // cannot wait forever on a ping var responseMessage = await _clusterMessaging .SendToMemberAsync(requestMessage, connection, cancellation.Token) .TimeoutAfter(_pingTimeout, cancellation, true) .CAF(); // just to be sure everything is ok _ = ClientPingCodec.DecodeResponse(responseMessage); } catch (TaskTimeoutException) { await TerminateConnection(connection).CAF(); } catch (Exception e) { // unexpected _logger.LogWarning(e, "Heartbeat has thrown an exception."); } finally { // if .SendToClientAsync() throws before awaiting, .TimeoutAfter() is never invoked // and therefore cannot dispose the cancellation = better take care of it cancellation.Dispose(); } } }
// tries to authenticate // returns a result if successful // returns null if failed due to credentials (may want to retry) // throws if anything else went wrong private async ValueTask <AuthenticationResult> TryAuthenticateAsync(MemberConnection client, string clusterName, Guid clusterClientId, string clusterClientName, ISet <string> labels, ICredentialsFactory credentialsFactory, CancellationToken cancellationToken) { const string clientType = "CSP"; // CSharp var serializationVersion = _serializationService.GetVersion(); var clientVersion = ClientVersion; var credentials = credentialsFactory.NewCredentials(); ClientMessage requestMessage; switch (credentials) { case IPasswordCredentials passwordCredentials: requestMessage = ClientAuthenticationCodec.EncodeRequest(clusterName, passwordCredentials.Name, passwordCredentials.Password, clusterClientId, clientType, serializationVersion, clientVersion, clusterClientName, labels); break; case ITokenCredentials tokenCredentials: requestMessage = ClientAuthenticationCustomCodec.EncodeRequest(clusterName, tokenCredentials.GetToken(), clusterClientId, clientType, serializationVersion, clientVersion, clusterClientName, labels); break; default: var bytes = _serializationService.ToData(credentials).ToByteArray(); requestMessage = ClientAuthenticationCustomCodec.EncodeRequest(clusterName, bytes, clusterClientId, clientType, serializationVersion, clientVersion, clusterClientName, labels); break; } cancellationToken.ThrowIfCancellationRequested(); HConsole.WriteLine(this, "Send auth request"); var responseMessage = await client.SendAsync(requestMessage).CfAwait(); HConsole.WriteLine(this, "Rcvd auth response"); var response = ClientAuthenticationCodec.DecodeResponse(responseMessage); HConsole.WriteLine(this, "Auth response is: " + (AuthenticationStatus)response.Status); return((AuthenticationStatus)response.Status switch { AuthenticationStatus.Authenticated => new AuthenticationResult(response.ClusterId, response.MemberUuid, response.Address, response.ServerHazelcastVersion, response.FailoverSupported, response.PartitionCount, response.SerializationVersion, credentials.Name), AuthenticationStatus.CredentialsFailed => null, // could want to retry AuthenticationStatus.NotAllowedInCluster => throw new AuthenticationException("Client is not allowed in cluster."), AuthenticationStatus.SerializationVersionMismatch => throw new AuthenticationException("Serialization mismatch."), _ => throw new AuthenticationException($"Received unsupported status code {response.Status}.") });
private async Task TerminateConnection(MemberConnection connection) { if (!connection.Active) { return; } _logger.LogWarning("Heartbeat timeout for connection {ConnectionId}, terminating.", connection.Id); await connection.TerminateAsync().CAF(); // does not throw // TODO: original code has reasons for closing connections //connection.Close(reason, new TargetDisconnectedException($"Heartbeat timed out to connection {connection}")); }
/// <summary> /// Notifies the members service of a new connection. /// </summary> /// <param name="memberId">The identifier of the member.</param> /// <param name="connection">The connection.</param> /// <returns><c>true</c> if the connection is the first one to be established; otherwise <c>false</c>.</returns> /// <remarks> /// <para>This method should be invoked within the global cluster lock.</para> /// </remarks> public bool NotifyNewConnection(Guid memberId, MemberConnection connection) { var isFirst = _connections.IsEmpty; if (_connections.ContainsKey(memberId)) { throw new HazelcastException("Duplicate client."); } _connections[memberId] = connection; return(isFirst); }
/// <summary> /// Clears the connection currently handling cluster events, if it matches the specified <paramref name="connection"/>. /// </summary> /// <param name="connection">A connection.</param> /// <returns><c>true</c> if the current connection matched the specified connection, and was cleared; otherwise <c>false</c>.</returns> private bool ClearClusterEventsConnectionWithLock(MemberConnection connection) { // if the specified client is *not* the cluster events client, ignore if (_clusterEventsConnection != connection) { return(false); } // otherwise, clear the cluster event client _clusterEventsConnection = null; _correlatedSubscriptions.TryRemove(_clusterEventsCorrelationId, out _); _clusterEventsCorrelationId = 0; return(true); }
/// <summary> /// Installs existing subscriptions on a new member. /// </summary> /// <param name="connection">The connection to the new member.</param> /// <param name="subscriptions">The subscriptions</param> /// <param name="cancellationToken">A cancellation token.</param> /// <returns>A task that will complete when the client has subscribed to server events.</returns> private async Task InstallSubscriptionsOnNewMember(MemberConnection connection, IReadOnlyCollection <ClusterSubscription> subscriptions, CancellationToken cancellationToken) { // the client has been added to _clients, and subscriptions have been // captured already, all within a _clientsLock, but the caller...? // install all active subscriptions foreach (var subscription in subscriptions) { cancellationToken.ThrowIfCancellationRequested(); // don't even try subscriptions that became inactive if (!subscription.Active) { continue; } // this never throws var attempt = await InstallSubscriptionAsync(subscription, connection, cancellationToken).CfAwait(); switch (attempt.Value) { case InstallResult.Success: case InstallResult.SubscriptionNotActive: continue; case InstallResult.ClientNotActive: // not active: some other code must have: // - removed the client from _clients // - dealt with its existing subscriptions // nothing left to do here throw new HazelcastException("Failed to install the new connection (removed)."); case InstallResult.ConfusedServer: // same as subscription not active, but we failed to remove the subscription // on the server side - the client is dirty - just kill it entirely ClearMemberSubscriptions(subscriptions, connection); throw new HazelcastException("Failed to install the new connection (confused)."); case InstallResult.Failed: // failed to talk to the client - nothing works - kill it entirely ClearMemberSubscriptions(subscriptions, connection); throw new HazelcastException("Failed to install the new connection (failed)."); default: throw new NotSupportedException(); } } }
/// <summary> /// Tries to add a client subscription. /// </summary> /// <param name="message">The subscription response message.</param> /// <param name="client">The client.</param> /// <returns>Whether the client subscription was added, and its server identifier.</returns> public (bool, Guid) TryAddClientSubscription(ClientMessage message, MemberConnection client) { var serverSubscriptionId = _subscribeResponseReader(message, State); bool active; lock (_activeLock) { active = _active; if (active) { _clientSubscriptions[client] = new MemberSubscription(this, serverSubscriptionId, message.CorrelationId, client); } } return(active, serverSubscriptionId); }
/// <inheritdoc /> public async ValueTask <AuthenticationResult> AuthenticateAsync(MemberConnection client, string clusterName, Guid clusterClientId, string clusterClientName, ISet <string> labels, ISerializationService serializationService, CancellationToken cancellationToken) { if (client == null) { throw new ArgumentNullException(nameof(client)); } if (serializationService == null) { throw new ArgumentNullException(nameof(serializationService)); } // gets the credentials factory and don't dispose it // if there is none, create the default one and dispose it var credentialsFactory = _options.CredentialsFactory.Service; using var temp = credentialsFactory != null ? null : new DefaultCredentialsFactory(); credentialsFactory ??= temp; var result = await TryAuthenticateAsync(client, clusterName, clusterClientId, clusterClientName, labels, credentialsFactory, serializationService, cancellationToken).CAF(); if (result != null) { return(result); } // result is null, credentials failed but we may want to retry if (credentialsFactory is IResettableCredentialsFactory resettableCredentialsFactory) { resettableCredentialsFactory.Reset(); // try again result = await TryAuthenticateAsync(client, clusterName, clusterClientId, clusterClientName, labels, credentialsFactory, serializationService, cancellationToken).CAF(); if (result != null) { return(result); } } // nah, no chance throw new AuthenticationException("Invalid credentials."); }
/// <summary> /// Sets a connection to handle cluster events. /// </summary> /// <param name="connection">An optional candidate connection.</param> /// <param name="cancellationToken">A cancellation token.</param> /// <returns>A task that will complete when a new client has been assigned to handle cluster events.</returns> private async Task SetClusterEventsConnectionAsync(MemberConnection connection, CancellationToken cancellationToken) { // this will only exit once a client is assigned, or the task is // cancelled, when the cluster goes down (and never up again) while (!cancellationToken.IsCancellationRequested) { connection ??= _clusterMembers.GetRandomConnection(false); if (connection == null) { // no clients => wait for clients // TODO: consider IRetryStrategy? await Task.Delay(_clusterState.Options.Networking.WaitForClientMilliseconds, cancellationToken).CAF(); continue; } // try to subscribe, relying on the default invocation timeout, // so this is not going to last forever - we know it will end var correlationId = _clusterState.GetNextCorrelationId(); if (!await SubscribeToClusterEventsAsync(connection, correlationId, cancellationToken).CAF()) // does not throw { // failed => try another client connection = null; continue; } // success! using (await _clusterState.ClusterLock.AcquireAsync(CancellationToken.None).CAF()) { _clusterEventsConnection = connection; _clusterEventsCorrelationId = correlationId; // avoid race conditions, this task is going to end, and if the // client dies we want to be sure we restart the task _clusterEventsTask = null; } break; } }
/// <summary> /// Sets a connection to handle cluster events. /// </summary> /// <param name="connection">An optional candidate connection.</param> /// <param name="cancellationToken">A cancellation token.</param> /// <returns>A task that will complete when a new client has been assigned to handle cluster events.</returns> private async Task SetClusterEventsConnectionAsync(MemberConnection connection, CancellationToken cancellationToken) { // TODO: throttle // this will only exit once a client is assigned, or the task is // cancelled, when the cluster goes down (and never up again) while (!cancellationToken.IsCancellationRequested) { connection ??= await _clusterMembers.WaitRandomConnection(cancellationToken).CfAwait(); // try to subscribe, relying on the default invocation timeout, // so this is not going to last forever - we know it will end var correlationId = _clusterState.GetNextCorrelationId(); if (!await SubscribeToClusterEventsAsync(connection, correlationId, cancellationToken).CfAwait()) // does not throw { // FIXME tests fail // because of ClientOfflineException in ClusterMessaging.SendToMember // ClusterState.GetLinkedCancellation -> ClientOfflineException // but, if it's offline, how come we have a connection from WaitRandomConnection?! // failed => try another client connection = null; continue; } // success! lock (_clusterState.Mutex) { _clusterEventsConnection = connection; _clusterEventsCorrelationId = correlationId; // avoid race conditions, this task is going to end, and if the // client dies we want to be sure we restart the task _clusterEventsTask = null; } break; } }
/// <summary> /// Clears the connection currently supporting the cluster view event, if it matches the specified <paramref name="connection"/>. /// </summary> /// <param name="connection">A connection.</param> /// <returns><c>true</c> if the current connection matched the specified connection, and was cleared; otherwise <c>false</c>.</returns> /// <remarks> /// <para>If <paramref name="connection"/> was supporting the cluster view event, and was not the last connection, /// this starts a background task to assign another connection to support the cluster view event.</para> /// </remarks> private void ClearClusterViewsConnection(MemberConnection connection) { // note: we do not "unsubscribe" - if we come here, the connection is gone lock (_clusterViewsMutex) { // if the specified client is *not* the cluster events client, ignore if (_clusterViewsConnection != connection) { return; } // otherwise, clear the connection _clusterViewsConnection = null; _correlatedSubscriptions.TryRemove(_clusterViewsCorrelationId, out _); _clusterViewsCorrelationId = 0; HConsole.WriteLine(this, "ClusterViews: no connection."); // assign another connection (async) _clusterViewsTask ??= AssignClusterViewsConnectionAsync(null, _cancel.Token); } }
// (background) adds subscriptions on one member - when a connection is added private async Task AddSubscriptionsAsync(MemberConnection connection, IReadOnlyCollection <ClusterSubscription> subscriptions, CancellationToken cancellationToken) { // this is a background task and therefore should never throw! foreach (var subscription in subscriptions) { if (cancellationToken.IsCancellationRequested) { return; } // this never throws var attempt = await AddSubscriptionAsync(subscription, connection, cancellationToken).CfAwait(); switch (attempt.Value) { case InstallResult.Success: // ok case InstallResult.SubscriptionNotActive: // ignore it continue; case InstallResult.ConnectionNotActive: // not active = has been removed = what has been done already has been undone break; // simply exit case InstallResult.Failed: // failed to talk to the server - this connection is not working _terminateConnections.Add(connection); break; // exit default: continue; } } // we are done now lock (_subscribeTasksMutex) _subscribeTasks.Remove(connection); }
/// <summary> /// Clears the subscriptions of a member that is gone fishing. /// </summary> /// <param name="subscriptions">Cluster subscriptions.</param> /// <param name="connection">The connection to the member.</param> private void ClearMemberSubscriptions(IEnumerable <ClusterSubscription> subscriptions, MemberConnection connection) { foreach (var subscription in subscriptions) { // remove the correlated subscription // remove the client subscription if (subscription.TryRemove(connection, out var clientSubscription)) { _correlatedSubscriptions.TryRemove(clientSubscription.CorrelationId, out _); } } }
/// <summary> /// Initializes a new instance of the <see cref="Invocation"/> class. /// </summary> /// <param name="requestMessage">The request message.</param> /// <param name="messagingOptions">Messaging options.</param> /// <param name="targetClientConnection">An optional client connection, that the invocation is bound to.</param> /// <param name="cancellationToken">A cancellation token.</param> /// <remarks> /// <para>When an invocation is bound to a client, it will only be sent to that client, /// and it cannot and will not be retried if the client dies.</para> /// </remarks> public Invocation(ClientMessage requestMessage, MessagingOptions messagingOptions, MemberConnection targetClientConnection, CancellationToken cancellationToken) : this(requestMessage, messagingOptions, cancellationToken) { TargetClientConnection = targetClientConnection ?? throw new ArgumentNullException(nameof(targetClientConnection)); }
/// <summary> /// Installs a subscription on one member. /// </summary> /// <param name="connection">The connection to the member.</param> /// <param name="subscription">The subscription.</param> /// <param name="cancellationToken">A cancellation token.</param> /// <returns>A task that will complete when the client has subscribed to the server event.</returns> private async ValueTask <Attempt <InstallResult> > InstallSubscriptionAsync(ClusterSubscription subscription, MemberConnection connection, CancellationToken cancellationToken) { // if we already know the client is not active anymore, ignore it // otherwise, install on this client - may throw if the client goes away in the meantime if (!connection.Active) { return(Attempt.Fail(InstallResult.ClientNotActive)); } // add immediately, we don't know when the events will start to come var correlationId = _clusterState.GetNextCorrelationId(); _correlatedSubscriptions[correlationId] = subscription; // we do not control the original subscription.SubscribeRequest message and it may // be used concurrently, and so it is not safe to alter its correlation identifier. // instead, we use a safe clone of the original message var subscribeRequest = subscription.SubscribeRequest.CloneWithNewCorrelationId(correlationId); ClientMessage response; try { // hopefully the client is still active, else this will throw response = await _clusterMessaging.SendToMemberAsync(subscribeRequest, connection, correlationId, cancellationToken).CfAwait(); } catch (Exception e) { _correlatedSubscriptions.TryRemove(correlationId, out _); if (!connection.Active) { return(Attempt.Fail(InstallResult.ClientNotActive)); } _logger.LogError(e, "Caught exception while cleaning up after failing to install a subscription."); return(Attempt.Fail(InstallResult.Failed, e)); } // try to add the client subscription var(added, id) = subscription.TryAddClientSubscription(response, connection); if (added) { return(InstallResult.Success); } // otherwise, the client subscription could not be added, which means that the // cluster subscription is not active anymore, and so we need to undo the // server-side subscription // if the client is gone already it may be that the subscription has been // removed already, in which case... just give up now if (!_correlatedSubscriptions.TryRemove(correlationId, out _)) { return(Attempt.Fail(InstallResult.SubscriptionNotActive)); } var unsubscribeRequest = subscription.CreateUnsubscribeRequest(id); try { var unsubscribeResponse = await _clusterMessaging.SendToMemberAsync(unsubscribeRequest, connection, cancellationToken).CfAwait(); var unsubscribed = subscription.ReadUnsubscribeResponse(unsubscribeResponse); return(unsubscribed ? Attempt.Fail(InstallResult.SubscriptionNotActive) : Attempt.Fail(InstallResult.ConfusedServer)); } catch (Exception e) { // otherwise, we failed to undo the server-side subscription - end result is that // the client is fine (won't handle events, we've removed the correlated subscription // etc) but the server maybe confused. _logger.LogError(e, "Caught exception while cleaning up after failing to install a subscription."); return(Attempt.Fail(InstallResult.ConfusedServer, e)); } }
// runs once on a connection to a member private async Task RunAsync(MemberConnection connection, DateTime now, CancellationToken cancellationToken) { var readElapsed = now - connection.LastReadTime; var writeElapsed = now - connection.LastWriteTime; HConsole.WriteLine(this, $"Heartbeat {_clusterState.ClientName} on {connection.Id.ToShortString()} to {connection.MemberId.ToShortString()} at {connection.Address}, " + $"written {(int)writeElapsed.TotalSeconds}s ago, " + $"read {(int)readElapsed.TotalSeconds}s ago"); // make sure we read from the client at least every 'timeout', // which is greater than the interval, so we *should* have // read from the last ping, if nothing else, so no read means // that the client not responsive - terminate it if (readElapsed > _timeout && writeElapsed < _period) { _logger.LogWarning("Heartbeat timeout for connection {ConnectionId}, terminating.", connection.Id.ToShortString()); if (connection.Active) { _terminateConnections.Add(connection); } return; } // make sure we write to the client at least every 'period', // this should trigger a read when we receive the response if (writeElapsed > _period) { _logger.LogDebug("Ping client {ClientId}", connection.Id.ToShortString()); var requestMessage = ClientPingCodec.EncodeRequest(); try { // ping should complete within the default invocation timeout var responseMessage = await _clusterMessaging .SendToMemberAsync(requestMessage, connection, cancellationToken) .CfAwait(); // just to be sure everything is ok _ = ClientPingCodec.DecodeResponse(responseMessage); } catch (ClientOfflineException) { // down } catch (TaskTimeoutException) { _logger.LogWarning("Heartbeat ping timeout for connection {ConnectionId}, terminating.", connection.Id.ToShortString()); if (connection.Active) { _terminateConnections.Add(connection); } } catch (Exception e) { // unexpected _logger.LogWarning(e, "Heartbeat has thrown an exception, but will continue."); } } }
/// <summary> /// Removes a connection. /// </summary> /// <param name="connection">The connection to remove.</param> public void RemoveConnection(MemberConnection connection) { lock (_mutex) _connections.Remove(connection); }
/// <summary> /// Removes a client subscription. /// </summary> /// <param name="client">The client.</param> /// <param name="memberSubscription">The client subscription.</param> /// <returns>Whether a client subscription was removed.</returns> public bool TryRemove(MemberConnection client, out MemberSubscription memberSubscription) => _clientSubscriptions.TryRemove(client, out memberSubscription);
/// <summary> /// Tries to get a connection for a member. /// </summary> /// <param name="memberId">The identifier of the member.</param> /// <param name="connection">The connection.</param> /// <returns><c>true</c> if a connection to the specified member was found; otherwise <c>false</c>.</returns> public bool TryGetConnection(Guid memberId, out MemberConnection connection) { lock (Mutex) return(_connections.TryGetValue(memberId, out connection)); }