/// <summary> /// Subscribes a connection to cluster events. /// </summary> /// <param name="connection">The connection.</param> /// <param name="correlationId">The correlation identifier.</param> /// <param name="cancellationToken">A cancellation token.</param> /// <returns>A task that will complete when the subscription has been processed, and represent whether it was successful.</returns> private async Task <bool> SubscribeToClusterEventsAsync(MemberConnection connection, long correlationId, CancellationToken cancellationToken) { // aka subscribe to member/partition view events HConsole.TraceLine(this, "subscribe"); // handles the event ValueTask HandleEventAsync(ClientMessage message, object _) => ClientAddClusterViewListenerCodec.HandleEventAsync(message, HandleCodecMemberViewEvent, HandleCodecPartitionViewEvent, connection.Id, _clusterState.LoggerFactory); try { var subscribeRequest = ClientAddClusterViewListenerCodec.EncodeRequest(); _correlatedSubscriptions[correlationId] = new ClusterSubscription(HandleEventAsync); _ = await _clusterMessaging.SendToMemberAsync(subscribeRequest, connection, correlationId, cancellationToken).CfAwait(); HConsole.WriteLine(this, "subscribed"); return(true); } catch (Exception e) { HConsole.WriteLine(this, "failed " + e); _correlatedSubscriptions.TryRemove(correlationId, out _); _logger.LogWarning(e, "Failed to subscribe to cluster events, may retry."); return(false); } }
/// <summary> /// Initializes a new instance of the <see cref="MemberSubscription"/> class. /// </summary> /// <param name="clusterSubscription">The cluster subscription.</param> /// <param name="serverSubscriptionId">The unique identifier assigned by the server to the client subscription.</param> /// <param name="correlationId">The correlation identifier.</param> /// <param name="connection">The connection to the member.</param> public MemberSubscription(ClusterSubscription clusterSubscription, Guid serverSubscriptionId, long correlationId, MemberConnection connection) { ClusterSubscription = clusterSubscription; ServerSubscriptionId = serverSubscriptionId; CorrelationId = correlationId; Connection = connection; }
/// <summary> /// Removes a subscription from the cluster, i.e. from each member. /// </summary> /// <param name="subscription">The subscription to remove.</param> /// <param name="cancellationToken">A cancellation token.</param> /// <returns></returns> private async ValueTask RemoveSubscriptionAsync(ClusterSubscription subscription, CancellationToken cancellationToken = default) { // de-activate the subscription: events received from members will *not* be processed anymore, // even if we receive more event messages from the servers subscription.Deactivate(); var allRemoved = await RemoveMemberSubscriptionsAsync(subscription, cancellationToken).CfAwait(); // remove the subscription, but if some member-level subscriptions could not // be removed, adds the subscription to the list of ghost subscriptions. lock (_clusterState.Mutex) { _subscriptions.TryRemove(subscription.Id, out _); } if (allRemoved) { return; } lock (_ghostLock) { _ghostSubscriptions.Add(subscription); _ghostTask ??= CollectSubscriptionsAsync(_clusterState.CancellationToken); } }
// removes a subscription private async ValueTask RemoveSubscriptionAsync(ClusterSubscription subscription, CancellationToken cancellationToken) { // de-activate the subscription: all further events will be ignored subscription.Deactivate(); // for each member subscription foreach (var memberSubscription in subscription) { // runs them all regardless of cancellation // remove the correlated subscription _correlatedSubscriptions.TryRemove(memberSubscription.CorrelationId, out _); // remove from the server // and, if it fails, enqueue for collection if (await RemoveSubscriptionAsync(memberSubscription, cancellationToken).CfAwait()) { subscription.Remove(memberSubscription); } else { CollectSubscription(memberSubscription); } } }
// add all member subscriptions of a cluster subscription to be collected, start the collect task if needed private void CollectSubscription(ClusterSubscription subscription) { lock (_collectMutex) { foreach (var memberSubscription in subscription) { _collectSubscriptions.Add(memberSubscription); } _collectTask ??= CollectSubscriptionsAsync(_cancel.Token); } }
// _connections is the list of known member connections // connections are added & removed by handling the ConnectionOpened and ConnectionClosed events // note: a connection may be opened yet not correspond to any member // // _subscriptions is the list of known cluster subscriptions // subscriptions are added & removed by invoking Add/RemoveSubscriptionAsync // each subscription in _subscriptions must be added to each connection in _connections // // when a subscription is added, // - (mutex): capture _connections connections, add the subscription to _subscriptions // - for each connection // - add a correlated subscription (before adding on server!) // - add the subscription to the connection on server // - fails // - remove the correlated subscription // - because // - the connection is not active anymore = skip & continue with other connections // - any other reason = queue all member connections for collection // - fail // - try-add a member connection to subscription // - fails (because the subscription is not active anymore) // - remove the correlated subscription // - nothing else to do: the subscription has been de-activated = clean // - fail // // when a connection is added // - (mutex): capture _subscriptions subscriptions, add the connection to _connections // - for each subscription // - add a correlated subscription (before adding on server!) // - add the subscription to the connection on server // - fails // - remove the correlated subscription // - because // - the connection is not active anymore = queue all created member subscriptions for collection // - for any other reason = terminate the connection // - exit // - try-add the corresponding member connection to the subscription // - fails (because the subscription is not active anymore) // - remove the correlated subscription // - queue the member connection for collection // - skip & continue with other subscriptions // // // when a subscription is removed // - (mutex): remove the subscription from _subscriptions // - de-activate the subscription (cannot add member subscriptions anymore) // - for each member connection in the subscription, // - clear the correlated subscription // - remove from server // - fails because the connection is not active anymore = consider it a success // - fails for any other reason = queue the member subscription for collection // // note: meanwhile, if a connection is // - added: it will not see the subscription, or see it de-activated // - removed: removing from server will be considered a success // // // when a connection is removed // - (mutex): capture _subscriptions subscriptions, remove the connection from _connections // - for each subscription // - remove the member subscription for the removed connection (cannot remove from server, connection is down) // - remove the corresponding correlated subscription // - if it is the cluster views connection // - clear // - remove the corresponding correlated subscription // - start assigning another connection // // note: meanwhile, if a subscription is // - added: it will not see the connection // - removed: never mind, we just have nothing to remove /// <summary> /// Adds a subscription. /// </summary> /// <param name="subscription">The subscription.</param> /// <param name="cancellationToken">A cancellation token.</param> /// <returns>A task that will complete when the subscription has been added.</returns> public async Task AddSubscriptionAsync(ClusterSubscription subscription, CancellationToken cancellationToken = default) { if (subscription == null) { throw new ArgumentNullException(nameof(subscription)); } // atomically get connections and add the subscription List <MemberConnection> connections; lock (_mutex) { // capture connections connections = _connections.ToList(); // failing would be a nasty internal error but better report it if (!_subscriptions.TryAdd(subscription.Id, subscription)) { throw new InvalidOperationException("A subscription with the same identifier already exists."); } } // add the subscription to each captured connection // TODO: consider adding in parallel foreach (var connection in connections) { if (cancellationToken.IsCancellationRequested) { CollectSubscription(subscription); // undo what has been done already cancellationToken.ThrowIfCancellationRequested(); // and throw } // this never throws var attempt = await AddSubscriptionAsync(subscription, connection, cancellationToken).CfAwait(); switch (attempt.Value) { case InstallResult.Success: // good case InstallResult.ConnectionNotActive: // ignore it continue; case InstallResult.SubscriptionNotActive: // not active = has been de-activated = what has been done already has been undone throw new HazelcastException("Failed to add the subscription because it was removed."); case InstallResult.Failed: // also if canceled CollectSubscription(subscription); // undo what has been done already throw new HazelcastException("Failed to add subscription (see inner exception).", attempt.Exception); default: throw new NotSupportedException(); } } }
// adds a subscription on one member private async ValueTask <Attempt <InstallResult> > AddSubscriptionAsync(ClusterSubscription subscription, MemberConnection connection, CancellationToken cancellationToken) { // if we already know the connection is not active anymore, ignore it // otherwise, install on this member - may throw if the connection goes away in the meantime if (!connection.Active) { return(Attempt.Fail(InstallResult.ConnectionNotActive)); } // add correlated subscription now so it is ready when the first events come var correlationId = _clusterState.GetNextCorrelationId(); _correlatedSubscriptions[correlationId] = subscription; // the original subscription.SubscribeRequest message may be used concurrently, // we need a safe clone so we can use our own correlation id in a safe way. var subscribeRequest = subscription.SubscribeRequest.CloneWithNewCorrelationId(correlationId); // talk to the server ClientMessage response; try { response = await _clusterMessaging.SendToMemberAsync(subscribeRequest, connection, correlationId, cancellationToken).CfAwait(); } catch (Exception e) { _correlatedSubscriptions.TryRemove(correlationId, out _); return(connection.Active ? Attempt.Fail(InstallResult.Failed, e) // also if canceled : Attempt.Fail(InstallResult.ConnectionNotActive)); } // try to add the member subscription to the cluster subscription // fails if the cluster subscription is not active anymore var memberSubscription = subscription.ReadSubscriptionResponse(response, connection); var added = subscription.TryAddMemberSubscription(memberSubscription); if (added) { return(InstallResult.Success); } // the subscription is not active anymore _correlatedSubscriptions.TryRemove(correlationId, out _); CollectSubscription(memberSubscription); return(Attempt.Fail(InstallResult.SubscriptionNotActive)); }
private async ValueTask <bool> RemoveMemberSubscriptionsAsync(ClusterSubscription subscription, CancellationToken cancellationToken = default) { var allRemoved = true; // un-subscribe each client var removedMemberSubscriptions = new List <MemberSubscription>(); foreach (var memberSubscription in subscription) { // if one client fails, keep the exception but continue with other clients try { // this does // - remove the correlated subscription // - tries to properly unsubscribe from the server var removed = await RemoveMemberSubscriptionAsync(memberSubscription, cancellationToken).CfAwait(); if (removed) { removedMemberSubscriptions.Add(memberSubscription); } else { allRemoved = false; } } catch (Exception e) { _logger.LogError(e, "Caught an exception while unsubscribing to events."); allRemoved = false; } } // remove those that have effectively been removed foreach (var memberSubscription in removedMemberSubscriptions) { subscription.Remove(memberSubscription); } return(allRemoved); }
/// <summary> /// Adds an event. /// </summary> /// <param name="subscription">The event subscription.</param> /// <param name="eventMessage">The event message.</param> /// <returns><c>true</c> if the even has been added successfully; otherwise (if the scheduler /// does not accept events anymore, because it has been disposed), <c>false</c>.</returns> public bool Add(ClusterSubscription subscription, ClientMessage eventMessage) { var partitionId = eventMessage.PartitionId; var start = false; var data = new EventData { PartitionId = partitionId, Subscription = subscription, Message = eventMessage }; Queue queue; lock (_mutex) { if (_disposed) { HConsole.WriteLine(this, $"Discard event, correlation:{eventMessage.CorrelationId}"); return(false); } HConsole.WriteLine(this, $"Enqueue event, correlation:{eventMessage.CorrelationId} queue:{partitionId}"); if (!_queues.TryGetValue(partitionId, out queue)) { HConsole.WriteLine(this, $"Create queue:{partitionId}"); queue = _queues[partitionId] = _pool.Get(); start = true; } queue.Enqueue(data); } if (start) { queue.Task = Handle(partitionId, queue); } return(true); }
/// <summary> /// Adds an event. /// </summary> /// <param name="subscription">The event subscription.</param> /// <param name="eventMessage">The event message.</param> /// <returns><c>true</c> if the even has been added successfully; otherwise (if the scheduler /// does not accept events anymore, because it has been disposed), <c>false</c>.</returns> public bool Add(ClusterSubscription subscription, ClientMessage eventMessage) { var partitionId = eventMessage.PartitionId; var start = false; var data = new EventData { PartitionId = partitionId, Subscription = subscription, Message = eventMessage }; Queue queue; lock (_mutex) { if (_disposed) { return(false); } if (!_queues.TryGetValue(partitionId, out queue)) { queue = _queues[partitionId] = _pool.Get(); start = true; } queue.Enqueue(data); } if (start) { queue.Task = Handle(partitionId, queue); } return(true); }
/// <summary> /// Adds an event. /// </summary> /// <param name="subscription">The event subscription.</param> /// <param name="eventMessage">The event message.</param> /// <returns><c>true</c> if the even has been added successfully; otherwise (if the scheduler /// does not accept events anymore, because it has been disposed), <c>false</c>.</returns> public bool Add(ClusterSubscription subscription, ClientMessage eventMessage) { if (subscription == null) { throw new ArgumentNullException(nameof(subscription)); } if (eventMessage == null) { throw new ArgumentNullException(nameof(eventMessage)); } var partitionId = eventMessage.PartitionId; var state = new State { Subscription = subscription, Message = eventMessage, PartitionId = partitionId }; // the factories in ConcurrentDictionary.AddOrUpdate are *not* thread-safe, i.e. in order // to run with minimal locking, the ConcurrentDictionary may run the two of them, or one // of them multiple times, and only guarantees that one single unique value ends up in the // dictionary - in our case, that would be a problem, since the factories spawn tasks. // // a traditional way around this consists in having the factories return Lazy<Task> so // that AddOrUpdate returns a Lazy<Task> and one single unique task is created when getting // the .Value of that lazy. however this (a) adds another layer of locking, (b) implies // captures since Lazy<T> does not have a constructor that accept factory arguments, etc. // // this is annoying - so we are going with a normal dictionary and a global lock for now. // // ideas: // avoid creating a Lazy per task, but manage a per-partition lock (so we only lock the // partition, not the whole dictionary) - yet that would mean a concurrent dictionary of // locks, etc? /* * _lock.EnterReadLock(); * try * { * if (_disposed) return false; * * _ = _partitionTasks * .AddOrUpdate(partitionId, CreateFirstTask, AppendNextTask, state) * .ContinueWith(_clearAfterUse, state, default, TaskContinuationOptions.None, TaskScheduler.Current); * * return true; * } * finally * { * _lock.ExitReadLock(); * } */ Task task; lock (_mutex) { if (_disposed) { return(false); } if (!_partitionTasks.TryGetValue(partitionId, out task)) { task = Task.CompletedTask; } task = AddContinuation(task, state); _partitionTasks[partitionId] = task; } task.ContinueWith(_removeAfterUse, state, default, TaskContinuationOptions.None, TaskScheduler.Current);
/// <summary> /// Installs a subscription on one member. /// </summary> /// <param name="connection">The connection to the member.</param> /// <param name="subscription">The subscription.</param> /// <param name="cancellationToken">A cancellation token.</param> /// <returns>A task that will complete when the client has subscribed to the server event.</returns> private async ValueTask <Attempt <InstallResult> > InstallSubscriptionAsync(ClusterSubscription subscription, MemberConnection connection, CancellationToken cancellationToken) { // if we already know the client is not active anymore, ignore it // otherwise, install on this client - may throw if the client goes away in the meantime if (!connection.Active) { return(Attempt.Fail(InstallResult.ClientNotActive)); } // add immediately, we don't know when the events will start to come var correlationId = _clusterState.GetNextCorrelationId(); _correlatedSubscriptions[correlationId] = subscription; // we do not control the original subscription.SubscribeRequest message and it may // be used concurrently, and so it is not safe to alter its correlation identifier. // instead, we use a safe clone of the original message var subscribeRequest = subscription.SubscribeRequest.CloneWithNewCorrelationId(correlationId); ClientMessage response; try { // hopefully the client is still active, else this will throw response = await _clusterMessaging.SendToMemberAsync(subscribeRequest, connection, correlationId, cancellationToken).CfAwait(); } catch (Exception e) { _correlatedSubscriptions.TryRemove(correlationId, out _); if (!connection.Active) { return(Attempt.Fail(InstallResult.ClientNotActive)); } _logger.LogError(e, "Caught exception while cleaning up after failing to install a subscription."); return(Attempt.Fail(InstallResult.Failed, e)); } // try to add the client subscription var(added, id) = subscription.TryAddClientSubscription(response, connection); if (added) { return(InstallResult.Success); } // otherwise, the client subscription could not be added, which means that the // cluster subscription is not active anymore, and so we need to undo the // server-side subscription // if the client is gone already it may be that the subscription has been // removed already, in which case... just give up now if (!_correlatedSubscriptions.TryRemove(correlationId, out _)) { return(Attempt.Fail(InstallResult.SubscriptionNotActive)); } var unsubscribeRequest = subscription.CreateUnsubscribeRequest(id); try { var unsubscribeResponse = await _clusterMessaging.SendToMemberAsync(unsubscribeRequest, connection, cancellationToken).CfAwait(); var unsubscribed = subscription.ReadUnsubscribeResponse(unsubscribeResponse); return(unsubscribed ? Attempt.Fail(InstallResult.SubscriptionNotActive) : Attempt.Fail(InstallResult.ConfusedServer)); } catch (Exception e) { // otherwise, we failed to undo the server-side subscription - end result is that // the client is fine (won't handle events, we've removed the correlated subscription // etc) but the server maybe confused. _logger.LogError(e, "Caught exception while cleaning up after failing to install a subscription."); return(Attempt.Fail(InstallResult.ConfusedServer, e)); } }
/// <summary> /// Installs a subscription on the cluster, i.e. on each member. /// </summary> /// <param name="subscription">The subscription.</param> /// <param name="cancellationToken">A cancellation token.</param> /// <returns>A task that will complete when the subscription has been installed.</returns> public async Task InstallSubscriptionAsync(ClusterSubscription subscription, CancellationToken cancellationToken = default) { if (subscription == null) { throw new ArgumentNullException(nameof(subscription)); } // capture active clients, and adds the subscription - atomically. List <MemberConnection> connections; lock (_clusterState.Mutex) { connections = _clusterMembers.SnapshotConnections(true); if (!_subscriptions.TryAdd(subscription.Id, subscription)) { throw new InvalidOperationException("A subscription with the same identifier already exists."); } } // from now on, // - if new clients are added, we won't deal with them here, but they will // subscribe on their own since the subscription is now listed. // - if a captured client goes away while we install subscriptions, we // will just ignore the associated errors and skip it entirely. // subscribe each captured client // TODO: could we install in parallel? // ReSharper disable once ForeachCanBePartlyConvertedToQueryUsingAnotherGetEnumerator foreach (var connection in connections) { // don't even try clients that became inactive if (!connection.Active) { continue; } // this never throws var attempt = await InstallSubscriptionAsync(subscription, connection, cancellationToken).CfAwait(); switch (attempt.Value) { case InstallResult.Success: case InstallResult.ClientNotActive: continue; case InstallResult.SubscriptionNotActive: case InstallResult.ConfusedServer: // not active: some other code must have // - removed the subscriptions from _subscriptions // - dealt with its existing clients // nothing left to do here throw new HazelcastException(attempt.Value == InstallResult.SubscriptionNotActive ? "Failed to install the subscription because it was removed." : "Failed to install the subscription because it was removed (and the server may be confused).", attempt.Exception); case InstallResult.Failed: // failed: client is active but installing the subscription failed // however, we might have installed it on other clients await RemoveSubscriptionAsync(subscription, cancellationToken).CfAwait(); throw new HazelcastException("Failed to install subscription (see inner exception).", attempt.Exception); default: throw new NotSupportedException(); } } }
/// <summary> /// Removes a subscription from the cluster, i.e. from each member. /// </summary> /// <param name="subscription">The subscription to remove.</param> /// <param name="throwOnError">Whether to throw on error (or return <c>false</c>).</param> /// <param name="cancellationToken">A cancellation token.</param> /// <returns></returns> private async ValueTask <bool> RemoveSubscriptionAsync(ClusterSubscription subscription, bool throwOnError, CancellationToken cancellationToken = default) { // de-activate the subscription: events received from members will *not* be processed anymore, // even if we receive more event messages from the servers subscription.Deactivate(); List <Exception> exceptions = null; var allRemoved = true; // un-subscribe each client var removedMemberSubscriptions = new List <MemberSubscription>(); foreach (var memberSubscription in subscription) { // if one client fails, keep the exception but continue with other clients try { // this does // - remove the correlated subscription // - tries to properly unsubscribe from the server allRemoved &= await RemoveSubscriptionAsync(memberSubscription, cancellationToken).CAF(); removedMemberSubscriptions.Add(memberSubscription); } catch (Exception e) { exceptions ??= new List <Exception>(); exceptions.Add(e); allRemoved = false; } } // remove those that have effectively been removed foreach (var memberSubscription in removedMemberSubscriptions) { subscription.Remove(memberSubscription); } // if all went well, remove the subscription, otherwise keep it around // so one can try again to unsubscribe - not that the subscription is // de-activated, so it will not trigger events anymore. using (await _clusterState.ClusterLock.AcquireAsync(CancellationToken.None).CAF()) { if (allRemoved) { _subscriptions.TryRemove(subscription.Id, out _); } } if (!throwOnError) { return(allRemoved); } // if at least an exception was thrown, rethrow if (exceptions != null) { throw new AggregateException("Failed to fully remove the subscription (and the server may be confused).", exceptions.ToArray()); } // if !allRemoved, everything has been removed from the client side, // but the server may still think it needs to send events, so it's kinda dirty. if (!allRemoved) { throw new HazelcastException("Failed to fully remove the subscription (and the server may be confused)."); } return(true); }