private async Task ProcessMembershipUpdates() { ClusterMembershipSnapshot previous = default; try { if (this.log.IsEnabled(LogLevel.Debug)) { this.log.LogDebug("Starting to process membership updates"); } await foreach (var tableSnapshot in this.membershipTableManager.MembershipTableUpdates.WithCancellation(this.cancellation.Token)) { var snapshot = tableSnapshot.CreateClusterMembershipSnapshot(); var update = (previous is null || snapshot.Version == MembershipVersion.MinValue) ? snapshot.AsUpdate() : snapshot.CreateUpdate(previous); this.NotifyObservers(update); previous = snapshot; } } catch (Exception exception) when(this.fatalErrorHandler.IsUnexpected(exception)) { this.log.LogError("Error processing membership updates: {Exception}", exception); this.fatalErrorHandler.OnFatalException(this, nameof(ProcessMembershipUpdates), exception); } finally { if (this.log.IsEnabled(LogLevel.Debug)) { this.log.LogDebug("Stopping membership update processor"); } } }
public MockClusterMembershipService(Dictionary <SiloAddress, SiloStatus> initialStatuses = null) { this.statuses = initialStatuses ?? new Dictionary <SiloAddress, SiloStatus>(); this.snapshot = ToSnapshot(this.statuses, ++version); this.updates = this.updates = new AsyncEnumerable <ClusterMembershipSnapshot>( (previous, proposed) => proposed.Version == MembershipVersion.MinValue || proposed.Version > previous.Version, this.snapshot) { OnPublished = update => Interlocked.Exchange(ref this.snapshot, update) }; }
private SiloAddress FindSuccessor(ClusterMembershipSnapshot snapshot) { var(successorVersion, successor) = _successor; if (successorVersion >= snapshot.Version) { return(successor); } // Find the silo with the smallest hashcode which is larger than this silo's. (SiloAddress Silo, int HashCode)firstInRing = (default(SiloAddress), int.MaxValue); (SiloAddress Silo, int HashCode)candidate = (default(SiloAddress), int.MaxValue); var localSiloHashCode = _localSilo.GetConsistentHashCode(); foreach (var member in snapshot.Members.Values) { if (member.SiloAddress.Equals(_localSilo)) { continue; } if (member.Status != SiloStatus.Active) { continue; } var memberHashCode = member.SiloAddress.GetConsistentHashCode(); // It is possible that the local silo is last in the ring, therefore we also find the first silo in the ring, // which would be the local silo's successor in that case. if (memberHashCode < firstInRing.HashCode) { firstInRing = (member.SiloAddress, memberHashCode); } // This member comes before this silo in the ring, but is not the first in the ring. if (memberHashCode < localSiloHashCode) { continue; } // This member comes after this silo in the ring, but before the current candidate. // Therefore, this member is the new candidate. if (memberHashCode < candidate.HashCode) { candidate = (member.SiloAddress, memberHashCode); } } // The result is either the silo with the smallest hashcode that is larger than this silo's, // or the first silo in the ring, or null in the case that there are no other active silos. successor = candidate.Silo ?? firstInRing.Silo; return(successor); }
private async Task ProcessMembershipUpdates() { ClusterMembershipSnapshot previous = default; IAsyncEnumerator <MembershipTableSnapshot> enumerator = default; try { if (this.log.IsEnabled(LogLevel.Debug)) { this.log.LogDebug("Starting to process membership updates"); } enumerator = this.membershipTableManager.MembershipTableUpdates.GetAsyncEnumerator(this.cancellation.Token); while (await enumerator.MoveNextAsync()) { var snapshot = enumerator.Current.CreateClusterMembershipSnapshot(); var update = (previous is null || snapshot.Version == MembershipVersion.MinValue) ? snapshot.AsUpdate() : snapshot.CreateUpdate(previous); this.NotifyObservers(update); previous = snapshot; } } catch (Exception exception) when(this.fatalErrorHandler.IsUnexpected(exception)) { this.log.LogError("Error processing membership updates: {Exception}", exception); this.fatalErrorHandler.OnFatalException(this, nameof(ProcessMembershipUpdates), exception); } finally { if (enumerator is object) { await enumerator.DisposeAsync(); } if (this.log.IsEnabled(LogLevel.Debug)) { this.log.LogDebug("Stopping membership update processor"); } } }
public DirectoryMembershipSnapshot( ILogger log, SiloAddress siloAddress, ClusterMembershipSnapshot clusterMembership) { this.log = log ?? throw new ArgumentNullException(nameof(log)); this.siloAddress = siloAddress ?? throw new ArgumentNullException(nameof(siloAddress)); this.ClusterMembership = clusterMembership ?? throw new ArgumentNullException(nameof(clusterMembership)); var activeMembers = ImmutableList.CreateBuilder <SiloAddress>(); foreach (var member in clusterMembership.Members) { if (member.Value.Status == SiloStatus.Active) { var silo = member.Value.SiloAddress; activeMembers.Add(silo); } } activeMembers.Sort(RingComparer); this.ring = activeMembers.ToImmutable(); }
private async Task Run() { ClusterMembershipSnapshot activeMembersSnapshot = default; SiloAddress[] otherNodes = default; TimeSpan? overrideDelay = ThreadSafeRandom.NextTimeSpan(_clusterMembershipOptions.CurrentValue.ProbeTimeout); while (await _pingTimer.NextTick(overrideDelay)) { ProbeResult probeResult; overrideDelay = default; try { // Discover the other active nodes in the cluster, if there are any. var membershipSnapshot = _membershipService.CurrentSnapshot; if (otherNodes is null || !object.ReferenceEquals(activeMembersSnapshot, membershipSnapshot)) { activeMembersSnapshot = membershipSnapshot; otherNodes = membershipSnapshot.Members.Values .Where(v => v.Status == SiloStatus.Active && v.SiloAddress != this.SiloAddress && v.SiloAddress != _localSiloDetails.SiloAddress) .Select(s => s.SiloAddress) .ToArray(); } var isDirectProbe = !_clusterMembershipOptions.CurrentValue.EnableIndirectProbes || _failedProbes < _clusterMembershipOptions.CurrentValue.NumMissedProbesLimit - 1 || otherNodes.Length == 0; var timeout = GetTimeout(isDirectProbe); var cancellation = new CancellationTokenSource(timeout); if (isDirectProbe) { // Probe the silo directly. probeResult = await this.ProbeDirectly(cancellation.Token).ConfigureAwait(false); } else { // Pick a random other node and probe the target indirectly, using the selected node as an intermediary. var intermediary = otherNodes[ThreadSafeRandom.Next(otherNodes.Length)]; // Select a timeout which will allow the intermediary node to attempt to probe the target node and still respond to this node // if the remote node does not respond in time. // Attempt to account for local health degradation by extending the timeout period. probeResult = await this.ProbeIndirectly(intermediary, timeout, cancellation.Token).ConfigureAwait(false); // If the intermediary is not entirely healthy, remove it from consideration and continue to probe. // Note that all recused silos will be included in the consideration set the next time cluster membership changes. if (probeResult.Status != ProbeResultStatus.Succeeded && probeResult.IntermediaryHealthDegradationScore > 0) { _log.LogInformation("Recusing unhealthy intermediary {Intermediary} and trying again with remaining nodes", intermediary); otherNodes = otherNodes.Where(node => !node.Equals(intermediary)).ToArray(); overrideDelay = TimeSpan.FromMilliseconds(250); } } if (!_stoppingCancellation.IsCancellationRequested) { await _onProbeResult(this, probeResult).ConfigureAwait(false); } } catch (Exception exception) { _log.LogError(exception, "Exception monitoring silo {SiloAddress}", SiloAddress); } } TimeSpan GetTimeout(bool isDirectProbe) { var additionalTimeout = 0; if (_clusterMembershipOptions.CurrentValue.ExtendProbeTimeoutDuringDegradation) { // Attempt to account for local health degradation by extending the timeout period. var localDegradationScore = _localSiloHealthMonitor.GetLocalHealthDegradationScore(DateTime.UtcNow); additionalTimeout += localDegradationScore; } if (!isDirectProbe) { // Indirect probes need extra time to account for the additional hop. additionalTimeout += 1; } return(_clusterMembershipOptions.CurrentValue.ProbeTimeout.Multiply(1 + additionalTimeout)); } }
private async Task <bool> UpdateManifest(ClusterMembershipSnapshot clusterMembership) { var existingManifest = _current; var builder = existingManifest.Silos.ToBuilder(); var modified = false; // First, remove defunct entries. foreach (var entry in existingManifest.Silos) { var address = entry.Key; var status = clusterMembership.GetSiloStatus(address); if (address.Equals(_localSiloAddress)) { // The local silo is always present in the manifest. continue; } if (status == SiloStatus.None || status == SiloStatus.Dead) { builder.Remove(address); modified = true; } } // Next, fill missing entries. var tasks = new List <Task <(SiloAddress Key, GrainManifest Value, Exception Exception)> >(); foreach (var entry in clusterMembership.Members) { var member = entry.Value; if (member.SiloAddress.Equals(_localSiloAddress)) { // The local silo is always present in the manifest. continue; } if (existingManifest.Silos.ContainsKey(member.SiloAddress)) { // Manifest has already been retrieved for the cluster member. continue; } if (member.Status != SiloStatus.Active) { // If the member is not yet active, it may not be ready to process requests. continue; } tasks.Add(GetManifest(member.SiloAddress)); async Task <(SiloAddress, GrainManifest, Exception)> GetManifest(SiloAddress siloAddress) { try { // Get the manifest from the remote silo. var grainFactory = _services.GetRequiredService <IInternalGrainFactory>(); var remoteManifestProvider = grainFactory.GetSystemTarget <ISiloManifestSystemTarget>(Constants.ManifestProviderType, member.SiloAddress); var manifest = await remoteManifestProvider.GetSiloManifest(); return(siloAddress, manifest, null); } catch (Exception exception) { return(siloAddress, null, exception); } } } var fetchSuccess = true; await Task.WhenAll(tasks); foreach (var task in tasks) { var result = await task; if (result.Exception is Exception exception) { fetchSuccess = false; _logger.LogWarning(exception, "Error retrieving silo manifest for silo {SiloAddress}", result.Key); } else { modified = true; builder[result.Key] = result.Value; } } // Regardless of success or failure, update the manifest if it has been modified. var version = new MajorMinorVersion(clusterMembership.Version.Value, existingManifest.Version.Minor + 1); if (modified) { return(_updates.TryPublish(new ClusterManifest(version, builder.ToImmutable(), builder.Values.ToImmutableArray())) && fetchSuccess); } return(fetchSuccess); }