Esempio n. 1
0
        private async Task ProcessMembershipUpdates()
        {
            ClusterMembershipSnapshot previous = default;

            try
            {
                if (this.log.IsEnabled(LogLevel.Debug))
                {
                    this.log.LogDebug("Starting to process membership updates");
                }
                await foreach (var tableSnapshot in this.membershipTableManager.MembershipTableUpdates.WithCancellation(this.cancellation.Token))
                {
                    var snapshot = tableSnapshot.CreateClusterMembershipSnapshot();

                    var update = (previous is null || snapshot.Version == MembershipVersion.MinValue) ? snapshot.AsUpdate() : snapshot.CreateUpdate(previous);
                    this.NotifyObservers(update);
                    previous = snapshot;
                }
            }
            catch (Exception exception) when(this.fatalErrorHandler.IsUnexpected(exception))
            {
                this.log.LogError("Error processing membership updates: {Exception}", exception);
                this.fatalErrorHandler.OnFatalException(this, nameof(ProcessMembershipUpdates), exception);
            }
            finally
            {
                if (this.log.IsEnabled(LogLevel.Debug))
                {
                    this.log.LogDebug("Stopping membership update processor");
                }
            }
        }
 public MockClusterMembershipService(Dictionary <SiloAddress, SiloStatus> initialStatuses = null)
 {
     this.statuses = initialStatuses ?? new Dictionary <SiloAddress, SiloStatus>();
     this.snapshot = ToSnapshot(this.statuses, ++version);
     this.updates  = this.updates = new AsyncEnumerable <ClusterMembershipSnapshot>(
         (previous, proposed) => proposed.Version == MembershipVersion.MinValue || proposed.Version > previous.Version,
         this.snapshot)
     {
         OnPublished = update => Interlocked.Exchange(ref this.snapshot, update)
     };
 }
Esempio n. 3
0
        private SiloAddress FindSuccessor(ClusterMembershipSnapshot snapshot)
        {
            var(successorVersion, successor) = _successor;
            if (successorVersion >= snapshot.Version)
            {
                return(successor);
            }

            // Find the silo with the smallest hashcode which is larger than this silo's.
            (SiloAddress Silo, int HashCode)firstInRing = (default(SiloAddress), int.MaxValue);
            (SiloAddress Silo, int HashCode)candidate   = (default(SiloAddress), int.MaxValue);
            var localSiloHashCode = _localSilo.GetConsistentHashCode();

            foreach (var member in snapshot.Members.Values)
            {
                if (member.SiloAddress.Equals(_localSilo))
                {
                    continue;
                }

                if (member.Status != SiloStatus.Active)
                {
                    continue;
                }

                var memberHashCode = member.SiloAddress.GetConsistentHashCode();

                // It is possible that the local silo is last in the ring, therefore we also find the first silo in the ring,
                // which would be the local silo's successor in that case.
                if (memberHashCode < firstInRing.HashCode)
                {
                    firstInRing = (member.SiloAddress, memberHashCode);
                }

                // This member comes before this silo in the ring, but is not the first in the ring.
                if (memberHashCode < localSiloHashCode)
                {
                    continue;
                }

                // This member comes after this silo in the ring, but before the current candidate.
                // Therefore, this member is the new candidate.
                if (memberHashCode < candidate.HashCode)
                {
                    candidate = (member.SiloAddress, memberHashCode);
                }
            }

            // The result is either the silo with the smallest hashcode that is larger than this silo's,
            // or the first silo in the ring, or null in the case that there are no other active silos.
            successor = candidate.Silo ?? firstInRing.Silo;
            return(successor);
        }
        private async Task ProcessMembershipUpdates()
        {
            ClusterMembershipSnapshot previous = default;
            IAsyncEnumerator <MembershipTableSnapshot> enumerator = default;

            try
            {
                if (this.log.IsEnabled(LogLevel.Debug))
                {
                    this.log.LogDebug("Starting to process membership updates");
                }
                enumerator = this.membershipTableManager.MembershipTableUpdates.GetAsyncEnumerator(this.cancellation.Token);
                while (await enumerator.MoveNextAsync())
                {
                    var snapshot = enumerator.Current.CreateClusterMembershipSnapshot();

                    var update = (previous is null || snapshot.Version == MembershipVersion.MinValue) ? snapshot.AsUpdate() : snapshot.CreateUpdate(previous);
                    this.NotifyObservers(update);
                    previous = snapshot;
                }
            }
            catch (Exception exception) when(this.fatalErrorHandler.IsUnexpected(exception))
            {
                this.log.LogError("Error processing membership updates: {Exception}", exception);
                this.fatalErrorHandler.OnFatalException(this, nameof(ProcessMembershipUpdates), exception);
            }
            finally
            {
                if (enumerator is object)
                {
                    await enumerator.DisposeAsync();
                }
                if (this.log.IsEnabled(LogLevel.Debug))
                {
                    this.log.LogDebug("Stopping membership update processor");
                }
            }
        }
        public DirectoryMembershipSnapshot(
            ILogger log,
            SiloAddress siloAddress,
            ClusterMembershipSnapshot clusterMembership)
        {
            this.log               = log ?? throw new ArgumentNullException(nameof(log));
            this.siloAddress       = siloAddress ?? throw new ArgumentNullException(nameof(siloAddress));
            this.ClusterMembership = clusterMembership ?? throw new ArgumentNullException(nameof(clusterMembership));

            var activeMembers = ImmutableList.CreateBuilder <SiloAddress>();

            foreach (var member in clusterMembership.Members)
            {
                if (member.Value.Status == SiloStatus.Active)
                {
                    var silo = member.Value.SiloAddress;
                    activeMembers.Add(silo);
                }
            }

            activeMembers.Sort(RingComparer);
            this.ring = activeMembers.ToImmutable();
        }
Esempio n. 6
0
        private async Task Run()
        {
            ClusterMembershipSnapshot activeMembersSnapshot = default;

            SiloAddress[] otherNodes    = default;
            TimeSpan?     overrideDelay = ThreadSafeRandom.NextTimeSpan(_clusterMembershipOptions.CurrentValue.ProbeTimeout);

            while (await _pingTimer.NextTick(overrideDelay))
            {
                ProbeResult probeResult;
                overrideDelay = default;

                try
                {
                    // Discover the other active nodes in the cluster, if there are any.
                    var membershipSnapshot = _membershipService.CurrentSnapshot;
                    if (otherNodes is null || !object.ReferenceEquals(activeMembersSnapshot, membershipSnapshot))
                    {
                        activeMembersSnapshot = membershipSnapshot;
                        otherNodes            = membershipSnapshot.Members.Values
                                                .Where(v => v.Status == SiloStatus.Active && v.SiloAddress != this.SiloAddress && v.SiloAddress != _localSiloDetails.SiloAddress)
                                                .Select(s => s.SiloAddress)
                                                .ToArray();
                    }

                    var isDirectProbe = !_clusterMembershipOptions.CurrentValue.EnableIndirectProbes || _failedProbes < _clusterMembershipOptions.CurrentValue.NumMissedProbesLimit - 1 || otherNodes.Length == 0;
                    var timeout       = GetTimeout(isDirectProbe);
                    var cancellation  = new CancellationTokenSource(timeout);

                    if (isDirectProbe)
                    {
                        // Probe the silo directly.
                        probeResult = await this.ProbeDirectly(cancellation.Token).ConfigureAwait(false);
                    }
                    else
                    {
                        // Pick a random other node and probe the target indirectly, using the selected node as an intermediary.
                        var intermediary = otherNodes[ThreadSafeRandom.Next(otherNodes.Length)];

                        // Select a timeout which will allow the intermediary node to attempt to probe the target node and still respond to this node
                        // if the remote node does not respond in time.
                        // Attempt to account for local health degradation by extending the timeout period.
                        probeResult = await this.ProbeIndirectly(intermediary, timeout, cancellation.Token).ConfigureAwait(false);

                        // If the intermediary is not entirely healthy, remove it from consideration and continue to probe.
                        // Note that all recused silos will be included in the consideration set the next time cluster membership changes.
                        if (probeResult.Status != ProbeResultStatus.Succeeded && probeResult.IntermediaryHealthDegradationScore > 0)
                        {
                            _log.LogInformation("Recusing unhealthy intermediary {Intermediary} and trying again with remaining nodes", intermediary);
                            otherNodes    = otherNodes.Where(node => !node.Equals(intermediary)).ToArray();
                            overrideDelay = TimeSpan.FromMilliseconds(250);
                        }
                    }

                    if (!_stoppingCancellation.IsCancellationRequested)
                    {
                        await _onProbeResult(this, probeResult).ConfigureAwait(false);
                    }
                }
                catch (Exception exception)
                {
                    _log.LogError(exception, "Exception monitoring silo {SiloAddress}", SiloAddress);
                }
            }

            TimeSpan GetTimeout(bool isDirectProbe)
            {
                var additionalTimeout = 0;

                if (_clusterMembershipOptions.CurrentValue.ExtendProbeTimeoutDuringDegradation)
                {
                    // Attempt to account for local health degradation by extending the timeout period.
                    var localDegradationScore = _localSiloHealthMonitor.GetLocalHealthDegradationScore(DateTime.UtcNow);
                    additionalTimeout += localDegradationScore;
                }

                if (!isDirectProbe)
                {
                    // Indirect probes need extra time to account for the additional hop.
                    additionalTimeout += 1;
                }

                return(_clusterMembershipOptions.CurrentValue.ProbeTimeout.Multiply(1 + additionalTimeout));
            }
        }
Esempio n. 7
0
        private async Task <bool> UpdateManifest(ClusterMembershipSnapshot clusterMembership)
        {
            var existingManifest = _current;
            var builder          = existingManifest.Silos.ToBuilder();
            var modified         = false;

            // First, remove defunct entries.
            foreach (var entry in existingManifest.Silos)
            {
                var address = entry.Key;
                var status  = clusterMembership.GetSiloStatus(address);

                if (address.Equals(_localSiloAddress))
                {
                    // The local silo is always present in the manifest.
                    continue;
                }

                if (status == SiloStatus.None || status == SiloStatus.Dead)
                {
                    builder.Remove(address);
                    modified = true;
                }
            }

            // Next, fill missing entries.
            var tasks = new List <Task <(SiloAddress Key, GrainManifest Value, Exception Exception)> >();

            foreach (var entry in clusterMembership.Members)
            {
                var member = entry.Value;

                if (member.SiloAddress.Equals(_localSiloAddress))
                {
                    // The local silo is always present in the manifest.
                    continue;
                }

                if (existingManifest.Silos.ContainsKey(member.SiloAddress))
                {
                    // Manifest has already been retrieved for the cluster member.
                    continue;
                }

                if (member.Status != SiloStatus.Active)
                {
                    // If the member is not yet active, it may not be ready to process requests.
                    continue;
                }

                tasks.Add(GetManifest(member.SiloAddress));

                async Task <(SiloAddress, GrainManifest, Exception)> GetManifest(SiloAddress siloAddress)
                {
                    try
                    {
                        // Get the manifest from the remote silo.
                        var grainFactory           = _services.GetRequiredService <IInternalGrainFactory>();
                        var remoteManifestProvider = grainFactory.GetSystemTarget <ISiloManifestSystemTarget>(Constants.ManifestProviderType, member.SiloAddress);
                        var manifest = await remoteManifestProvider.GetSiloManifest();

                        return(siloAddress, manifest, null);
                    }
                    catch (Exception exception)
                    {
                        return(siloAddress, null, exception);
                    }
                }
            }

            var fetchSuccess = true;
            await Task.WhenAll(tasks);

            foreach (var task in tasks)
            {
                var result = await task;
                if (result.Exception is Exception exception)
                {
                    fetchSuccess = false;
                    _logger.LogWarning(exception, "Error retrieving silo manifest for silo {SiloAddress}", result.Key);
                }
                else
                {
                    modified            = true;
                    builder[result.Key] = result.Value;
                }
            }

            // Regardless of success or failure, update the manifest if it has been modified.
            var version = new MajorMinorVersion(clusterMembership.Version.Value, existingManifest.Version.Minor + 1);

            if (modified)
            {
                return(_updates.TryPublish(new ClusterManifest(version, builder.ToImmutable(), builder.Values.ToImmutableArray())) && fetchSuccess);
            }

            return(fetchSuccess);
        }