Beispiel #1
0
        private void InitializeInternal(IQueueAdapter qAdapter, IQueueAdapterCache queueAdapterCache, IStreamFailureHandler failureHandler)
        {
            logger.Info(ErrorCode.PersistentStreamPullingAgent_02, "Init of {0} {1} on silo {2} for queue {3}.",
                        GetType().Name, ((ISystemTargetBase)this).GrainId.ToDetailedString(), Silo, QueueId.ToStringWithHashCode());

            // Remove cast once we cleanup
            queueAdapter               = qAdapter;
            streamFailureHandler       = failureHandler;
            lastTimeCleanedPubSubCache = DateTime.UtcNow;

            try
            {
                receiver = queueAdapter.CreateReceiver(QueueId);
            }
            catch (Exception exc)
            {
                logger.Error(ErrorCode.PersistentStreamPullingAgent_02, "Exception while calling IQueueAdapter.CreateNewReceiver.", exc);
                throw;
            }

            try
            {
                if (queueAdapterCache != null)
                {
                    queueCache = queueAdapterCache.CreateQueueCache(QueueId);
                }
            }
            catch (Exception exc)
            {
                logger.Error(ErrorCode.PersistentStreamPullingAgent_23, "Exception while calling IQueueAdapterCache.CreateQueueCache.", exc);
                throw;
            }

            try
            {
                receiverInitTask = OrleansTaskExtentions.SafeExecute(() => receiver.Initialize(config.InitQueueTimeout))
                                   .LogException(logger, ErrorCode.PersistentStreamPullingAgent_03, $"QueueAdapterReceiver {QueueId.ToStringWithHashCode()} failed to Initialize.");
                receiverInitTask.Ignore();
            }
            catch
            {
                // Just ignore this exception and proceed as if Initialize has succeeded.
                // We already logged individual exceptions for individual calls to Initialize. No need to log again.
            }
            // Setup a reader for a new receiver.
            // Even if the receiver failed to initialise, treat it as OK and start pumping it. It's receiver responsibility to retry initialization.
            var randomTimerOffset = safeRandom.NextTimeSpan(config.GetQueueMsgsTimerPeriod);

            timer = RegisterTimer(AsyncTimerCallback, QueueId, randomTimerOffset, config.GetQueueMsgsTimerPeriod);

            IntValueStatistic.FindOrCreate(new StatisticName(StatisticNames.STREAMS_PERSISTENT_STREAM_PUBSUB_CACHE_SIZE, StatisticUniquePostfix), () => pubSubCache.Count);

            logger.Info((int)ErrorCode.PersistentStreamPullingAgent_04, "Taking queue {0} under my responsibility.", QueueId.ToStringWithHashCode());
        }
        private async Task PeriodicallyRefreshMembershipTable()
        {
            if (this.log.IsEnabled(LogLevel.Debug))
            {
                this.log.LogDebug("Starting periodic membership table refreshes");
            }
            try
            {
                var targetMilliseconds = (int)this.clusterMembershipOptions.TableRefreshTimeout.TotalMilliseconds;
                var random             = new SafeRandom();

                TimeSpan?onceOffDelay = random.NextTimeSpan(this.clusterMembershipOptions.TableRefreshTimeout);
                while (await this.membershipUpdateTimer.NextTick(onceOffDelay))
                {
                    onceOffDelay = default;

                    try
                    {
                        var stopwatch = ValueStopwatch.StartNew();
                        await this.Refresh();

                        if (this.log.IsEnabled(LogLevel.Trace))
                        {
                            this.log.LogTrace("Refreshing membership table took {Elapsed}", stopwatch.Elapsed);
                        }
                    }
                    catch (Exception exception)
                    {
                        this.log.LogError(
                            (int)ErrorCode.MembershipUpdateIAmAliveFailure,
                            "Failed to refresh membership table, will retry shortly: {Exception}",
                            exception);

                        // Retry quickly
                        onceOffDelay = TimeSpan.FromMilliseconds(200);
                    }
                }
            }
            catch (Exception exception) when(this.fatalErrorHandler.IsUnexpected(exception))
            {
                this.log.LogError("Error refreshing membership table: {Exception}", exception);
                this.fatalErrorHandler.OnFatalException(this, nameof(PeriodicallyRefreshMembershipTable), exception);
            }
            finally
            {
                if (this.log.IsEnabled(LogLevel.Debug))
                {
                    this.log.LogDebug("Stopping periodic membership table refreshes");
                }
            }
        }
Beispiel #3
0
        public static TimeSpan NextTimeSpan(this SafeRandom random, TimeSpan minValue, TimeSpan maxValue)
        {
            if (minValue <= TimeSpan.Zero)
            {
                throw new ArgumentOutOfRangeException(nameof(minValue), minValue, "SafeRandom.NextTimeSpan minValue must be a positive number.");
            }
            if (minValue >= maxValue)
            {
                throw new ArgumentOutOfRangeException(nameof(minValue), minValue, "SafeRandom.NextTimeSpan minValue must be greater than maxValue.");
            }
            TimeSpan span = maxValue - minValue;

            return(minValue + random.NextTimeSpan(span));
        }
Beispiel #4
0
        protected override async Task StartInBackground()
        {
            await DoInitialReadAndUpdateReminders();

            if (Status == GrainServiceStatus.Booting)
            {
                var random = new SafeRandom();
                listRefreshTimer = GrainTimer.FromTaskCallback(
                    _ => DoInitialReadAndUpdateReminders(),
                    null,
                    random.NextTimeSpan(InitialReadRetryPeriod),
                    InitialReadRetryPeriod,
                    name: "ReminderService.ReminderListInitialRead");
                listRefreshTimer.Start();
            }
        }
        /// <summary>
        /// Take responsibility for a new queues that was assigned to me via a new range.
        /// We first store the new queue in our internal data structure, try to initialize it and start a pumping timer.
        /// ERROR HANDLING:
        ///     The resposibility to handle initializatoion and shutdown failures is inside the INewQueueAdapterReceiver code.
        ///     The agent will call Initialize once and log an error. It will not call initiliaze again.
        ///     The receiver itself may attempt later to recover from this error and do initialization again.
        ///     The agent will assume initialization has succeeded and will subsequently start calling pumping receive.
        ///     Same applies to shutdown.
        /// </summary>
        /// <param name="qAdapter"></param>
        /// <returns></returns>
        public async Task Initialize(Immutable <IQueueAdapter> qAdapter)
        {
            if (qAdapter.Value == null)
            {
                throw new ArgumentNullException("qAdapter", "Init: queueAdapter should not be null");
            }

            logger.Info((int)ErrorCode.PersistentStreamPullingAgent_02, "Init of {0} {1} on silo {2} for queue {3}.",
                        this.GetType().Name, this.GrainId.ToDetailedString(), base.Silo, QueueId.ToStringWithHashCode());

            // Remove cast once we cleanup
            queueAdapter = qAdapter.Value;

            try
            {
                receiver = queueAdapter.CreateReceiver(QueueId);
            }
            catch (Exception exc)
            {
                logger.Error((int)ErrorCode.PersistentStreamPullingAgent_02, String.Format("Exception while calling INewQueueAdapter.CreateNewReceiver."), exc);
                return;
            }

            try
            {
                var task = OrleansTaskExtentions.SafeExecute(() => receiver.Initialize(initQueueTimeout));
                task = task.LogException(logger, ErrorCode.PersistentStreamPullingAgent_03, String.Format("QueueAdapterReceiver {0} failed to Initialize.", QueueId.ToStringWithHashCode()));
                await task;
            }
            catch (Exception)
            {
                // Just ignore this exception and proceed as if Initialize has succeeded.
                // We already logged individual exceptions for individual calls to Initialize. No need to log again.
            }
            // Setup a reader for a new receiver.
            // Even if the receiver failed to initialise, treat it as OK and start pumping it. It's receiver responsibility to retry initialization.
            var randomTimerOffset = safeRandom.NextTimeSpan(queueGetPeriod);

            timer = providerRuntime.RegisterTimer(AsyncTimerCallback, QueueId, randomTimerOffset, queueGetPeriod);

            logger.Info((int)ErrorCode.PersistentStreamPullingAgent_04, "Taking queue {0} under my responsibility.", QueueId.ToStringWithHashCode());
        }
Beispiel #6
0
        private async Task Start()
        {
            try
            {
                this.log.LogInformation(
                    (int)ErrorCode.MembershipStarting,
                    "MembershipOracle starting on host {HostName} with SiloAddress {SiloAddress} at {StartTime}",
                    this.localSiloDetails.DnsHostName,
                    this.myAddress,
                    LogFormatter.PrintDate(this.siloStartTime));

                // Init the membership table.
                await this.membershipTableProvider.InitializeMembershipTable(true);

                if (this.clusterMembershipOptions.ExpectedClusterSize > 1)
                {
                    // randomly delay the startup, so not all silos write to the table at once.
                    // Use random time not larger than MaxJoinAttemptTime, one minute and 0.5sec*ExpectedClusterSize;
                    // Skip waiting if we expect only one member for the cluster.
                    var random   = new SafeRandom();
                    var maxDelay = TimeSpan.FromMilliseconds(500).Multiply(this.clusterMembershipOptions.ExpectedClusterSize);
                    maxDelay = StandardExtensions.Min(maxDelay, StandardExtensions.Min(this.clusterMembershipOptions.MaxJoinAttemptTime, TimeSpan.FromMinutes(1)));
                    var randomDelay = random.NextTimeSpan(maxDelay);
                    await Task.Delay(randomDelay);
                }

                var table = await this.RefreshInternal();

                LogMissedIAmAlives(table);

                // read the table and look for my node migration occurrences
                DetectNodeMigration(table, this.localSiloDetails.DnsHostName);
            }
            catch (Exception exception)
            {
                this.log.LogError((int)ErrorCode.MembershipFailedToStart, "Membership failed to start: {Exception}", exception);
                throw;
            }
        }
Beispiel #7
0
        private async Task Run()
        {
            var random = new SafeRandom();
            ClusterMembershipSnapshot activeMembersSnapshot = default;

            SiloAddress[] otherNodes    = default;
            TimeSpan?     overrideDelay = random.NextTimeSpan(_clusterMembershipOptions.CurrentValue.ProbeTimeout);

            while (await _pingTimer.NextTick(overrideDelay))
            {
                ProbeResult probeResult;
                overrideDelay = default;

                try
                {
                    // Discover the other active nodes in the cluster, if there are any.
                    var membershipSnapshot = _membershipService.CurrentSnapshot;
                    if (otherNodes is null || !object.ReferenceEquals(activeMembersSnapshot, membershipSnapshot))
                    {
                        activeMembersSnapshot = membershipSnapshot;
                        otherNodes            = membershipSnapshot.Members.Values
                                                .Where(v => v.Status == SiloStatus.Active && v.SiloAddress != this.SiloAddress && v.SiloAddress != _localSiloDetails.SiloAddress)
                                                .Select(s => s.SiloAddress)
                                                .ToArray();
                    }

                    var isDirectProbe = !_clusterMembershipOptions.CurrentValue.EnableIndirectProbes || _failedProbes < _clusterMembershipOptions.CurrentValue.NumMissedProbesLimit - 1 || otherNodes.Length == 0;
                    var timeout       = GetTimeout(isDirectProbe);
                    var cancellation  = new CancellationTokenSource(timeout);

                    if (isDirectProbe)
                    {
                        // Probe the silo directly.
                        probeResult = await this.ProbeDirectly(cancellation.Token).ConfigureAwait(false);
                    }
                    else
                    {
                        // Pick a random other node and probe the target indirectly, using the selected node as an intermediary.
                        var intermediary = otherNodes[random.Next(0, otherNodes.Length - 1)];

                        // Select a timeout which will allow the intermediary node to attempt to probe the target node and still respond to this node
                        // if the remote node does not respond in time.
                        // Attempt to account for local health degradation by extending the timeout period.
                        probeResult = await this.ProbeIndirectly(intermediary, timeout, cancellation.Token).ConfigureAwait(false);

                        // If the intermediary is not entirely healthy, remove it from consideration and continue to probe.
                        // Note that all recused silos will be included in the consideration set the next time cluster membership changes.
                        if (probeResult.Status != ProbeResultStatus.Succeeded && probeResult.IntermediaryHealthDegradationScore > 0)
                        {
                            _log.LogInformation("Recusing unhealthy intermediary {Intermediary} and trying again with remaining nodes", intermediary);
                            otherNodes    = otherNodes.Where(node => !node.Equals(intermediary)).ToArray();
                            overrideDelay = TimeSpan.FromMilliseconds(250);
                        }
                    }

                    if (!_stoppingCancellation.IsCancellationRequested)
                    {
                        await _onProbeResult(this, probeResult).ConfigureAwait(false);
                    }
                }
                catch (Exception exception)
                {
                    _log.LogError(exception, "Exception monitoring silo {SiloAddress}", SiloAddress);
                }
            }

            TimeSpan GetTimeout(bool isDirectProbe)
            {
                var additionalTimeout = 0;

                if (_clusterMembershipOptions.CurrentValue.ExtendProbeTimeoutDuringDegradation)
                {
                    // Attempt to account for local health degradation by extending the timeout period.
                    var localDegradationScore = _localSiloHealthMonitor.GetLocalHealthDegradationScore(DateTime.UtcNow);
                    additionalTimeout += localDegradationScore;
                }

                if (!isDirectProbe)
                {
                    // Indirect probes need extra time to account for the additional hop.
                    additionalTimeout += 1;
                }

                return(_clusterMembershipOptions.CurrentValue.ProbeTimeout.Multiply(1 + additionalTimeout));
            }
        }