private void InitializeInternal()
        {
            logger.Info(ErrorCode.PersistentStreamPullingAgent_02, "Init of {0} {1} on silo {2} for queue {3}.",
                        GetType().Name, ((ISystemTargetBase)this).GrainId.ToString(), Silo, QueueId.ToStringWithHashCode());

            lastTimeCleanedPubSubCache = DateTime.UtcNow;

            try
            {
                receiverInitTask = OrleansTaskExtentions.SafeExecute(() => receiver.Initialize(this.options.InitQueueTimeout))
                                   .LogException(logger, ErrorCode.PersistentStreamPullingAgent_03, $"QueueAdapterReceiver {QueueId.ToStringWithHashCode()} failed to Initialize.");
                receiverInitTask.Ignore();
            }
            catch
            {
                // Just ignore this exception and proceed as if Initialize has succeeded.
                // We already logged individual exceptions for individual calls to Initialize. No need to log again.
            }

            // Setup a reader for a new receiver.
            // Even if the receiver failed to initialise, treat it as OK and start pumping it. It's receiver responsibility to retry initialization.
            var randomTimerOffset = ThreadSafeRandom.NextTimeSpan(this.options.GetQueueMsgsTimerPeriod);

            timer = RegisterTimer(AsyncTimerCallback, QueueId, randomTimerOffset, this.options.GetQueueMsgsTimerPeriod);

            IntValueStatistic.FindOrCreate(new StatisticName(StatisticNames.STREAMS_PERSISTENT_STREAM_PUBSUB_CACHE_SIZE, StatisticUniquePostfix), () => pubSubCache.Count);

            logger.Info((int)ErrorCode.PersistentStreamPullingAgent_04, "Taking queue {0} under my responsibility.", QueueId.ToStringWithHashCode());
        }
Пример #2
0
        private async Task Operation(int opNumber)
        {
            if (operationsInProgress > 0)
            {
                Assert.True(false, $"1: Operation {opNumber} found {operationsInProgress} operationsInProgress.");
            }
            operationsInProgress++;
            var delay = ThreadSafeRandom.NextTimeSpan(TimeSpan.FromSeconds(2));

            output.WriteLine("Task {0} Staring", opNumber);
            await Task.Delay(delay);

            if (operationsInProgress != 1)
            {
                Assert.True(false, $"2: Operation {opNumber} found {operationsInProgress} operationsInProgress.");
            }

            output.WriteLine("Task {0} after first delay", opNumber);
            await Task.Delay(delay);

            if (operationsInProgress != 1)
            {
                Assert.True(false, $"3: Operation {opNumber} found {operationsInProgress} operationsInProgress.");
            }

            operationsInProgress--;
            output.WriteLine("Task {0} Done", opNumber);
        }
Пример #3
0
        public async Task InterleavingConsistencyTest(int numItems)
        {
            TimeSpan    delay = TimeSpan.FromMilliseconds(1);
            List <Task> getFileMetadataPromises    = new List <Task>(numItems * 2);
            Dictionary <int, string> fileMetadatas = new Dictionary <int, string>(numItems * 2);

            for (int i = 0; i < numItems; i++)
            {
                int         capture = i;
                Func <Task> func    = (
                    async() =>
                {
                    await Task.Delay(ThreadSafeRandom.NextTimeSpan(delay));
                    int fileMetadata = capture;
                    if ((fileMetadata % 2) == 0)
                    {
                        fileMetadatas.Add(fileMetadata, fileMetadata.ToString());
                    }
                });
                getFileMetadataPromises.Add(func());
            }

            await Task.WhenAll(getFileMetadataPromises.ToArray());

            List <Task> tagPromises = new List <Task>(fileMetadatas.Count);

            foreach (KeyValuePair <int, string> keyValuePair in fileMetadatas)
            {
                int         fileId = keyValuePair.Key;
                Func <Task> func   = (async() =>
                {
                    await Task.Delay(ThreadSafeRandom.NextTimeSpan(delay));
                    _ = fileMetadatas[fileId];
                });
                tagPromises.Add(func());
            }

            await Task.WhenAll(tagPromises);

            // sort the fileMetadatas according to fileIds.
            List <string> results = new List <string>(fileMetadatas.Count);

            for (int i = 0; i < numItems; i++)
            {
                string metadata;
                if (fileMetadatas.TryGetValue(i, out metadata))
                {
                    results.Add(metadata);
                }
            }

            if (numItems != results.Count)
            {
                //throw new OrleansException(String.Format("numItems != results.Count, {0} != {1}", numItems, results.Count));
            }
        }
Пример #4
0
        private async Task CleanupDefunctSilos()
        {
            if (!this.clusterMembershipOptions.DefunctSiloCleanupPeriod.HasValue)
            {
                if (this.log.IsEnabled(LogLevel.Debug))
                {
                    this.log.LogDebug($"Membership table cleanup is disabled due to {nameof(ClusterMembershipOptions)}.{nameof(ClusterMembershipOptions.DefunctSiloCleanupPeriod)} not being specified");
                }

                return;
            }

            if (this.log.IsEnabled(LogLevel.Debug))
            {
                this.log.LogDebug("Starting membership table cleanup agent");
            }
            try
            {
                var period = this.clusterMembershipOptions.DefunctSiloCleanupPeriod.Value;

                // The first cleanup should be scheduled for shortly after silo startup.
                var delay = ThreadSafeRandom.NextTimeSpan(TimeSpan.FromMinutes(2), TimeSpan.FromMinutes(10));
                while (await this.cleanupDefunctSilosTimer.NextTick(delay))
                {
                    // Select a random time within the next window.
                    // The purpose of this is to add jitter to a process which could be affected by contention with other silos.
                    delay = ThreadSafeRandom.NextTimeSpan(period, period + TimeSpan.FromMinutes(5));
                    try
                    {
                        var dateLimit = DateTime.UtcNow - this.clusterMembershipOptions.DefunctSiloExpiration;
                        await this.membershipTableProvider.CleanupDefunctSiloEntries(dateLimit);
                    }
                    catch (Exception exception) when(exception is NotImplementedException || exception is MissingMethodException)
                    {
                        this.cleanupDefunctSilosTimer.Dispose();
                        this.log.LogWarning(
                            (int)ErrorCode.MembershipCleanDeadEntriesFailure,
                            $"{nameof(IMembershipTable.CleanupDefunctSiloEntries)} operation is not supported by the current implementation of {nameof(IMembershipTable)}. Disabling the timer now.");
                        return;
                    }
                    catch (Exception exception)
                    {
                        this.log.LogError((int)ErrorCode.MembershipCleanDeadEntriesFailure, "Failed to clean up defunct membership table entries: {Exception}", exception);
                    }
                }
            }
            finally
            {
                if (this.log.IsEnabled(LogLevel.Debug))
                {
                    this.log.LogDebug("Stopped membership table cleanup agent");
                }
            }
        }
        private async Task PeriodicallyRefreshMembershipTable()
        {
            if (this.log.IsEnabled(LogLevel.Debug))
            {
                this.log.LogDebug("Starting periodic membership table refreshes");
            }
            try
            {
                var targetMilliseconds = (int)this.clusterMembershipOptions.TableRefreshTimeout.TotalMilliseconds;

                TimeSpan?onceOffDelay = ThreadSafeRandom.NextTimeSpan(this.clusterMembershipOptions.TableRefreshTimeout);
                while (await this.membershipUpdateTimer.NextTick(onceOffDelay))
                {
                    onceOffDelay = default;

                    try
                    {
                        var stopwatch = ValueStopwatch.StartNew();
                        await this.Refresh();

                        if (this.log.IsEnabled(LogLevel.Trace))
                        {
                            this.log.LogTrace("Refreshing membership table took {Elapsed}", stopwatch.Elapsed);
                        }
                    }
                    catch (Exception exception)
                    {
                        this.log.LogError(
                            (int)ErrorCode.MembershipUpdateIAmAliveFailure,
                            "Failed to refresh membership table, will retry shortly: {Exception}",
                            exception);

                        // Retry quickly
                        onceOffDelay = TimeSpan.FromMilliseconds(200);
                    }
                }
            }
            catch (Exception exception) when(this.fatalErrorHandler.IsUnexpected(exception))
            {
                this.log.LogError("Error refreshing membership table: {Exception}", exception);
                this.fatalErrorHandler.OnFatalException(this, nameof(PeriodicallyRefreshMembershipTable), exception);
            }
            finally
            {
                if (this.log.IsEnabled(LogLevel.Debug))
                {
                    this.log.LogDebug("Stopping periodic membership table refreshes");
                }
            }
        }
Пример #6
0
        public async Task Start()
        {
            logger.Info("Starting DeploymentLoadPublisher.");
            if (statisticsRefreshTime > TimeSpan.Zero)
            {
                // Randomize PublishStatistics timer,
                // but also upon start publish my stats to everyone and take everyone's stats for me to start with something.
                var randomTimerOffset = ThreadSafeRandom.NextTimeSpan(statisticsRefreshTime);
                this.publishTimer = this.RegisterTimer(PublishStatistics, null, randomTimerOffset, statisticsRefreshTime, "DeploymentLoadPublisher.PublishStatisticsTimer");
            }
            await RefreshStatistics();
            await PublishStatistics(null);

            logger.Info("Started DeploymentLoadPublisher.");
        }
Пример #7
0
        private async Task Run()
        {
            ClusterMembershipSnapshot activeMembersSnapshot = default;

            SiloAddress[] otherNodes    = default;
            TimeSpan?     overrideDelay = ThreadSafeRandom.NextTimeSpan(_clusterMembershipOptions.CurrentValue.ProbeTimeout);

            while (await _pingTimer.NextTick(overrideDelay))
            {
                ProbeResult probeResult;
                overrideDelay = default;

                try
                {
                    // Discover the other active nodes in the cluster, if there are any.
                    var membershipSnapshot = _membershipService.CurrentSnapshot;
                    if (otherNodes is null || !object.ReferenceEquals(activeMembersSnapshot, membershipSnapshot))
                    {
                        activeMembersSnapshot = membershipSnapshot;
                        otherNodes            = membershipSnapshot.Members.Values
                                                .Where(v => v.Status == SiloStatus.Active && v.SiloAddress != this.SiloAddress && v.SiloAddress != _localSiloDetails.SiloAddress)
                                                .Select(s => s.SiloAddress)
                                                .ToArray();
                    }

                    var isDirectProbe = !_clusterMembershipOptions.CurrentValue.EnableIndirectProbes || _failedProbes < _clusterMembershipOptions.CurrentValue.NumMissedProbesLimit - 1 || otherNodes.Length == 0;
                    var timeout       = GetTimeout(isDirectProbe);
                    var cancellation  = new CancellationTokenSource(timeout);

                    if (isDirectProbe)
                    {
                        // Probe the silo directly.
                        probeResult = await this.ProbeDirectly(cancellation.Token).ConfigureAwait(false);
                    }
                    else
                    {
                        // Pick a random other node and probe the target indirectly, using the selected node as an intermediary.
                        var intermediary = otherNodes[ThreadSafeRandom.Next(otherNodes.Length)];

                        // Select a timeout which will allow the intermediary node to attempt to probe the target node and still respond to this node
                        // if the remote node does not respond in time.
                        // Attempt to account for local health degradation by extending the timeout period.
                        probeResult = await this.ProbeIndirectly(intermediary, timeout, cancellation.Token).ConfigureAwait(false);

                        // If the intermediary is not entirely healthy, remove it from consideration and continue to probe.
                        // Note that all recused silos will be included in the consideration set the next time cluster membership changes.
                        if (probeResult.Status != ProbeResultStatus.Succeeded && probeResult.IntermediaryHealthDegradationScore > 0)
                        {
                            _log.LogInformation("Recusing unhealthy intermediary {Intermediary} and trying again with remaining nodes", intermediary);
                            otherNodes    = otherNodes.Where(node => !node.Equals(intermediary)).ToArray();
                            overrideDelay = TimeSpan.FromMilliseconds(250);
                        }
                    }

                    if (!_stoppingCancellation.IsCancellationRequested)
                    {
                        await _onProbeResult(this, probeResult).ConfigureAwait(false);
                    }
                }
                catch (Exception exception)
                {
                    _log.LogError(exception, "Exception monitoring silo {SiloAddress}", SiloAddress);
                }
            }

            TimeSpan GetTimeout(bool isDirectProbe)
            {
                var additionalTimeout = 0;

                if (_clusterMembershipOptions.CurrentValue.ExtendProbeTimeoutDuringDegradation)
                {
                    // Attempt to account for local health degradation by extending the timeout period.
                    var localDegradationScore = _localSiloHealthMonitor.GetLocalHealthDegradationScore(DateTime.UtcNow);
                    additionalTimeout += localDegradationScore;
                }

                if (!isDirectProbe)
                {
                    // Indirect probes need extra time to account for the additional hop.
                    additionalTimeout += 1;
                }

                return(_clusterMembershipOptions.CurrentValue.ProbeTimeout.Multiply(1 + additionalTimeout));
            }
        }