private void InitializeInternal(IQueueAdapter qAdapter, IQueueAdapterCache queueAdapterCache, IStreamFailureHandler failureHandler) { logger.Info(ErrorCode.PersistentStreamPullingAgent_02, "Init of {0} {1} on silo {2} for queue {3}.", GetType().Name, ((ISystemTargetBase)this).GrainId.ToDetailedString(), Silo, QueueId.ToStringWithHashCode()); // Remove cast once we cleanup queueAdapter = qAdapter; streamFailureHandler = failureHandler; lastTimeCleanedPubSubCache = DateTime.UtcNow; try { receiver = queueAdapter.CreateReceiver(QueueId); } catch (Exception exc) { logger.Error(ErrorCode.PersistentStreamPullingAgent_02, "Exception while calling IQueueAdapter.CreateNewReceiver.", exc); throw; } try { if (queueAdapterCache != null) { queueCache = queueAdapterCache.CreateQueueCache(QueueId); } } catch (Exception exc) { logger.Error(ErrorCode.PersistentStreamPullingAgent_23, "Exception while calling IQueueAdapterCache.CreateQueueCache.", exc); throw; } try { receiverInitTask = OrleansTaskExtentions.SafeExecute(() => receiver.Initialize(config.InitQueueTimeout)) .LogException(logger, ErrorCode.PersistentStreamPullingAgent_03, $"QueueAdapterReceiver {QueueId.ToStringWithHashCode()} failed to Initialize."); receiverInitTask.Ignore(); } catch { // Just ignore this exception and proceed as if Initialize has succeeded. // We already logged individual exceptions for individual calls to Initialize. No need to log again. } // Setup a reader for a new receiver. // Even if the receiver failed to initialise, treat it as OK and start pumping it. It's receiver responsibility to retry initialization. var randomTimerOffset = safeRandom.NextTimeSpan(config.GetQueueMsgsTimerPeriod); timer = RegisterTimer(AsyncTimerCallback, QueueId, randomTimerOffset, config.GetQueueMsgsTimerPeriod); IntValueStatistic.FindOrCreate(new StatisticName(StatisticNames.STREAMS_PERSISTENT_STREAM_PUBSUB_CACHE_SIZE, StatisticUniquePostfix), () => pubSubCache.Count); logger.Info((int)ErrorCode.PersistentStreamPullingAgent_04, "Taking queue {0} under my responsibility.", QueueId.ToStringWithHashCode()); }
private async Task PeriodicallyRefreshMembershipTable() { if (this.log.IsEnabled(LogLevel.Debug)) { this.log.LogDebug("Starting periodic membership table refreshes"); } try { var targetMilliseconds = (int)this.clusterMembershipOptions.TableRefreshTimeout.TotalMilliseconds; var random = new SafeRandom(); TimeSpan?onceOffDelay = random.NextTimeSpan(this.clusterMembershipOptions.TableRefreshTimeout); while (await this.membershipUpdateTimer.NextTick(onceOffDelay)) { onceOffDelay = default; try { var stopwatch = ValueStopwatch.StartNew(); await this.Refresh(); if (this.log.IsEnabled(LogLevel.Trace)) { this.log.LogTrace("Refreshing membership table took {Elapsed}", stopwatch.Elapsed); } } catch (Exception exception) { this.log.LogError( (int)ErrorCode.MembershipUpdateIAmAliveFailure, "Failed to refresh membership table, will retry shortly: {Exception}", exception); // Retry quickly onceOffDelay = TimeSpan.FromMilliseconds(200); } } } catch (Exception exception) when(this.fatalErrorHandler.IsUnexpected(exception)) { this.log.LogError("Error refreshing membership table: {Exception}", exception); this.fatalErrorHandler.OnFatalException(this, nameof(PeriodicallyRefreshMembershipTable), exception); } finally { if (this.log.IsEnabled(LogLevel.Debug)) { this.log.LogDebug("Stopping periodic membership table refreshes"); } } }
public static TimeSpan NextTimeSpan(this SafeRandom random, TimeSpan minValue, TimeSpan maxValue) { if (minValue <= TimeSpan.Zero) { throw new ArgumentOutOfRangeException(nameof(minValue), minValue, "SafeRandom.NextTimeSpan minValue must be a positive number."); } if (minValue >= maxValue) { throw new ArgumentOutOfRangeException(nameof(minValue), minValue, "SafeRandom.NextTimeSpan minValue must be greater than maxValue."); } TimeSpan span = maxValue - minValue; return(minValue + random.NextTimeSpan(span)); }
protected override async Task StartInBackground() { await DoInitialReadAndUpdateReminders(); if (Status == GrainServiceStatus.Booting) { var random = new SafeRandom(); listRefreshTimer = GrainTimer.FromTaskCallback( _ => DoInitialReadAndUpdateReminders(), null, random.NextTimeSpan(InitialReadRetryPeriod), InitialReadRetryPeriod, name: "ReminderService.ReminderListInitialRead"); listRefreshTimer.Start(); } }
/// <summary> /// Take responsibility for a new queues that was assigned to me via a new range. /// We first store the new queue in our internal data structure, try to initialize it and start a pumping timer. /// ERROR HANDLING: /// The resposibility to handle initializatoion and shutdown failures is inside the INewQueueAdapterReceiver code. /// The agent will call Initialize once and log an error. It will not call initiliaze again. /// The receiver itself may attempt later to recover from this error and do initialization again. /// The agent will assume initialization has succeeded and will subsequently start calling pumping receive. /// Same applies to shutdown. /// </summary> /// <param name="qAdapter"></param> /// <returns></returns> public async Task Initialize(Immutable <IQueueAdapter> qAdapter) { if (qAdapter.Value == null) { throw new ArgumentNullException("qAdapter", "Init: queueAdapter should not be null"); } logger.Info((int)ErrorCode.PersistentStreamPullingAgent_02, "Init of {0} {1} on silo {2} for queue {3}.", this.GetType().Name, this.GrainId.ToDetailedString(), base.Silo, QueueId.ToStringWithHashCode()); // Remove cast once we cleanup queueAdapter = qAdapter.Value; try { receiver = queueAdapter.CreateReceiver(QueueId); } catch (Exception exc) { logger.Error((int)ErrorCode.PersistentStreamPullingAgent_02, String.Format("Exception while calling INewQueueAdapter.CreateNewReceiver."), exc); return; } try { var task = OrleansTaskExtentions.SafeExecute(() => receiver.Initialize(initQueueTimeout)); task = task.LogException(logger, ErrorCode.PersistentStreamPullingAgent_03, String.Format("QueueAdapterReceiver {0} failed to Initialize.", QueueId.ToStringWithHashCode())); await task; } catch (Exception) { // Just ignore this exception and proceed as if Initialize has succeeded. // We already logged individual exceptions for individual calls to Initialize. No need to log again. } // Setup a reader for a new receiver. // Even if the receiver failed to initialise, treat it as OK and start pumping it. It's receiver responsibility to retry initialization. var randomTimerOffset = safeRandom.NextTimeSpan(queueGetPeriod); timer = providerRuntime.RegisterTimer(AsyncTimerCallback, QueueId, randomTimerOffset, queueGetPeriod); logger.Info((int)ErrorCode.PersistentStreamPullingAgent_04, "Taking queue {0} under my responsibility.", QueueId.ToStringWithHashCode()); }
private async Task Start() { try { this.log.LogInformation( (int)ErrorCode.MembershipStarting, "MembershipOracle starting on host {HostName} with SiloAddress {SiloAddress} at {StartTime}", this.localSiloDetails.DnsHostName, this.myAddress, LogFormatter.PrintDate(this.siloStartTime)); // Init the membership table. await this.membershipTableProvider.InitializeMembershipTable(true); if (this.clusterMembershipOptions.ExpectedClusterSize > 1) { // randomly delay the startup, so not all silos write to the table at once. // Use random time not larger than MaxJoinAttemptTime, one minute and 0.5sec*ExpectedClusterSize; // Skip waiting if we expect only one member for the cluster. var random = new SafeRandom(); var maxDelay = TimeSpan.FromMilliseconds(500).Multiply(this.clusterMembershipOptions.ExpectedClusterSize); maxDelay = StandardExtensions.Min(maxDelay, StandardExtensions.Min(this.clusterMembershipOptions.MaxJoinAttemptTime, TimeSpan.FromMinutes(1))); var randomDelay = random.NextTimeSpan(maxDelay); await Task.Delay(randomDelay); } var table = await this.RefreshInternal(); LogMissedIAmAlives(table); // read the table and look for my node migration occurrences DetectNodeMigration(table, this.localSiloDetails.DnsHostName); } catch (Exception exception) { this.log.LogError((int)ErrorCode.MembershipFailedToStart, "Membership failed to start: {Exception}", exception); throw; } }
private async Task Run() { var random = new SafeRandom(); ClusterMembershipSnapshot activeMembersSnapshot = default; SiloAddress[] otherNodes = default; TimeSpan? overrideDelay = random.NextTimeSpan(_clusterMembershipOptions.CurrentValue.ProbeTimeout); while (await _pingTimer.NextTick(overrideDelay)) { ProbeResult probeResult; overrideDelay = default; try { // Discover the other active nodes in the cluster, if there are any. var membershipSnapshot = _membershipService.CurrentSnapshot; if (otherNodes is null || !object.ReferenceEquals(activeMembersSnapshot, membershipSnapshot)) { activeMembersSnapshot = membershipSnapshot; otherNodes = membershipSnapshot.Members.Values .Where(v => v.Status == SiloStatus.Active && v.SiloAddress != this.SiloAddress && v.SiloAddress != _localSiloDetails.SiloAddress) .Select(s => s.SiloAddress) .ToArray(); } var isDirectProbe = !_clusterMembershipOptions.CurrentValue.EnableIndirectProbes || _failedProbes < _clusterMembershipOptions.CurrentValue.NumMissedProbesLimit - 1 || otherNodes.Length == 0; var timeout = GetTimeout(isDirectProbe); var cancellation = new CancellationTokenSource(timeout); if (isDirectProbe) { // Probe the silo directly. probeResult = await this.ProbeDirectly(cancellation.Token).ConfigureAwait(false); } else { // Pick a random other node and probe the target indirectly, using the selected node as an intermediary. var intermediary = otherNodes[random.Next(0, otherNodes.Length - 1)]; // Select a timeout which will allow the intermediary node to attempt to probe the target node and still respond to this node // if the remote node does not respond in time. // Attempt to account for local health degradation by extending the timeout period. probeResult = await this.ProbeIndirectly(intermediary, timeout, cancellation.Token).ConfigureAwait(false); // If the intermediary is not entirely healthy, remove it from consideration and continue to probe. // Note that all recused silos will be included in the consideration set the next time cluster membership changes. if (probeResult.Status != ProbeResultStatus.Succeeded && probeResult.IntermediaryHealthDegradationScore > 0) { _log.LogInformation("Recusing unhealthy intermediary {Intermediary} and trying again with remaining nodes", intermediary); otherNodes = otherNodes.Where(node => !node.Equals(intermediary)).ToArray(); overrideDelay = TimeSpan.FromMilliseconds(250); } } if (!_stoppingCancellation.IsCancellationRequested) { await _onProbeResult(this, probeResult).ConfigureAwait(false); } } catch (Exception exception) { _log.LogError(exception, "Exception monitoring silo {SiloAddress}", SiloAddress); } } TimeSpan GetTimeout(bool isDirectProbe) { var additionalTimeout = 0; if (_clusterMembershipOptions.CurrentValue.ExtendProbeTimeoutDuringDegradation) { // Attempt to account for local health degradation by extending the timeout period. var localDegradationScore = _localSiloHealthMonitor.GetLocalHealthDegradationScore(DateTime.UtcNow); additionalTimeout += localDegradationScore; } if (!isDirectProbe) { // Indirect probes need extra time to account for the additional hop. additionalTimeout += 1; } return(_clusterMembershipOptions.CurrentValue.ProbeTimeout.Multiply(1 + additionalTimeout)); } }