private void InitializeInternal() { logger.Info(ErrorCode.PersistentStreamPullingAgent_02, "Init of {0} {1} on silo {2} for queue {3}.", GetType().Name, ((ISystemTargetBase)this).GrainId.ToString(), Silo, QueueId.ToStringWithHashCode()); lastTimeCleanedPubSubCache = DateTime.UtcNow; try { receiverInitTask = OrleansTaskExtentions.SafeExecute(() => receiver.Initialize(this.options.InitQueueTimeout)) .LogException(logger, ErrorCode.PersistentStreamPullingAgent_03, $"QueueAdapterReceiver {QueueId.ToStringWithHashCode()} failed to Initialize."); receiverInitTask.Ignore(); } catch { // Just ignore this exception and proceed as if Initialize has succeeded. // We already logged individual exceptions for individual calls to Initialize. No need to log again. } // Setup a reader for a new receiver. // Even if the receiver failed to initialise, treat it as OK and start pumping it. It's receiver responsibility to retry initialization. var randomTimerOffset = ThreadSafeRandom.NextTimeSpan(this.options.GetQueueMsgsTimerPeriod); timer = RegisterTimer(AsyncTimerCallback, QueueId, randomTimerOffset, this.options.GetQueueMsgsTimerPeriod); IntValueStatistic.FindOrCreate(new StatisticName(StatisticNames.STREAMS_PERSISTENT_STREAM_PUBSUB_CACHE_SIZE, StatisticUniquePostfix), () => pubSubCache.Count); logger.Info((int)ErrorCode.PersistentStreamPullingAgent_04, "Taking queue {0} under my responsibility.", QueueId.ToStringWithHashCode()); }
private async Task Operation(int opNumber) { if (operationsInProgress > 0) { Assert.True(false, $"1: Operation {opNumber} found {operationsInProgress} operationsInProgress."); } operationsInProgress++; var delay = ThreadSafeRandom.NextTimeSpan(TimeSpan.FromSeconds(2)); output.WriteLine("Task {0} Staring", opNumber); await Task.Delay(delay); if (operationsInProgress != 1) { Assert.True(false, $"2: Operation {opNumber} found {operationsInProgress} operationsInProgress."); } output.WriteLine("Task {0} after first delay", opNumber); await Task.Delay(delay); if (operationsInProgress != 1) { Assert.True(false, $"3: Operation {opNumber} found {operationsInProgress} operationsInProgress."); } operationsInProgress--; output.WriteLine("Task {0} Done", opNumber); }
public async Task InterleavingConsistencyTest(int numItems) { TimeSpan delay = TimeSpan.FromMilliseconds(1); List <Task> getFileMetadataPromises = new List <Task>(numItems * 2); Dictionary <int, string> fileMetadatas = new Dictionary <int, string>(numItems * 2); for (int i = 0; i < numItems; i++) { int capture = i; Func <Task> func = ( async() => { await Task.Delay(ThreadSafeRandom.NextTimeSpan(delay)); int fileMetadata = capture; if ((fileMetadata % 2) == 0) { fileMetadatas.Add(fileMetadata, fileMetadata.ToString()); } }); getFileMetadataPromises.Add(func()); } await Task.WhenAll(getFileMetadataPromises.ToArray()); List <Task> tagPromises = new List <Task>(fileMetadatas.Count); foreach (KeyValuePair <int, string> keyValuePair in fileMetadatas) { int fileId = keyValuePair.Key; Func <Task> func = (async() => { await Task.Delay(ThreadSafeRandom.NextTimeSpan(delay)); _ = fileMetadatas[fileId]; }); tagPromises.Add(func()); } await Task.WhenAll(tagPromises); // sort the fileMetadatas according to fileIds. List <string> results = new List <string>(fileMetadatas.Count); for (int i = 0; i < numItems; i++) { string metadata; if (fileMetadatas.TryGetValue(i, out metadata)) { results.Add(metadata); } } if (numItems != results.Count) { //throw new OrleansException(String.Format("numItems != results.Count, {0} != {1}", numItems, results.Count)); } }
private async Task CleanupDefunctSilos() { if (!this.clusterMembershipOptions.DefunctSiloCleanupPeriod.HasValue) { if (this.log.IsEnabled(LogLevel.Debug)) { this.log.LogDebug($"Membership table cleanup is disabled due to {nameof(ClusterMembershipOptions)}.{nameof(ClusterMembershipOptions.DefunctSiloCleanupPeriod)} not being specified"); } return; } if (this.log.IsEnabled(LogLevel.Debug)) { this.log.LogDebug("Starting membership table cleanup agent"); } try { var period = this.clusterMembershipOptions.DefunctSiloCleanupPeriod.Value; // The first cleanup should be scheduled for shortly after silo startup. var delay = ThreadSafeRandom.NextTimeSpan(TimeSpan.FromMinutes(2), TimeSpan.FromMinutes(10)); while (await this.cleanupDefunctSilosTimer.NextTick(delay)) { // Select a random time within the next window. // The purpose of this is to add jitter to a process which could be affected by contention with other silos. delay = ThreadSafeRandom.NextTimeSpan(period, period + TimeSpan.FromMinutes(5)); try { var dateLimit = DateTime.UtcNow - this.clusterMembershipOptions.DefunctSiloExpiration; await this.membershipTableProvider.CleanupDefunctSiloEntries(dateLimit); } catch (Exception exception) when(exception is NotImplementedException || exception is MissingMethodException) { this.cleanupDefunctSilosTimer.Dispose(); this.log.LogWarning( (int)ErrorCode.MembershipCleanDeadEntriesFailure, $"{nameof(IMembershipTable.CleanupDefunctSiloEntries)} operation is not supported by the current implementation of {nameof(IMembershipTable)}. Disabling the timer now."); return; } catch (Exception exception) { this.log.LogError((int)ErrorCode.MembershipCleanDeadEntriesFailure, "Failed to clean up defunct membership table entries: {Exception}", exception); } } } finally { if (this.log.IsEnabled(LogLevel.Debug)) { this.log.LogDebug("Stopped membership table cleanup agent"); } } }
private async Task PeriodicallyRefreshMembershipTable() { if (this.log.IsEnabled(LogLevel.Debug)) { this.log.LogDebug("Starting periodic membership table refreshes"); } try { var targetMilliseconds = (int)this.clusterMembershipOptions.TableRefreshTimeout.TotalMilliseconds; TimeSpan?onceOffDelay = ThreadSafeRandom.NextTimeSpan(this.clusterMembershipOptions.TableRefreshTimeout); while (await this.membershipUpdateTimer.NextTick(onceOffDelay)) { onceOffDelay = default; try { var stopwatch = ValueStopwatch.StartNew(); await this.Refresh(); if (this.log.IsEnabled(LogLevel.Trace)) { this.log.LogTrace("Refreshing membership table took {Elapsed}", stopwatch.Elapsed); } } catch (Exception exception) { this.log.LogError( (int)ErrorCode.MembershipUpdateIAmAliveFailure, "Failed to refresh membership table, will retry shortly: {Exception}", exception); // Retry quickly onceOffDelay = TimeSpan.FromMilliseconds(200); } } } catch (Exception exception) when(this.fatalErrorHandler.IsUnexpected(exception)) { this.log.LogError("Error refreshing membership table: {Exception}", exception); this.fatalErrorHandler.OnFatalException(this, nameof(PeriodicallyRefreshMembershipTable), exception); } finally { if (this.log.IsEnabled(LogLevel.Debug)) { this.log.LogDebug("Stopping periodic membership table refreshes"); } } }
public async Task Start() { logger.Info("Starting DeploymentLoadPublisher."); if (statisticsRefreshTime > TimeSpan.Zero) { // Randomize PublishStatistics timer, // but also upon start publish my stats to everyone and take everyone's stats for me to start with something. var randomTimerOffset = ThreadSafeRandom.NextTimeSpan(statisticsRefreshTime); this.publishTimer = this.RegisterTimer(PublishStatistics, null, randomTimerOffset, statisticsRefreshTime, "DeploymentLoadPublisher.PublishStatisticsTimer"); } await RefreshStatistics(); await PublishStatistics(null); logger.Info("Started DeploymentLoadPublisher."); }
private async Task Run() { ClusterMembershipSnapshot activeMembersSnapshot = default; SiloAddress[] otherNodes = default; TimeSpan? overrideDelay = ThreadSafeRandom.NextTimeSpan(_clusterMembershipOptions.CurrentValue.ProbeTimeout); while (await _pingTimer.NextTick(overrideDelay)) { ProbeResult probeResult; overrideDelay = default; try { // Discover the other active nodes in the cluster, if there are any. var membershipSnapshot = _membershipService.CurrentSnapshot; if (otherNodes is null || !object.ReferenceEquals(activeMembersSnapshot, membershipSnapshot)) { activeMembersSnapshot = membershipSnapshot; otherNodes = membershipSnapshot.Members.Values .Where(v => v.Status == SiloStatus.Active && v.SiloAddress != this.SiloAddress && v.SiloAddress != _localSiloDetails.SiloAddress) .Select(s => s.SiloAddress) .ToArray(); } var isDirectProbe = !_clusterMembershipOptions.CurrentValue.EnableIndirectProbes || _failedProbes < _clusterMembershipOptions.CurrentValue.NumMissedProbesLimit - 1 || otherNodes.Length == 0; var timeout = GetTimeout(isDirectProbe); var cancellation = new CancellationTokenSource(timeout); if (isDirectProbe) { // Probe the silo directly. probeResult = await this.ProbeDirectly(cancellation.Token).ConfigureAwait(false); } else { // Pick a random other node and probe the target indirectly, using the selected node as an intermediary. var intermediary = otherNodes[ThreadSafeRandom.Next(otherNodes.Length)]; // Select a timeout which will allow the intermediary node to attempt to probe the target node and still respond to this node // if the remote node does not respond in time. // Attempt to account for local health degradation by extending the timeout period. probeResult = await this.ProbeIndirectly(intermediary, timeout, cancellation.Token).ConfigureAwait(false); // If the intermediary is not entirely healthy, remove it from consideration and continue to probe. // Note that all recused silos will be included in the consideration set the next time cluster membership changes. if (probeResult.Status != ProbeResultStatus.Succeeded && probeResult.IntermediaryHealthDegradationScore > 0) { _log.LogInformation("Recusing unhealthy intermediary {Intermediary} and trying again with remaining nodes", intermediary); otherNodes = otherNodes.Where(node => !node.Equals(intermediary)).ToArray(); overrideDelay = TimeSpan.FromMilliseconds(250); } } if (!_stoppingCancellation.IsCancellationRequested) { await _onProbeResult(this, probeResult).ConfigureAwait(false); } } catch (Exception exception) { _log.LogError(exception, "Exception monitoring silo {SiloAddress}", SiloAddress); } } TimeSpan GetTimeout(bool isDirectProbe) { var additionalTimeout = 0; if (_clusterMembershipOptions.CurrentValue.ExtendProbeTimeoutDuringDegradation) { // Attempt to account for local health degradation by extending the timeout period. var localDegradationScore = _localSiloHealthMonitor.GetLocalHealthDegradationScore(DateTime.UtcNow); additionalTimeout += localDegradationScore; } if (!isDirectProbe) { // Indirect probes need extra time to account for the additional hop. additionalTimeout += 1; } return(_clusterMembershipOptions.CurrentValue.ProbeTimeout.Multiply(1 + additionalTimeout)); } }