private async Task OnRuntimeServicesStart(CancellationToken ct) { //TODO: Setup all (or as many as possible) of the class started in this call to work directly with lifecyce var stopWatch = Stopwatch.StartNew(); // The order of these 4 is pretty much arbitrary. StartTaskWithPerfAnalysis("Start Scheduler", scheduler.Start, stopWatch); StartTaskWithPerfAnalysis("Start Message center", messageCenter.Start, stopWatch); StartTaskWithPerfAnalysis("Start Incoming message agents", IncomingMessageAgentsStart, stopWatch); void IncomingMessageAgentsStart() { incomingPingAgent.Start(); incomingSystemAgent.Start(); incomingAgent.Start(); } StartTaskWithPerfAnalysis("Start local grain directory", LocalGrainDirectory.Start, stopWatch); // Set up an execution context for this thread so that the target creation steps can use asynch values. RuntimeContext.InitializeMainThread(); StartTaskWithPerfAnalysis("Init implicit stream subscribe table", InitImplicitStreamSubscribeTable, stopWatch); void InitImplicitStreamSubscribeTable() { // Initialize the implicit stream subscribers table. var implicitStreamSubscriberTable = Services.GetRequiredService <ImplicitStreamSubscriberTable>(); var grainTypeManager = Services.GetRequiredService <GrainTypeManager>(); implicitStreamSubscriberTable.InitImplicitStreamSubscribers(grainTypeManager.GrainClassTypeData.Select(t => t.Value.Type).ToArray()); } var siloProviderRuntime = Services.GetRequiredService <SiloProviderRuntime>(); SiloStatisticsOptions statisticsOptions = Services.GetRequiredService <IOptions <SiloStatisticsOptions> >().Value; runtimeClient.CurrentStreamProviderRuntime = siloProviderRuntime; await StartAsyncTaskWithPerfAnalysis("Load StatisticProviders", LoadStatsProvider, stopWatch); async Task LoadStatsProvider() { // can call SetSiloMetricsTableDataManager only after MessageCenter is created (dependency on this.SiloAddress). await siloStatistics.SetSiloStatsTableDataManager(this, statisticsOptions).WithTimeout(initTimeout, $"SiloStatistics Setting SiloStatsTableDataManager failed due to timeout {initTimeout}"); await siloStatistics.SetSiloMetricsTableDataManager(this, statisticsOptions).WithTimeout(initTimeout, $"SiloStatistics Setting SiloMetricsTableDataManager failed due to timeout {initTimeout}"); } // This has to follow the above steps that start the runtime components await StartAsyncTaskWithPerfAnalysis("Create system targets and inject dependencies", () => { CreateSystemTargets(); return(InjectDependencies()); }, stopWatch); // Validate the configuration. // TODO - refactor validation - jbragg //GlobalConfig.Application.ValidateConfiguration(logger); }
/// <summary> /// Constructor /// </summary> /// <param name="loadSheddingLimit"></param> /// <param name="getNodeConfig">The method used to get the current node configuration.</param> private LoadShedQueueFlowController(int loadSheddingLimit, SiloStatisticsOptions options) { this.options = options; if (loadSheddingLimit < 0 || loadSheddingLimit > 100) { throw new ArgumentOutOfRangeException(nameof(loadSheddingLimit), "Value must be between 0-100"); } this.loadSheddingLimit = loadSheddingLimit != 0 ? loadSheddingLimit : int.MaxValue; }
/// <summary> /// Creates a flow controller triggered when the CPU reaches a percentage of the cluster load shedding limit. /// This is intended to reduce queue read rate prior to causing the silo to shed load. /// Note: Triggered only when load shedding is enabled. /// </summary> /// <param name="options">The silo satistics options.</param> /// <param name="percentOfSiloSheddingLimit">Percentage of load shed limit which triggers a reduction of queue read rate.</param> /// <returns></returns> public static IQueueFlowController CreateAsPercentOfLoadSheddingLimit(SiloStatisticsOptions options, int percentOfSiloSheddingLimit = SiloStatisticsOptions.DEFAULT_LOAD_SHEDDING_LIMIT) { if (percentOfSiloSheddingLimit < 0.0 || percentOfSiloSheddingLimit > 100.0) { throw new ArgumentOutOfRangeException(nameof(percentOfSiloSheddingLimit), "Percent value must be between 0-100"); } // Start shedding before silo reaches shedding limit. return(new LoadShedQueueFlowController((int)(options.LoadSheddingLimit * (percentOfSiloSheddingLimit / 100.0)), options)); }
public EventHubAdapterReceiver(EventHubPartitionSettings settings, Func <string, IStreamQueueCheckpointer <string>, ILoggerFactory, ITelemetryProducer, IEventHubQueueCache> cacheFactory, Func <string, Task <IStreamQueueCheckpointer <string> > > checkpointerFactory, ILoggerFactory loggerFactory, IQueueAdapterReceiverMonitor monitor, SiloStatisticsOptions statisticsOptions, ITelemetryProducer telemetryProducer, Func <EventHubPartitionSettings, string, ILogger, ITelemetryProducer, Task <IEventHubReceiver> > eventHubReceiverFactory = null) { if (settings == null) { throw new ArgumentNullException(nameof(settings)); } if (cacheFactory == null) { throw new ArgumentNullException(nameof(cacheFactory)); } if (checkpointerFactory == null) { throw new ArgumentNullException(nameof(checkpointerFactory)); } if (loggerFactory == null) { throw new ArgumentNullException(nameof(loggerFactory)); } if (monitor == null) { throw new ArgumentNullException(nameof(monitor)); } if (statisticsOptions == null) { throw new ArgumentNullException(nameof(statisticsOptions)); } if (telemetryProducer == null) { throw new ArgumentNullException(nameof(telemetryProducer)); } this.settings = settings; this.cacheFactory = cacheFactory; this.checkpointerFactory = checkpointerFactory; this.loggerFactory = loggerFactory; this.logger = this.loggerFactory.CreateLogger($"{this.GetType().FullName}.{settings.Hub.Path}.{settings.Partition}"); this.monitor = monitor; this.telemetryProducer = telemetryProducer; this.statisticsOptions = statisticsOptions; this.eventHubReceiverFactory = eventHubReceiverFactory == null ? EventHubAdapterReceiver.CreateReceiver : eventHubReceiverFactory; }
internal SiloPerformanceMetrics( IHostEnvironmentStatistics hostEnvironmentStatistics, IAppEnvironmentStatistics appEnvironmentStatistics, ILoggerFactory loggerFactory, IOptions <SiloStatisticsOptions> statisticsOptions) { this.loggerFactory = loggerFactory; this.hostEnvironmentStatistics = hostEnvironmentStatistics; this.appEnvironmentStatistics = appEnvironmentStatistics; reportFrequency = TimeSpan.Zero; overloadLatched = false; overloadValue = false; this.logger = loggerFactory.CreateLogger <SiloPerformanceMetrics>(); this.statisticsOptions = statisticsOptions.Value; StringValueStatistic.FindOrCreate(StatisticNames.RUNTIME_IS_OVERLOADED, () => IsOverloaded.ToString()); }
private async Task OnRuntimeGrainServicesStart(CancellationToken ct) { var stopWatch = Stopwatch.StartNew(); await StartAsyncTaskWithPerfAnalysis("Init transaction agent", InitTransactionAgent, stopWatch); async Task InitTransactionAgent() { ITransactionAgent transactionAgent = this.Services.GetRequiredService <ITransactionAgent>(); ISchedulingContext transactionAgentContext = (transactionAgent as SystemTarget)?.SchedulingContext; await scheduler.QueueTask(transactionAgent.Start, transactionAgentContext) .WithTimeout(initTimeout, $"Starting TransactionAgent failed due to timeout {initTimeout}"); } // Load and init grain services before silo becomes active. await StartAsyncTaskWithPerfAnalysis("Init grain services", () => CreateGrainServices(), stopWatch); this.membershipOracleContext = (this.membershipOracle as SystemTarget)?.SchedulingContext ?? this.fallbackScheduler.SchedulingContext; await StartAsyncTaskWithPerfAnalysis("Starting local silo status oracle", StartMembershipOracle, stopWatch); async Task StartMembershipOracle() { await scheduler.QueueTask(() => this.membershipOracle.Start(), this.membershipOracleContext) .WithTimeout(initTimeout, $"Starting MembershipOracle failed due to timeout {initTimeout}"); logger.Debug("Local silo status oracle created successfully."); } var versionStore = Services.GetService <IVersionStore>(); await StartAsyncTaskWithPerfAnalysis("Init type manager", () => scheduler .QueueTask(() => this.typeManager.Initialize(versionStore), this.typeManager.SchedulingContext) .WithTimeout(this.initTimeout, $"TypeManager Initializing failed due to timeout {initTimeout}"), stopWatch); //if running in multi cluster scenario, start the MultiClusterNetwork Oracle if (this.multiClusterOracle != null) { await StartAsyncTaskWithPerfAnalysis("Start multicluster oracle", StartMultiClusterOracle, stopWatch); async Task StartMultiClusterOracle() { logger.Info("Starting multicluster oracle with my ServiceId={0} and ClusterId={1}.", this.clusterOptions.ServiceId, this.clusterOptions.ClusterId); this.multiClusterOracleContext = (multiClusterOracle as SystemTarget)?.SchedulingContext ?? this.fallbackScheduler.SchedulingContext; await scheduler.QueueTask(() => multiClusterOracle.Start(), multiClusterOracleContext) .WithTimeout(initTimeout, $"Starting MultiClusterOracle failed due to timeout {initTimeout}"); logger.Debug("multicluster oracle created successfully."); } } try { SiloStatisticsOptions statisticsOptions = Services.GetRequiredService <IOptions <SiloStatisticsOptions> >().Value; StartTaskWithPerfAnalysis("Start silo statistics", () => this.siloStatistics.Start(statisticsOptions), stopWatch); logger.Debug("Silo statistics manager started successfully."); // Finally, initialize the deployment load collector, for grains with load-based placement await StartAsyncTaskWithPerfAnalysis("Start deployment load collector", StartDeploymentLoadCollector, stopWatch); async Task StartDeploymentLoadCollector() { var deploymentLoadPublisher = Services.GetRequiredService <DeploymentLoadPublisher>(); await this.scheduler.QueueTask(deploymentLoadPublisher.Start, deploymentLoadPublisher.SchedulingContext) .WithTimeout(this.initTimeout, $"Starting DeploymentLoadPublisher failed due to timeout {initTimeout}"); logger.Debug("Silo deployment load publisher started successfully."); } // Start background timer tick to watch for platform execution stalls, such as when GC kicks in this.platformWatchdog = new Watchdog(statisticsOptions.LogWriteInterval, this.healthCheckParticipants, this.executorService, this.loggerFactory); this.platformWatchdog.Start(); if (this.logger.IsEnabled(LogLevel.Debug)) { logger.Debug("Silo platform watchdog started successfully."); } } catch (Exception exc) { this.SafeExecute(() => this.logger.Error(ErrorCode.Runtime_Error_100330, String.Format("Error starting silo {0}. Going to FastKill().", this.SiloAddress), exc)); throw; } if (logger.IsEnabled(LogLevel.Debug)) { logger.Debug("Silo.Start complete: System status = {0}", this.SystemStatus); } }
/// <summary> /// Creates a flow controller triggered when the CPU reaches the specified limit. /// Note: Triggered only when load shedding is enabled. /// </summary> /// <param name="loadSheddingLimit">Percentage of CPU which triggers queue read rate reduction</param> /// <param name="options">The silo satistics options.</param> /// <returns></returns> public static IQueueFlowController CreateAsPercentageOfCPU(int loadSheddingLimit, SiloStatisticsOptions options) { if (loadSheddingLimit < 0 || loadSheddingLimit > 100) { throw new ArgumentOutOfRangeException(nameof(loadSheddingLimit), "Value must be between 0-100"); } return(new LoadShedQueueFlowController(loadSheddingLimit, options)); }