public async Task RenewJobRequestAsync(int poolId, long requestId, Guid lockToken, TaskCompletionSource <int> firstJobRequestRenewed, CancellationToken token) { var agentServer = HostContext.GetService <IAgentServer>(); TaskAgentJobRequest request = null; int firstRenewRetryLimit = 5; int encounteringError = 0; // renew lock during job running. // stop renew only if cancellation token for lock renew task been signal or exception still happen after retry. while (!token.IsCancellationRequested) { try { request = await agentServer.RenewAgentRequestAsync(poolId, requestId, lockToken, token); Trace.Info($"Successfully renew job request {requestId}, job is valid till {request.LockedUntil.Value}"); if (!firstJobRequestRenewed.Task.IsCompleted) { // fire first renew succeed event. firstJobRequestRenewed.TrySetResult(0); } if (encounteringError > 0) { encounteringError = 0; agentServer.SetConnectionTimeout(AgentConnectionType.JobRequest, TimeSpan.FromSeconds(60)); HostContext.WritePerfCounter("JobRenewRecovered"); } // renew again after 60 sec delay await HostContext.Delay(TimeSpan.FromSeconds(60), token); } catch (TaskAgentJobNotFoundException) { // no need for retry. the job is not valid anymore. Trace.Info($"TaskAgentJobNotFoundException received when renew job request {requestId}, job is no longer valid, stop renew job request."); return; } catch (TaskAgentJobTokenExpiredException) { // no need for retry. the job is not valid anymore. Trace.Info($"TaskAgentJobTokenExpiredException received renew job request {requestId}, job is no longer valid, stop renew job request."); return; } catch (OperationCanceledException) when(token.IsCancellationRequested) { // OperationCanceledException may caused by http timeout or _lockRenewalTokenSource.Cance(); // Stop renew only on cancellation token fired. Trace.Info($"job renew has been canceled, stop renew job request {requestId}."); return; } catch (Exception ex) { Trace.Error($"Catch exception during renew agent jobrequest {requestId}."); Trace.Error(ex); encounteringError++; // retry TimeSpan remainingTime = TimeSpan.Zero; if (!firstJobRequestRenewed.Task.IsCompleted) { // retry 5 times every 10 sec for the first renew if (firstRenewRetryLimit-- > 0) { remainingTime = TimeSpan.FromSeconds(10); } } else { // retry till reach lockeduntil + 5 mins extra buffer. remainingTime = request.LockedUntil.Value + TimeSpan.FromMinutes(5) - DateTime.UtcNow; } if (remainingTime > TimeSpan.Zero) { TimeSpan delayTime; if (!firstJobRequestRenewed.Task.IsCompleted) { Trace.Info($"Retrying lock renewal for jobrequest {requestId}. The first job renew request has failed."); delayTime = BackoffTimerHelper.GetRandomBackoff(TimeSpan.FromSeconds(1), TimeSpan.FromSeconds(10)); } else { Trace.Info($"Retrying lock renewal for jobrequest {requestId}. Job is valid until {request.LockedUntil.Value}."); if (encounteringError > 5) { delayTime = BackoffTimerHelper.GetRandomBackoff(TimeSpan.FromSeconds(15), TimeSpan.FromSeconds(30)); } else { delayTime = BackoffTimerHelper.GetRandomBackoff(TimeSpan.FromSeconds(5), TimeSpan.FromSeconds(15)); } } // Re-establish connection to server in order to avoid affinity with server. // Reduce connection timeout to 30 seconds (from 60s) HostContext.WritePerfCounter("ResetJobRenewConnection"); await agentServer.RefreshConnectionAsync(AgentConnectionType.JobRequest, TimeSpan.FromSeconds(30)); try { // back-off before next retry. await HostContext.Delay(delayTime, token); } catch (OperationCanceledException) when(token.IsCancellationRequested) { Trace.Info($"job renew has been canceled, stop renew job request {requestId}."); } } else { Trace.Info($"Lock renewal has run out of retry, stop renew lock for jobrequest {requestId}."); HostContext.WritePerfCounter("JobRenewReachLimit"); return; } } } }
public async Task <TaskAgentMessage> GetNextMessageAsync(CancellationToken token) { Trace.Entering(); ArgUtil.NotNull(_session, nameof(_session)); ArgUtil.NotNull(_settings, nameof(_settings)); bool encounteringError = false; int continuousError = 0; string errorMessage = string.Empty; Stopwatch heartbeat = new Stopwatch(); heartbeat.Restart(); while (true) { token.ThrowIfCancellationRequested(); TaskAgentMessage message = null; try { message = await _runnerServer.GetAgentMessageAsync(_settings.PoolId, _session.SessionId, _lastMessageId, token); // Decrypt the message body if the session is using encryption message = DecryptMessage(message); if (message != null) { _lastMessageId = message.MessageId; } if (encounteringError) //print the message once only if there was an error { _term.WriteLine($"{DateTime.UtcNow:u}: Runner reconnected."); encounteringError = false; continuousError = 0; } } catch (OperationCanceledException) when(token.IsCancellationRequested) { Trace.Info("Get next message has been cancelled."); throw; } catch (TaskAgentAccessTokenExpiredException) { Trace.Info("Runner OAuth token has been revoked. Unable to pull message."); throw; } catch (Exception ex) { Trace.Error("Catch exception during get next message."); Trace.Error(ex); // don't retry if SkipSessionRecover = true, DT service will delete agent session to stop agent from taking more jobs. if (ex is TaskAgentSessionExpiredException && !_settings.SkipSessionRecover && await CreateSessionAsync(token)) { Trace.Info($"{nameof(TaskAgentSessionExpiredException)} received, recovered by recreate session."); } else if (!IsGetNextMessageExceptionRetriable(ex)) { throw; } else { continuousError++; //retry after a random backoff to avoid service throttling //in case of there is a service error happened and all agents get kicked off of the long poll and all agent try to reconnect back at the same time. if (continuousError <= 5) { // random backoff [15, 30] _getNextMessageRetryInterval = BackoffTimerHelper.GetRandomBackoff(TimeSpan.FromSeconds(15), TimeSpan.FromSeconds(30), _getNextMessageRetryInterval); } else { // more aggressive backoff [30, 60] _getNextMessageRetryInterval = BackoffTimerHelper.GetRandomBackoff(TimeSpan.FromSeconds(30), TimeSpan.FromSeconds(60), _getNextMessageRetryInterval); } if (!encounteringError) { //print error only on the first consecutive error _term.WriteError($"{DateTime.UtcNow:u}: Runner connect error: {ex.Message}. Retrying until reconnected."); encounteringError = true; } // re-create VssConnection before next retry await _runnerServer.RefreshConnectionAsync(RunnerConnectionType.MessageQueue, TimeSpan.FromSeconds(60)); Trace.Info("Sleeping for {0} seconds before retrying.", _getNextMessageRetryInterval.TotalSeconds); await HostContext.Delay(_getNextMessageRetryInterval, token); } } if (message == null) { if (heartbeat.Elapsed > TimeSpan.FromMinutes(30)) { Trace.Info($"No message retrieved from session '{_session.SessionId}' within last 30 minutes."); heartbeat.Restart(); } else { Trace.Verbose($"No message retrieved from session '{_session.SessionId}'."); } continue; } Trace.Info($"Message '{message.MessageId}' received from session '{_session.SessionId}'."); return(message); } }
public async Task <Boolean> CreateSessionAsync(CancellationToken token) { Trace.Entering(); // Settings var configManager = HostContext.GetService <IConfigurationManager>(); _settings = configManager.LoadSettings(); var serverUrl = _settings.ServerUrl; Trace.Info(_settings); // Create connection. Trace.Info("Loading Credentials"); var credMgr = HostContext.GetService <ICredentialManager>(); VssCredentials creds = credMgr.LoadCredentials(); var agent = new TaskAgentReference { Id = _settings.AgentId, Name = _settings.AgentName, Version = BuildConstants.RunnerPackage.Version, OSDescription = RuntimeInformation.OSDescription, }; string sessionName = $"{Environment.MachineName ?? "RUNNER"}"; var taskAgentSession = new TaskAgentSession(sessionName, agent); string errorMessage = string.Empty; bool encounteringError = false; while (true) { token.ThrowIfCancellationRequested(); Trace.Info($"Attempt to create session."); try { Trace.Info("Connecting to the Runner Server..."); await _runnerServer.ConnectAsync(new Uri(serverUrl), creds); Trace.Info("VssConnection created"); _term.WriteLine(); _term.WriteSuccessMessage("Connected to GitHub"); _term.WriteLine(); _session = await _runnerServer.CreateAgentSessionAsync( _settings.PoolId, taskAgentSession, token); Trace.Info($"Session created."); if (encounteringError) { _term.WriteLine($"{DateTime.UtcNow:u}: Runner reconnected."); _sessionCreationExceptionTracker.Clear(); encounteringError = false; } return(true); } catch (OperationCanceledException) when(token.IsCancellationRequested) { Trace.Info("Session creation has been cancelled."); throw; } catch (TaskAgentAccessTokenExpiredException) { Trace.Info("Runner OAuth token has been revoked. Session creation failed."); throw; } catch (Exception ex) { Trace.Error("Catch exception during create session."); Trace.Error(ex); if (ex is VssOAuthTokenRequestException && creds.Federated is VssOAuthCredential vssOAuthCred) { // Check whether we get 401 because the runner registration already removed by the service. // If the runner registration get deleted, we can't exchange oauth token. Trace.Error("Test oauth app registration."); var oauthTokenProvider = new VssOAuthTokenProvider(vssOAuthCred, new Uri(serverUrl)); var authError = await oauthTokenProvider.ValidateCredentialAsync(token); if (string.Equals(authError, "invalid_client", StringComparison.OrdinalIgnoreCase)) { _term.WriteError("Failed to create a session. The runner registration has been deleted from the server, please re-configure."); return(false); } } if (!IsSessionCreationExceptionRetriable(ex)) { _term.WriteError($"Failed to create session. {ex.Message}"); return(false); } if (!encounteringError) //print the message only on the first error { _term.WriteError($"{DateTime.UtcNow:u}: Runner connect error: {ex.Message}. Retrying until reconnected."); encounteringError = true; } Trace.Info("Sleeping for {0} seconds before retrying.", _sessionCreationRetryInterval.TotalSeconds); await HostContext.Delay(_sessionCreationRetryInterval, token); } } }
public async Task <TaskAgentMessage> GetNextMessageAsync(CancellationToken token) { Trace.Entering(); ArgUtil.NotNull(_session, nameof(_session)); ArgUtil.NotNull(_settings, nameof(_settings)); bool encounteringError = false; string errorMessage = string.Empty; Stopwatch heartbeat = new Stopwatch(); heartbeat.Restart(); while (true) { token.ThrowIfCancellationRequested(); TaskAgentMessage message = null; try { message = await _agentServer.GetAgentMessageAsync(_settings.PoolId, _session.SessionId, _lastMessageId, token); // Decrypt the message body if the session is using encryption message = DecryptMessage(message); if (message != null) { _lastMessageId = message.MessageId; } if (encounteringError) //print the message once only if there was an error { _term.WriteLine(StringUtil.Loc("QueueConnected", DateTime.UtcNow)); encounteringError = false; } } catch (OperationCanceledException) when(token.IsCancellationRequested) { Trace.Info("Get next message has been cancelled."); throw; } catch (Exception ex) { Trace.Error("Catch exception during get next message."); Trace.Error(ex); if (ex is TaskAgentSessionExpiredException && await CreateSessionAsync(token)) { Trace.Info($"{nameof(TaskAgentSessionExpiredException)} received, recoverd by recreate session."); } else if (!IsGetNextMessageExceptionRetriable(ex)) { throw; } else { //retry after a delay if (!encounteringError) { //print error only on the first consecutive error _term.WriteError(StringUtil.Loc("QueueConError", DateTime.UtcNow, ex.Message, _getNextMessageRetryInterval.TotalSeconds)); encounteringError = true; } Trace.Info("Sleeping for {0} seconds before retrying.", _getNextMessageRetryInterval.TotalSeconds); await HostContext.Delay(_getNextMessageRetryInterval, token); } } if (message == null) { if (heartbeat.Elapsed > TimeSpan.FromMinutes(30)) { Trace.Info($"No message retrieved from session '{_session.SessionId}' within last 30 minutes."); heartbeat.Restart(); } else { Trace.Verbose($"No message retrieved from session '{_session.SessionId}'."); } continue; } Trace.Info($"Message '{message.MessageId}' received from session '{_session.SessionId}'."); return(message); } }
public async Task <Boolean> CreateSessionAsync(CancellationToken token) { Trace.Entering(); // Settings var configManager = HostContext.GetService <IConfigurationManager>(); _settings = configManager.LoadSettings(); var serverUrl = _settings.ServerUrl; Trace.Info(_settings); // Capabilities. _term.WriteLine(StringUtil.Loc("ScanToolCapabilities")); Dictionary <string, string> systemCapabilities = await HostContext.GetService <ICapabilitiesManager>().GetCapabilitiesAsync(_settings, token); // Create connection. Trace.Verbose("Loading Credentials"); var credMgr = HostContext.GetService <ICredentialManager>(); VssCredentials creds = credMgr.LoadCredentials(); Uri uri = new Uri(serverUrl); VssConnection conn = ApiUtil.CreateConnection(uri, creds); var agent = new TaskAgentReference { Id = _settings.AgentId, Name = _settings.AgentName, Version = Constants.Agent.Version, }; string sessionName = $"{Environment.MachineName ?? "AGENT"}"; var taskAgentSession = new TaskAgentSession(sessionName, agent, systemCapabilities); string errorMessage = string.Empty; bool encounteringError = false; _term.WriteLine(StringUtil.Loc("ConnectToServer")); while (true) { token.ThrowIfCancellationRequested(); Trace.Info($"Attempt to create session."); try { Trace.Info("Connecting to the Agent Server..."); await _agentServer.ConnectAsync(conn); _session = await _agentServer.CreateAgentSessionAsync( _settings.PoolId, taskAgentSession, token); Trace.Info($"Session created."); if (encounteringError) { _term.WriteLine(StringUtil.Loc("QueueConnected", DateTime.UtcNow)); _sessionCreationExceptionTracker.Clear(); encounteringError = false; } return(true); } catch (OperationCanceledException) when(token.IsCancellationRequested) { Trace.Info("Session creation has been cancelled."); throw; } catch (Exception ex) { Trace.Error("Catch exception during create session."); Trace.Error(ex); if (!IsSessionCreationExceptionRetriable(ex)) { _term.WriteError(StringUtil.Loc("SessionCreateFailed", ex.Message)); return(false); } if (!encounteringError) //print the message only on the first error { _term.WriteError(StringUtil.Loc("QueueConError", DateTime.UtcNow, ex.Message, _sessionCreationRetryInterval.TotalSeconds)); encounteringError = true; } Trace.Info("Sleeping for {0} seconds before retrying.", _sessionCreationRetryInterval.TotalSeconds); await HostContext.Delay(_sessionCreationRetryInterval, token); } } }
private async Task <VssCredentials> GetNewOAuthAuthorizationSetting(CancellationToken token) { Trace.Info("Start checking oauth authorization url update."); while (true) { var backoff = BackoffTimerHelper.GetRandomBackoff(TimeSpan.FromMinutes(30), TimeSpan.FromMinutes(45)); await HostContext.Delay(backoff, token); try { var migratedAuthorizationUrl = await _runnerServer.GetRunnerAuthUrlAsync(_settings.PoolId, _settings.AgentId); if (!string.IsNullOrEmpty(migratedAuthorizationUrl)) { var credData = _configStore.GetCredentials(); var clientId = credData.Data.GetValueOrDefault("clientId", null); var currentAuthorizationUrl = credData.Data.GetValueOrDefault("authorizationUrl", null); Trace.Info($"Current authorization url: {currentAuthorizationUrl}, new authorization url: {migratedAuthorizationUrl}"); if (string.Equals(currentAuthorizationUrl, migratedAuthorizationUrl, StringComparison.OrdinalIgnoreCase)) { // We don't need to update credentials. Trace.Info("No needs to update authorization url"); await Task.Delay(TimeSpan.FromMilliseconds(-1), token); } var keyManager = HostContext.GetService <IRSAKeyManager>(); var signingCredentials = VssSigningCredentials.Create(() => keyManager.GetKey()); var migratedClientCredential = new VssOAuthJwtBearerClientCredential(clientId, migratedAuthorizationUrl, signingCredentials); var migratedRunnerCredential = new VssOAuthCredential(new Uri(migratedAuthorizationUrl, UriKind.Absolute), VssOAuthGrant.ClientCredentials, migratedClientCredential); Trace.Info("Try connect service with Token Service OAuth endpoint."); var runnerServer = HostContext.CreateService <IRunnerServer>(); await runnerServer.ConnectAsync(new Uri(_settings.ServerUrl), migratedRunnerCredential); await runnerServer.GetAgentPoolsAsync(); Trace.Info($"Successfully connected service with new authorization url."); var migratedCredData = new CredentialData { Scheme = Constants.Configuration.OAuth, Data = { { "clientId", clientId }, { "authorizationUrl", migratedAuthorizationUrl }, { "oauthEndpointUrl", migratedAuthorizationUrl }, }, }; _configStore.SaveMigratedCredential(migratedCredData); return(migratedRunnerCredential); } else { Trace.Verbose("No authorization url updates"); } } catch (Exception ex) { Trace.Error("Fail to get/test new authorization url."); Trace.Error(ex); try { await _runnerServer.ReportRunnerAuthUrlErrorAsync(_settings.PoolId, _settings.AgentId, ex.ToString()); } catch (Exception e) { // best effort Trace.Error("Fail to report the migration error"); Trace.Error(e); } } } }
public async Task <Boolean> CreateSessionAsync(CancellationToken token) { Trace.Entering(); // Settings var configManager = HostContext.GetService <IConfigurationManager>(); _settings = configManager.LoadSettings(); var serverUrl = _settings.ServerUrl; Trace.Info(_settings); // Create connection. Trace.Info("Loading Credentials"); _useMigratedCredentials = !StringUtil.ConvertToBoolean(Environment.GetEnvironmentVariable("GITHUB_ACTIONS_RUNNER_SPSAUTHURL")); VssCredentials creds = _credMgr.LoadCredentials(_useMigratedCredentials); var agent = new TaskAgentReference { Id = _settings.AgentId, Name = _settings.AgentName, Version = BuildConstants.RunnerPackage.Version, OSDescription = RuntimeInformation.OSDescription, }; string sessionName = $"{Environment.MachineName ?? "RUNNER"}"; var taskAgentSession = new TaskAgentSession(sessionName, agent); string errorMessage = string.Empty; bool encounteringError = false; var originalCreds = _configStore.GetCredentials(); var migratedCreds = _configStore.GetMigratedCredentials(); if (migratedCreds == null) { _useMigratedCredentials = false; if (originalCreds.Scheme == Constants.Configuration.OAuth) { _needToCheckAuthorizationUrlUpdate = true; } } while (true) { token.ThrowIfCancellationRequested(); Trace.Info($"Attempt to create session."); try { Trace.Info("Connecting to the Runner Server..."); await _runnerServer.ConnectAsync(new Uri(serverUrl), creds); Trace.Info("VssConnection created"); _term.WriteLine(); _term.WriteSuccessMessage("Connected to GitHub"); _term.WriteLine(); _session = await _runnerServer.CreateAgentSessionAsync( _settings.PoolId, taskAgentSession, token); Trace.Info($"Session created."); if (encounteringError) { _term.WriteLine($"{DateTime.UtcNow:u}: Runner reconnected."); _sessionCreationExceptionTracker.Clear(); encounteringError = false; } if (_needToCheckAuthorizationUrlUpdate) { // start background task try to get new authorization url _authorizationUrlMigrationBackgroundTask = GetNewOAuthAuthorizationSetting(token); } return(true); } catch (OperationCanceledException) when(token.IsCancellationRequested) { Trace.Info("Session creation has been cancelled."); throw; } catch (TaskAgentAccessTokenExpiredException) { Trace.Info("Runner OAuth token has been revoked. Session creation failed."); throw; } catch (Exception ex) { Trace.Error("Catch exception during create session."); Trace.Error(ex); if (!IsSessionCreationExceptionRetriable(ex)) { if (_useMigratedCredentials) { // migrated credentials might cause lose permission during permission check, // we will force to use original credential and try again _useMigratedCredentials = false; var reattemptBackoff = BackoffTimerHelper.GetRandomBackoff(TimeSpan.FromHours(24), TimeSpan.FromHours(36)); _authorizationUrlRollbackReattemptDelayBackgroundTask = HostContext.Delay(reattemptBackoff, token); // retry migrated creds in 24-36 hours. creds = _credMgr.LoadCredentials(false); Trace.Error("Fallback to original credentials and try again."); } else { _term.WriteError($"Failed to create session. {ex.Message}"); return(false); } } if (!encounteringError) //print the message only on the first error { _term.WriteError($"{DateTime.UtcNow:u}: Runner connect error: {ex.Message}. Retrying until reconnected."); encounteringError = true; } Trace.Info("Sleeping for {0} seconds before retrying.", _sessionCreationRetryInterval.TotalSeconds); await HostContext.Delay(_sessionCreationRetryInterval, token); } } }
public async Task <TaskAgentMessage> GetNextMessageAsync(CancellationToken token) { Trace.Entering(); ArgUtil.NotNull(_session, nameof(_session)); ArgUtil.NotNull(_settings, nameof(_settings)); bool encounteringError = false; int continuousError = 0; string errorMessage = string.Empty; Stopwatch heartbeat = new Stopwatch(); heartbeat.Restart(); while (true) { token.ThrowIfCancellationRequested(); TaskAgentMessage message = null; try { message = await _runnerServer.GetAgentMessageAsync(_settings.PoolId, _session.SessionId, _lastMessageId, token); // Decrypt the message body if the session is using encryption message = DecryptMessage(message); if (message != null) { _lastMessageId = message.MessageId; } if (encounteringError) //print the message once only if there was an error { _term.WriteLine($"{DateTime.UtcNow:u}: Runner reconnected."); encounteringError = false; continuousError = 0; } if (_needToCheckAuthorizationUrlUpdate && _authorizationUrlMigrationBackgroundTask?.IsCompleted == true) { if (HostContext.GetService <IJobDispatcher>().Busy || HostContext.GetService <ISelfUpdater>().Busy) { Trace.Info("Job or runner updates in progress, update credentials next time."); } else { try { var newCred = await _authorizationUrlMigrationBackgroundTask; await _runnerServer.ConnectAsync(new Uri(_settings.ServerUrl), newCred); Trace.Info("Updated connection to use migrated credential for next GetMessage call."); _useMigratedCredentials = true; _authorizationUrlMigrationBackgroundTask = null; _needToCheckAuthorizationUrlUpdate = false; } catch (Exception ex) { Trace.Error("Fail to refresh connection with new authorization url."); Trace.Error(ex); } } } if (_authorizationUrlRollbackReattemptDelayBackgroundTask?.IsCompleted == true) { try { // we rolled back to use original creds about 2 days before, now it's a good time to try migrated creds again. Trace.Info("Re-attempt to use migrated credential"); var migratedCreds = _credMgr.LoadCredentials(); await _runnerServer.ConnectAsync(new Uri(_settings.ServerUrl), migratedCreds); _useMigratedCredentials = true; _authorizationUrlRollbackReattemptDelayBackgroundTask = null; } catch (Exception ex) { Trace.Error("Fail to refresh connection with new authorization url on rollback reattempt."); Trace.Error(ex); } } } catch (OperationCanceledException) when(token.IsCancellationRequested) { Trace.Info("Get next message has been cancelled."); throw; } catch (TaskAgentAccessTokenExpiredException) { Trace.Info("Runner OAuth token has been revoked. Unable to pull message."); throw; } catch (Exception ex) { Trace.Error("Catch exception during get next message."); Trace.Error(ex); // don't retry if SkipSessionRecover = true, DT service will delete agent session to stop agent from taking more jobs. if (ex is TaskAgentSessionExpiredException && !_settings.SkipSessionRecover && await CreateSessionAsync(token)) { Trace.Info($"{nameof(TaskAgentSessionExpiredException)} received, recovered by recreate session."); } else if (!IsGetNextMessageExceptionRetriable(ex)) { if (_useMigratedCredentials) { // migrated credentials might cause lose permission during permission check, // we will force to use original credential and try again _useMigratedCredentials = false; var reattemptBackoff = BackoffTimerHelper.GetRandomBackoff(TimeSpan.FromHours(24), TimeSpan.FromHours(36)); _authorizationUrlRollbackReattemptDelayBackgroundTask = HostContext.Delay(reattemptBackoff, token); // retry migrated creds in 24-36 hours. var originalCreds = _credMgr.LoadCredentials(false); await _runnerServer.ConnectAsync(new Uri(_settings.ServerUrl), originalCreds); Trace.Error("Fallback to original credentials and try again."); } else { throw; } } else { continuousError++; //retry after a random backoff to avoid service throttling //in case of there is a service error happened and all agents get kicked off of the long poll and all agent try to reconnect back at the same time. if (continuousError <= 5) { // random backoff [15, 30] _getNextMessageRetryInterval = BackoffTimerHelper.GetRandomBackoff(TimeSpan.FromSeconds(15), TimeSpan.FromSeconds(30), _getNextMessageRetryInterval); } else { // more aggressive backoff [30, 60] _getNextMessageRetryInterval = BackoffTimerHelper.GetRandomBackoff(TimeSpan.FromSeconds(30), TimeSpan.FromSeconds(60), _getNextMessageRetryInterval); } if (!encounteringError) { //print error only on the first consecutive error _term.WriteError($"{DateTime.UtcNow:u}: Runner connect error: {ex.Message}. Retrying until reconnected."); encounteringError = true; } // re-create VssConnection before next retry await _runnerServer.RefreshConnectionAsync(RunnerConnectionType.MessageQueue, TimeSpan.FromSeconds(60)); Trace.Info("Sleeping for {0} seconds before retrying.", _getNextMessageRetryInterval.TotalSeconds); await HostContext.Delay(_getNextMessageRetryInterval, token); } } if (message == null) { if (heartbeat.Elapsed > TimeSpan.FromMinutes(30)) { Trace.Info($"No message retrieved from session '{_session.SessionId}' within last 30 minutes."); heartbeat.Restart(); } else { Trace.Verbose($"No message retrieved from session '{_session.SessionId}'."); } continue; } Trace.Info($"Message '{message.MessageId}' received from session '{_session.SessionId}'."); return(message); } }
public async Task <Boolean> CreateSessionAsync(CancellationToken token) { Trace.Entering(); // Settings var configManager = HostContext.GetService <IConfigurationManager>(); _settings = configManager.LoadSettings(); var serverUrl = _settings.ServerUrl; Trace.Info(_settings); // Create connection. Trace.Info("Loading Credentials"); var credMgr = HostContext.GetService <ICredentialManager>(); VssCredentials creds = credMgr.LoadCredentials(); var agent = new TaskAgentReference { Id = _settings.AgentId, Name = _settings.AgentName, Version = BuildConstants.RunnerPackage.Version, OSDescription = RuntimeInformation.OSDescription, }; string sessionName = $"{Environment.MachineName ?? "RUNNER"}"; var taskAgentSession = new TaskAgentSession(sessionName, agent); string errorMessage = string.Empty; bool encounteringError = false; while (true) { token.ThrowIfCancellationRequested(); Trace.Info($"Attempt to create session."); try { Trace.Info("Connecting to the Runner Server..."); await _runnerServer.ConnectAsync(new Uri(serverUrl), creds); Trace.Info("VssConnection created"); _term.WriteLine(); _term.WriteSuccessMessage("Connected to GitHub"); _term.WriteLine(); _session = await _runnerServer.CreateAgentSessionAsync( _settings.PoolId, taskAgentSession, token); Trace.Info($"Session created."); if (encounteringError) { _term.WriteLine($"{DateTime.UtcNow:u}: Runner reconnected."); _sessionCreationExceptionTracker.Clear(); encounteringError = false; } return(true); } catch (OperationCanceledException) when(token.IsCancellationRequested) { Trace.Info("Session creation has been cancelled."); throw; } catch (TaskAgentAccessTokenExpiredException) { Trace.Info("Runner OAuth token has been revoked. Session creation failed."); throw; } catch (Exception ex) { Trace.Error("Catch exception during create session."); Trace.Error(ex); if (!IsSessionCreationExceptionRetriable(ex)) { _term.WriteError($"Failed to create session. {ex.Message}"); return(false); } if (!encounteringError) //print the message only on the first error { _term.WriteError($"{DateTime.UtcNow:u}: Runner connect error: {ex.Message}. Retrying until reconnected."); encounteringError = true; } Trace.Info("Sleeping for {0} seconds before retrying.", _sessionCreationRetryInterval.TotalSeconds); await HostContext.Delay(_sessionCreationRetryInterval, token); } } }
public async Task <Boolean> CreateSessionAsync(CancellationToken token) { Trace.Entering(); const int MaxAttempts = 10; int attempt = 0; // Settings var configManager = HostContext.GetService <IConfigurationManager>(); _settings = configManager.LoadSettings(); int agentPoolId = _settings.PoolId; var serverUrl = _settings.ServerUrl; Trace.Info(_settings); // Load Credentials Trace.Verbose("Loading Credentials"); var credMgr = HostContext.GetService <ICredentialManager>(); VssCredentials creds = credMgr.LoadCredentials(); Uri uri = new Uri(serverUrl); VssConnection conn = ApiUtil.CreateConnection(uri, creds); string sessionName = $"{Environment.MachineName ?? string.Empty}_{Guid.NewGuid().ToString()}"; var capProvider = HostContext.GetService <ICapabilitiesProvider>(); Dictionary <string, string> agentSystemCapabilities = await capProvider.GetCapabilitiesAsync(_settings.AgentName, token); var agent = new TaskAgentReference { Id = _settings.AgentId, Name = _settings.AgentName, Version = Constants.Agent.Version, Enabled = true }; var taskAgentSession = new TaskAgentSession(sessionName, agent, agentSystemCapabilities); var agentSvr = HostContext.GetService <IAgentServer>(); while (++attempt <= MaxAttempts) { Trace.Info("Create session attempt {0} of {1}.", attempt, MaxAttempts); try { Trace.Info("Connecting to the Agent Server..."); await agentSvr.ConnectAsync(conn); Session = await agentSvr.CreateAgentSessionAsync( _settings.PoolId, taskAgentSession, token); return(true); } catch (OperationCanceledException) { Trace.Info("Cancelled"); throw; } catch (Exception ex) { Trace.Error("Failed to create session."); if (ex is TaskAgentNotFoundException) { Trace.Error("The agent no longer exists on the server. Stopping the agent."); Trace.Error(ex); return(false); } else if (ex is TaskAgentSessionConflictException) { Trace.Error("The session for this agent already exists."); } else { Trace.Error(ex); } if (attempt >= MaxAttempts) { Trace.Error("Retries exhausted. Terminating the agent."); return(false); } TimeSpan interval = TimeSpan.FromSeconds(30); Trace.Info("Sleeping for {0} seconds before retrying.", interval.TotalSeconds); await HostContext.Delay(interval, token); } } return(false); }
public async Task <Boolean> CreateSessionAsync(CancellationToken token) { Trace.Entering(); int attempt = 0; // Settings var configManager = HostContext.GetService <IConfigurationManager>(); _settings = configManager.LoadSettings(); int agentPoolId = _settings.PoolId; var serverUrl = _settings.ServerUrl; Trace.Info(_settings); // Capabilities. // TODO: LOC _term.WriteLine("Scanning for tool capabilities."); Dictionary <string, string> systemCapabilities = await HostContext.GetService <ICapabilitiesManager>().GetCapabilitiesAsync(_settings, token); // Create connection. Trace.Verbose("Loading Credentials"); var credMgr = HostContext.GetService <ICredentialManager>(); VssCredentials creds = credMgr.LoadCredentials(); Uri uri = new Uri(serverUrl); VssConnection conn = ApiUtil.CreateConnection(uri, creds); var agent = new TaskAgentReference { Id = _settings.AgentId, Name = _settings.AgentName, Version = Constants.Agent.Version, Enabled = true }; string sessionName = $"{Environment.MachineName ?? "AGENT"}"; var taskAgentSession = new TaskAgentSession(sessionName, agent, systemCapabilities); var agentSvr = HostContext.GetService <IAgentServer>(); string errorMessage = string.Empty; bool firstAttempt = true; //tells us if this is the first time we try to connect // TODO: LOC _term.WriteLine("Connecting to the server."); while (true) { attempt++; Trace.Info($"Create session attempt {attempt}."); try { Trace.Info("Connecting to the Agent Server..."); await agentSvr.ConnectAsync(conn); Session = await agentSvr.CreateAgentSessionAsync( _settings.PoolId, taskAgentSession, token); if (!firstAttempt) { _term.WriteLine(StringUtil.Loc("QueueConnected", DateTime.UtcNow)); } return(true); } catch (OperationCanceledException ex) { if (token.IsCancellationRequested) //Distinguish timeout from user cancellation { Trace.Info("Cancelled"); throw; } errorMessage = ex.Message; } catch (Exception ex) { Trace.Error("Failed to create session."); if (ex is TaskAgentNotFoundException) { Trace.Error("The agent no longer exists on the server. Stopping the agent."); _term.WriteError(StringUtil.Loc("MissingAgent")); } if (ex is TaskAgentSessionConflictException) { Trace.Error("The session for this agent already exists."); _term.WriteError(StringUtil.Loc("SessionExist")); } Trace.Error(ex); if (IsFatalException(ex)) { _term.WriteError(StringUtil.Loc("SessionCreateFailed")); return(false); } errorMessage = ex.Message; } TimeSpan interval = TimeSpan.FromSeconds(30); if (firstAttempt) //print the message only on the first error { _term.WriteError(StringUtil.Loc("QueueConError", DateTime.UtcNow, errorMessage, interval.TotalSeconds)); firstAttempt = false; } Trace.Info("Sleeping for {0} seconds before retrying.", interval.TotalSeconds); await HostContext.Delay(interval, token); } }
public async Task <TaskAgentMessage> GetNextMessageAsync(CancellationToken token) { Trace.Entering(); ArgUtil.NotNull(Session, nameof(Session)); ArgUtil.NotNull(_settings, nameof(_settings)); var agentServer = HostContext.GetService <IAgentServer>(); int consecutiveErrors = 0; //number of consecutive exceptions thrown by GetAgentMessageAsync string errorMessage = string.Empty; while (true) { token.ThrowIfCancellationRequested(); TaskAgentMessage message = null; try { consecutiveErrors++; message = await agentServer.GetAgentMessageAsync(_settings.PoolId, Session.SessionId, _lastMessageId, token); if (message != null) { _lastMessageId = message.MessageId; } if (consecutiveErrors > 1) //print the message once only if there was an error { _term.WriteLine(StringUtil.Loc("QueueConnected", DateTime.UtcNow)); } consecutiveErrors = 0; } catch (TimeoutException ex) { Trace.Verbose($"{nameof(TimeoutException)} received."); //retry after a delay errorMessage = ex.Message; } catch (TaskAgentSessionExpiredException) { Trace.Verbose($"{nameof(TaskAgentSessionExpiredException)} received."); if (!await CreateSessionAsync(token)) { throw; } consecutiveErrors = 0; } catch (OperationCanceledException ex) { Trace.Verbose($"{nameof(OperationCanceledException)} received."); //we get here when the agent is stopped with CTRL-C or service is stopped or HttpClient has timed out if (token.IsCancellationRequested) //Distinguish timeout from user cancellation { throw; } //retry after a delay errorMessage = ex.Message; } catch (Exception ex) { Trace.Error(ex); if (IsFatalException(ex)) { throw; } //retry after a delay errorMessage = ex.Message; } //print an error and add a delay if (consecutiveErrors > 0) { TimeSpan interval = TimeSpan.FromSeconds(15); if (consecutiveErrors == 1) { //print error only on the first consecutive error _term.WriteError(StringUtil.Loc("QueueConError", DateTime.UtcNow, errorMessage, interval.TotalSeconds)); } Trace.Info("Sleeping for {0} seconds before retrying.", interval.TotalSeconds); await HostContext.Delay(interval, token); } if (message == null) { Trace.Verbose($"No message retrieved from session '{Session.SessionId}'."); continue; } Trace.Verbose($"Message '{message.MessageId}' received from session '{Session.SessionId}'."); return(message); } }