protected override IBaseSSHAccess ConnectWithSSH(BaseServer server) { if (server.PublicAddress == null) { throw new InvalidOperationException("Can't connect to a server with no public address"); } sshAccess.ConnectTo(server.PublicAddress.ToString(), ((ExternalServer)server).SSHKeyFileName); return(sshAccess); }
public async Task <IActionResult> Create([Required][FromBody] ExternalServerDTO request) { FailIfNotConfigured(); if (request.PublicAddress == null) { return(BadRequest("Missing address")); } if (string.IsNullOrEmpty(request.SSHKeyFileName) || request.SSHKeyFileName.Contains("..") || request.SSHKeyFileName.Contains("/")) { return(BadRequest("Invalid SSH key name format")); } if (!serverSSHAccess.IsValidKey(request.SSHKeyFileName)) { return(BadRequest("Invalid SSH key specified")); } // Test connection try { serverSSHAccess.ConnectTo(request.PublicAddress.ToString(), request.SSHKeyFileName); } catch (Exception e) { logger.LogWarning("Failing to add a new external server due to connect failure: {@E}", e); return(BadRequest("Can't access the specified IP address with the specified key")); } // Don't allow duplicate IPs if (await database.ExternalServers.FirstOrDefaultAsync(s => s.PublicAddress != null && s.PublicAddress.Equals(request.PublicAddress)) != null) { return(BadRequest("There is already a server configured with that IP address")); } var server = new ExternalServer() { PublicAddress = request.PublicAddress, SSHKeyFileName = request.SSHKeyFileName, }; await database.ExternalServers.AddAsync(server); await database.AdminActions.AddAsync(new AdminAction() { Message = $"New external server with IP {request.PublicAddress} added", PerformedById = HttpContext.AuthenticatedUser() !.Id, });
public async Task Execute(long id, CancellationToken cancellationToken) { var server = await database.ExternalServers.FindAsync(id); if (server == null) { logger.LogWarning("Server {Id} (external) not found for startup check", id); return; } if (server.Status == ServerStatus.Running || server.Status == ServerStatus.Provisioning) { logger.LogInformation("External server {Id} is already up, skipping check job", id); return; } if (server.Status == ServerStatus.Stopping && DateTime.UtcNow - server.StatusLastChecked < TimeSpan.FromSeconds(15)) { throw new Exception($"External server {id} has been in stopping status too short time"); } if (server.PublicAddress == null || server.PublicAddress.Equals(IPAddress.None)) { throw new Exception($"External server {id} doesn't have public a address set"); } bool up = false; try { sshAccess.ConnectTo(server.PublicAddress.ToString(), server.SSHKeyFileName); up = true; } catch (SocketException) { logger.LogInformation("Connection failed (socket exception), server is probably not up yet"); } catch (SshOperationTimeoutException) { logger.LogInformation("Connection failed (ssh timed out), server is probably not up yet"); } if (up) { server.Status = ServerStatus.Running; } server.StatusLastChecked = DateTime.UtcNow; server.BumpUpdatedAt(); await database.SaveChangesAsync(cancellationToken); if (!up) { logger.LogTrace("External server {Id} is not up currently", id); jobClient.Schedule <WaitForExternalServerStartUpJob>(x => Execute(id, CancellationToken.None), TimeSpan.FromSeconds(30)); } else { logger.LogInformation("External server {Id} is now up", id); } }
public async Task Execute(long ciProjectId, long ciBuildId, long ciJobId, long serverId, bool serverIsExternal, int retries, CancellationToken cancellationToken) { // Includes are needed here to provide fully populated data for update notifications var job = await Database.CiJobs.Include(j => j.Build !).ThenInclude(b => b.CiProject) .FirstOrDefaultAsync( j => j.CiProjectId == ciProjectId && j.CiBuildId == ciBuildId && j.CiJobId == ciJobId, cancellationToken); BaseServer?server; if (serverIsExternal) { server = await Database.ExternalServers.FindAsync(new object[] { serverId }, cancellationToken); } else { server = await Database.ControlledServers.FindAsync(new object[] { serverId }, cancellationToken); } if (server == null) { throw new ArgumentException($"Could not find server ({serverId}, external: {serverIsExternal}) " + "to run build on"); } if (job == null) { Logger.LogWarning("Skipping CI job as it doesn't exist"); ReleaseServerReservation(server); return; } if (job.State != CIJobState.WaitingForServer) { Logger.LogWarning( "CI job is not in waiting for server status, refusing to start running it on server: {ServerId}", serverId); ReleaseServerReservation(server); return; } if (server.ReservedFor != job.CiJobId) { Logger.LogWarning( "CI job id doesn't match reservation on server, refusing to start it on server: {ServerId}", serverId); ReleaseServerReservation(server); return; } // Get the CI image for the job var imageFileName = job.GetImageFileName(); var serverSideImagePath = Path.Join("CI/Images", imageFileName); StorageItem?imageItem; try { imageItem = await StorageItem.FindByPath(Database, serverSideImagePath); } catch (Exception e) { // ReSharper disable once ExceptionPassedAsTemplateArgumentProblem Logger.LogError("Invalid image specified for CI job: {Image}, path parse exception: {@E}", job.Image, e); job.SetFinishSuccess(false); await job.CreateFailureSection(Database, "Invalid image specified for job (invalid path)"); await OnJobEnded(server, job); return; } if (string.IsNullOrEmpty(job.Image) || imageItem == null) { Logger.LogError("Invalid image specified for CI job: {Image}", job.Image); job.SetFinishSuccess(false); await job.CreateFailureSection(Database, "Invalid image specified for job (not found)"); await OnJobEnded(server, job); return; } // The CI system uses the first valid image version. For future updates a different file name is needed // For example bumping the ":v1" to a ":v2" suffix var version = await imageItem.GetLowestUploadedVersion(Database); if (version == null || version.StorageFile == null) { Logger.LogError("Image with no uploaded version specified for CI job: {Image}", job.Image); job.SetFinishSuccess(false); await job.CreateFailureSection(Database, "Invalid image specified for job (not uploaded version)"); await OnJobEnded(server, job); return; } // Queue a job to lock writing to the CI image if it isn't write protected yet if (imageItem.WriteAccess != FileAccess.Nobody) { Logger.LogInformation( "Storage item {Id} used as CI image is not write locked, queuing a job to lock it", imageItem.Id); // To ensure the upload time is expired, this is upload time + 5 minutes JobClient.Schedule <LockCIImageItemJob>(x => x.Execute(imageItem.Id, CancellationToken.None), AppInfo.RemoteStorageUploadExpireTime + TimeSpan.FromMinutes(5)); } Logger.LogInformation("Trying to start job {CIProjectId}-{CIBuildId}-{CIJobId} on reserved " + "server ({Id}, {ServerIsExternal})", ciProjectId, ciBuildId, ciJobId, server.Id, serverIsExternal); if (server.PublicAddress == null || server.PublicAddress.Equals(IPAddress.None)) { throw new Exception($"Server ({server.Id}, {serverIsExternal}) doesn't have a public address set"); } // Try to start running the job, this can fail if the server is not actually really up yet IBaseSSHAccess sshAccess; try { if (serverIsExternal) { externalSSHAccess.ConnectTo(server.PublicAddress.ToString(), ((ExternalServer)server).SSHKeyFileName); sshAccess = externalSSHAccess; } else { controlledSSHAccess.ConnectTo(server.PublicAddress.ToString()); sshAccess = controlledSSHAccess; } } catch (SocketException) { Logger.LogInformation("Connection failed (socket exception), server is probably not up (yet)"); await Requeue(job, retries - 1, server, serverIsExternal); return; } catch (SshOperationTimeoutException) { Logger.LogInformation("Connection failed (ssh timed out), server is probably not up (yet)"); await Requeue(job, retries - 1, server, serverIsExternal); return; } var imageDownloadUrl = remoteDownloadUrls.CreateDownloadFor(version.StorageFile, AppInfo.RemoteStorageDownloadExpireTime); // Connection success, so now we can run the job starting on the server job.RunningOnServerId = serverId; job.RunningOnServerIsExternal = server.IsExternal; // TODO: permanently store on which server this job was ran on and how long since creation it took to get // here if (job.Build == null) { throw new NotLoadedModelNavigationException(); } CISecretType jobSpecificSecretType = job.Build.IsSafe ? CISecretType.SafeOnly : CISecretType.UnsafeOnly; var secrets = await Database.CiSecrets.Where(s => s.CiProjectId == job.CiProjectId && (s.UsedForBuildTypes == jobSpecificSecretType || s.UsedForBuildTypes == CISecretType.All)) .ToListAsync(cancellationToken); await PerformServerCleanUpIfNeeded(server, sshAccess); // Then move on to the build starting, first thing is to download the CI executor script // TODO: is there a possibility that this is not secure? Someone would need to do HTTPS MItM attack... var executorDownload = GetUrlToDownloadCIExecutor(); var executorResourceDownload = GetUrlToDownloadCIExecutorResource(); var executorHash = await hashCalculator.Sha256(executorDownload, cancellationToken); var posixHelperHash = await hashCalculator.Sha256(executorResourceDownload, cancellationToken); // TODO: using async would be nice for the run commands when supported var result1 = sshAccess .RunCommand("set -e\n" + CreateDownloadCommand("~/CIExecutor", executorHash, executorDownload) + CreateDownloadCommand("~/libMonoPosixHelper.so", posixHelperHash, executorResourceDownload) + "chmod +x ~/CIExecutor"); if (!result1.Success) { throw new Exception($"Failed to run executor download step: {result1.Result}, error: {result1.Error}"); } // This save is done here as the build status might get reported back to us before we finish with the ssh // commands job.State = CIJobState.Running; job.RanOnServer = serverIsExternal ? $"External server {serverId}" : $"Controlled server {serverId}"; job.TimeWaitingForServer = DateTime.UtcNow - job.CreatedAt; await Database.SaveChangesAsync(cancellationToken); // and then run it with environment variables for this build // Remove all type secrets if there is one with the same name that is build specific var cleanedSecrets = secrets .Where(s => s.UsedForBuildTypes != CISecretType.All || !secrets.Any(s2 => s2.SecretName == s.SecretName && s2.UsedForBuildTypes != s.UsedForBuildTypes)) .Select(s => s.ToExecutorData()); if (job.Build.CiProject == null) { throw new NotLoadedModelNavigationException(); } var env = new StringBuilder(250); env.Append("export CI_REF=\""); env.Append(BashEscape.EscapeForBash(job.Build.RemoteRef)); env.Append("\"; export CI_COMMIT_HASH=\""); env.Append(BashEscape.EscapeForBash(job.Build.CommitHash)); env.Append("\"; export CI_EARLIER_COMMIT=\""); env.Append(BashEscape.EscapeForBash(job.Build.PreviousCommit ?? AppInfo.NoCommitHash)); env.Append("\"; export CI_BRANCH=\""); env.Append(BashEscape.EscapeForBash(job.Build.Branch ?? "unknown_branch")); env.Append("\"; export CI_DEFAULT_BRANCH=\""); env.Append(BashEscape.EscapeForBash(job.Build.CiProject.DefaultBranch)); env.Append("\"; export CI_TRUSTED=\""); env.Append(job.Build.IsSafe); env.Append("\"; export CI_ORIGIN=\""); env.Append(BashEscape.EscapeForBash(job.Build.CiProject.RepositoryCloneUrl)); env.Append("\"; export CI_IMAGE_DL_URL=\""); env.Append(BashEscape.EscapeForBash(imageDownloadUrl)); env.Append("\"; export CI_IMAGE_NAME=\""); env.Append(BashEscape.EscapeForBash(job.Image)); env.Append("\"; export CI_IMAGE_FILENAME=\""); env.Append(BashEscape.EscapeForBash(imageFileName)); env.Append("\"; export CI_CACHE_OPTIONS=\""); env.Append(BashEscape.EscapeForBash(job.CacheSettingsJson ?? "{}")); env.Append("\"; export CI_SECRETS=\""); env.Append(BashEscape.EscapeForBash(JsonSerializer.Serialize(cleanedSecrets))); env.Append("\"; export CI_JOB_NAME=\""); env.Append(BashEscape.EscapeForBash(job.JobName)); env.Append("\";"); var result2 = sshAccess.RunCommand($"{env} nohup ~/CIExecutor {GetConnectToUrl(job)} > " + "build_script_output.txt 2>&1 &"); if (!result2.Success) { throw new Exception($"Failed to start running CI executor: {result2.Result}, error: {result2.Error}"); } JobClient.Schedule <CheckCIJobOutputHasConnectedJob>( x => x.Execute(ciProjectId, ciBuildId, ciJobId, serverId, CancellationToken.None), TimeSpan.FromMinutes(5)); JobClient.Schedule <CancelCIBuildIfStuckJob>( x => x.Execute(ciProjectId, ciBuildId, ciJobId, serverId, server.IsExternal, CancellationToken.None), TimeSpan.FromMinutes(61)); Logger.LogInformation( "CI job startup succeeded, now it's up to the executor to contact us with updates"); }
public async Task Execute(long ciProjectId, long ciBuildId, long ciJobId, long serverId, bool externalServer, CancellationToken cancellationToken) { var job = await database.CiJobs.FirstOrDefaultAsync( j => j.CiProjectId == ciProjectId && j.CiBuildId == ciBuildId && j.CiJobId == ciJobId, cancellationToken); if (job == null) { logger.LogWarning("Failed to check if a CI job is stuck, can't find the job"); return; } if (job.State == CIJobState.Finished) { return; } logger.LogError( "Detected CI job {CIProjectId}-{CIBuildId}-{CIJobId} as stuck running (total build time limit reached)", ciProjectId, ciBuildId, ciJobId); if (externalServer) { var server = await database.ExternalServers.FindAsync(new object[] { serverId }, cancellationToken); if (server == null) { throw new ArgumentException("Could not find server to release for a stuck build"); } cancellationToken.ThrowIfCancellationRequested(); if (server.PublicAddress == null) { throw new InvalidOperationException("Can't connect to a server with no public address"); } externalServerSSHAccess.ConnectTo(server.PublicAddress.ToString(), server.SSHKeyFileName); externalServerSSHAccess.Reboot(); await database.LogEntries.AddAsync(new LogEntry() { Message = $"External server {server.Id} timed out running CI job, force rebooting it", }, cancellationToken); server.StatusLastChecked = DateTime.UtcNow; server.ReservationType = ServerReservationType.None; server.Status = ServerStatus.Stopping; server.BumpUpdatedAt(); jobClient.Schedule <WaitForExternalServerStartUpJob>(x => x.Execute(server.Id, CancellationToken.None), TimeSpan.FromSeconds(20)); logger.LogInformation("Successfully commanded reboot on: {ServerId}", server.Id); } else { var server = await database.ControlledServers.FindAsync(new object[] { serverId }, cancellationToken); if (server == null) { throw new ArgumentException("Could not find server to release for a stuck build"); } cancellationToken.ThrowIfCancellationRequested(); await database.LogEntries.AddAsync(new LogEntry() { Message = $"Server {server.Id} ({server.InstanceId}) timed out running CI job, stopping it, running " + $"since {server.UpdatedAt}" }, cancellationToken); if (string.IsNullOrEmpty(server.InstanceId)) { throw new ArgumentException("Stuck server has no InstanceId, can't stop it"); } await ec2Controller.StopInstance(server.InstanceId, false); server.Status = ServerStatus.Stopping; if (server.RunningSince != null) { server.TotalRuntime += (DateTime.UtcNow - server.RunningSince.Value).TotalSeconds; } server.RunningSince = null; logger.LogInformation("Successfully signaled stop on: {InstanceId}", server.InstanceId); } if (job.RunningOnServerId != serverId) { logger.LogError("Wrong RunningOnServerId in job (total runtime limit exceeded)"); job.RunningOnServerId = serverId; } // Not cancellable done as the state to terminated is very important to save // ReSharper disable once MethodSupportsCancellation await database.SaveChangesAsync(); jobClient.Enqueue <SetFinishedCIJobStatusJob>(x => x.Execute(ciProjectId, ciBuildId, ciJobId, false, CancellationToken.None)); }