/// <summary> /// Emulates a signal instructing the service to close. This will typically be used /// for unit testing services. /// </summary> /// <exception cref="TimeoutException"> /// Thrown if the service did not exit gracefully in time before it would have /// been killed (e.g. by Kubernetes or Docker). /// </exception> public void Signal() { if (readyToExit) { // Application has already indicated that it has terminated. return; } var isTerminating = terminating; terminating = true; if (isTerminating) { return; // Already terminating. } log?.LogInfo(() => $"Emulated stop request: [timeout={Timeout}]"); cts.Cancel(); lock (handlers) { foreach (var handler in handlers) { new Thread(new ThreadStart(handler)).Start(); } } StopEvent.Set(); try { NeonHelper.WaitFor(() => readyToExit, Timeout); log?.LogInfo(() => "Process stopped gracefully."); } catch (TimeoutException) { log?.LogWarn(() => $"Process did not stop within [{Timeout}]."); throw; } }
/// <summary> /// Cleanly terminates the current process (for internal use). /// </summary> /// <param name="exitCode">Optional process exit code (defaults to <b>0</b>).</param> /// <param name="explicitTermination">Optionally indicates that termination is not due to receiving an external signal.</param> private void ExitInternal(int exitCode = 0, bool explicitTermination = false) { if (readyToExit) { // Application has already indicated that it has terminated. return; } var isTerminating = terminating; terminating = true; if (isTerminating) { return; // Already terminating. } if (explicitTermination) { log?.LogInfo(() => $"INTERNAL stop request: [timeout={Timeout}]"); } else { log?.LogInfo(() => $"SIGTERM received: Stopping process [timeout={Timeout}]"); } cts.Cancel(); lock (handlers) { foreach (var handler in handlers) { new Thread(new ThreadStart(handler)).Start(); } } try { NeonHelper.WaitFor(() => readyToExit, Timeout); log?.LogInfo(() => "Process stopped gracefully."); } catch (TimeoutException) { log?.LogWarn(() => $"Process did not stop within [{Timeout}]."); } Environment.Exit(exitCode); }
/// <summary> /// Logs a transient exception that is going to be retried if logging /// is enabled. /// </summary> /// <param name="e">The exception.</param> protected void LogTransient(Exception e) { log?.LogWarn("[transient-retry]", e); }
/// <summary> /// Constructs a query/signal method map for a workflow type. /// </summary> /// <param name="workflowType">The workflow interface.</param> /// <returns>The <see cref="WorkflowMethodMap"/>.</returns> public static WorkflowMethodMap Create(Type workflowType) { Covenant.Requires <ArgumentNullException>(workflowType != null); // $todo(jeff.lill): // // The code below doesn't not verify that query/signal names are unique // but also doesn't barf. It will send requets to the last method // encountered with the same name, which is pretty reasonable. // // In a perfect world, we'd detect this and throw an exception. var map = new WorkflowMethodMap(); foreach (var method in workflowType.GetMethods(BindingFlags.Public | BindingFlags.NonPublic | BindingFlags.Instance)) { // Signal methods are tagged by [SignalHandler], accept a single byte array parameter, // and returns [Task]. var signalHandlerAttribute = method.GetCustomAttribute <SignalMethodAttribute>(); if (signalHandlerAttribute != null) { if (method.ReturnType != typeof(Task)) { Log.LogWarn($"Workflow [{workflowType.FullName}.{method.Name}()] signal handler is invalid because it doesn't return [void]. It will be ignored."); continue; } var parameters = method.GetParameters(); if (parameters.Length != 1 || parameters[0].ParameterType != typeof(byte[])) { Log.LogWarn($"Workflow [{workflowType.FullName}.{method.Name}()] signal handler is invalid because it doesn't accept a single byte array parameter. It will be ignored."); continue; } map.nameToSignalMethod[signalHandlerAttribute.Name] = method; continue; } // Query methods are tagged by [QueryHandler], accept a single byte array parameter, // and returns [Task<byte[]>]. var queryHandlerAttribute = method.GetCustomAttribute <QueryMethodAttribute>(); if (queryHandlerAttribute != null) { if (method.ReturnType != typeof(Task <byte[]>)) { Log.LogWarn($"Workflow [{workflowType.FullName}.{method.Name}()] query handler is invalid because it doesn't return a byte array. It will be ignored."); continue; } var parameters = method.GetParameters(); if (parameters.Length != 1 || parameters[0].ParameterType != typeof(byte[])) { Log.LogWarn($"Workflow [{workflowType.FullName}.{method.Name}()] query handler is invalid because it doesn't accept a single byte array parameter. It will be ignored."); continue; } map.nameToQueryMethod[queryHandlerAttribute.Name] = method; continue; } } return(map); }
/// <summary> /// Implements the service as a <see cref="Task"/>. /// </summary> /// <returns>The <see cref="Task"/>.</returns> private static async Task RunAsync() { var localMD5 = string.Empty; var remoteMD5 = "[unknown]"; var verifyTimer = new PolledTimer(verifyInterval, autoReset: true); var periodicTask = new AsyncPeriodicTask( pollInterval, onTaskAsync: async() => { log.LogDebug(() => "Starting poll"); log.LogDebug(() => "Fetching DNS answers MD5 from Consul."); remoteMD5 = await consul.KV.GetStringOrDefault(HiveConst.ConsulDnsHostsMd5Key, terminator.CancellationToken); if (remoteMD5 == null) { remoteMD5 = "[unknown]"; } var verify = verifyTimer.HasFired; if (verify) { // Under normal circumstances, we should never see the reload signal file // here because the [neon-dns-loader] service should have deleted it after // handling the last change signal. // // This probably means that [neon-dns-loader] is not running or if this service // is configured with POLL_INTERVAL being so short that [neon-dns-loader] // hasn't had a chance to handle the previous signal. if (File.Exists(reloadSignalPath)) { log.LogWarn("[neon-dns-loader] service doesn't appear to be running because the reload signal file is present."); } } if (!verify && localMD5 == remoteMD5) { log.LogDebug(() => "DNS answers are unchanged."); } else { if (localMD5 == remoteMD5) { log.LogDebug(() => "DNS answers have not changed but we're going to verify that we have the correct hosts anyway."); } else { log.LogDebug(() => "DNS answers have changed."); } log.LogDebug(() => "Fetching DNS answers."); var hostsTxt = await consul.KV.GetStringOrDefault(HiveConst.ConsulDnsHostsKey, terminator.CancellationToken); if (hostsTxt == null) { log.LogWarn(() => "DNS answers do not exist on Consul. Is [neon-dns-mon] functioning properly?"); } else { var marker = "# -------- NEON-DNS --------"; // We have the host entries from Consul. We need to add these onto the // end [/etc/powserdns/hosts], replacing any host entries written during // a previous run. // // We're going to use the special marker line: // // # ---DYNAMIC-HOSTS--- // // to separate the built-in hosts (above the line) from the dynamic hosts // we're generating here (which will be below the line). Note that this // line won't exist the first time this service runs, so we'll just add it. // // Note that it's possible that the PowerDNS Recursor might be reading this // file while we're trying to write it. We're going to treat these as a // transient errors and retry. var retry = new LinearRetryPolicy(typeof(IOException), maxAttempts: 5, retryInterval: TimeSpan.FromSeconds(1)); await retry.InvokeAsync( async() => { using (var stream = new FileStream(powerDnsHostsPath, FileMode.Open, FileAccess.ReadWrite)) { // Read a copy of the hosts file as bytes so we can compare // the old version with the new one generated below for changes. var orgHostBytes = stream.ReadToEnd(); stream.Position = 0; // Generate the new hosts file. var sbHosts = new StringBuilder(); // Read the hosts file up to but not including the special marker // line (if it's present). using (var reader = new StreamReader(stream, Encoding.UTF8, true, 32 * 1024, leaveOpen: true)) { foreach (var line in reader.Lines()) { if (line.StartsWith(marker)) { break; } sbHosts.AppendLine(line); } } // Strip any trailing whitespace from the hosts file so we'll // be able to leave a nice blank line between the end of the // original file and the special marker line. var text = sbHosts.ToString().TrimEnd(); sbHosts.Clear(); sbHosts.AppendLine(text); // Append the marker line, followed by dynamic host // entries we downloaded from Consul. sbHosts.AppendLine(); sbHosts.AppendLine(marker); sbHosts.AppendLine(); sbHosts.Append(hostsTxt); // Generate the new host file bytes, taking care to ensure that // we're using Linux style line endings and then update the // hosts file if anything changed. var hostsText = NeonHelper.ToLinuxLineEndings(sbHosts.ToString()); var newHostBytes = Encoding.UTF8.GetBytes(hostsText); if (NeonHelper.ArrayEquals(orgHostBytes, newHostBytes)) { log.LogDebug(() => $"[{powerDnsHostsPath}] file is up-to-date."); } else { log.LogDebug(() => $"[{powerDnsHostsPath}] is being updated."); stream.Position = 0; stream.SetLength(0); stream.Write(newHostBytes); // Signal to the local [neon-dns-loader] systemd service that it needs // to have PowerDNS Recursor reload the hosts file. File.WriteAllText(reloadSignalPath, "reload now"); } } log.LogDebug(() => "Finished poll"); await Task.CompletedTask; }); // We've successfully synchronized the local hosts file with // the Consul DNS settings. localMD5 = remoteMD5; } } return(await Task.FromResult(false)); }, onExceptionAsync: async e => { log.LogError(e); return(await Task.FromResult(false)); }, onTerminateAsync: async() => { log.LogInfo(() => "Terminating"); await Task.CompletedTask; }); terminator.AddDisposable(periodicTask); await periodicTask.Run(); }
/// <summary> /// Resolves the <paramref name="targets"/> into healthy host addresses, /// adding the results to <paramref name="hostAddresses"/>. /// </summary> /// <param name="hostAddresses">The host addresses.</param> /// <param name="targets">The DNS targets.</param> private static async Task ResolveTargetsAsync(HostAddresses hostAddresses, List <DnsEntry> targets) { // $todo(jeff.lill): // // I'm keeping this implementation super simple for now, by performing // all of the health checks during the poll. This probably won't scale // well when there are 100s of target endpoints. This will also tend // to blast health check traffic to all of the endpoints at once. // // It would probably be better to do health checking continuously in // another task and have this method resolve the hosts from that data. // That would also allow health checks to use a target TTL as a hint // for how often endpoint health should be checked. // Implementation Note: // -------------------- // We're going to create a task for each DNS host entry and then // each of those tasks will create a task for each endpoint that // requires a health check. var nodeGroups = hiveDefinition.GetHostGroups(); var entryTasks = new List <Task>(); var warnings = new List <string>(); foreach (var target in targets) { var targetWarnings = target.Validate(hiveDefinition, nodeGroups); if (targetWarnings.Count > 0) { // We skip generating DNS entries for targets with warnings. foreach (var warning in warnings) { warnings.Add(warning); } continue; } // Clear the resolver at the beginning of each health check pass // to purge any cached state from the previous pass. healthResolver.Clear(); // Kick off the endpoint health checks. var healthyAddresses = new HashSet <string>(); entryTasks.Add(Task.Run( async() => { var healthTasks = new List <Task>(); foreach (var endpoint in target.Endpoints) { //------------------------------------------------- // Handle node group endpoints. var groupName = endpoint.GetGroupName(); if (groupName != null) { if (nodeGroups.TryGetValue(groupName, out var group)) { foreach (var node in group) { healthTasks.Add(Task.Run( async() => { var nodeAddresses = await CheckEndpointAsync(endpoint, node.PrivateAddress); foreach (var nodeAddress in nodeAddresses) { hostAddresses.Add(target.Hostname, nodeAddress); } })); } } continue; } //------------------------------------------------- // Handle normal endpoints. var addresses = await CheckEndpointAsync(endpoint); if (addresses != null) { foreach (var address in addresses) { hostAddresses.Add(target.Hostname, address); } } } await NeonHelper.WaitAllAsync(healthTasks); }, cancellationToken: terminator.CancellationToken)); } await NeonHelper.WaitAllAsync(entryTasks); // Log any detected configuration warnings. Note that we're going to throttle // warning reports to once every 5 minutes, so we won't spam the logs. if (warnTimer.HasFired) { foreach (var warning in warnings) { log.LogWarn(warning); } } }
/// <inheritdoc/> public void LogWarn(object message, string activityId = null) { log.LogWarn(message, activityId); capture.AppendLine($"[WARN]: {message}"); }
/// <summary> /// Rebuilds the host node's <b>/etc/containers/registries.conf.d/00-neon-cluster.conf</b> file, /// using the container registries passed, signals CRI-O to reload any changes and also manages /// container registry logins. /// </summary> private async Task UpdateContainerRegistriesAsync() { var registries = (await k8s.ListClusterCustomObjectAsync <V1NeonContainerRegistry>()).Items; // NOTE: Here's the documentation for the config file we're generating: // // https://github.com/containers/image/blob/main/docs/containers-registries.conf.5.md // var sbRegistryConfig = new StringBuilder(); var sbSearchRegistries = new StringBuilder(); // Configure any unqualified search registries. foreach (var registry in registries .Where(registry => registry.Spec.SearchOrder >= 0) .OrderBy(registry => registry.Spec.SearchOrder)) { sbSearchRegistries.AppendWithSeparator($"\"{registry.Spec.Prefix}\"", ", "); } sbRegistryConfig.Append( $@"unqualified-search-registries = [{sbSearchRegistries}] "); // Configure any container registries including the local cluster. foreach (var registry in registries) { sbRegistryConfig.Append( $@" [[registry]] prefix = ""{registry.Spec.Prefix}"" insecure = {NeonHelper.ToBoolString(registry.Spec.Insecure)} blocked = {NeonHelper.ToBoolString(registry.Spec.Blocked)} "); if (!string.IsNullOrEmpty(registry.Spec.Location)) { sbRegistryConfig.AppendLine($"location = \"{registry.Spec.Location}\""); } } if (NeonHelper.IsLinux) { // Read and parse the current configuration file to create list of the existing // configured upstream registries. var currentConfigText = File.ReadAllText(configMountPath); var currentConfig = Toml.Parse(currentConfigText); var existingLocations = new List <string>(); foreach (var registryTable in currentConfig.Tables.Where(table => table.Name.Key.GetName() == "registry")) { var location = registryTable.Items.SingleOrDefault(key => key.Key.GetName() == "location")?.Value.GetValue(); if (!string.IsNullOrWhiteSpace(location)) { existingLocations.Add(location); } } // Convert the generated config to Linux line endings and then compare the new // config against what's already configured on the host node. We'll rewrite the // host file and then signal CRI-O to reload its config when the files differ. var newConfigText = NeonHelper.ToLinuxLineEndings(sbRegistryConfig.ToString()); if (currentConfigText != newConfigText) { configUpdateCounter.Inc(); File.WriteAllText(configMountPath, newConfigText); (await Node.ExecuteCaptureAsync("pkill", new object[] { "-HUP", "crio" })).EnsureSuccess(); // Wait a few seconds to give CRI-O a chance to reload its config. This will // help mitigate problems when managing logins below due to potential inconsistencies // between CRI-O's currently loaded config and the new config we just saved. await Task.Delay(TimeSpan.FromSeconds(15)); } } //----------------------------------------------------------------- // We need to manage registry logins by logging into new registries, // logging out of deleted registries, relogging in with new credentials, // and periodically logging in with unchanged credentials to ensure that // we're actually logged in. Here's how this works: // // https://github.com/nforgeio/neonKUBE/issues/1591 var retry = new LinearRetryPolicy(e => true, maxAttempts: 5, retryInterval: TimeSpan.FromSeconds(5)); // Construct LoginFile instances for all specified upstream registries // that require credentials and add these to a dictionary keyed by SHA-256. var shaToRequiredLogins = new Dictionary <string, LoginFile>(); foreach (var registry in registries.Where(registry => !string.IsNullOrEmpty(registry.Spec.Username))) { var loginFile = LoginFile.Create(hostContainerRegistriesFolder, registry.Spec.Location, registry.Spec.Username, registry.Spec.Password); shaToRequiredLogins.Add(loginFile.Sha256, loginFile); } // Read all existing login files on the node and add them to a dictionary // mapping their SHA-256s to the file. var shaToExistingLogins = new Dictionary <string, LoginFile>(); foreach (var file in Directory.GetFiles(hostContainerRegistriesFolder, "*.login", SearchOption.TopDirectoryOnly)) { var loginFile = LoginFile.Read(file); if (loginFile != null) { shaToExistingLogins.Add(loginFile.Sha256, loginFile); } } // Look for any existing login files that are not present in the collection of // new logins. These correspond to registries that have been deleted or whose // credentials have changed. We're going to go ahead and log out of the related // registries and then delete these login files (we'll re-login with new // credentials below for the registries that weren't targeted for removal). foreach (var loginFile in shaToExistingLogins.Values .Where(login => !shaToRequiredLogins.ContainsKey(login.Sha256))) { try { await retry.InvokeAsync( async() => { // Note that we're not ensuring success here because we may not be // logged-in which is OK: we don't want to see that error. log.LogInfo($"{podmanPath} logout {loginFile.Location}"); if (NeonHelper.IsLinux) { await Node.ExecuteCaptureAsync(podmanPath, new object[] { "logout", loginFile.Location }); } loginFile.Delete(); }); } catch (Exception e) { loginErrorCounter.Inc(); log.LogError(e); } } // Look for any required logins that don't have an existing login file, // and then login the registry and then create the login file on success. foreach (var loginFile in shaToRequiredLogins.Values .Where(login => !shaToExistingLogins.ContainsKey(login.Sha256))) { try { await retry.InvokeAsync( async() => { log.LogInfo($"{podmanPath} login {loginFile.Location} --username {loginFile.Username} --password REDACTED"); if (NeonHelper.IsLinux) { (await Node.ExecuteCaptureAsync(podmanPath, new object[] { "login", loginFile.Location, "--username", loginFile.Username, "--password", loginFile.Password })).EnsureSuccess(); } }); loginFile.Write(); } catch (Exception e) { loginErrorCounter.Inc(); log.LogError(e); } } //----------------------------------------------------------------- // Finally, we need to force a re-login for any existing logins that haven't // been explicitly logged into for a while. Note that we're always going to // log into the local Harbor registry. foreach (var file in Directory.GetFiles(hostContainerRegistriesFolder, "*.login", SearchOption.TopDirectoryOnly)) { // Read the next existing login file. var loginFile = LoginFile.Read(file); if (loginFile == null) { continue; } // Update the login with the password from the corresponding container registry resource. var registry = registries.FirstOrDefault(registry => registry.Spec.Location == loginFile.Location); if (registry == null) { log.LogWarn($"Cannot locate [{nameof(V1NeonContainerRegistry)}] resource for [location={loginFile.Location}]."); continue; } loginFile.Password = registry.Spec.Password; // Perform the login. var scheduledLoginUtc = loginFile.UpdatedUtc + reloginInterval + NeonHelper.PseudoRandomTimespan(reloginMaxRandomInterval); if (DateTime.UtcNow <= scheduledLoginUtc || loginFile.Location == KubeConst.LocalClusterRegistry) { try { await retry.InvokeAsync( async() => { log.LogInfo($"{podmanPath} login {loginFile.Location} --username {loginFile.Username} --password REDACTED"); if (NeonHelper.IsLinux) { (await Node.ExecuteCaptureAsync(podmanPath, new object[] { "login", loginFile.Location, "--username", loginFile.Username, "--password", loginFile.Password })).EnsureSuccess(); } }); loginFile.Write(); } catch (Exception e) { loginErrorCounter.Inc(); log.LogError(e); } } } }