public async Task Timeout() { var policy = new LinearRetryPolicy(TransientDetector, retryInterval: TimeSpan.FromSeconds(0.5), timeout: TimeSpan.FromSeconds(1.5)); var times = new List <DateTime>(); Assert.Equal(int.MaxValue, policy.MaxAttempts); Assert.Equal(TimeSpan.FromSeconds(0.5), policy.RetryInterval); Assert.Equal(TimeSpan.FromSeconds(1.5), policy.Timeout); await Assert.ThrowsAsync <TransientException>( async() => { await policy.InvokeAsync( async() => { times.Add(DateTime.UtcNow); await Task.CompletedTask; throw new TransientException(); }); }); Assert.Equal(4, times.Count); // Additional test to verify this serious problem is fixed: // // https://github.com/nforgeio/neonKUBE/issues/762 // // We'll wait a bit longer to enure that any (incorrect) deadline computed // by the policy when constructed above does not impact a subsequent run. await Task.Delay(TimeSpan.FromSeconds(4)); times.Clear(); Assert.Equal(TimeSpan.FromSeconds(0.5), policy.RetryInterval); Assert.Equal(TimeSpan.FromSeconds(1.5), policy.Timeout); await Assert.ThrowsAsync <TransientException>( async() => { await policy.InvokeAsync( async() => { times.Add(DateTime.UtcNow); await Task.CompletedTask; throw new TransientException(); }); }); Assert.Equal(4, times.Count); }
public async Task SuccessDelayedAggregateArray() { var policy = new LinearRetryPolicy(new Type[] { typeof(NotReadyException), typeof(KeyNotFoundException) }); var times = new List <DateTime>(); var success = false; await policy.InvokeAsync( async() => { times.Add(DateTime.UtcNow); await Task.Delay(0); if (times.Count < policy.MaxAttempts) { if (times.Count % 1 == 0) { throw new AggregateException(new NotReadyException()); } else { throw new AggregateException(new KeyNotFoundException()); } } success = true; }); Assert.True(success); Assert.Equal(policy.MaxAttempts, times.Count); VerifyIntervals(times, policy); }
public async Task SuccessCustom() { var policy = new LinearRetryPolicy(TransientDetector, maxAttempts: 4, retryInterval: TimeSpan.FromSeconds(2)); var times = new List <DateTime>(); var success = false; Assert.Equal(4, policy.MaxAttempts); Assert.Equal(TimeSpan.FromSeconds(2), policy.RetryInterval); await policy.InvokeAsync( async() => { times.Add(DateTime.UtcNow); await Task.CompletedTask; if (times.Count < policy.MaxAttempts) { throw new TransientException(); } success = true; }); Assert.True(success); Assert.Equal(policy.MaxAttempts, times.Count); VerifyIntervals(times, policy); }
public async Task SuccessCustom_Result() { var policy = new LinearRetryPolicy(TransientDetector, maxAttempts: 4, retryInterval: TimeSpan.FromSeconds(2)); var times = new List <DateTime>(); Assert.Equal(4, policy.MaxAttempts); Assert.Equal(TimeSpan.FromSeconds(2), policy.RetryInterval); var success = await policy.InvokeAsync( async() => { times.Add(DateTime.UtcNow); await Task.Delay(0); if (times.Count < policy.MaxAttempts) { throw new TransientException(); } return("WOOHOO!"); }); Assert.Equal("WOOHOO!", success); Assert.Equal(policy.MaxAttempts, times.Count); VerifyIntervals(times, policy); }
/// <summary> /// Used to start the fixture within a <see cref="ComposedFixture"/>. /// </summary> /// <param name="image"> /// Optionally specifies the NATS container image. This defaults to /// <b>nkubeio/nats:latest</b> or <b>nkubedev/nats:latest</b> depending /// on whether the assembly was built from a git release branch or not. /// </param> /// <param name="name">Optionally specifies the container name (defaults to <c>nats-test</c>).</param> /// <param name="args">Optional NATS server command line arguments.</param> public void StartAsComposed( string image = null, string name = "nats-test", string[] args = null) { image = image ?? $"{KubeConst.NeonBranchRegistry}/nats:latest"; base.CheckWithinAction(); var dockerArgs = new string[] { "--detach", "-p", "4222:4222", "-p", "8222:8222", "-p", "6222:6222" }; if (!IsRunning) { StartAsComposed(name, image, dockerArgs, args); } var factory = new ConnectionFactory(); var retry = new LinearRetryPolicy(exception => true, 20, TimeSpan.FromSeconds(0.5)); retry.InvokeAsync( async() => { Connection = factory.CreateConnection(); await Task.CompletedTask; }).Wait(); }
/// <summary> /// Signals the Docker orchestrator to begin scheduling service tasks on a node. /// </summary> /// <param name="nodeName">Identifies the target node.</param> /// <exception cref="KeyNotFoundException">Thrown if the named node does not exist.</exception> /// <exception cref="InvalidOperationException">Thrown if the node is not part of the swarm.</exception> public void ActivateNode(string nodeName) { Covenant.Requires <ArgumentNullException>(!string.IsNullOrEmpty(nodeName)); var node = hive.GetNode(nodeName); if (!node.Metadata.InSwarm) { throw new InvalidOperationException($"Node [{nodeName}] is not part of the swarm."); } // I've see transient errors, so we'll retry a few times. var manager = hive.GetReachableManager(); var retry = new LinearRetryPolicy(typeof(Exception), maxAttempts: 5, retryInterval: TimeSpan.FromSeconds(5)); retry.InvokeAsync( async() => { var response = manager.SudoCommand($"docker node update --availability active {nodeName}"); if (response.ExitCode != 0) { throw new Exception(response.ErrorSummary); } await Task.CompletedTask; }).Wait(); }
public async Task FailDelayed_Result() { var policy = new LinearRetryPolicy(TransientDetector); var times = new List <DateTime>(); await Assert.ThrowsAsync <NotImplementedException>( async() => { await policy.InvokeAsync <string>( async() => { times.Add(DateTime.UtcNow); await Task.Delay(0); if (times.Count < 2) { throw new TransientException(); } else { throw new NotImplementedException(); } }); }); Assert.Equal(2, times.Count); VerifyIntervals(times, policy); }
/// <summary> /// Establishes the server connection. /// </summary> private void Connect() { var factory = new StanConnectionFactory(); var retry = new LinearRetryPolicy(exception => true, 20, TimeSpan.FromSeconds(0.5)); retry.InvokeAsync( async() => { Connection = factory.CreateConnection("test-cluster", nameof(NatsStreamingFixture)); await Task.CompletedTask; }).Wait(); }
/// <summary> /// Writes a file as text, retrying if the file is already open. /// </summary> /// <param name="path">The file path.</param> /// <param name="text">The text to be written.</param> /// <remarks> /// It's possible for the configuration file to be temporarily opened /// by another process (e.g. the neonKUBE Desktop application or a /// command line tool). Rather than throw an exception, we're going /// to retry the operation a few times. /// </remarks> internal static string WriteFileTextWithRetry(string path, string text) { var retry = new LinearRetryPolicy(typeof(IOException), maxAttempts: 10, retryInterval: TimeSpan.FromMilliseconds(200)); retry.InvokeAsync( async() => { await Task.CompletedTask; File.WriteAllText(path, text); }).Wait(); return(text); }
public async Task SuccessImmediate_Result() { var policy = new LinearRetryPolicy(TransientDetector); var times = new List <DateTime>(); var success = await policy.InvokeAsync( async() => { times.Add(DateTime.UtcNow); await Task.Delay(0); return("WOOHOO!"); }); Assert.Single(times); Assert.Equal("WOOHOO!", success); }
/// <summary> /// Signals the Docker orchestrator to drain all service tasks from a node. /// </summary> /// <param name="nodeName">Identifies the target node.</param> /// <exception cref="KeyNotFoundException">Thrown if the named node does not exist.</exception> /// <exception cref="InvalidOperationException">Thrown if the node is not part of the swarm.</exception> public void DrainNode(string nodeName) { Covenant.Requires <ArgumentNullException>(!string.IsNullOrEmpty(nodeName)); var node = hive.GetNode(nodeName); if (!node.Metadata.InSwarm) { throw new InvalidOperationException($"Node [{nodeName}] is not part of the swarm."); } // I've see transient errors, so we'll retry a few times. var manager = hive.GetReachableManager(); var retry = new LinearRetryPolicy(typeof(Exception), maxAttempts: 5, retryInterval: TimeSpan.FromSeconds(5)); retry.InvokeAsync( async() => { var response = manager.SudoCommand($"docker node update --availability drain {nodeName}"); if (response.ExitCode != 0) { throw new Exception(response.ErrorSummary); } await Task.CompletedTask; }).Wait(); // $todo(jeff.lill): // // Ideally, we'd wait for all of the service tasks to stop but it // appears that there's no easy way to check for this other than // listing all of the hive services and then doing a // // docker service ps SERVICE] // // for each until none report running on this node. // // A hacky alternative would be to list local containers and try // to determine which ones look liks service tasks by examining // the container name. Thread.Sleep(TimeSpan.FromSeconds(30)); }
public async Task SuccessImmediate() { var policy = new LinearRetryPolicy(TransientDetector); var times = new List <DateTime>(); var success = false; await policy.InvokeAsync( async() => { times.Add(DateTime.UtcNow); await Task.Delay(0); success = true; }); Assert.Single(times); Assert.True(success); }
public async Task FailImmediate_Result() { var policy = new LinearRetryPolicy(TransientDetector); var times = new List <DateTime>(); await Assert.ThrowsAsync <NotImplementedException>( async() => { await policy.InvokeAsync <string>( async() => { times.Add(DateTime.UtcNow); await Task.Delay(0); throw new NotImplementedException(); }); }); Assert.Single(times); }
public async Task FailAll_Result() { var policy = new LinearRetryPolicy(TransientDetector); var times = new List <DateTime>(); await Assert.ThrowsAsync <TransientException>( async() => { await policy.InvokeAsync <string>( async() => { times.Add(DateTime.UtcNow); await Task.Delay(0); throw new TransientException(); }); }); Assert.Equal(policy.MaxAttempts, times.Count); VerifyIntervals(times, policy); }
/// <summary> /// Restarts the NATS container to clear any previous state and returns the /// new client connection. /// </summary> public new IConnection Restart() { base.Restart(); if (Connection != null) { Connection.Dispose(); Connection = null; } var factory = new ConnectionFactory(); var retry = new LinearRetryPolicy(exception => true, 20, TimeSpan.FromSeconds(0.5)); retry.InvokeAsync( async() => { Connection = factory.CreateConnection($"nats://{GetHostInterface(hostInterface, forConnection: true)}:4222"); await Task.CompletedTask; }).Wait(); return(Connection); }
public async Task SuccessDelayed_Result() { var policy = new LinearRetryPolicy(TransientDetector); var times = new List <DateTime>(); var success = await policy.InvokeAsync( async() => { times.Add(DateTime.UtcNow); await Task.CompletedTask; if (times.Count < policy.MaxAttempts) { throw new TransientException(); } return("WOOHOO!"); }); Assert.Equal("WOOHOO!", success); Assert.Equal(policy.MaxAttempts, times.Count); VerifyIntervals(times, policy); }
public async Task Timeout() { var policy = new LinearRetryPolicy(TransientDetector, maxAttempts: 6, retryInterval: TimeSpan.FromSeconds(0.5), timeout: TimeSpan.FromSeconds(1.5)); var times = new List <DateTime>(); Assert.Equal(6, policy.MaxAttempts); Assert.Equal(TimeSpan.FromSeconds(0.5), policy.RetryInterval); Assert.Equal(TimeSpan.FromSeconds(1.5), policy.Timeout); await Assert.ThrowsAsync <TransientException>( async() => { await policy.InvokeAsync( async() => { times.Add(DateTime.UtcNow); await Task.CompletedTask; throw new TransientException(); }); }); Assert.True(times.Count < 6); }
public async Task SuccessDelayedByType() { var policy = new LinearRetryPolicy(typeof(NotReadyException)); var times = new List <DateTime>(); var success = false; await policy.InvokeAsync( async() => { times.Add(DateTime.UtcNow); await Task.Delay(0); if (times.Count < policy.MaxAttempts) { throw new NotReadyException(); } success = true; }); Assert.True(success); Assert.Equal(policy.MaxAttempts, times.Count); VerifyIntervals(times, policy); }
/// <summary> /// <para> /// Used to temporarily modify the <b>hosts</b> file used by the DNS resolver /// for debugging or other purposes. /// </para> /// <note> /// <b>WARNING:</b> Modifying the <b>hosts</b> file will impact all processes /// on the system, not just the current one and this is designed to be used by /// a single process at a time. /// </note> /// </summary> /// <param name="hostEntries">A dictionary mapping the hostnames to an IP address or <c>null</c>.</param> /// <param name="section"> /// <para> /// Optionally specifies the string to use to mark the hostnames section. This /// defaults to <b>MODIFY</b> which will delimit the section with <b># NEON-BEGIN-MODIFY</b> /// and <b># NEON-END-MODIFY</b>. You may pass a different string to identify a custom section. /// </para> /// <note> /// The string passed must be a valid DNS hostname label that must begin with a letter /// followed by letters, digits or dashes. The maximum length is 63 characters. /// </note> /// </param> /// <remarks> /// <note> /// This method requires elevated administrative privileges. /// </note> /// <para> /// This method adds or removes a temporary section of host entry definitions /// delimited by special comment lines. When <paramref name="hostEntries"/> is /// non-null and non-empty, the section will be added or updated. Otherwise, the /// section will be removed. /// </para> /// <para> /// You can remove all host sections by passing both <paramref name="hostEntries"/> /// and <paramref name="section"/> as <c>null</c>. /// </para> /// </remarks> public static void ModifyLocalHosts(Dictionary <string, IPAddress> hostEntries = null, string section = "MODIFY") { #if XAMARIN throw new NotSupportedException(); #else if (hostEntries != null && string.IsNullOrWhiteSpace(section)) { throw new ArgumentNullException(nameof(section)); } if (section != null) { var sectionOK = char.IsLetter(section[0]) && section.Length <= 63; if (sectionOK) { foreach (var ch in section) { if (!char.IsLetterOrDigit(ch) && ch != '-') { sectionOK = false; break; } } } if (!sectionOK) { throw new ArgumentException("Suffix is not a valid DNS host name label.", nameof(section)); } section = section.ToUpperInvariant(); } string hostsPath; if (NeonHelper.IsWindows) { hostsPath = Path.Combine(Environment.GetEnvironmentVariable("windir"), "System32", "drivers", "etc", "hosts"); } else if (NeonHelper.IsLinux || NeonHelper.IsOSX) { hostsPath = "/etc/hosts"; } else { throw new NotSupportedException(); } // We're seeing transient file locked errors when trying to update the [hosts] file. // My guess is that this is cause by the Window DNS resolver opening the file as // READ/WRITE to prevent it from being modified while the resolver is reading any // changes. // // We're going to mitigate this by retrying a few times. // // It can take a bit of time for the Windows DNS resolver to pick up the change. // // https://github.com/nforgeio/neonKUBE/issues/244 // // We're going to mitigate this by writing a [neon-modify-local-hosts.nhive.io] record with // a random IP address and then wait for for the DNS resolver to report the correct address. // // Note that this only works on Windows and perhaps OSX. This doesn't work on // Linux because there's no central DNS resolver there. See the issue below for // more information: // // https://github.com/nforgeio/neonKUBE/issues/271 var updateHost = section != null ? $"{section.ToLowerInvariant()}.neonforge-marker" : $"H-{Guid.NewGuid().ToString("d")}.neonforge-marker"; var addressBytes = NeonHelper.GetCryptoRandomBytes(4); var updateAddress = GetRandomAddress(); var lines = new List <string>(); var existingHosts = new Dictionary <string, string>(StringComparer.InvariantCultureIgnoreCase); var different = false; retryFile.InvokeAsync( async() => { var beginMarker = $"# NEON-BEGIN-"; var endMarker = $"# NEON-END-"; if (section != null) { beginMarker += section; endMarker += section; } var inputLines = File.ReadAllLines(hostsPath); var inSection = false; // Load lines of text from the current [hosts] file, without // any lines for the named section. We're going to parse those // lines instead, so we can compare them against the [hostEntries] // passed to determine whether we actually need to update the // [hosts] file. lines.Clear(); existingHosts.Clear(); foreach (var line in inputLines) { var trimmed = line.Trim(); if (trimmed == beginMarker || (section == null && trimmed.StartsWith(beginMarker))) { inSection = true; } else if (trimmed == endMarker || (section == null && trimmed.StartsWith(endMarker))) { inSection = false; } else { if (inSection) { // The line is within the named section, so we're going to parse // the host entry (if any) and add it to [existingHosts]. if (trimmed.Length == 0 || trimmed.StartsWith("#")) { // Ignore empty or comment lines (just to be safe). continue; } // We're going to simply assume that the address and hostname // are separated by whitespace and that there's no other junk // on the line (like comments added by the operator). If there // is any junk, we'll capture that too and then the entries // won't match and we'll just end up rewriting the section // (which is reasonable). // // Note that we're going to ignore the special marker entry. var fields = line.Split(new char[] { ' ', '\t' }, StringSplitOptions.RemoveEmptyEntries); var address = fields[0]; var hostname = fields.Length > 1 ? fields[1] : string.Empty; if (!hostname.EndsWith(".neonforge-marker")) { existingHosts[hostname] = address; } } else { // The line is not in the named section, so we'll // include it as as. lines.Add(line); } } } // Compare the existing entries against the new ones and rewrite // the [hosts] file only if they are different. if (hostEntries != null && hostEntries.Count == existingHosts.Count) { foreach (var item in hostEntries) { if (!existingHosts.TryGetValue(item.Key, out var existingAddress) || item.Value.ToString() != existingAddress) { different = true; break; } } if (!different) { return; } } // Append the section if it has any host entries. if (hostEntries?.Count > 0) { lines.Add(beginMarker); // Append the special update host with a random IP address. var address = updateAddress.ToString(); lines.Add($" {address}{new string(' ', 16 - address.Length)} {updateHost}"); // Append the new entries. foreach (var item in hostEntries) { address = item.Value.ToString(); lines.Add($" {address}{new string(' ', 16 - address.Length)} {item.Key}"); } lines.Add(endMarker); } File.WriteAllLines(hostsPath, lines.ToArray()); await Task.CompletedTask; }).Wait(); if (!different) { // We didn't detect any changes to the section above so we're going to // exit without rewriting the [hosts] file. return; } if (NeonHelper.IsWindows) { // Flush the DNS cache (and I believe this reloads the [hosts] file too). var response = NeonHelper.ExecuteCapture("ipconfig", "/flushdns"); if (response.ExitCode != 0) { throw new ToolException($"ipconfig [exitcode={response.ExitCode}]: {response.ErrorText}"); } } else if (NeonHelper.IsOSX) { // $todo(jefflill): // // We may need to clear the OSX DNS cache here. Here's some information on // how to do this: // // https://help.dreamhost.com/hc/en-us/articles/214981288-Flushing-your-DNS-cache-in-Mac-OS-X-and-Linux throw new NotImplementedException("$todo(jefflill): Purge the OSX DNS cache."); } if (NeonHelper.IsWindows || NeonHelper.IsOSX) { // Poll the local DNS resolver until it reports the correct address for the // [neon-modify-local-hosts.nhive.io]. // // If [hostEntries] is not null and contains at least one entry, we'll lookup // [neon-modify-local-hosts.neon] and compare the IP address to ensure that the // resolver has loaded the new entries. // // If [hostEntries] is null or empty, we'll wait until there are no records // for [neon-modify-local-hosts.neon] to ensure that the resolver has reloaded // the hosts file after we removed the entries. // // Note that we're going to count the retries and after the 20th (about 2 second's // worth of 100ms polling), we're going to rewrite the [hosts] file. I've seen // situations where at appears that the DNS resolver isn't re-reading [hosts] // after it's been updated. I believe this is due to the file being written // twice, once to remove the section and then shortly again there after to // write the section again. I believe there's a chance that the resolver may // miss the second file change notification. Writing the file again should // trigger a new notification. var retryCount = 0; retryReady.InvokeAsync( async() => { var addresses = await GetHostAddressesAsync(updateHost); if (hostEntries?.Count > 0) { // Ensure that the new records have been loaded by the resolver. if (addresses.Length != 1) { RewriteOn20thRetry(hostsPath, lines, ref retryCount); throw new NotReadyException($"[{updateHost}] lookup is returning [{addresses.Length}] results. There should be [1]."); } if (addresses[0].ToString() != updateAddress.ToString()) { RewriteOn20thRetry(hostsPath, lines, ref retryCount); throw new NotReadyException($"DNS is [{updateHost}={addresses[0]}] rather than [{updateAddress}]."); } } else { // Ensure that the resolver recognizes that we removed the records. if (addresses.Length != 0) { RewriteOn20thRetry(hostsPath, lines, ref retryCount); throw new NotReadyException($"[{updateHost}] lookup is returning [{addresses.Length}] results. There should be [0]."); } } }).Wait(); } #endif }
/// <summary> /// Configures the Kibana dashboard. /// </summary> /// <param name="firstManager">The first hive proxy manager.</param> public void ConfigureKibana(SshProxy <NodeDefinition> firstManager) { if (!hive.Definition.Log.Enabled) { return; } firstManager.InvokeIdempotentAction("setup/log-kibana", () => { using (var jsonClient = new JsonClient()) { var baseLogEsDataUri = hive.Definition.LogEsDataUri; var baseKibanaUri = $"http://{firstManager.PrivateAddress}:{HiveHostPorts.Kibana}"; var timeout = TimeSpan.FromMinutes(5); var retry = new LinearRetryPolicy(TransientDetector.Http, maxAttempts: 30, retryInterval: TimeSpan.FromSeconds(2)); // The Kibana API calls below require the [kbn-xsrf] header. jsonClient.DefaultRequestHeaders.Add("kbn-xsrf", "true"); // Ensure that Kibana is ready before we submit any API requests. firstManager.Status = "wait for kibana"; retry.InvokeAsync( async() => { var response = await jsonClient.GetAsync <dynamic>($"{baseKibanaUri}/api/status"); if (response.status.overall.state != "green") { throw new TransientException($"Kibana [state={response.status.overall.state}]"); } }).Wait(); // Add the index pattern to Kibana. firstManager.Status = "configure kibana index pattern"; retry.InvokeAsync( async() => { dynamic indexPattern = new ExpandoObject(); dynamic attributes = new ExpandoObject(); attributes.title = "logstash-*"; attributes.timeFieldName = "@timestamp"; indexPattern.attributes = attributes; await jsonClient.PostAsync($"{baseKibanaUri}/api/saved_objects/index-pattern/logstash-*?overwrite=true", indexPattern); }).Wait(); // Now we need to save a Kibana config document so that [logstash-*] will be // the default index and the timestamp will be displayed as UTC and have a // more useful terse format. firstManager.Status = "configure kibana defaults"; retry.InvokeAsync( async() => { dynamic setting = new ExpandoObject(); setting.value = "logstash-*"; await jsonClient.PostAsync($"{baseKibanaUri}/api/kibana/settings/defaultIndex", setting); setting.value = "HH:mm:ss.SSS MM-DD-YYYY"; await jsonClient.PostAsync($"{baseKibanaUri}/api/kibana/settings/dateFormat", setting); setting.value = "UTC"; await jsonClient.PostAsync($"{baseKibanaUri}/api/kibana/settings/dateFormat:tz", setting); }).Wait(); // Set the Kibana traffic manager rule. firstManager.Status = "kibana traffic manager rule"; var rule = new TrafficHttpRule() { Name = "neon-log-kibana", System = true, Log = true, Resolver = null }; rule.Frontends.Add( new TrafficHttpFrontend() { ProxyPort = HiveHostPorts.ProxyPrivateKibanaDashboard }); rule.Backends.Add( new TrafficHttpBackend() { Server = "neon-log-kibana", Port = NetworkPorts.Kibana }); hive.PrivateTraffic.SetRule(rule); firstManager.Status = string.Empty; } }); }
/// <summary> /// Implements the service as a <see cref="Task"/>. /// </summary> /// <returns>The <see cref="Task"/>.</returns> private static async Task RunAsync() { var localMD5 = string.Empty; var remoteMD5 = "[unknown]"; var verifyTimer = new PolledTimer(verifyInterval, autoReset: true); var periodicTask = new AsyncPeriodicTask( pollInterval, onTaskAsync: async() => { log.LogDebug(() => "Starting poll"); log.LogDebug(() => "Fetching DNS answers MD5 from Consul."); remoteMD5 = await consul.KV.GetStringOrDefault(HiveConst.ConsulDnsHostsMd5Key, terminator.CancellationToken); if (remoteMD5 == null) { remoteMD5 = "[unknown]"; } var verify = verifyTimer.HasFired; if (verify) { // Under normal circumstances, we should never see the reload signal file // here because the [neon-dns-loader] service should have deleted it after // handling the last change signal. // // This probably means that [neon-dns-loader] is not running or if this service // is configured with POLL_INTERVAL being so short that [neon-dns-loader] // hasn't had a chance to handle the previous signal. if (File.Exists(reloadSignalPath)) { log.LogWarn("[neon-dns-loader] service doesn't appear to be running because the reload signal file is present."); } } if (!verify && localMD5 == remoteMD5) { log.LogDebug(() => "DNS answers are unchanged."); } else { if (localMD5 == remoteMD5) { log.LogDebug(() => "DNS answers have not changed but we're going to verify that we have the correct hosts anyway."); } else { log.LogDebug(() => "DNS answers have changed."); } log.LogDebug(() => "Fetching DNS answers."); var hostsTxt = await consul.KV.GetStringOrDefault(HiveConst.ConsulDnsHostsKey, terminator.CancellationToken); if (hostsTxt == null) { log.LogWarn(() => "DNS answers do not exist on Consul. Is [neon-dns-mon] functioning properly?"); } else { var marker = "# -------- NEON-DNS --------"; // We have the host entries from Consul. We need to add these onto the // end [/etc/powserdns/hosts], replacing any host entries written during // a previous run. // // We're going to use the special marker line: // // # ---DYNAMIC-HOSTS--- // // to separate the built-in hosts (above the line) from the dynamic hosts // we're generating here (which will be below the line). Note that this // line won't exist the first time this service runs, so we'll just add it. // // Note that it's possible that the PowerDNS Recursor might be reading this // file while we're trying to write it. We're going to treat these as a // transient errors and retry. var retry = new LinearRetryPolicy(typeof(IOException), maxAttempts: 5, retryInterval: TimeSpan.FromSeconds(1)); await retry.InvokeAsync( async() => { using (var stream = new FileStream(powerDnsHostsPath, FileMode.Open, FileAccess.ReadWrite)) { // Read a copy of the hosts file as bytes so we can compare // the old version with the new one generated below for changes. var orgHostBytes = stream.ReadToEnd(); stream.Position = 0; // Generate the new hosts file. var sbHosts = new StringBuilder(); // Read the hosts file up to but not including the special marker // line (if it's present). using (var reader = new StreamReader(stream, Encoding.UTF8, true, 32 * 1024, leaveOpen: true)) { foreach (var line in reader.Lines()) { if (line.StartsWith(marker)) { break; } sbHosts.AppendLine(line); } } // Strip any trailing whitespace from the hosts file so we'll // be able to leave a nice blank line between the end of the // original file and the special marker line. var text = sbHosts.ToString().TrimEnd(); sbHosts.Clear(); sbHosts.AppendLine(text); // Append the marker line, followed by dynamic host // entries we downloaded from Consul. sbHosts.AppendLine(); sbHosts.AppendLine(marker); sbHosts.AppendLine(); sbHosts.Append(hostsTxt); // Generate the new host file bytes, taking care to ensure that // we're using Linux style line endings and then update the // hosts file if anything changed. var hostsText = NeonHelper.ToLinuxLineEndings(sbHosts.ToString()); var newHostBytes = Encoding.UTF8.GetBytes(hostsText); if (NeonHelper.ArrayEquals(orgHostBytes, newHostBytes)) { log.LogDebug(() => $"[{powerDnsHostsPath}] file is up-to-date."); } else { log.LogDebug(() => $"[{powerDnsHostsPath}] is being updated."); stream.Position = 0; stream.SetLength(0); stream.Write(newHostBytes); // Signal to the local [neon-dns-loader] systemd service that it needs // to have PowerDNS Recursor reload the hosts file. File.WriteAllText(reloadSignalPath, "reload now"); } } log.LogDebug(() => "Finished poll"); await Task.CompletedTask; }); // We've successfully synchronized the local hosts file with // the Consul DNS settings. localMD5 = remoteMD5; } } return(await Task.FromResult(false)); }, onExceptionAsync: async e => { log.LogError(e); return(await Task.FromResult(false)); }, onTerminateAsync: async() => { log.LogInfo(() => "Terminating"); await Task.CompletedTask; }); terminator.AddDisposable(periodicTask); await periodicTask.Run(); }
/// <inheritdoc/> public void Run(ModuleContext context) { var hive = HiveHelper.Hive; string hostname; if (!context.ValidateArguments(context.Arguments, validModuleArgs)) { context.Failed = true; return; } // Obtain common arguments. context.WriteLine(AnsibleVerbosity.Trace, $"Parsing [state]"); if (!context.Arguments.TryGetValue <string>("state", out var state)) { state = "present"; } state = state.ToLowerInvariant(); if (context.HasErrors) { return; } var manager = hive.GetReachableManager(); var sbErrorNodes = new StringBuilder(); // Determine whether the registry service is already deployed and // also retrieve the registry credentials from Vault if present. // Note that the current registry hostname will be persisted to // Consul at [neon/service/neon-registry/hostname] when a registry // is deployed. context.WriteLine(AnsibleVerbosity.Trace, $"Inspecting the [neon-registry] service."); var currentService = hive.Docker.InspectService("neon-registry"); context.WriteLine(AnsibleVerbosity.Trace, $"Getting current registry hostname from Consul."); var currentHostname = hive.Registry.GetLocalHostname(); var currentSecret = hive.Registry.GetLocalSecret(); var currentImage = currentService?.Spec.TaskTemplate.ContainerSpec.ImageWithoutSHA; var currentCredentials = // Set blank properties for the change detection below. new RegistryCredentials() { Registry = string.Empty, Username = string.Empty, Password = string.Empty }; if (!string.IsNullOrEmpty(currentHostname)) { context.WriteLine(AnsibleVerbosity.Trace, $"Reading existing registry credentials for [{currentHostname}]."); currentCredentials = hive.Registry.GetCredentials(currentHostname); if (currentCredentials != null) { context.WriteLine(AnsibleVerbosity.Info, $"Registry credentials for [{currentHostname}] exist."); } else { context.WriteLine(AnsibleVerbosity.Info, $"Registry credentials for [{currentHostname}] do not exist."); } } // Obtain the current registry TLS certificate (if any). var currentCertificate = hive.Certificate.Get("neon-registry"); // Perform the operation. switch (state) { case "absent": context.WriteLine(AnsibleVerbosity.Trace, $"Parsing [hostname]"); if (!context.Arguments.TryGetValue <string>("hostname", out hostname)) { throw new ArgumentException($"[hostname] module argument is required."); } if (currentService == null) { context.WriteLine(AnsibleVerbosity.Important, "[neon-registry] is not currently deployed."); } if (context.CheckMode) { context.WriteLine(AnsibleVerbosity.Important, $"Local registry will be removed when CHECK-MODE is disabled."); return; } if (currentService == null) { return; // Nothing to do } context.Changed = true; // Logout of the registry. if (currentCredentials != null) { context.WriteLine(AnsibleVerbosity.Trace, $"Logging the hive out of the [{currentHostname}] registry."); hive.Registry.Logout(currentHostname); } // Delete the [neon-registry] service and volume. Note that // the volume should exist on all of the manager nodes. context.WriteLine(AnsibleVerbosity.Trace, $"Removing the [neon-registry] service."); manager.DockerCommand(RunOptions.None, "docker", "service", "rm", "neon-registry"); context.WriteLine(AnsibleVerbosity.Trace, $"Removing the [neon-registry] volumes."); var volumeRemoveActions = new List <Action>(); var volumeRetryPolicy = new LinearRetryPolicy(typeof(TransientException), maxAttempts: 10, retryInterval: TimeSpan.FromSeconds(2)); foreach (var node in hive.Managers) { volumeRemoveActions.Add( () => { // $hack(jeff.lill): // // Docker service removal appears to be synchronous but the removal of the // actual service task containers is not. We're going to detect this and // throw a [TransientException] and then retry. using (var clonedNode = node.Clone()) { lock (context) { context.WriteLine(AnsibleVerbosity.Trace, $"Removing [neon-registry] volume on [{clonedNode.Name}]."); } volumeRetryPolicy.InvokeAsync( async() => { var response = clonedNode.DockerCommand(RunOptions.None, "docker", "volume", "rm", "neon-registry"); if (response.ExitCode != 0) { var message = $"Error removing [neon-registry] volume from [{clonedNode.Name}: {response.ErrorText}"; lock (syncLock) { context.WriteLine(AnsibleVerbosity.Info, message); } if (response.AllText.Contains("volume is in use")) { throw new TransientException(message); } } else { lock (context) { context.WriteLine(AnsibleVerbosity.Trace, $"Removed [neon-registry] volume on [{clonedNode.Name}]."); } } await Task.Delay(0); }).Wait(); } }); } NeonHelper.WaitForParallel(volumeRemoveActions); // Remove the traffic manager rule and certificate. context.WriteLine(AnsibleVerbosity.Trace, $"Removing the [neon-registry] traffic manager rule."); hive.PublicTraffic.RemoveRule("neon-registry"); context.WriteLine(AnsibleVerbosity.Trace, $"Removing the [neon-registry] traffic manager certificate."); hive.Certificate.Remove("neon-registry"); // Remove any related Consul state. context.WriteLine(AnsibleVerbosity.Trace, $"Removing the [neon-registry] Consul [hostname] and [secret]."); hive.Registry.SetLocalHostname(null); hive.Registry.SetLocalSecret(null); // Logout the hive from the registry. context.WriteLine(AnsibleVerbosity.Trace, $"Logging the hive out of the [{currentHostname}] registry."); hive.Registry.Logout(currentHostname); // Remove the hive DNS host entry. context.WriteLine(AnsibleVerbosity.Trace, $"Removing the [{currentHostname}] registry DNS hosts entry."); hive.Dns.Remove(hostname); break; case "present": if (!hive.Definition.HiveFS.Enabled) { context.WriteErrorLine("The local registry service requires hive CephFS."); return; } // Parse the [hostname], [certificate], [username] and [password] arguments. context.WriteLine(AnsibleVerbosity.Trace, $"Parsing [hostname]"); if (!context.Arguments.TryGetValue <string>("hostname", out hostname)) { throw new ArgumentException($"[hostname] module argument is required."); } context.WriteLine(AnsibleVerbosity.Trace, $"Parsing [certificate]"); if (!context.Arguments.TryGetValue <string>("certificate", out var certificatePem)) { throw new ArgumentException($"[certificate] module argument is required."); } if (!TlsCertificate.TryParse(certificatePem, out var certificate)) { throw new ArgumentException($"[certificate] is not a valid certificate."); } context.WriteLine(AnsibleVerbosity.Trace, $"Parsing [username]"); if (!context.Arguments.TryGetValue <string>("username", out var username)) { throw new ArgumentException($"[username] module argument is required."); } context.WriteLine(AnsibleVerbosity.Trace, $"Parsing [password]"); if (!context.Arguments.TryGetValue <string>("password", out var password)) { throw new ArgumentException($"[password] module argument is required."); } context.WriteLine(AnsibleVerbosity.Trace, $"Parsing [secret]"); if (!context.Arguments.TryGetValue <string>("secret", out var secret) || string.IsNullOrEmpty(secret)) { throw new ArgumentException($"[secret] module argument is required."); } context.WriteLine(AnsibleVerbosity.Trace, $"Parsing [image]"); if (!context.Arguments.TryGetValue <string>("image", out var image)) { image = HiveConst.NeonProdRegistry + "/neon-registry:latest"; } // Detect service changes. var hostnameChanged = hostname != currentCredentials?.Registry; var usernameChanged = username != currentCredentials?.Username; var passwordChanged = password != currentCredentials?.Password; var secretChanged = secret != currentSecret; var imageChanged = image != currentImage; var certificateChanged = certificate?.CombinedPemNormalized != currentCertificate?.CombinedPemNormalized; var updateRequired = hostnameChanged || usernameChanged || passwordChanged || secretChanged || imageChanged || certificateChanged; if (hostnameChanged) { context.WriteLine(AnsibleVerbosity.Info, $"[hostname] changed from [{currentCredentials?.Registry}] --> [{hostname}]"); } if (usernameChanged) { context.WriteLine(AnsibleVerbosity.Info, $"[username] changed from [{currentCredentials?.Username}] --> [{username}]"); } if (usernameChanged) { context.WriteLine(AnsibleVerbosity.Info, $"[password] changed from [{currentCredentials?.Password}] --> [**REDACTED**]"); } if (secretChanged) { context.WriteLine(AnsibleVerbosity.Info, $"[secret] changed from [{currentSecret}] --> [**REDACTED**]"); } if (imageChanged) { context.WriteLine(AnsibleVerbosity.Info, $"[image] changed from [{currentImage}] --> [{image}]"); } if (certificateChanged) { var currentCertRedacted = currentCertificate != null ? "**REDACTED**" : "**NONE**"; context.WriteLine(AnsibleVerbosity.Info, $"[certificate] changed from [{currentCertRedacted}] --> [**REDACTED**]"); } // Handle CHECK-MODE. if (context.CheckMode) { if (currentService == null) { context.WriteLine(AnsibleVerbosity.Important, $"Local registry will be deployed when CHECK-MODE is disabled."); return; } if (updateRequired) { context.WriteLine(AnsibleVerbosity.Important, $"One or more of the arguments have changed so the registry will be updated when CHECK-MODE is disabled."); return; } return; } // Create the hive DNS host entry we'll use to redirect traffic targeting the registry // hostname to the hive managers. We need to do this because registry IP addresses // are typically public, typically targeting the external firewall or load balancer // interface. // // The problem is that hive nodes will generally be unable to connect to the // local managers through the firewall/load balancer because most network routers // block network traffic that originates from inside the hive, then leaves // to hit the external router interface with the expectation of being routed // back inside. I believe this is an anti-spoofing security measure. var dnsRedirect = GetRegistryDnsEntry(hostname); // Perform the operation. if (currentService == null) { context.WriteLine(AnsibleVerbosity.Important, $"[neon-registry] service needs to be created."); context.Changed = true; // The registry service isn't running, so we'll do a full deployment. context.WriteLine(AnsibleVerbosity.Trace, $"Setting certificate."); hive.Certificate.Set("neon-registry", certificate); context.WriteLine(AnsibleVerbosity.Trace, $"Updating Consul settings."); hive.Registry.SetLocalHostname(hostname); hive.Registry.SetLocalSecret(secret); context.WriteLine(AnsibleVerbosity.Trace, $"Adding hive DNS host entry for [{hostname}]."); hive.Dns.Set(dnsRedirect, waitUntilPropagated: true); context.WriteLine(AnsibleVerbosity.Trace, $"Writing traffic manager rule."); hive.PublicTraffic.SetRule(GetRegistryTrafficManagerRule(hostname)); context.WriteLine(AnsibleVerbosity.Trace, $"Creating the [neon-registry] service."); var createResponse = manager.DockerCommand(RunOptions.None, "docker service create", "--name", "neon-registry", "--mode", "global", "--constraint", "node.role==manager", "--env", $"USERNAME={username}", "--env", $"PASSWORD={password}", "--env", $"SECRET={secret}", "--env", $"LOG_LEVEL=info", "--env", $"READ_ONLY=false", "--mount", "type=volume,src=neon-registry,volume-driver=neon,dst=/var/lib/neon-registry", "--network", "neon-public", "--restart-delay", "10s", image); if (createResponse.ExitCode != 0) { context.WriteErrorLine($"[neon-registry] service create failed: {createResponse.ErrorText}"); return; } context.WriteLine(AnsibleVerbosity.Trace, $"Service created."); context.WriteLine(AnsibleVerbosity.Trace, $"Wait for [neon-registry] service to stabilize (30s)."); Thread.Sleep(TimeSpan.FromSeconds(30)); context.WriteLine(AnsibleVerbosity.Trace, $"Logging the hive into the [{hostname}] registry."); hive.Registry.Login(hostname, username, password); } else if (updateRequired) { context.WriteLine(AnsibleVerbosity.Important, $"[neon-registry] service update is required."); context.Changed = true; // Update the service and related settings as required. if (certificateChanged) { context.WriteLine(AnsibleVerbosity.Trace, $"Updating certificate."); hive.Certificate.Set("neon-registry", certificate); } if (hostnameChanged) { context.WriteLine(AnsibleVerbosity.Trace, $"Updating traffic manager rule."); hive.PublicTraffic.SetRule(GetRegistryTrafficManagerRule(hostname)); context.WriteLine(AnsibleVerbosity.Trace, $"Updating hive DNS host entry for [{hostname}] (60 seconds)."); hive.Dns.Set(dnsRedirect, waitUntilPropagated: true); context.WriteLine(AnsibleVerbosity.Trace, $"Updating local hive hostname [{hostname}]."); hive.Registry.SetLocalHostname(hostname); if (!string.IsNullOrEmpty(currentHostname)) { context.WriteLine(AnsibleVerbosity.Trace, $"Logging the hive out of the [{currentHostname}] registry."); hive.Registry.Logout(currentHostname); } } if (secretChanged) { context.WriteLine(AnsibleVerbosity.Trace, $"Updating local hive secret."); hive.Registry.SetLocalSecret(secret); } context.WriteLine(AnsibleVerbosity.Trace, $"Updating service."); var updateResponse = manager.DockerCommand(RunOptions.None, "docker service update", "--env-add", $"USERNAME={username}", "--env-add", $"PASSWORD={password}", "--env-add", $"SECRET={secret}", "--env-add", $"LOG_LEVEL=info", "--env-add", $"READ_ONLY=false", "--image", image, "neon-registry"); if (updateResponse.ExitCode != 0) { context.WriteErrorLine($"[neon-registry] service update failed: {updateResponse.ErrorText}"); return; } context.WriteLine(AnsibleVerbosity.Trace, $"Service updated."); context.WriteLine(AnsibleVerbosity.Trace, $"Logging the hive into the [{hostname}] registry."); hive.Registry.Login(hostname, username, password); } else { context.WriteLine(AnsibleVerbosity.Important, $"[neon-registry] service update is not required but we're logging all nodes into [{hostname}] to ensure hive consistency."); hive.Registry.Login(hostname, username, password); context.Changed = false; } break; case "prune": if (currentService == null) { context.WriteLine(AnsibleVerbosity.Important, "Registry service is not running."); return; } if (context.CheckMode) { context.WriteLine(AnsibleVerbosity.Important, "Registry will be pruned when CHECK-MODE is disabled."); return; } context.Changed = true; // Always set this to TRUE for prune. // We're going to upload a script to one of the managers that handles // putting the [neon-registry] service into READ-ONLY mode, running // the garbage collection container and then restoring [neon-registry] // to READ/WRITE mode. // // The nice thing about this is that the operation will continue to // completion on the manager node even if we lose the SSH connection. var updateScript = $@"#!/bin/bash # Update [neon-registry] to READ-ONLY mode: docker service update --env-rm READ_ONLY --env-add READ_ONLY=true neon-registry # Prune the registry: docker run \ --name neon-registry-prune \ --restart-condition=none \ --mount type=volume,src=neon-registry,volume-driver=neon,dst=/var/lib/neon-registry \ {HiveConst.NeonProdRegistry}/neon-registry garbage-collect # Restore [neon-registry] to READ/WRITE mode: docker service update --env-rm READ_ONLY --env-add READ_ONLY=false neon-registry "; var bundle = new CommandBundle("./collect.sh"); bundle.AddFile("collect.sh", updateScript, isExecutable: true); context.WriteLine(AnsibleVerbosity.Info, "Registry prune started."); var pruneResponse = manager.SudoCommand(bundle, RunOptions.None); if (pruneResponse.ExitCode != 0) { context.WriteErrorLine($"The prune operation failed. The registry may be running in READ-ONLY mode: {pruneResponse.ErrorText}"); return; } context.WriteLine(AnsibleVerbosity.Info, "Registry prune completed."); break; default: throw new ArgumentException($"[state={state}] is not one of the valid choices: [present], [absent], or [prune]."); } }
/// <summary> /// Handles purging of old <b>logstash</b> and <b>metricbeat</b> Elasticsearch indexes. /// </summary> /// <returns>The tracking <see cref="Task"/>.</returns> public void KibanaSetup() { Log.LogInfo("Setting up Kibana index patterns."); using (var jsonClient = new JsonClient()) { jsonClient.BaseAddress = KubernetesClientConfiguration.IsInCluster() ? this.ServiceMap[NeonServices.Kibana].Endpoints.Default.Uri : new Uri($"http://*****:*****@timestamp"; indexPattern.attributes = attributes; await jsonClient.PostAsync($"api/saved_objects/index-pattern/logstash-*?overwrite=true", indexPattern); }).Wait(); // Now we need to save a Kibana config document so that [logstash-*] will be // the default index and the timestamp will be displayed as UTC and have a // more useful terse format. retry.InvokeAsync( async() => { dynamic setting = new ExpandoObject(); setting.value = "logstash-*"; await jsonClient.PostAsync($"api/kibana/settings/defaultIndex", setting); setting.value = "HH:mm:ss.SSS MM-DD-YYYY"; await jsonClient.PostAsync($"api/kibana/settings/dateFormat", setting); setting.value = "UTC"; await jsonClient.PostAsync($"api/kibana/settings/dateFormat:tz", setting); }).Wait(); } Log.LogInfo("Kibana index patterns configured."); }
/// <summary> /// Starts the container using the instance fields. /// </summary> private void StartContainer() { // Handle the special case where an earlier run of this container was // not stopped because the developer was debugging and interrupted the // the unit tests before the fixture was disposed or a container with // the same name is already running for some other reason. // // We're going to look for a existing container with the same name // and remove it if its ID doesn't match the current container. var args = new string[] { "ps", "-a", "--filter", $"name={name}", "--format", "{{.ID}}" }; var result = NeonHelper.ExecuteCapture($"docker", args); if (result.ExitCode == 0) { var existingId = result.OutputText.Trim(); if (!string.IsNullOrEmpty(existingId)) { NeonHelper.Execute("docker", new object[] { "rm", "--force", existingId }); } } // Pull and then start the container. Note that we're going to // retry the pull a few times to handle transitent issues. var argsString = NeonHelper.NormalizeExecArgs("pull", image); var pullRetry = new LinearRetryPolicy(TransientDetector.Always, maxAttempts: 5, retryInterval: TimeSpan.FromSeconds(1)); pullRetry.InvokeAsync( async() => { result = NeonHelper.ExecuteCapture($"docker", argsString); if (result.ExitCode != 0) { throw new Exception($"Cannot pull container [{image}] - [exitcode={result.ExitCode}]: {result.ErrorText}"); } await Task.CompletedTask; }).Wait(); var extraArgs = new List <string>(); if (!string.IsNullOrEmpty(name)) { extraArgs.Add("--name"); extraArgs.Add(name); } if (env != null) { foreach (var variable in env) { extraArgs.Add("--env"); extraArgs.Add(variable); } } argsString = NeonHelper.NormalizeExecArgs("run", dockerArgs, extraArgs.ToArray(), image, containerArgs); result = NeonHelper.ExecuteCapture($"docker", argsString); if (result.ExitCode != 0) { throw new Exception($"Cannot launch container [{image}] - [exitcode={result.ExitCode}]: {result.ErrorText}"); } else { ContainerName = name; ContainerId = result.OutputText.Trim().Substring(0, 12); } }
/// <summary> /// Removes then local Docker registry from the hive. /// </summary> /// <param name="progress">Optional action that will be called with a progress message.</param> /// <exception cref="HiveException">Thrown if no registry is deployed or there was an error removing it.</exception> public void RemoveLocalRegistry(Action <string> progress = null) { if (!HasLocalRegistry) { throw new HiveException("The [neon-registry] service is not deployed."); } var syncLock = new object(); var manager = hive.GetReachableManager(); var hostname = hive.Registry.GetLocalHostname(); // Logout of the registry. progress?.Invoke($"Logging the hive out of the [{hostname}] registry."); hive.Registry.Logout(hostname); // Delete the [neon-registry] service and volume. Note that // the volume should exist on all of the manager nodes. progress?.Invoke($"Removing the [neon-registry] service."); manager.DockerCommand(RunOptions.None, "docker", "service", "rm", "neon-registry"); progress?.Invoke($"Removing the [neon-registry] volumes."); var volumeRemoveActions = new List <Action>(); var volumeRetryPolicy = new LinearRetryPolicy(typeof(TransientException), maxAttempts: 10, retryInterval: TimeSpan.FromSeconds(2)); foreach (var node in hive.Managers) { volumeRemoveActions.Add( () => { // $hack(jeff.lill): // // Docker service removal appears to be synchronous but the removal of the // actual service task containers is not. We're going to detect this and // throw a [TransientException] and then retry. using (var clonedNode = node.Clone()) { lock (syncLock) { progress?.Invoke($"Removing [neon-registry] volume on [{clonedNode.Name}]."); } volumeRetryPolicy.InvokeAsync( async() => { var response = clonedNode.DockerCommand(RunOptions.None, "docker", "volume", "rm", "neon-registry"); if (response.ExitCode != 0) { if (response.AllText.Contains("volume is in use")) { throw new TransientException($"Error removing [neon-registry] volume from [{clonedNode.Name}: {response.ErrorText}"); } } else { lock (syncLock) { progress?.Invoke($"Removed [neon-registry] volume on [{clonedNode.Name}]."); } } await Task.Delay(0); }).Wait(); } }); } NeonHelper.WaitForParallel(volumeRemoveActions); // Remove the traffic manager rule and certificate. progress?.Invoke($"Removing the [neon-registry] traffic manager rule."); hive.PublicTraffic.RemoveRule("neon-registry"); progress?.Invoke($"Removing the [neon-registry] traffic manager certificate."); hive.Certificate.Remove("neon-registry"); // Remove any related Consul state. progress?.Invoke($"Removing the [neon-registry] Consul [hostname] and [secret]."); hive.Registry.SetLocalHostname(null); hive.Registry.SetLocalSecret(null); // Logout the hive from the registry. progress?.Invoke($"Logging the hive out of the [{hostname}] registry."); hive.Registry.Logout(hostname); // Remove the hive DNS host entry. progress?.Invoke($"Removing the [{hostname}] registry DNS hosts entry."); hive.Dns.Remove(hostname); }
/// <summary> /// Removes a specific fixture section from the <b>hosts</b> file or all /// fixture sections if <paramref name="fixtureId"/> is <c>null</c>. /// </summary> /// <param name="fixtureId"> /// Identifies the fixture section to be removed or <c>null</c> to /// remove all fixture sections. /// </param> private static void RemoveSection(string fixtureId = null) { var sb = new StringBuilder(); var changed = false; var sectionGuids = new HashSet <string>(); // Update the [hosts] file. retryFile.InvokeAsync( async() => { if (File.Exists(HostsPath)) { using (var reader = new StreamReader(new FileStream(HostsPath, FileMode.Open, FileAccess.ReadWrite))) { var guid = fixtureId ?? string.Empty; var startMarker = $"# START-NEON-HOSTS-FIXTURE-{guid}"; var endMarker = $"# END-NEON-HOSTS-FIXTURE-{guid}"; var inSection = false; foreach (var line in reader.Lines()) { if (inSection) { if (line.StartsWith(endMarker)) { inSection = false; changed = true; } } else { if (line.StartsWith(startMarker)) { // Extract the section GUID from the marker because we'll need // these below when we verify that the resolver has picked up // the changes. var posGuid = line.LastIndexOf('-') + 1; var sectionGuid = line.Substring(posGuid); if (!sectionGuids.Contains(sectionGuid)) { sectionGuids.Add(sectionGuid); } inSection = true; changed = true; } else { if (!inSection) { sb.AppendLine(line); } } } } } } if (changed) { File.WriteAllText(HostsPath, sb.ToString()); } await Task.CompletedTask; }).Wait(); if (changed) { // We need to verify that the local DNS resolver has picked up the change // by verifying that none of the removed section hostnames resolve. retryReady.InvokeAsync( async() => { foreach (var sectionGuid in sectionGuids) { var hostname = GetSectionHostname(sectionGuid); var addresses = await GetHostAddressesAsync(hostname); if (addresses.Length > 0) { throw new NotReadyException($"Waiting for [{hostname}] to be removed by the local DNS resolver."); } } }).Wait(); } }
/// <summary> /// Rebuilds the host node's <b>/etc/containers/registries.conf.d/00-neon-cluster.conf</b> file, /// using the container registries passed, signals CRI-O to reload any changes and also manages /// container registry logins. /// </summary> private async Task UpdateContainerRegistriesAsync() { var registries = (await k8s.ListClusterCustomObjectAsync <V1NeonContainerRegistry>()).Items; // NOTE: Here's the documentation for the config file we're generating: // // https://github.com/containers/image/blob/main/docs/containers-registries.conf.5.md // var sbRegistryConfig = new StringBuilder(); var sbSearchRegistries = new StringBuilder(); // Configure any unqualified search registries. foreach (var registry in registries .Where(registry => registry.Spec.SearchOrder >= 0) .OrderBy(registry => registry.Spec.SearchOrder)) { sbSearchRegistries.AppendWithSeparator($"\"{registry.Spec.Prefix}\"", ", "); } sbRegistryConfig.Append( $@"unqualified-search-registries = [{sbSearchRegistries}] "); // Configure any container registries including the local cluster. foreach (var registry in registries) { sbRegistryConfig.Append( $@" [[registry]] prefix = ""{registry.Spec.Prefix}"" insecure = {NeonHelper.ToBoolString(registry.Spec.Insecure)} blocked = {NeonHelper.ToBoolString(registry.Spec.Blocked)} "); if (!string.IsNullOrEmpty(registry.Spec.Location)) { sbRegistryConfig.AppendLine($"location = \"{registry.Spec.Location}\""); } } if (NeonHelper.IsLinux) { // Read and parse the current configuration file to create list of the existing // configured upstream registries. var currentConfigText = File.ReadAllText(configMountPath); var currentConfig = Toml.Parse(currentConfigText); var existingLocations = new List <string>(); foreach (var registryTable in currentConfig.Tables.Where(table => table.Name.Key.GetName() == "registry")) { var location = registryTable.Items.SingleOrDefault(key => key.Key.GetName() == "location")?.Value.GetValue(); if (!string.IsNullOrWhiteSpace(location)) { existingLocations.Add(location); } } // Convert the generated config to Linux line endings and then compare the new // config against what's already configured on the host node. We'll rewrite the // host file and then signal CRI-O to reload its config when the files differ. var newConfigText = NeonHelper.ToLinuxLineEndings(sbRegistryConfig.ToString()); if (currentConfigText != newConfigText) { configUpdateCounter.Inc(); File.WriteAllText(configMountPath, newConfigText); (await Node.ExecuteCaptureAsync("pkill", new object[] { "-HUP", "crio" })).EnsureSuccess(); // Wait a few seconds to give CRI-O a chance to reload its config. This will // help mitigate problems when managing logins below due to potential inconsistencies // between CRI-O's currently loaded config and the new config we just saved. await Task.Delay(TimeSpan.FromSeconds(15)); } } //----------------------------------------------------------------- // We need to manage registry logins by logging into new registries, // logging out of deleted registries, relogging in with new credentials, // and periodically logging in with unchanged credentials to ensure that // we're actually logged in. Here's how this works: // // https://github.com/nforgeio/neonKUBE/issues/1591 var retry = new LinearRetryPolicy(e => true, maxAttempts: 5, retryInterval: TimeSpan.FromSeconds(5)); // Construct LoginFile instances for all specified upstream registries // that require credentials and add these to a dictionary keyed by SHA-256. var shaToRequiredLogins = new Dictionary <string, LoginFile>(); foreach (var registry in registries.Where(registry => !string.IsNullOrEmpty(registry.Spec.Username))) { var loginFile = LoginFile.Create(hostContainerRegistriesFolder, registry.Spec.Location, registry.Spec.Username, registry.Spec.Password); shaToRequiredLogins.Add(loginFile.Sha256, loginFile); } // Read all existing login files on the node and add them to a dictionary // mapping their SHA-256s to the file. var shaToExistingLogins = new Dictionary <string, LoginFile>(); foreach (var file in Directory.GetFiles(hostContainerRegistriesFolder, "*.login", SearchOption.TopDirectoryOnly)) { var loginFile = LoginFile.Read(file); if (loginFile != null) { shaToExistingLogins.Add(loginFile.Sha256, loginFile); } } // Look for any existing login files that are not present in the collection of // new logins. These correspond to registries that have been deleted or whose // credentials have changed. We're going to go ahead and log out of the related // registries and then delete these login files (we'll re-login with new // credentials below for the registries that weren't targeted for removal). foreach (var loginFile in shaToExistingLogins.Values .Where(login => !shaToRequiredLogins.ContainsKey(login.Sha256))) { try { await retry.InvokeAsync( async() => { // Note that we're not ensuring success here because we may not be // logged-in which is OK: we don't want to see that error. log.LogInfo($"{podmanPath} logout {loginFile.Location}"); if (NeonHelper.IsLinux) { await Node.ExecuteCaptureAsync(podmanPath, new object[] { "logout", loginFile.Location }); } loginFile.Delete(); }); } catch (Exception e) { loginErrorCounter.Inc(); log.LogError(e); } } // Look for any required logins that don't have an existing login file, // and then login the registry and then create the login file on success. foreach (var loginFile in shaToRequiredLogins.Values .Where(login => !shaToExistingLogins.ContainsKey(login.Sha256))) { try { await retry.InvokeAsync( async() => { log.LogInfo($"{podmanPath} login {loginFile.Location} --username {loginFile.Username} --password REDACTED"); if (NeonHelper.IsLinux) { (await Node.ExecuteCaptureAsync(podmanPath, new object[] { "login", loginFile.Location, "--username", loginFile.Username, "--password", loginFile.Password })).EnsureSuccess(); } }); loginFile.Write(); } catch (Exception e) { loginErrorCounter.Inc(); log.LogError(e); } } //----------------------------------------------------------------- // Finally, we need to force a re-login for any existing logins that haven't // been explicitly logged into for a while. Note that we're always going to // log into the local Harbor registry. foreach (var file in Directory.GetFiles(hostContainerRegistriesFolder, "*.login", SearchOption.TopDirectoryOnly)) { // Read the next existing login file. var loginFile = LoginFile.Read(file); if (loginFile == null) { continue; } // Update the login with the password from the corresponding container registry resource. var registry = registries.FirstOrDefault(registry => registry.Spec.Location == loginFile.Location); if (registry == null) { log.LogWarn($"Cannot locate [{nameof(V1NeonContainerRegistry)}] resource for [location={loginFile.Location}]."); continue; } loginFile.Password = registry.Spec.Password; // Perform the login. var scheduledLoginUtc = loginFile.UpdatedUtc + reloginInterval + NeonHelper.PseudoRandomTimespan(reloginMaxRandomInterval); if (DateTime.UtcNow <= scheduledLoginUtc || loginFile.Location == KubeConst.LocalClusterRegistry) { try { await retry.InvokeAsync( async() => { log.LogInfo($"{podmanPath} login {loginFile.Location} --username {loginFile.Username} --password REDACTED"); if (NeonHelper.IsLinux) { (await Node.ExecuteCaptureAsync(podmanPath, new object[] { "login", loginFile.Location, "--username", loginFile.Username, "--password", loginFile.Password })).EnsureSuccess(); } }); loginFile.Write(); } catch (Exception e) { loginErrorCounter.Inc(); log.LogError(e); } } } }