Beispiel #1
0
        /// <inheritdoc/>
        public void Run(ModuleContext context)
        {
            var    hive = HiveHelper.Hive;
            string hostname;

            if (!context.ValidateArguments(context.Arguments, validModuleArgs))
            {
                context.Failed = true;
                return;
            }

            // Obtain common arguments.

            context.WriteLine(AnsibleVerbosity.Trace, $"Parsing [state]");

            if (!context.Arguments.TryGetValue <string>("state", out var state))
            {
                state = "present";
            }

            state = state.ToLowerInvariant();

            if (context.HasErrors)
            {
                return;
            }

            var manager      = hive.GetReachableManager();
            var sbErrorNodes = new StringBuilder();

            // Determine whether the registry service is already deployed and
            // also retrieve the registry credentials from Vault if present.
            // Note that the current registry hostname will be persisted to
            // Consul at [neon/service/neon-registry/hostname] when a registry
            // is deployed.

            context.WriteLine(AnsibleVerbosity.Trace, $"Inspecting the [neon-registry] service.");

            var currentService = hive.Docker.InspectService("neon-registry");

            context.WriteLine(AnsibleVerbosity.Trace, $"Getting current registry hostname from Consul.");

            var currentHostname = hive.Registry.GetLocalHostname();
            var currentSecret   = hive.Registry.GetLocalSecret();
            var currentImage    = currentService?.Spec.TaskTemplate.ContainerSpec.ImageWithoutSHA;

            var currentCredentials =        // Set blank properties for the change detection below.
                                     new RegistryCredentials()
            {
                Registry = string.Empty,
                Username = string.Empty,
                Password = string.Empty
            };

            if (!string.IsNullOrEmpty(currentHostname))
            {
                context.WriteLine(AnsibleVerbosity.Trace, $"Reading existing registry credentials for [{currentHostname}].");

                currentCredentials = hive.Registry.GetCredentials(currentHostname);

                if (currentCredentials != null)
                {
                    context.WriteLine(AnsibleVerbosity.Info, $"Registry credentials for [{currentHostname}] exist.");
                }
                else
                {
                    context.WriteLine(AnsibleVerbosity.Info, $"Registry credentials for [{currentHostname}] do not exist.");
                }
            }

            // Obtain the current registry TLS certificate (if any).

            var currentCertificate = hive.Certificate.Get("neon-registry");

            // Perform the operation.

            switch (state)
            {
            case "absent":

                context.WriteLine(AnsibleVerbosity.Trace, $"Parsing [hostname]");

                if (!context.Arguments.TryGetValue <string>("hostname", out hostname))
                {
                    throw new ArgumentException($"[hostname] module argument is required.");
                }

                if (currentService == null)
                {
                    context.WriteLine(AnsibleVerbosity.Important, "[neon-registry] is not currently deployed.");
                }

                if (context.CheckMode)
                {
                    context.WriteLine(AnsibleVerbosity.Important, $"Local registry will be removed when CHECK-MODE is disabled.");
                    return;
                }

                if (currentService == null)
                {
                    return;     // Nothing to do
                }

                context.Changed = true;

                // Logout of the registry.

                if (currentCredentials != null)
                {
                    context.WriteLine(AnsibleVerbosity.Trace, $"Logging the hive out of the [{currentHostname}] registry.");
                    hive.Registry.Logout(currentHostname);
                }

                // Delete the [neon-registry] service and volume.  Note that
                // the volume should exist on all of the manager nodes.

                context.WriteLine(AnsibleVerbosity.Trace, $"Removing the [neon-registry] service.");
                manager.DockerCommand(RunOptions.None, "docker", "service", "rm", "neon-registry");

                context.WriteLine(AnsibleVerbosity.Trace, $"Removing the [neon-registry] volumes.");

                var volumeRemoveActions = new List <Action>();
                var volumeRetryPolicy   = new LinearRetryPolicy(typeof(TransientException), maxAttempts: 10, retryInterval: TimeSpan.FromSeconds(2));

                foreach (var node in hive.Managers)
                {
                    volumeRemoveActions.Add(
                        () =>
                    {
                        // $hack(jeff.lill):
                        //
                        // Docker service removal appears to be synchronous but the removal of the
                        // actual service task containers is not.  We're going to detect this and
                        // throw a [TransientException] and then retry.

                        using (var clonedNode = node.Clone())
                        {
                            lock (context)
                            {
                                context.WriteLine(AnsibleVerbosity.Trace, $"Removing [neon-registry] volume on [{clonedNode.Name}].");
                            }

                            volumeRetryPolicy.InvokeAsync(
                                async() =>
                            {
                                var response = clonedNode.DockerCommand(RunOptions.None, "docker", "volume", "rm", "neon-registry");

                                if (response.ExitCode != 0)
                                {
                                    var message = $"Error removing [neon-registry] volume from [{clonedNode.Name}: {response.ErrorText}";

                                    lock (syncLock)
                                    {
                                        context.WriteLine(AnsibleVerbosity.Info, message);
                                    }

                                    if (response.AllText.Contains("volume is in use"))
                                    {
                                        throw new TransientException(message);
                                    }
                                }
                                else
                                {
                                    lock (context)
                                    {
                                        context.WriteLine(AnsibleVerbosity.Trace, $"Removed [neon-registry] volume on [{clonedNode.Name}].");
                                    }
                                }

                                await Task.Delay(0);
                            }).Wait();
                        }
                    });
                }

                NeonHelper.WaitForParallel(volumeRemoveActions);

                // Remove the traffic manager rule and certificate.

                context.WriteLine(AnsibleVerbosity.Trace, $"Removing the [neon-registry] traffic manager rule.");
                hive.PublicTraffic.RemoveRule("neon-registry");
                context.WriteLine(AnsibleVerbosity.Trace, $"Removing the [neon-registry] traffic manager certificate.");
                hive.Certificate.Remove("neon-registry");

                // Remove any related Consul state.

                context.WriteLine(AnsibleVerbosity.Trace, $"Removing the [neon-registry] Consul [hostname] and [secret].");
                hive.Registry.SetLocalHostname(null);
                hive.Registry.SetLocalSecret(null);

                // Logout the hive from the registry.

                context.WriteLine(AnsibleVerbosity.Trace, $"Logging the hive out of the [{currentHostname}] registry.");
                hive.Registry.Logout(currentHostname);

                // Remove the hive DNS host entry.

                context.WriteLine(AnsibleVerbosity.Trace, $"Removing the [{currentHostname}] registry DNS hosts entry.");
                hive.Dns.Remove(hostname);
                break;

            case "present":

                if (!hive.Definition.HiveFS.Enabled)
                {
                    context.WriteErrorLine("The local registry service requires hive CephFS.");
                    return;
                }

                // Parse the [hostname], [certificate], [username] and [password] arguments.

                context.WriteLine(AnsibleVerbosity.Trace, $"Parsing [hostname]");

                if (!context.Arguments.TryGetValue <string>("hostname", out hostname))
                {
                    throw new ArgumentException($"[hostname] module argument is required.");
                }

                context.WriteLine(AnsibleVerbosity.Trace, $"Parsing [certificate]");

                if (!context.Arguments.TryGetValue <string>("certificate", out var certificatePem))
                {
                    throw new ArgumentException($"[certificate] module argument is required.");
                }

                if (!TlsCertificate.TryParse(certificatePem, out var certificate))
                {
                    throw new ArgumentException($"[certificate] is not a valid certificate.");
                }

                context.WriteLine(AnsibleVerbosity.Trace, $"Parsing [username]");

                if (!context.Arguments.TryGetValue <string>("username", out var username))
                {
                    throw new ArgumentException($"[username] module argument is required.");
                }

                context.WriteLine(AnsibleVerbosity.Trace, $"Parsing [password]");

                if (!context.Arguments.TryGetValue <string>("password", out var password))
                {
                    throw new ArgumentException($"[password] module argument is required.");
                }

                context.WriteLine(AnsibleVerbosity.Trace, $"Parsing [secret]");

                if (!context.Arguments.TryGetValue <string>("secret", out var secret) || string.IsNullOrEmpty(secret))
                {
                    throw new ArgumentException($"[secret] module argument is required.");
                }

                context.WriteLine(AnsibleVerbosity.Trace, $"Parsing [image]");

                if (!context.Arguments.TryGetValue <string>("image", out var image))
                {
                    image = HiveConst.NeonProdRegistry + "/neon-registry:latest";
                }

                // Detect service changes.

                var hostnameChanged    = hostname != currentCredentials?.Registry;
                var usernameChanged    = username != currentCredentials?.Username;
                var passwordChanged    = password != currentCredentials?.Password;
                var secretChanged      = secret != currentSecret;
                var imageChanged       = image != currentImage;
                var certificateChanged = certificate?.CombinedPemNormalized != currentCertificate?.CombinedPemNormalized;
                var updateRequired     = hostnameChanged ||
                                         usernameChanged ||
                                         passwordChanged ||
                                         secretChanged ||
                                         imageChanged ||
                                         certificateChanged;

                if (hostnameChanged)
                {
                    context.WriteLine(AnsibleVerbosity.Info, $"[hostname] changed from [{currentCredentials?.Registry}] --> [{hostname}]");
                }

                if (usernameChanged)
                {
                    context.WriteLine(AnsibleVerbosity.Info, $"[username] changed from [{currentCredentials?.Username}] --> [{username}]");
                }

                if (usernameChanged)
                {
                    context.WriteLine(AnsibleVerbosity.Info, $"[password] changed from [{currentCredentials?.Password}] --> [**REDACTED**]");
                }

                if (secretChanged)
                {
                    context.WriteLine(AnsibleVerbosity.Info, $"[secret] changed from [{currentSecret}] --> [**REDACTED**]");
                }

                if (imageChanged)
                {
                    context.WriteLine(AnsibleVerbosity.Info, $"[image] changed from [{currentImage}] --> [{image}]");
                }

                if (certificateChanged)
                {
                    var currentCertRedacted = currentCertificate != null ? "**REDACTED**" : "**NONE**";

                    context.WriteLine(AnsibleVerbosity.Info, $"[certificate] changed from [{currentCertRedacted}] --> [**REDACTED**]");
                }

                // Handle CHECK-MODE.

                if (context.CheckMode)
                {
                    if (currentService == null)
                    {
                        context.WriteLine(AnsibleVerbosity.Important, $"Local registry will be deployed when CHECK-MODE is disabled.");
                        return;
                    }

                    if (updateRequired)
                    {
                        context.WriteLine(AnsibleVerbosity.Important, $"One or more of the arguments have changed so the registry will be updated when CHECK-MODE is disabled.");
                        return;
                    }

                    return;
                }

                // Create the hive DNS host entry we'll use to redirect traffic targeting the registry
                // hostname to the hive managers.  We need to do this because registry IP addresses
                // are typically public, typically targeting the external firewall or load balancer
                // interface.
                //
                // The problem is that hive nodes will generally be unable to connect to the
                // local managers through the firewall/load balancer because most network routers
                // block network traffic that originates from inside the hive, then leaves
                // to hit the external router interface with the expectation of being routed
                // back inside.  I believe this is an anti-spoofing security measure.

                var dnsRedirect = GetRegistryDnsEntry(hostname);

                // Perform the operation.

                if (currentService == null)
                {
                    context.WriteLine(AnsibleVerbosity.Important, $"[neon-registry] service needs to be created.");
                    context.Changed = true;

                    // The registry service isn't running, so we'll do a full deployment.

                    context.WriteLine(AnsibleVerbosity.Trace, $"Setting certificate.");
                    hive.Certificate.Set("neon-registry", certificate);

                    context.WriteLine(AnsibleVerbosity.Trace, $"Updating Consul settings.");
                    hive.Registry.SetLocalHostname(hostname);
                    hive.Registry.SetLocalSecret(secret);

                    context.WriteLine(AnsibleVerbosity.Trace, $"Adding hive DNS host entry for [{hostname}].");
                    hive.Dns.Set(dnsRedirect, waitUntilPropagated: true);

                    context.WriteLine(AnsibleVerbosity.Trace, $"Writing traffic manager rule.");
                    hive.PublicTraffic.SetRule(GetRegistryTrafficManagerRule(hostname));

                    context.WriteLine(AnsibleVerbosity.Trace, $"Creating the [neon-registry] service.");

                    var createResponse = manager.DockerCommand(RunOptions.None,
                                                               "docker service create",
                                                               "--name", "neon-registry",
                                                               "--mode", "global",
                                                               "--constraint", "node.role==manager",
                                                               "--env", $"USERNAME={username}",
                                                               "--env", $"PASSWORD={password}",
                                                               "--env", $"SECRET={secret}",
                                                               "--env", $"LOG_LEVEL=info",
                                                               "--env", $"READ_ONLY=false",
                                                               "--mount", "type=volume,src=neon-registry,volume-driver=neon,dst=/var/lib/neon-registry",
                                                               "--network", "neon-public",
                                                               "--restart-delay", "10s",
                                                               image);

                    if (createResponse.ExitCode != 0)
                    {
                        context.WriteErrorLine($"[neon-registry] service create failed: {createResponse.ErrorText}");
                        return;
                    }

                    context.WriteLine(AnsibleVerbosity.Trace, $"Service created.");
                    context.WriteLine(AnsibleVerbosity.Trace, $"Wait for [neon-registry] service to stabilize (30s).");
                    Thread.Sleep(TimeSpan.FromSeconds(30));
                    context.WriteLine(AnsibleVerbosity.Trace, $"Logging the hive into the [{hostname}] registry.");
                    hive.Registry.Login(hostname, username, password);
                }
                else if (updateRequired)
                {
                    context.WriteLine(AnsibleVerbosity.Important, $"[neon-registry] service update is required.");
                    context.Changed = true;

                    // Update the service and related settings as required.

                    if (certificateChanged)
                    {
                        context.WriteLine(AnsibleVerbosity.Trace, $"Updating certificate.");
                        hive.Certificate.Set("neon-registry", certificate);
                    }

                    if (hostnameChanged)
                    {
                        context.WriteLine(AnsibleVerbosity.Trace, $"Updating traffic manager rule.");
                        hive.PublicTraffic.SetRule(GetRegistryTrafficManagerRule(hostname));

                        context.WriteLine(AnsibleVerbosity.Trace, $"Updating hive DNS host entry for [{hostname}] (60 seconds).");
                        hive.Dns.Set(dnsRedirect, waitUntilPropagated: true);

                        context.WriteLine(AnsibleVerbosity.Trace, $"Updating local hive hostname [{hostname}].");
                        hive.Registry.SetLocalHostname(hostname);

                        if (!string.IsNullOrEmpty(currentHostname))
                        {
                            context.WriteLine(AnsibleVerbosity.Trace, $"Logging the hive out of the [{currentHostname}] registry.");
                            hive.Registry.Logout(currentHostname);
                        }
                    }

                    if (secretChanged)
                    {
                        context.WriteLine(AnsibleVerbosity.Trace, $"Updating local hive secret.");
                        hive.Registry.SetLocalSecret(secret);
                    }

                    context.WriteLine(AnsibleVerbosity.Trace, $"Updating service.");

                    var updateResponse = manager.DockerCommand(RunOptions.None,
                                                               "docker service update",
                                                               "--env-add", $"USERNAME={username}",
                                                               "--env-add", $"PASSWORD={password}",
                                                               "--env-add", $"SECRET={secret}",
                                                               "--env-add", $"LOG_LEVEL=info",
                                                               "--env-add", $"READ_ONLY=false",
                                                               "--image", image,
                                                               "neon-registry");

                    if (updateResponse.ExitCode != 0)
                    {
                        context.WriteErrorLine($"[neon-registry] service update failed: {updateResponse.ErrorText}");
                        return;
                    }

                    context.WriteLine(AnsibleVerbosity.Trace, $"Service updated.");

                    context.WriteLine(AnsibleVerbosity.Trace, $"Logging the hive into the [{hostname}] registry.");
                    hive.Registry.Login(hostname, username, password);
                }
                else
                {
                    context.WriteLine(AnsibleVerbosity.Important, $"[neon-registry] service update is not required but we're logging all nodes into [{hostname}] to ensure hive consistency.");
                    hive.Registry.Login(hostname, username, password);

                    context.Changed = false;
                }
                break;

            case "prune":

                if (currentService == null)
                {
                    context.WriteLine(AnsibleVerbosity.Important, "Registry service is not running.");
                    return;
                }

                if (context.CheckMode)
                {
                    context.WriteLine(AnsibleVerbosity.Important, "Registry will be pruned when CHECK-MODE is disabled.");
                    return;
                }

                context.Changed = true;     // Always set this to TRUE for prune.

                // We're going to upload a script to one of the managers that handles
                // putting the [neon-registry] service into READ-ONLY mode, running
                // the garbage collection container and then restoring [neon-registry]
                // to READ/WRITE mode.
                //
                // The nice thing about this is that the operation will continue to
                // completion on the manager node even if we lose the SSH connection.

                var updateScript =
                    $@"#!/bin/bash
# Update [neon-registry] to READ-ONLY mode:

docker service update --env-rm READ_ONLY --env-add READ_ONLY=true neon-registry

# Prune the registry:

docker run \
   --name neon-registry-prune \
   --restart-condition=none \
   --mount type=volume,src=neon-registry,volume-driver=neon,dst=/var/lib/neon-registry \
   {HiveConst.NeonProdRegistry}/neon-registry garbage-collect

# Restore [neon-registry] to READ/WRITE mode:

docker service update --env-rm READ_ONLY --env-add READ_ONLY=false neon-registry
";
                var bundle = new CommandBundle("./collect.sh");

                bundle.AddFile("collect.sh", updateScript, isExecutable: true);

                context.WriteLine(AnsibleVerbosity.Info, "Registry prune started.");

                var pruneResponse = manager.SudoCommand(bundle, RunOptions.None);

                if (pruneResponse.ExitCode != 0)
                {
                    context.WriteErrorLine($"The prune operation failed.  The registry may be running in READ-ONLY mode: {pruneResponse.ErrorText}");
                    return;
                }

                context.WriteLine(AnsibleVerbosity.Info, "Registry prune completed.");
                break;

            default:

                throw new ArgumentException($"[state={state}] is not one of the valid choices: [present], [absent], or [prune].");
            }
        }