/// <summary> /// Uploads the configuration files for the target operating system to the server. /// </summary> /// <typeparam name="Metadata">The node metadata type.</typeparam> /// <param name="node">The remote node.</param> /// <param name="hiveDefinition">The hive definition or <c>null</c>.</param> public static void UploadConfigFiles <Metadata>(this SshProxy <Metadata> node, HiveDefinition hiveDefinition = null) where Metadata : class { Covenant.Requires <ArgumentNullException>(node != null); // Clear the contents of the configuration folder. node.Status = $"clear: {HiveHostFolders.Config}"; node.SudoCommand($"rm -rf {HiveHostFolders.Config}/*.*"); // Upload the files. node.Status = "upload: config files"; foreach (var file in Program.LinuxFolder.GetFolder("conf").Files()) { node.UploadFile(hiveDefinition, file, $"{HiveHostFolders.Config}/{file.Name}"); } // Secure the files and make the scripts executable. node.SudoCommand($"chmod 644 {HiveHostFolders.Config}/*.*"); node.SudoCommand($"chmod 744 {HiveHostFolders.Config}/*.sh"); node.Status = "copied"; }
/// <summary> /// Updates Docker on a specific node. /// </summary> /// <param name="node">The target node.</param> /// <param name="stepDelay">The step delay.</param> private void UpdateDocker(SshProxy <NodeDefinition> node, TimeSpan stepDelay) { if (node.GetDockerVersion() >= (SemanticVersion)version) { return; // Already updated } if (node.Metadata.InSwarm) { node.Status = "swarm: drain services"; hive.Docker.DrainNode(node.Name); } node.Status = "run: safe-apt-get update"; node.SudoCommand("safe-apt-get update"); node.Status = $"run: safe-apt-get install -yq {dockerPackageUri}"; node.SudoCommand($"safe-apt-get install -yq {dockerPackageUri}"); node.Status = $"restart: docker"; node.SudoCommand("systemctl restart docker"); if (node.Metadata.InSwarm) { // Put the node back into ACTIVE mode (from DRAIN). node.Status = "swarm: activate"; hive.Docker.ActivateNode(node.Name); } node.Status = $"stabilizing ({Program.WaitSeconds}s)"; Thread.Sleep(TimeSpan.FromSeconds(Program.WaitSeconds)); }
/// <summary> /// Updates docker on a hive node. /// </summary> /// <param name="hive">The target hive.</param> /// <param name="node">The target node.</param> /// <param name="dockerPackageUri">The Docker Debian package URI.</param> private static void UpdateDocker(HiveProxy hive, SshProxy <NodeDefinition> node, string dockerPackageUri) { try { if (node.Metadata.InSwarm) { node.Status = "swarm: drain services"; hive.Docker.DrainNode(node.Name); } node.Status = "stop: docker"; node.SudoCommand("systemctl stop docker").EnsureSuccess(); node.Status = "download: docker package"; node.SudoCommand($"curl {Program.CurlOptions} {dockerPackageUri} -o /tmp/docker.deb").EnsureSuccess(); node.Status = "update: docker"; node.SudoCommand("gdebi /tmp/docker.deb").EnsureSuccess(); node.SudoCommand("rm /tmp/docker.deb"); node.Status = "restart: docker"; node.SudoCommand("systemctl start docker").EnsureSuccess(); if (node.Metadata.InSwarm) { node.Status = "swarm: activate"; hive.Docker.ActivateNode(node.Name); } } catch (Exception e) { node.Fault($"[docker] update failed: {NeonHelper.ExceptionError(e)}"); } }
/// <summary> /// Executes a Vault command on a specific node using the root Vault token. /// </summary> /// <param name="node">The target node.</param> /// <param name="commandLine">The Vault command.</param> private void ExecuteOnNode(SshProxy <NodeDefinition> node, CommandLine commandLine) { var response = node.SudoCommand($"export VAULT_TOKEN={vaultCredentials.RootToken} && {remoteVaultPath} {commandLine}", RunOptions.IgnoreRemotePath | RunOptions.Redact); Console.WriteLine(response.AllText); Program.Exit(response.ExitCode); }
/// <summary> /// Updates Linux on a specific node. /// </summary> /// <param name="node">The target node.</param> /// <param name="stepDelay">The step delay.</param> private void UpdateLinux(SshProxy <NodeDefinition> node, TimeSpan stepDelay) { if (node.Metadata.InSwarm) { node.Status = "swarm: drain services"; hive.Docker.DrainNode(node.Name); } node.Status = "run: safe-apt-get dist-upgrade -yq"; node.SudoCommand("safe-apt-get dist-upgrade -yq"); node.Reboot(); if (node.Metadata.InSwarm) { // Put the node back into ACTIVE mode (from DRAIN). node.Status = "swarm: activate"; hive.Docker.ActivateNode(node.Name); } // Give the node a chance to become active again in the swarm // for containers to restart and for service tasks to redeploy node.Status = $"stabilizing ({Program.WaitSeconds}s)"; Thread.Sleep(TimeSpan.FromSeconds(Program.WaitSeconds)); }
/// <summary> /// <para> /// Edits the [/etc/hosts] file on all hive nodes so that the line: /// </para> /// <code> /// 127.0.1.1 {hostname} /// </code> /// <para> /// is changed to: /// </para> /// <code> /// {node.PrivateAddress} {hostname} /// </code> /// <para> /// Hashicorp Vault cannot restart with the old setting, complaining about a /// <b>""missing API address</b>. /// </para> /// </summary> /// <param name="node">The target node.</param> private void EditEtcHosts(SshProxy <NodeDefinition> node) { node.InvokeIdempotentAction(GetIdempotentTag("edit-etc-hosts"), () => { var etcHosts = node.DownloadText("/etc/hosts"); var sbEtcHosts = new StringBuilder(); using (var reader = new StringReader(etcHosts)) { foreach (var line in reader.Lines()) { if (line.StartsWith("127.0.1.1")) { var nodeAddress = node.PrivateAddress.ToString(); var separator = new string(' ', Math.Max(16 - nodeAddress.Length, 1)); sbEtcHosts.AppendLine($"{nodeAddress}{separator}{node.Name}"); } else { sbEtcHosts.AppendLine(line); } } } node.UploadText("/etc/hosts", sbEtcHosts.ToString(), permissions: "644"); node.SudoCommand("systemctl restart vault"); }); }
/// <summary> /// Verifies Consul health. /// </summary> /// <param name="node">The manager node.</param> /// <param name="hiveDefinition">The hive definition.</param> private static void CheckConsul(SshProxy <NodeDefinition> node, HiveDefinition hiveDefinition) { node.Status = "checking: consul"; // Verify that the daemon is running. switch (Program.ServiceManager) { case ServiceManager.Systemd: { var output = node.SudoCommand("systemctl status consul", RunOptions.LogOutput).OutputText; if (!output.Contains("Active: active (running)")) { node.Fault($"Consul deamon is not running."); return; } } break; default: throw new NotImplementedException(); } }
/// <summary> /// Starts a neonHIVE related Docker container on a node and also uploads a script /// to make it easy to restart the container manually or for hive updates. /// </summary> /// <param name="node">The target hive node.</param> /// <param name="containerName">Identifies the container.</param> /// <param name="image">The Docker image to be used by the container.</param> /// <param name="runOptions">Optional run options (defaults to <see cref="RunOptions.FaultOnError"/>).</param> /// <param name="commands">The commands required to start the container.</param> /// <remarks> /// <para> /// This method performs the following steps: /// </para> /// <list type="number"> /// <item> /// Passes <paramref name="image"/> to <see cref="Program.ResolveDockerImage(string)"/> to /// obtain the actual image to be started. /// </item> /// <item> /// Generates the first few lines of the script file that sets the /// default image as the <c>TARGET_IMAGE</c> macro and then overrides /// this with the script parameter (if there is one). /// </item> /// <item> /// Appends the commands to the script, replacing any text that matches /// <see cref="ImagePlaceholderArg"/> with <c>${TARGET_IMAGE}</c> to make it easy /// for services to be upgraded later. /// </item> /// <item> /// Starts the container. /// </item> /// <item> /// Uploads the generated script to the node to [<see cref="HiveHostFolders.Scripts"/>/<paramref name="containerName"/>.sh]. /// </item> /// </list> /// </remarks> public static void StartContainer(SshProxy <NodeDefinition> node, string containerName, string image, RunOptions runOptions = RunOptions.FaultOnError, params IBashCommandFormatter[] commands) { Covenant.Requires <ArgumentNullException>(node != null); Covenant.Requires <ArgumentNullException>(!string.IsNullOrWhiteSpace(containerName)); Covenant.Requires <ArgumentNullException>(!string.IsNullOrWhiteSpace(image)); Covenant.Requires <ArgumentNullException>(commands != null); Covenant.Requires <ArgumentNullException>(commands.Length > 0); node.Status = $"start: {containerName}"; // Generate the container start script. var script = CreateStartScript(containerName, image, true, commands); // Upload the script to the target node and set permissions. var scriptPath = LinuxPath.Combine(HiveHostFolders.Scripts, $"{containerName}.sh"); node.UploadText(scriptPath, script); node.SudoCommand($"chmod 740 {scriptPath}"); // Run the script without a parameter to start the container. node.IdempotentDockerCommand($"setup/{containerName}", null, runOptions, scriptPath); node.Status = string.Empty; }
/// <summary> /// Removes the Docker python module from all nodes because it conflicts with /// Docker related Ansible playbooks. /// </summary> /// <param name="node">The target node.</param> private void RemoveDockerPython(SshProxy <NodeDefinition> node) { node.InvokeIdempotentAction(GetIdempotentTag("remove-docker-py"), () => { node.SudoCommand("su sysadmin -c 'pip uninstall -y docker'", RunOptions.LogOnErrorOnly); }); }
/// <summary> /// Uploads the setup and other scripts and tools for the target operating system to the server. /// </summary> /// <typeparam name="TMetadata">The server's metadata type.</typeparam> /// <param name="server">The remote server.</param> /// <param name="clusterDefinition">The cluster definition.</param> /// <param name="kubeSetupInfo">The Kubernetes setup details.</param> public static void UploadResources <TMetadata>(this SshProxy <TMetadata> server, ClusterDefinition clusterDefinition, KubeSetupInfo kubeSetupInfo) where TMetadata : class { Covenant.Requires <ArgumentNullException>(server != null, nameof(server)); Covenant.Requires <ArgumentNullException>(clusterDefinition != null, nameof(clusterDefinition)); Covenant.Requires <ArgumentNullException>(kubeSetupInfo != null, nameof(kubeSetupInfo)); //----------------------------------------------------------------- // Upload resource files to the setup folder. server.Status = $"clear: {KubeHostFolders.Setup}"; server.SudoCommand($"rm -rf {KubeHostFolders.Setup}/*.*"); // Upload the setup files. server.Status = "upload: setup scripts"; foreach (var file in Program.LinuxFolder.GetFolder("setup").Files()) { server.UploadFile(clusterDefinition, kubeSetupInfo, file, $"{KubeHostFolders.Setup}/{file.Name}"); } // Make the setup scripts executable. server.SudoCommand($"chmod 744 {KubeHostFolders.Setup}/*"); //----------------------------------------------------------------- // Upload files to the bin folder. server.Status = $"clear: {KubeHostFolders.Bin}"; server.SudoCommand($"rm -rf {KubeHostFolders.Bin}/*.*"); // Upload the tool files. Note that we're going to strip out the [.sh] // file type to make these easier to run. server.Status = "upload: binary files"; foreach (var file in Program.LinuxFolder.GetFolder("binary").Files()) { server.UploadFile(clusterDefinition, kubeSetupInfo, file, $"{KubeHostFolders.Bin}/{file.Name.Replace(".sh", string.Empty)}"); } // Make the scripts executable. server.SudoCommand($"chmod 744 {KubeHostFolders.Bin}/*"); }
/// <summary> /// Ensures that the Docker <b>config.json</b> file for the node's root /// user matches that for the sysadmin user. /// </summary> private void SyncDockerConf(SshProxy <NodeDefinition> node) { // We also need to manage the login for the [root] account due // to issue // // https://github.com/jefflill/NeonForge/issues/265 // $hack(jeff.lill): // // We're simply going ensure that the [/root/.docker/config.json] // file matches the equivalent file for the node sysadmin account, // removing the root file if this was deleted for sysadmin. // // This is a bit of a hack because it assumes that the Docker config // for the root and sysadmin account never diverge, which is probably // a reasonable assumption given that these are managed hosts. // // We're also going to ensure that these directories and files have the // correct owners and permissions. var bundle = new CommandBundle("./sync.sh"); bundle.AddFile("sync.sh", $@"#!/bin/bash if [ ! -d /root/.docker ] ; then mkdir -p /root/.docker fi if [ -f /home/{node.Username}/.docker/config.json ] ; then cp /home/{node.Username}/.docker/config.json /root/.docker/config.json else if [ -f /root/.docker/config.json ] ; then rm /root/.docker/config.json fi fi if [ -d /root/.docker ] ; then chown -R root:root /root/.docker chmod 660 /root/.docker/* fi if [ -d /home/{node.Username}/.docker ] ; then chown -R {node.Username}:{node.Username} /home/{node.Username}/.docker chmod 660 /home/{node.Username}/.docker/* fi ", isExecutable: true); var response = node.SudoCommand(bundle); if (response.ExitCode != 0) { throw new HiveException(response.ErrorSummary); } }
/// <summary> /// Updates Consul on a specific node. /// </summary> /// <param name="node">The target node.</param> /// <param name="stepDelay">The step delay.</param> private void UpdateConsul(SshProxy <NodeDefinition> node, TimeSpan stepDelay) { if (node.GetConsulVersion() >= (SemanticVersion)version) { return; // Already updated } node.Status = $"stop: consul"; node.SudoCommand("systemctl stop consul"); node.Status = $"update: consul"; var bundle = new CommandBundle("./install.sh", version); bundle.AddFile("install.sh", $@"#!/bin/bash set -euo pipefail curl {Program.CurlOptions} https://releases.hashicorp.com/consul/$1/consul_$1_linux_amd64.zip -o /tmp/consul.zip 1>&2 unzip -u /tmp/consul.zip -d /tmp cp /tmp/consul /usr/local/bin chmod 770 /usr/local/bin/consul rm /tmp/consul.zip rm /tmp/consul ", isExecutable: true); node.SudoCommand(bundle); node.Status = $"restart: consul"; node.SudoCommand("systemctl restart consul"); if (node.Metadata.IsManager) { node.Status = $"stabilizing ({Program.WaitSeconds}s)"; Thread.Sleep(TimeSpan.FromSeconds(Program.WaitSeconds)); } }
/// <summary> /// Verifies Docker health. /// </summary> /// <param name="node">The target hive node.</param> /// <param name="hiveDefinition">The hive definition.</param> private static void CheckDocker(SshProxy <NodeDefinition> node, HiveDefinition hiveDefinition) { node.Status = "checking: docker"; // This is a super simple ping to verify that Docker appears to be running. var response = node.SudoCommand("docker info"); if (response.ExitCode != 0) { node.Fault($"Docker: {response.AllText}"); } }
/// <summary> /// Updates Vault on a specific node. /// </summary> /// <param name="node">The target node.</param> /// <param name="stepDelay">The step delay.</param> private void UpdateVault(SshProxy <NodeDefinition> node, TimeSpan stepDelay) { if (node.GetVaultVersion() >= (SemanticVersion)version) { return; // Already updated } node.Status = $"update: vault"; var bundle = new CommandBundle("./install.sh", version); bundle.AddFile("install.sh", $@"#!/bin/bash set -euo pipefail curl {Program.CurlOptions} https://releases.hashicorp.com/vault/$1/vault_$1_linux_amd64.zip -o /tmp/vault.zip 1>&2 unzip -o /tmp/vault.zip -d /tmp rm /tmp/vault.zip mv /tmp/vault /usr/local/bin/vault chmod 700 /usr/local/bin/vault ", isExecutable: true); node.SudoCommand(bundle); if (node.Metadata.IsManager) { node.Status = $"restart: vault"; node.SudoCommand("systemctl restart vault"); node.Status = $"unseal: vault"; hive.Vault.Unseal(); node.Status = $"stabilizing ({Program.WaitSeconds}s)"; Thread.Sleep(TimeSpan.FromSeconds(Program.WaitSeconds)); } }
/// <summary> /// Executes a <b>docker config create</b> command. /// </summary> /// <param name="node">The target node.</param> /// <param name="rightCommandLine">The right split of the command line.</param> private void ConfigCreate(SshProxy <NodeDefinition> node, CommandLine rightCommandLine) { // We're expecting a command like: // // docker config create [OPTIONS] CONFIG file|- // // where CONFIG is the name of the configuration and and [file] // is the path to the config file or [-] indicates that // the config is streaming in on stdin. // // We're going to run this as a command bundle that includes // the config file. if (rightCommandLine.Arguments.Length != 4) { Console.Error.WriteLine("*** ERROR: Expected: docker config create [OPTIONS] CONFIG file|-"); Program.Exit(0); } string fileArg = rightCommandLine.Arguments[3]; byte[] configData; if (fileArg == "-") { configData = NeonHelper.ReadStandardInputBytes(); } else { configData = File.ReadAllBytes(fileArg); } // Create and execute a command bundle. Note that we're going to hardcode // the config data path to [config.data]. rightCommandLine.Items[rightCommandLine.Items.Length - 1] = "config.data"; var bundle = new CommandBundle("docker", rightCommandLine.Items); bundle.AddFile("config.data", configData); var response = node.SudoCommand(bundle, RunOptions.None); Console.Write(response.AllText); Program.Exit(response.ExitCode); }
/// <summary> /// Updates the <b>/etc/systemd/system/ceph-fuse-hivefs.service</b> to adjust restart /// behavior: https://github.com/jefflill/NeonForge/issues/364 /// </summary> /// <param name="node">The target node.</param> private void UpdateCephFuse(SshProxy <NodeDefinition> node) { node.InvokeIdempotentAction(GetIdempotentTag("ceph-fuse"), () => { node.UploadText("/etc/systemd/system/ceph-fuse-hivefs.service", @"[Unit] Description=Ceph FUSE client (for /mnt/hivefs) After=network-online.target local-fs.target time-sync.target Wants=network-online.target local-fs.target time-sync.target Conflicts=umount.target PartOf=ceph-fuse.target [Service] EnvironmentFile=-/etc/default/ceph Environment=CLUSTER=ceph ExecStart=/usr/bin/ceph-fuse -f -o nonempty --cluster ${CLUSTER} /mnt/hivefs TasksMax=infinity # These settings configure the service to restart always after # waiting 5 seconds between attempts for up to a 365 days (effectively # forever). [StartLimitIntervalSec] is set to the number of seconds # in a year and [StartLimitBurst] is set to the number of 5 second # intervals in [StartLimitIntervalSec]. Restart=always RestartSec=5 StartLimitIntervalSec=31536000 StartLimitBurst=6307200 [Install] WantedBy=ceph-fuse.target WantedBy=docker.service ", permissions: "644"); // Tell systemd to regenerate its configuration. node.SudoCommand("systemctl daemon-reload"); }); }
/// <summary> /// Verifies that the node has the correct operating system installed. /// </summary> /// <param name="node">The target hive node.</param> public static void VerifyOS(SshProxy <NodeDefinition> node) { node.Status = "check: OS"; var response = node.SudoCommand("lsb_release -a"); switch (Program.OSProperties.TargetOS) { case TargetOS.Ubuntu_16_04: if (!response.OutputText.Contains("Ubuntu 16.04")) { node.Fault("Expected [Ubuntu 16.04]."); } break; default: throw new NotImplementedException($"Support for [{nameof(TargetOS)}.{Program.OSProperties.TargetOS}] is not implemented."); } }
/// <summary> /// Executes a command on a specific hive manager node using the root Vault token. /// </summary> /// <param name="manager">The target manager.</param> /// <param name="command">The command (including the <b>vault</b>).</param> /// <param name="args">The optional arguments.</param> /// <returns>The command response.</returns> /// <remarks> /// <note> /// This method does not fault or throw an exception if the command returns /// a non-zero exit code. /// </note> /// </remarks> public CommandResponse CommandNoFault(SshProxy <NodeDefinition> manager, string command, params object[] args) { Covenant.Requires <ArgumentNullException>(manager != null); Covenant.Requires <ArgumentNullException>(command != null); VerifyToken(); var scriptBundle = new CommandBundle(command, args); var bundle = new CommandBundle("./vault-command.sh"); bundle.AddFile("vault-command.sh", $@"#!/bin/bash export VAULT_TOKEN={hive.HiveLogin.VaultCredentials.RootToken} {scriptBundle} ", isExecutable: true); var response = manager.SudoCommand(bundle, hive.SecureRunOptions); response.BashCommand = bundle.ToBash(); return(response); }
/// <summary> /// Inspects the node to determine physical machine capabilities like /// processor count, RAM, and primary disk capacity and then sets the /// corresponding node labels. /// </summary> /// <param name="node">The target node.</param> private void SetLabels(SshProxy <NodeDefinition> node) { CommandResponse result; // Download [/proc/meminfo] and extract the [MemTotal] value (in kB). result = node.SudoCommand("cat /proc/meminfo"); if (result.ExitCode == 0) { var memInfo = result.OutputText; var memTotalRegex = new Regex(@"^MemTotal:\s*(?<size>\d+)\s*kB", RegexOptions.Multiline); var memMatch = memTotalRegex.Match(memInfo); if (memMatch.Success && long.TryParse(memMatch.Groups["size"].Value, out var memSizeKiB)) { // Note that the RAM reported by Linux is somewhat less than the // physical RAM installed. node.Metadata.Labels.ComputeRam = (int)(memSizeKiB / 1024); // Convert KiB --> MiB } } // Download [/proc/cpuinfo] and count the number of processors. result = node.SudoCommand("cat /proc/cpuinfo"); if (result.ExitCode == 0) { var cpuInfo = result.OutputText; var processorRegex = new Regex(@"^processor\s*:\s*\d+", RegexOptions.Multiline); var processorMatches = processorRegex.Matches(cpuInfo); node.Metadata.Labels.ComputeCores = processorMatches.Count; } // Determine the primary disk size. // $hack(jeff.lill): // // I'm not entirely sure how to determine which block device is hosting // the primary file system for all systems. For now, I'm just going to // assume that this can be one of: // // /dev/sda1 // /dev/sda // /dev/xvda1 // /dev/xvda // // I'll try each of these in order and setting the label for the // first reasonable result we get back. var blockDevices = new string[] { "/dev/sda1", "/dev/sda", "/dev/xvda1", "/dev/xvda" }; foreach (var blockDevice in blockDevices) { result = node.SudoCommand($"lsblk -b --output SIZE -n -d {blockDevice}", RunOptions.LogOutput); if (result.ExitCode == 0) { if (long.TryParse(result.OutputText.Trim(), out var deviceSize) && deviceSize > 0) { node.Metadata.Labels.StorageSize = ByteUnits.ToGiString(deviceSize); break; } } } }
/// <summary> /// Actually executes the command on the node. /// </summary> /// <param name="node">The target node.</param> private void Execute(SshProxy <NodeDefinition> node) { var status = this.ToString(); // Limit the node status to a maximum of 80 characters. For strings // longer than this, we're going to scan backwards from character 80 // until we find a space and then truncate the string at the space // so the status will look nice. if (status.Length > 80) { var pos = 80 - "...".Length; // Leave space for "..." for (; pos > 0; pos--) { if (status[pos] == ' ') { break; } } if (pos > 0) { status = status.Substring(0, pos) + "..."; } else { // Fallback on the chance that a long status has no spaces // before the break. status = status.Substring(0, 77) + "..."; } } node.Status = status; if (commandBundle.Count == 0) { // We can execute the command directly if we're // not uploading any files. if (isDocker) { node.DockerCommand(commandBundle.Command, commandBundle.Args); } else if (Sudo) { node.SudoCommand(commandBundle.Command, commandBundle.Args); } else { throw new NotImplementedException(); } } else { if (isDocker) { throw new NotImplementedException(); } else if (Sudo) { node.SudoCommand(commandBundle); } else { throw new NotImplementedException(); } } StatusPause(); node.Status = string.Empty; }
/// <summary> /// Executes a <b>docker deploy</b> or <b>docker stack deploy</b> command. /// </summary> /// <param name="node">The target node.</param> /// <param name="rightCommandLine">The right split of the command line.</param> private void Deploy(SshProxy <NodeDefinition> node, CommandLine rightCommandLine) { string path = null; // We're going to upload the file specified by the first // [--bundle-file], [--compose-file], or [-c] option. for (int i = 0; i < rightCommandLine.Items.Length; i++) { switch (rightCommandLine.Items[i]) { case "--bundle-file": case "--compose-file": case "-c": path = rightCommandLine.Items.Skip(i + 1).FirstOrDefault(); break; } if (path != null) { // Convert the command line argument to a bundle relative path. rightCommandLine.Items[i + 1] = Path.GetFileName(rightCommandLine.Items[i + 1]); break; } } if (path == null) { // If that didn't work, try looking for arguments like: // // --bundle-file=PATH var patterns = new string[] { "--bundle-file=", "--compose-file=", "-c=" }; for (int i = 0; i < rightCommandLine.Items.Length; i++) { var item = rightCommandLine.Items[i]; foreach (var pattern in patterns) { if (item.StartsWith(pattern)) { path = item.Substring(pattern.Length); // Convert the command line argument to a bundle relative path. rightCommandLine.Items[i] = pattern + Path.GetFileName(path); break; } } if (path != null) { break; } } } if (path == null) { Console.Error.WriteLine("*** ERROR: No DAB or compose file specified."); Program.Exit(0); } var bundle = new CommandBundle("docker", rightCommandLine.Items); bundle.AddFile(Path.GetFileName(path), File.ReadAllText(path)); var response = node.SudoCommand(bundle); Console.Write(response.AllText); Program.Exit(response.ExitCode); }
/// <summary> /// Verifies that a worker node's NTP health. /// </summary> /// <param name="node">The worker node.</param> /// <param name="clusterDefinition">The cluster definition.</param> private static void CheckWorkerNtp(SshProxy <NodeDefinition> node, ClusterDefinition clusterDefinition) { // We're going to use [ntpq -pw] to query the configured time sources. // We should get something back that looks like // // remote refid st t when poll reach delay offset jitter // ============================================================================== // LOCAL(0).LOCL. 10 l 45m 64 0 0.000 0.000 0.000 // * 10.0.1.5 198.60.22.240 2 u 111 128 377 0.062 3.409 0.608 // + 10.0.1.7 198.60.22.240 2 u 111 128 377 0.062 3.409 0.608 // + 10.0.1.7 198.60.22.240 2 u 111 128 377 0.062 3.409 0.608 // // For worker nodes, we need to verify that each of the masters are answering // by confirming that their IP addresses are present. node.Status = "check: NTP"; var retryDelay = TimeSpan.FromSeconds(30); var fault = (string)null; var firstTry = true; tryAgain: for (var tries = 0; tries < 6; tries++) { var output = node.SudoCommand("/usr/bin/ntpq -pw", RunOptions.LogOutput).OutputText; foreach (var master in clusterDefinition.SortedMasters) { // We're going to check the for presence of the master's IP address // or its name, the latter because [ntpq] appears to attempt a reverse // IP address lookup which will resolve into one of the DNS names defined // in the local [/etc/hosts] file. if (!output.Contains(master.PrivateAddress.ToString()) && !output.Contains(master.Name.ToLower())) { fault = $"NTP: Manager [{master.Name}/{master.PrivateAddress}] is not answering."; Thread.Sleep(retryDelay); continue; } // Everything looks OK. break; } } if (fault != null) { if (firstTry) { // $hack(jeff.lill): // // I've seen the NTP check fail on worker nodes, complaining // that the connection attempt was rejected. I manually restarted // the node and then it worked. I'm not sure if the rejected connection // was being made to the local NTP service or from the local service // to NTP running on the master. // // I'm going to assume that it was to the local NTP service and I'm // going to try mitigating this by restarting the local NTP service // and then re-running the tests. I'm only going to do this once. node.SudoCommand("systemctl restart ntp", node.DefaultRunOptions & ~RunOptions.FaultOnError); firstTry = false; goto tryAgain; } node.Fault(fault); } }
/// <summary> /// Customizes the OpenSSH configuration on a node. /// </summary> /// <param name="node">The target node.</param> /// <param name="stepDelayed">Ignored.</param> public static void ConfigureOpenSSH(SshProxy <NodeDefinition> node, TimeSpan stepDelayed) { // Upload the OpenSSH server configuration, restart OpenSSH and // then disconnect and wait for the OpenSSH to restart. var openSshConfig = @"# Package generated configuration file # See the sshd_config(5) manpage for details # What ports, IPs and protocols we listen for Port 22 # Use these options to restrict which interfaces/protocols sshd will bind to #ListenAddress :: #ListenAddress 0.0.0.0 Protocol 2 # HostKeys for protocol version 2 HostKey /etc/ssh/ssh_host_rsa_key #HostKey /etc/ssh/ssh_host_dsa_key #HostKey /etc/ssh/ssh_host_ecdsa_key #HostKey /etc/ssh/ssh_host_ed25519_key #Privilege Separation is turned on for security UsePrivilegeSeparation yes # Lifetime and size of ephemeral version 1 server key KeyRegenerationInterval 3600 ServerKeyBits 1024 # Logging SyslogFacility AUTH LogLevel INFO # Authentication: LoginGraceTime 120 PermitRootLogin prohibit-password StrictModes yes RSAAuthentication yes PubkeyAuthentication yes #AuthorizedKeysFile %h/.ssh/authorized_keys # Don't read the user's ~/.rhosts and ~/.shosts files IgnoreRhosts yes # For this to work you will also need host keys in /etc/ssh_known_hosts RhostsRSAAuthentication no # similar for protocol version 2 HostbasedAuthentication no # Uncomment if you don't trust ~/.ssh/known_hosts for RhostsRSAAuthentication #IgnoreUserKnownHosts yes # To enable empty passwords, change to yes (NOT RECOMMENDED) PermitEmptyPasswords no # Change to yes to enable challenge-response passwords (beware issues with # some PAM modules and threads) ChallengeResponseAuthentication no # Change to no to disable tunnelled clear text passwords #PasswordAuthentication yes # Kerberos options #KerberosAuthentication no #KerberosGetAFSToken no #KerberosOrLocalPasswd yes #KerberosTicketCleanup yes # GSSAPI options #GSSAPIAuthentication no #GSSAPICleanupCredentials yes AllowTcpForwarding no X11Forwarding no X11DisplayOffset 10 PrintMotd no PrintLastLog yes TCPKeepAlive yes #UseLogin no #MaxStartups 10:30:60 #Banner /etc/issue.net # Allow client to pass locale environment variables AcceptEnv LANG LC_* Subsystem sftp /usr/lib/openssh/sftp-server # Set this to 'yes' to enable PAM authentication, account processing, # and session processing. If this is enabled, PAM authentication will # be allowed through the ChallengeResponseAuthentication and # PasswordAuthentication. Depending on your PAM configuration, # PAM authentication via ChallengeResponseAuthentication may bypass # the setting of ""PermitRootLogin without-password"". # If you just want the PAM account and session checks to run without # PAM authentication, then enable this but set PasswordAuthentication # and ChallengeResponseAuthentication to 'no'. UsePAM yes # Allow connections to be idle for up to an 10 minutes (600 seconds) # before terminating them. This configuration pings the client every # 30 seconds for up to 20 times without a response: # # 20*30 = 600 seconds ClientAliveInterval 30 ClientAliveCountMax 20 TCPKeepAlive yes "; node.UploadText("/etc/ssh/sshd_config", openSshConfig); node.SudoCommand("systemctl restart sshd"); }
/// <summary> /// Initializes a near virgin server with the basic capabilities required /// for a cluster host node. /// </summary> /// <param name="node">The target cluster node.</param> /// <param name="clusterDefinition">The cluster definition.</param> /// <param name="kubeSetupInfo">Kubernetes setup details.</param> /// <param name="shutdown">Optionally shuts down the node.</param> public static void PrepareNode(SshProxy <NodeDefinition> node, ClusterDefinition clusterDefinition, KubeSetupInfo kubeSetupInfo, bool shutdown = false) { Covenant.Requires <ArgumentNullException>(node != null); Covenant.Requires <ArgumentNullException>(clusterDefinition != null); Covenant.Requires <ArgumentNullException>(kubeSetupInfo != null); if (node.FileExists($"{KubeHostFolders.State}/setup/prepared")) { return; // Already prepared } //----------------------------------------------------------------- // Ensure that the cluster host folders exist. node.CreateHostFolders(); //----------------------------------------------------------------- // Package manager configuration. if (!clusterDefinition.NodeOptions.AllowPackageManagerIPv6) { // Restrict the [apt] package manager to using IPv4 to communicate // with the package mirrors, since IPv6 often doesn't work. node.UploadText("/etc/apt/apt.conf.d/99-force-ipv4-transport", "Acquire::ForceIPv4 \"true\";"); node.SudoCommand("chmod 644 /etc/apt/apt.conf.d/99-force-ipv4-transport"); } // Configure [apt] to retry. node.UploadText("/etc/apt/apt.conf.d/99-retries", $"APT::Acquire::Retries \"{clusterDefinition.NodeOptions.PackageManagerRetries}\";"); node.SudoCommand("chmod 644 /etc/apt/apt.conf.d/99-retries"); //----------------------------------------------------------------- // Other configuration. ConfigureOpenSSH(node, TimeSpan.Zero); node.UploadConfigFiles(clusterDefinition, kubeSetupInfo); node.UploadResources(clusterDefinition, kubeSetupInfo); if (clusterDefinition != null) { ConfigureEnvironmentVariables(node, clusterDefinition); } node.SudoCommand("safe-apt-get update"); node.InvokeIdempotentAction("setup/prep-node", () => { node.Status = "preparing"; node.SudoCommand("setup-prep.sh"); node.Reboot(wait: true); }); // We need to upload the cluster configuration and initialize drives attached // to the node. We're going to assume that these are not already initialized. // $todo(jeff.lill): // // We may need an option that allows an operator to pre-build a hardware // based drive array or something. I'm going to defer this to later and // concentrate on commodity hardware and cloud deployments for now. CommonSteps.ConfigureEnvironmentVariables(node, clusterDefinition); node.Status = "setup: disk"; node.SudoCommand("setup-disk.sh"); // Clear any DHCP leases to be super sure that cloned node // VMs will obtain fresh IP addresses. node.Status = "clear: DHCP leases"; node.SudoCommand("rm -f /var/lib/dhcp/*"); // Indicate that the node has been fully prepared. node.SudoCommand($"touch {KubeHostFolders.State}/setup/prepared"); // Shutdown the node if requested. if (shutdown) { node.Status = "shutdown"; node.SudoCommand("shutdown 0", RunOptions.Defaults | RunOptions.Shutdown); } }
/// <summary> /// Verifies that a master node's NTP health. /// </summary> /// <param name="node">The master node.</param> /// <param name="clusterDefinition">The cluster definition.</param> private static void CheckMasterNtp(SshProxy <NodeDefinition> node, ClusterDefinition clusterDefinition) { // We're going to use [ntpq -pw] to query the configured time sources. // We should get something back that looks like // // remote refid st t when poll reach delay offset jitter // ============================================================================== // LOCAL(0).LOCL. 10 l 45m 64 0 0.000 0.000 0.000 // * clock.xmission. .GPS. 1 u 134 256 377 48.939 - 0.549 18.357 // + 173.44.32.10 18.26.4.105 2 u 200 256 377 96.981 - 0.623 3.284 // + pacific.latt.ne 44.24.199.34 3 u 243 256 377 41.457 - 8.929 8.497 // // For master nodes, we're simply going to verify that we have at least one external // time source answering. node.Status = "check: NTP"; var retryDelay = TimeSpan.FromSeconds(30); var fault = (string)null; for (int tryCount = 0; tryCount < 6; tryCount++) { var response = node.SudoCommand("/usr/bin/ntpq -pw", RunOptions.LogOutput); if (response.ExitCode != 0) { Thread.Sleep(retryDelay); continue; } using (var reader = response.OpenOutputTextReader()) { string line; // Column header and table bar lines. line = reader.ReadLine(); if (string.IsNullOrWhiteSpace(line)) { fault = "NTP: Invalid [ntpq -pw] response."; Thread.Sleep(retryDelay); continue; } line = reader.ReadLine(); if (string.IsNullOrWhiteSpace(line) || line[0] != '=') { fault = "NTP: Invalid [ntpq -pw] response."; Thread.Sleep(retryDelay); continue; } // Count the lines starting that don't include [*.LOCL.*], // the local clock. var sourceCount = 0; for (line = reader.ReadLine(); line != null; line = reader.ReadLine()) { if (line.Length > 0 && !line.Contains(".LOCL.")) { sourceCount++; } } if (sourceCount == 0) { fault = "NTP: No external sources are answering."; Thread.Sleep(retryDelay); continue; } // Everything looks good. break; } } if (fault != null) { node.Fault(fault); } }
/// <summary> /// Deploys RabbitMQ to a cluster node as a container. /// </summary> /// <param name="node">The target hive node.</param> private void DeployHiveMQ(SshProxy <NodeDefinition> node) { // Deploy RabbitMQ only on the labeled nodes. if (node.Metadata.Labels.HiveMQ) { // Build a comma separated list of fully qualified RabbitMQ hostnames so we // can pass them as the CLUSTER environment variable. var rabbitNodes = hive.Definition.SortedNodes.Where(n => n.Labels.HiveMQ).ToList(); var sbCluster = new StringBuilder(); foreach (var rabbitNode in rabbitNodes) { sbCluster.AppendWithSeparator($"{rabbitNode.Name}@{rabbitNode.Name}.{hive.Definition.Hostnames.HiveMQ}", ","); } var hipeCompileArgs = new List <string>(); if (hive.Definition.HiveMQ.Precompile) { hipeCompileArgs.Add("--env"); hipeCompileArgs.Add("RABBITMQ_HIPE_COMPILE=1"); } var managementPluginArgs = new List <string>(); if (node.Metadata.Labels.HiveMQManager) { hipeCompileArgs.Add("--env"); hipeCompileArgs.Add("MANAGEMENT_PLUGIN=true"); } // $todo(jeff.lill): // // I was unable to get TLS working correctly for RabbitMQ. I'll come back // and revisit this later: // // https://github.com/jefflill/NeonForge/issues/319 ServiceHelper.StartContainer(node, "neon-hivemq", hive.Definition.Image.HiveMQ, RunOptions.FaultOnError, new CommandBundle( "docker run", "--detach", "--name", "neon-hivemq", "--env", $"CLUSTER_NAME={hive.Definition.Name}", "--env", $"CLUSTER_NODES={sbCluster}", "--env", $"CLUSTER_PARTITION_MODE=autoheal", "--env", $"NODENAME={node.Name}@{node.Name}.{hive.Definition.Hostnames.HiveMQ}", "--env", $"RABBITMQ_USE_LONGNAME=true", "--env", $"RABBITMQ_DEFAULT_USER=sysadmin", "--env", $"RABBITMQ_DEFAULT_PASS=password", "--env", $"RABBITMQ_NODE_PORT={HiveHostPorts.HiveMQAMQP}", "--env", $"RABBITMQ_DIST_PORT={HiveHostPorts.HiveMQDIST}", "--env", $"RABBITMQ_MANAGEMENT_PORT={HiveHostPorts.HiveMQManagement}", "--env", $"RABBITMQ_ERLANG_COOKIE={hive.Definition.HiveMQ.ErlangCookie}", "--env", $"RABBITMQ_VM_MEMORY_HIGH_WATERMARK={hive.Definition.HiveMQ.RamHighWatermark}", hipeCompileArgs, managementPluginArgs, "--env", $"RABBITMQ_DISK_FREE_LIMIT={HiveDefinition.ValidateSize(hive.Definition.HiveMQ.DiskFreeLimit, typeof(HiveMQOptions), nameof(hive.Definition.HiveMQ.DiskFreeLimit))}", //"--env", $"RABBITMQ_SSL_CERTFILE=/etc/neon/certs/hive.crt", //"--env", $"RABBITMQ_SSL_KEYFILE=/etc/neon/certs/hive.key", "--env", $"ERL_EPMD_PORT={HiveHostPorts.HiveMQEPMD}", "--mount", "type=volume,source=neon-hivemq,target=/var/lib/rabbitmq", "--mount", "type=bind,source=/etc/neon/certs,target=/etc/neon/certs,readonly", "--publish", $"{HiveHostPorts.HiveMQEPMD}:{HiveHostPorts.HiveMQEPMD}", "--publish", $"{HiveHostPorts.HiveMQAMQP}:{HiveHostPorts.HiveMQAMQP}", "--publish", $"{HiveHostPorts.HiveMQDIST}:{HiveHostPorts.HiveMQDIST}", "--publish", $"{HiveHostPorts.HiveMQManagement}:{HiveHostPorts.HiveMQManagement}", "--memory", HiveDefinition.ValidateSize(hive.Definition.HiveMQ.RamLimit, typeof(HiveMQOptions), nameof(hive.Definition.HiveMQ.RamLimit)), "--restart", "always", ServiceHelper.ImagePlaceholderArg)); // Wait for the RabbitMQ node to report that it's ready. var timeout = TimeSpan.FromMinutes(4); var pollTime = TimeSpan.FromSeconds(2); node.Status = "hivemq: waiting"; try { NeonHelper.WaitFor( () => { var readyReponse = node.SudoCommand($"docker exec neon-hivemq rabbitmqctl node_health_check -n {node.Name}@{node.Name}.{hive.Definition.Hostnames.HiveMQ}", node.DefaultRunOptions & ~RunOptions.FaultOnError); return(readyReponse.ExitCode == 0); }, timeout: timeout, pollTime: pollTime); } catch (TimeoutException) { node.Fault($"RabbitMQ not ready after waiting [{timeout}]."); return; } node.Status = "hivemq: ready"; } }
/// <summary> /// Performs the Docker registry cache related configuration of the node. /// </summary> public void Configure(SshProxy <NodeDefinition> node) { // NOTE: // // We're going to configure the certificates even if the registry cache // isn't enabled so it'll be easier to upgrade the hive later. // For managers, upload the individual cache certificate and // private key files for managers [cache.crt] and [cache.key] at // [/etc/neon-registry-cache/]. This directory will be // mapped into the cache container. // // Then create the cache's data volume and start the manager's // Registry cache container. if (node.Metadata.IsManager) { node.InvokeIdempotentAction("setup/registrycache", () => { // Copy the registry cache certificate and private key to // // /etc/neon-registry-cache node.Status = "run: registry-cache-server-certs.sh"; var copyCommand = new CommandBundle("./registry-cache-server-certs.sh"); var sbCopyScript = new StringBuilder(); sbCopyScript.AppendLine("mkdir -p /etc/neon-registry-cache"); sbCopyScript.AppendLine("chmod 750 /etc/neon-registry-cache"); copyCommand.AddFile($"cache.crt", hive.HiveLogin.HiveCertificate.CertPem); copyCommand.AddFile($"cache.key", hive.HiveLogin.HiveCertificate.KeyPem); sbCopyScript.AppendLine($"cp cache.crt /etc/neon-registry-cache/cache.crt"); sbCopyScript.AppendLine($"cp cache.key /etc/neon-registry-cache/cache.key"); sbCopyScript.AppendLine($"chmod 640 /etc/neon-registry-cache/*"); copyCommand.AddFile("registry-cache-server-certs.sh", sbCopyScript.ToString(), isExecutable: true); node.SudoCommand(copyCommand); // Upload the cache certificates to every hive node at: // // /etc/docker/certs.d/<hostname>:{HiveHostPorts.RegistryCache}/ca.crt // // and then have Linux reload the trusted certificates. node.InvokeIdempotentAction("setup/registrycache-cert", () => { node.Status = "upload: registry cache certs"; var uploadCommand = new CommandBundle("./registry-cache-client-certs.sh"); var sbUploadScript = new StringBuilder(); uploadCommand.AddFile($"hive-neon-registry-cache.crt", hive.HiveLogin.HiveCertificate.CertPem); foreach (var manager in hive.Definition.SortedManagers) { var cacheHostName = hive.Definition.GetRegistryCacheHost(manager); sbUploadScript.AppendLine($"mkdir -p /etc/docker/certs.d/{cacheHostName}:{HiveHostPorts.DockerRegistryCache}"); sbUploadScript.AppendLine($"cp hive-neon-registry-cache.crt /etc/docker/certs.d/{cacheHostName}:{HiveHostPorts.DockerRegistryCache}/ca.crt"); } uploadCommand.AddFile("registry-cache-client-certs.sh", sbUploadScript.ToString(), isExecutable: true); node.SudoCommand(uploadCommand); }); // Start the registry cache containers if enabled for the hive. if (hive.Definition.Docker.RegistryCache) { // Create the registry data volume. node.Status = "create: registry cache volume"; node.SudoCommand(new CommandBundle("docker-volume-create \"neon-registry-cache\"")); // Start the registry cache using the required Docker public registry // credentials, if any. var publicRegistryCredentials = hive.Definition.Docker.Registries.SingleOrDefault(r => HiveHelper.IsDockerPublicRegistry(r.Registry)); publicRegistryCredentials = publicRegistryCredentials ?? new RegistryCredentials() { Registry = HiveConst.DockerPublicRegistry }; publicRegistryCredentials.Username = publicRegistryCredentials.Username ?? string.Empty; publicRegistryCredentials.Password = publicRegistryCredentials.Password ?? string.Empty; node.Status = "start: neon-registry-cache"; var registry = publicRegistryCredentials.Registry; if (string.IsNullOrEmpty(registry) || registry.Equals("docker.io", StringComparison.InvariantCultureIgnoreCase)) { registry = "registry-1.docker.io"; } ServiceHelper.StartContainer(node, "neon-registry-cache", hive.Definition.Image.RegistryCache, RunOptions.FaultOnError | hive.SecureRunOptions, new CommandBundle( "docker run", "--name", "neon-registry-cache", "--detach", "--restart", "always", "--publish", $"{HiveHostPorts.DockerRegistryCache}:5000", "--volume", "/etc/neon-registry-cache:/etc/neon-registry-cache:ro", // Registry cache certificates folder "--volume", "neon-registry-cache:/var/lib/neon-registry-cache", "--env", $"HOSTNAME={node.Name}.{hive.Definition.Hostnames.RegistryCache}", "--env", $"REGISTRY=https://{registry}", "--env", $"USERNAME={publicRegistryCredentials.Username}", "--env", $"PASSWORD={publicRegistryCredentials.Password}", "--env", "LOG_LEVEL=info", ServiceHelper.ImagePlaceholderArg)); } }); node.Status = string.Empty; } }
/// <summary> /// Uploads the setup and other scripts and tools for the target operating system to the server. /// </summary> /// <typeparam name="TMetadata">The server's metadata type.</typeparam> /// <param name="server">The remote server.</param> /// <param name="hiveDefinition">The hive definition or <c>null</c>.</param> public static void UploadResources <TMetadata>(this SshProxy <TMetadata> server, HiveDefinition hiveDefinition = null) where TMetadata : class { Covenant.Requires <ArgumentNullException>(server != null); //----------------------------------------------------------------- // Upload resource files to the setup folder. server.Status = $"clear: {HiveHostFolders.Setup}"; server.SudoCommand($"rm -rf {HiveHostFolders.Setup}/*.*"); // Upload the setup files. server.Status = "upload: setup files"; foreach (var file in Program.LinuxFolder.GetFolder("setup").Files()) { server.UploadFile(hiveDefinition, file, $"{HiveHostFolders.Setup}/{file.Name}"); } // Make the setup scripts executable. server.SudoCommand($"chmod 744 {HiveHostFolders.Setup}/*"); // Uncomment this if/when we have to upload source files. #if FALSE //----------------------------------------------------------------- // Upload resource files to the source folder. Note that we're going // to convert to Linux style line endings and we're going to convert // leading spaces into TABs (4 spaces == 1 TAB). // $hack(jeff.lill): // // This is hardcoded to assume that the source consists of a single level // folder with the source files. If the folders nest eny further, we'll // need to implement a recursive method to handle this properly. // // This code also assumes that the folder and file names do not include // any spaces. server.Status = $"clear: {HiveHostFolders.Source}"; server.SudoCommand($"rm -rf {HiveHostFolders.Source}/*.*"); // Upload the source files. server.Status = "upload: source files"; foreach (var folder in Program.LinuxFolder.GetFolder("source").Folders()) { foreach (var file in folder.Files()) { var targetPath = $"{HiveHostFolders.Source}/{folder.Name}/{file.Name}"; server.UploadText(targetPath, file.Contents, tabStop: -4); server.SudoCommand("chmod 664", targetPath); } } #endif //----------------------------------------------------------------- // Upload files to the tools folder. server.Status = $"clear: {HiveHostFolders.Tools}"; server.SudoCommand($"rm -rf {HiveHostFolders.Tools}/*.*"); // Upload the tool files. Note that we're going to strip out the [.sh] // file type to make these easier to run. server.Status = "upload: tool files"; foreach (var file in Program.LinuxFolder.GetFolder("tools").Files()) { server.UploadFile(hiveDefinition, file, $"{HiveHostFolders.Tools}/{file.Name.Replace(".sh", string.Empty)}"); } // Make the scripts executable. server.SudoCommand($"chmod 744 {HiveHostFolders.Tools}/*"); }
/// <summary> /// Configures OpenVPN on a manager node. /// </summary> /// <param name="manager">The manager.</param> private void ConfigManagerVpn(SshProxy <NodeDefinition> manager) { // Upload the setup and configuration files. // // NOTE: // // These steps are redundant and will be repeated during the // common node configuration, but we need some of the scripts // here, before that happens. manager.CreateHiveHostFolders(); manager.UploadConfigFiles(hive.Definition); manager.UploadResources(hive.Definition); // Install OpenVPN. manager.Status = "vpn install"; manager.SudoCommand("safe-apt-get update"); manager.SudoCommand("safe-apt-get install -yq openvpn"); // Configure OpenVPN. var nodesSubnet = NetworkCidr.Parse(hive.Definition.Network.NodesSubnet); var vpnSubnet = NetworkCidr.Parse(manager.Metadata.VpnPoolSubnet); var duplicateCN = hive.Definition.Vpn.AllowSharedCredentials ? "duplicate-cn" : ";duplicate-cn"; var vpnServerAddress = NetHelper.UintToAddress(NetHelper.AddressToUint(vpnSubnet.Address) + 1); var serverConf = $@"#------------------------------------------------------------------------------ # OpenVPN config file customized for the [{manager.Name}] neonHIVE manager node. # OpenVPN listening port. port {NetworkPorts.OpenVPN} # Enable TCP and/or UDP transports. proto tcp ;proto udp # Set packet tunneling mode. dev tun # SSL/TLS root certificate (ca), certificate # (cert), and private key (key). Each client # and the server must have their own cert and # key file. The server and all clients will # use the same ca file. # # See the [easy-rsa] directory for a series # of scripts for generating RSA certificates # and private keys. Remember to use # a unique Common Name for the server # and each of the client certificates. # # Any X509 key management system can be used. # OpenVPN can also use a PKCS #12 formatted key file # (see [pkcs12] directive in man page). ca ca.crt cert server.crt key server.key # This file should be kept secret # Diffie hellman parameters (2048-bit) generated via: # # openssl dhparam -out dhparam.pem 2048 # dh dhparam.pem # The currently recommended topology. topology subnet # Configure server mode and supply a VPN subnet # for OpenVPN to draw client addresses from. # The server will take {vpnServerAddress} for itself, # the rest will be made available to clients. # Each client will be able to reach the server # on {vpnServerAddress}. Comment this line out if you are # ethernet bridging. See the man page for more info. server {vpnSubnet.Address} {vpnSubnet.Mask} # Maintain a record of client virtual IP address # associations in this file. If OpenVPN goes down or # is restarted, reconnecting clients can be assigned # the same virtual IP address from the pool that was # previously assigned. ;ifconfig-pool-persist ipp.txt # Push routes to the client to allow it # to reach other private subnets behind # the server. Remember that these # private subnets will also need # to know to route the OpenVPN client # address pool ({vpnSubnet.Address}) # back to this specific OpenVPN server. push ""route {nodesSubnet.Address} {nodesSubnet.Mask}"" # Uncomment this directive if multiple clients # might connect with the same certificate/key # files or common names. This is recommended # only for testing purposes. For production use, # each client should have its own certificate/key # pair. {duplicateCN} # The keepalive directive causes ping-like # messages to be sent back and forth over # the link so that each side knows when # the other side has gone down. # Ping every 10 seconds, assume that remote # peer is down if no ping received during # a 120 second time period. keepalive 10 120 # For extra security beyond that provided # by SSL/TLS, create an [HMAC firewall] # to help block DoS attacks and UDP port flooding. # # Generate with: # openvpn --genkey --secret ta.key # # The server and each client must have # a copy of this key. # The second parameter should be '0' # on the server and '1' on the clients. tls-auth ta.key 0 # This file is secret # Select a cryptographic cipher. # This config item must be copied to # the client config file as well. cipher AES-256-CBC # Enable compression on the VPN link. # Don't enable this unless it is also # enabled in the client config file. # # We're not enabling this due to the # VORACLE security vulnerablity: # # https://community.openvpn.net/openvpn/wiki/VORACLE # # The maximum number of concurrently connected # clients we want to allow. max-clients {VpnOptions.ServerAddressCount - 2} # This macro sets the TCP_NODELAY socket flag on # the server as well as pushes it to connecting # clients. The TCP_NODELAY flag disables the Nagle # algorithm on TCP sockets causing packets to be # transmitted immediately with low latency, rather # than waiting a short period of time in order to # aggregate several packets into a larger containing # packet. In VPN applications over TCP, TCP_NODELAY # is generally a good latency optimization. tcp-nodelay # It's a good idea to reduce the OpenVPN # daemon's privileges after initialization. # # You can uncomment this out on # non-Windows systems. ;user nobody ;group nobody # The persist options will try to avoid # accessing certain resources on restart # that may no longer be accessible because # of the privilege downgrade. persist-key persist-tun # Output a short status file showing # current connections, truncated # and rewritten every minute. status openvpn-status.log # By default, log messages will go to the syslog (ork # on Windows, if running as a service, they will go to # the [\Program Files\OpenVPN\log] directory). # Use log or log-append to override this default. # [log] will truncate the log file on OpenVPN startup, # while [log-append] will append to it. Use one # or the other (but not both). log /var/log/openvpn.log ;log-append openvpn.log # Set the appropriate level of log # file verbosity. # # 0 is silent, except for fatal errors # 4 is reasonable for general usage # 5 and 6 can help to debug connection problems # 9 is extremely verbose verb 4 # Silence repeating messages. At most 20 # sequential messages of the same message # category will be output to the log. ;mute 20 "; manager.Status = "vpn config"; manager.SudoCommand("mkdir -p /etc/openvpn"); manager.UploadText("/etc/openvpn/server.conf", serverConf); manager.UploadText("/etc/openvpn/ca.crt", vpnCaFiles.GetCert("ca")); manager.UploadText("/etc/openvpn/server.crt", vpnCaFiles.GetCert("server")); manager.UploadText("/etc/openvpn/server.key", vpnCaFiles.GetKey("server")); manager.SudoCommand("chmod 600", "/etc/openvpn/server.key"); // This is a secret! manager.UploadText("/etc/openvpn/ta.key", vpnCaFiles.GetTaKey()); manager.SudoCommand("chmod 600", "/etc/openvpn/ta.key"); // This is a secret too! manager.UploadText("/etc/openvpn/dhparam.pem", vpnCaFiles.GetDHParam()); // Initialize the [root] user's credentials. vpnCredentials = new VpnCredentials() { CaCert = vpnCaFiles.GetCert("ca"), UserCert = vpnCaFiles.GetCert(HiveConst.RootUser), UserKey = vpnCaFiles.GetKey(HiveConst.RootUser), TaKey = vpnCaFiles.GetTaKey(), CaZipKey = VpnCaFiles.GenerateKey(), CaZip = vpnCaFiles.ToZipBytes() }; // Upload the initial (empty) Certificate Revocation List (CRL) file and then // upload a OpenVPN systemd unit drop-in so that it will recognize revoked certificates. manager.UploadText("/etc/openvpn/crl.pem", vpnCaFiles.GetFile("crl.pem")); manager.SudoCommand("chmod 664", "/etc/openvpn/crl.pem"); // OpenVPN needs to be able to read this after having its privileges downgraded. var openVpnUnit = @"[Unit] Description=OpenVPN connection to %i PartOf=openvpn.service ReloadPropagatedFrom=openvpn.service Before=systemd-user-sessions.service Documentation=man:openvpn(8) Documentation=https://community.openvpn.net/openvpn/wiki/Openvpn23ManPage Documentation=https://community.openvpn.net/openvpn/wiki/HOWTO [Service] PrivateTmp=true KillMode=mixed Type=forking ExecStart=/usr/sbin/openvpn --daemon ovpn-%i --status /run/openvpn/%i.status 10 --cd /etc/openvpn --script-security 2 --config /etc/openvpn/%i.conf --writepid /run/openvpn/%i.pid --crl-verify /etc/openvpn/crl.pem PIDFile=/run/openvpn/%i.pid ExecReload=/bin/kill -HUP $MAINPID WorkingDirectory=/etc/openvpn ProtectSystem=yes CapabilityBoundingSet=CAP_IPC_LOCK CAP_NET_ADMIN CAP_NET_BIND_SERVICE CAP_NET_RAW CAP_SETGID CAP_SETUID CAP_SYS_CHROOT CAP_DAC_READ_SEARCH CAP_AUDIT_WRITE LimitNPROC=10 DeviceAllow=/dev/null rw DeviceAllow=/dev/net/tun rw [Install] WantedBy=multi-user.target "; manager.UploadText("/etc/systemd/system/[email protected]", openVpnUnit); manager.SudoCommand("chmod 644 /etc/systemd/system/[email protected]"); // Do a daemon-reload so systemd will be aware of the new drop-in. manager.SudoCommand("systemctl disable openvpn"); manager.SudoCommand("systemctl daemon-reload"); // Enable and restart OpenVPN. manager.SudoCommand("systemctl enable openvpn"); manager.SudoCommand("systemctl restart openvpn"); //----------------------------------------------------------------- // SPECIAL NOTE: // // I figured out that I need this lovely bit of code after banging my head on the desk for // 12 freaking days. The problem was getting OpenVPN to work in Windows Azure (this will // also probably impact other cloud environments). // // Azure implements VNETs as layer 3 overlays. This means that the host network interfaces // are not actually on an ethernet segment and the VPN default gateway is actually handling // all of the ARP packets, routing between the VNET subnets, load balancers, and the Internet. // This is problematic for OpenVPN traffic because the VPN client IP address space is not // part of the VNET which means the VNET gateway is not able to route packets from hive // hosts back to the manager's OpenVPN client addresses by default. // // The solution is to configure the managers with secondary NIC cards in a different subnet // and provision special Azure user-defined routes that direct VPN return packets to the // correct manager. // // I figured this part out the second day. The problem was though that it simply didn't work. // From an external VPN client, I would try to ping a worker node through OpenVPN running on // a manager. I'd see the ping traffic: // // 1. manager/tun0: request // 2. manager/eth1: request // 3. worker/eth0: request // 4. worker/eth0: reply // 5. manager/eth0: reply // 6: NOTHING! EXPECTED: manager/tun0: reply // // So the problem was that I could see the ICMP ping request hit the various interfaces // on the manager and be received by the worker. I'd then see the worker send the reply, // and be routed via the user-defined Azure rult back to the manager. The problem was // that the packet was simply dropped there. It never made it back to tun0 so OpenVPN // could forward it back to the client. // // After days and days of trying to learn about Linux routing, iptables and policy rules, // I finally ran across this posting for the second time: // // https://unix.stackexchange.com/questions/21093/output-traffic-on-different-interfaces-based-on-destination-port // // This was the key. I ran across this a few days ago and didn't read it closely enough. // It made more sense after learning more about this stuff. // // Linux has a built-in IP address spoofing filter enabled by default. This filter has the // kernel discard any packets whose source address doesn't match the IP address/route implied // by the remote interface that transmitted the packet. This is exactly what's happening // when Azure forwards the VPN return packets via the user-defined route. I'd see return // packets hit eth0 on the manager, be processed by the low-level RAW and MANGLE iptables // and then they'd disappear. // // The solution is simply to disable the spoofing filter. I'm going to go ahead and do this // for all interfaces which should be fine for hives hosted in cloud environments, because the // VNET/Load Balancer/Security Groups will be used to lock things down. Local hives will // need to be manually placed behind a suitable router/firewall as well. // // For robustness, I'm going to deploy this as a service daemon that polls the filter state // for each interface every 5 seconds, and disables any enabled filters. This will ensure // that the filters will always be disabled, even as interfaces are bought up and down. var disableSpoofUnit = $@"[Unit] Description=Disable Network Anti-Spoofing Filters Documentation= After= Requires= Before= [Service] Type=simple ExecStart={HiveHostFolders.Bin}/disable-spoof-filters.sh [Install] WantedBy=multi-user.target "; var disableSpoofScript = @"#!/bin/bash #------------------------------------------------------------------------------ # This script is a deployed as a service to ensure that the Linux anti-spoofing # filters are disabled for the network interfaces on manager nodes hosting # OpenVPN. This is required to allow VPN return traffic from other nodes to # routed back to tun0 and ultimately, connected VPN clients. # # Note that it appears that we need to disable the filter for all interfaces # for this to actually work. while : do flush=false for f in /proc/sys/net/ipv4/conf/*/rp_filter do filter_enabled=$(cat $f) if [ ""$filter_enabled"" == ""1"" ] ; then echo 0 > $f flush=true fi done if [ ""$flush"" == ""true"" ] ; then echo 1 > /proc/sys/net/ipv4/route/flush fi sleep 5 done"; manager.UploadText("/lib/systemd/system/disable-spoof-filters.service", disableSpoofUnit); manager.SudoCommand("chmod 644 /lib/systemd/system/disable-spoof-filters.service"); manager.UploadText($"{HiveHostFolders.Bin}/disable-spoof-filters.sh", disableSpoofScript); manager.SudoCommand($"chmod 770 {HiveHostFolders.Bin}/disable-spoof-filters.sh"); manager.SudoCommand("systemctl enable disable-spoof-filters"); manager.SudoCommand("systemctl restart disable-spoof-filters"); }