/// <summary> /// Updates docker on a hive node. /// </summary> /// <param name="hive">The target hive.</param> /// <param name="node">The target node.</param> /// <param name="dockerPackageUri">The Docker Debian package URI.</param> private static void UpdateDocker(HiveProxy hive, SshProxy <NodeDefinition> node, string dockerPackageUri) { try { if (node.Metadata.InSwarm) { node.Status = "swarm: drain services"; hive.Docker.DrainNode(node.Name); } node.Status = "stop: docker"; node.SudoCommand("systemctl stop docker").EnsureSuccess(); node.Status = "download: docker package"; node.SudoCommand($"curl {Program.CurlOptions} {dockerPackageUri} -o /tmp/docker.deb").EnsureSuccess(); node.Status = "update: docker"; node.SudoCommand("gdebi /tmp/docker.deb").EnsureSuccess(); node.SudoCommand("rm /tmp/docker.deb"); node.Status = "restart: docker"; node.SudoCommand("systemctl start docker").EnsureSuccess(); if (node.Metadata.InSwarm) { node.Status = "swarm: activate"; hive.Docker.ActivateNode(node.Name); } } catch (Exception e) { node.Fault($"[docker] update failed: {NeonHelper.ExceptionError(e)}"); } }
/// <summary> /// Verifies Consul health. /// </summary> /// <param name="node">The manager node.</param> /// <param name="hiveDefinition">The hive definition.</param> private static void CheckConsul(SshProxy <NodeDefinition> node, HiveDefinition hiveDefinition) { node.Status = "checking: consul"; // Verify that the daemon is running. switch (Program.ServiceManager) { case ServiceManager.Systemd: { var output = node.SudoCommand("systemctl status consul", RunOptions.LogOutput).OutputText; if (!output.Contains("Active: active (running)")) { node.Fault($"Consul deamon is not running."); return; } } break; default: throw new NotImplementedException(); } }
/// <summary> /// Verify that the XenServer is ready to provision the hive virtual machines. /// </summary> /// <param name="xenSshProxy">The XenServer SSH proxy.</param> private void VerifyReady(SshProxy <XenClient> xenSshProxy) { // $todo(jeff.lill): // // It would be nice to verify that XenServer actually has enough // resources (RAM, DISK, and perhaps CPU) here as well. var xenHost = xenSshProxy.Metadata; var nodes = GetHostedNodes(xenHost); xenSshProxy.Status = "check virtual machines"; var vmNames = new HashSet <string>(StringComparer.InvariantCultureIgnoreCase); foreach (var vm in xenHost.Machine.List()) { vmNames.Add(vm.NameLabel); } foreach (var hostedNode in nodes) { var vmName = GetVmName(hostedNode); if (vmNames.Contains(vmName)) { xenSshProxy.Fault($"XenServer [{xenHost.Name}] already hosts a virtual machine named [{vmName}]."); return; } } }
/// <summary> /// Verifies Vault health for a node. /// </summary> /// <param name="node">The node.</param> /// <param name="hiveDefinition">The hive definition.</param> private static void CheckVault(SshProxy <NodeDefinition> node, HiveDefinition hiveDefinition) { // $todo(jeff.lill): Implement this. return; node.Status = "checking: vault"; // This is a minimal health test that just verifies that Vault // is listening for requests. We're going to ping the local // Vault instance at [/v1/sys/health]. // // Note that this should return a 500 status code with some // JSON content. The reason for this is because we have not // yet initialized and unsealed the vault. var targetUrl = $"https://{node.Metadata.PrivateAddress}:{hiveDefinition.Vault.Port}/v1/sys/health?standbycode=200"; using (var client = new HttpClient()) { try { var response = client.GetAsync(targetUrl).Result; if (response.StatusCode != HttpStatusCode.OK && response.StatusCode != HttpStatusCode.InternalServerError) { node.Fault($"Vault: Unexpected HTTP response status [{(int) response.StatusCode}={response.StatusCode}]"); return; } if (!response.Content.Headers.ContentType.MediaType.Equals("application/json", StringComparison.OrdinalIgnoreCase)) { node.Fault($"Vault: Unexpected content type [{response.Content.Headers.ContentType.MediaType}]"); return; } } catch (Exception e) { node.Fault($"Vault: {NeonHelper.ExceptionError(e)}"); } } }
/// <summary> /// Verifies that the node has the correct operating system installed. /// </summary> /// <param name="node">The target cluster node.</param> /// <param name="stepDelay">Ignored.</param> public static void VerifyOS(SshProxy <NodeDefinition> node, TimeSpan stepDelay) { node.Status = "check: OS"; // $todo(jeff.lill): We're currently hardcoded to Ubuntu 18.04.x if (!node.OsName.Equals("Ubuntu", StringComparison.InvariantCultureIgnoreCase) || node.OsVersion < Version.Parse("18.04")) { node.Fault("Expected: Ubuntu 18.04.x"); } }
/// <summary> /// Verifies Docker health. /// </summary> /// <param name="node">The target hive node.</param> /// <param name="hiveDefinition">The hive definition.</param> private static void CheckDocker(SshProxy <NodeDefinition> node, HiveDefinition hiveDefinition) { node.Status = "checking: docker"; // This is a super simple ping to verify that Docker appears to be running. var response = node.SudoCommand("docker info"); if (response.ExitCode != 0) { node.Fault($"Docker: {response.AllText}"); } }
/// <summary> /// Verifies that the node has the correct operating system installed. /// </summary> /// <param name="node">The target hive node.</param> public static void VerifyOS(SshProxy <NodeDefinition> node) { node.Status = "check: OS"; var response = node.SudoCommand("lsb_release -a"); switch (Program.OSProperties.TargetOS) { case TargetOS.Ubuntu_16_04: if (!response.OutputText.Contains("Ubuntu 16.04")) { node.Fault("Expected [Ubuntu 16.04]."); } break; default: throw new NotImplementedException($"Support for [{nameof(TargetOS)}.{Program.OSProperties.TargetOS}] is not implemented."); } }
/// <summary> /// Provision the virtual machines on the XenServer. /// </summary> /// <param name="xenSshProxy">The XenServer SSH proxy.</param> private void ProvisionVirtualMachines(SshProxy <XenClient> xenSshProxy) { var xenHost = xenSshProxy.Metadata; foreach (var node in GetHostedNodes(xenHost)) { var vmName = GetVmName(node); var processors = node.Metadata.GetVmProcessors(hive.Definition); var memoryBytes = node.Metadata.GetVmMemory(hive.Definition); var diskBytes = node.Metadata.GetVmDisk(hive.Definition); xenSshProxy.Status = FormatVmStatus(vmName, "create virtual machine"); // We need to create a raw drive if the node hosts a Ceph OSD. var extraDrives = new List <XenVirtualDrive>(); if (node.Metadata.Labels.CephOSD) { extraDrives.Add( new XenVirtualDrive() { Size = node.Metadata.GetCephOSDDriveSize(hive.Definition) }); } var vm = xenHost.Machine.Create(vmName, hive.Definition.Hosting.XenServer.TemplateName, processors: processors, memoryBytes: memoryBytes, diskBytes: diskBytes, snapshot: hive.Definition.Hosting.XenServer.Snapshot, extraDrives: extraDrives, primaryStorageRepository: hive.Definition.Hosting.XenServer.StorageRepository, extraStorageRespository: hive.Definition.Hosting.XenServer.OsdStorageRepository); xenSshProxy.Status = FormatVmStatus(vmName, "start virtual machine"); xenHost.Machine.Start(vm); // We need to wait for the virtual machine to start and obtain // and IP address via DHCP. var address = string.Empty; xenSshProxy.Status = FormatVmStatus(vmName, "fetch ip address"); try { NeonHelper.WaitFor( () => { while (true) { vm = xenHost.Machine.Find(vmName); if (!string.IsNullOrEmpty(vm.Address)) { address = vm.Address; return(true); } Thread.Sleep(1000); } }, TimeSpan.FromSeconds(120)); } catch (TimeoutException) { xenSshProxy.Fault("Timeout waiting for virtual machine to start and set an IP address."); } // SSH into the VM using the DHCP address, configure the static IP // address and extend the primary partition and file system to fill // the drive and then reboot. var subnet = NetworkCidr.Parse(hive.Definition.Network.PremiseSubnet); var gateway = hive.Definition.Network.Gateway; var broadcast = hive.Definition.Network.Broadcast; // We're going to temporarily set the node to the current VM address // so we can connect via SSH. var savedNodeAddress = node.PrivateAddress; try { node.PrivateAddress = IPAddress.Parse(address); using (var nodeProxy = hive.GetNode(node.Name)) { xenSshProxy.Status = FormatVmStatus(vmName, "connect"); nodeProxy.WaitForBoot(); // Replace the [/etc/network/interfaces] file to configure the static // IP and then reboot to reinitialize networking subsystem. var primaryInterface = node.GetNetworkInterface(node.PrivateAddress); xenSshProxy.Status = FormatVmStatus(vmName, $"set static ip [{node.PrivateAddress}]"); var interfacesText = $@"# This file describes the network interfaces available on your system # and how to activate them. For more information, see interfaces(5). source /etc/network/interfaces.d/* # The loopback network interface auto lo iface lo inet loopback # The primary network interface auto {primaryInterface} iface {primaryInterface} inet static address {savedNodeAddress} netmask {subnet.Mask} gateway {gateway} broadcast {broadcast} "; nodeProxy.UploadText("/etc/network/interfaces", interfacesText); // Temporarily configure the public Google DNS servers as // the name servers so DNS will work after we reboot with // the static IP. Note that hive setup will eventually // configure the name servers specified in the hive // definition. // $todo(jeff.lill): // // Is there a good reason why we're not just configuring the // DNS servers from the hive definition here??? // // Using the Google DNS seems like it could break some hive // network configurations (e.g. for hives that don't have // access to the public Internet). Totally private hives // aren't really a supported scenario right now though because // we assume we can use [apt-get]... to pull down packages. var resolvBaseText = $@"nameserver 8.8.8.8 nameserver 8.8.4.4 "; nodeProxy.UploadText("/etc/resolvconf/resolv.conf.d/base", resolvBaseText); // Extend the primary partition and file system to fill // the virtual the drive. xenSshProxy.Status = FormatVmStatus(vmName, $"resize primary partition"); // $hack(jeff.lill): // // I've seen a transient error here but can't reproduce it. I'm going // to assume for now that the file system might not be quite ready for // this operation directly after the VM has been rebooted, so we're going // to delay for a few seconds before performing the operations. Thread.Sleep(TimeSpan.FromSeconds(5)); nodeProxy.SudoCommand("growpart /dev/xvda 1"); nodeProxy.SudoCommand("resize2fs /dev/xvda1"); // Reboot to pick up the changes. xenSshProxy.Status = FormatVmStatus(vmName, "reboot"); nodeProxy.Reboot(wait: false); } } finally { // Restore the node's IP address. node.PrivateAddress = savedNodeAddress; } } }
/// <summary> /// Deploys RabbitMQ to a cluster node as a container. /// </summary> /// <param name="node">The target hive node.</param> private void DeployHiveMQ(SshProxy <NodeDefinition> node) { // Deploy RabbitMQ only on the labeled nodes. if (node.Metadata.Labels.HiveMQ) { // Build a comma separated list of fully qualified RabbitMQ hostnames so we // can pass them as the CLUSTER environment variable. var rabbitNodes = hive.Definition.SortedNodes.Where(n => n.Labels.HiveMQ).ToList(); var sbCluster = new StringBuilder(); foreach (var rabbitNode in rabbitNodes) { sbCluster.AppendWithSeparator($"{rabbitNode.Name}@{rabbitNode.Name}.{hive.Definition.Hostnames.HiveMQ}", ","); } var hipeCompileArgs = new List <string>(); if (hive.Definition.HiveMQ.Precompile) { hipeCompileArgs.Add("--env"); hipeCompileArgs.Add("RABBITMQ_HIPE_COMPILE=1"); } var managementPluginArgs = new List <string>(); if (node.Metadata.Labels.HiveMQManager) { hipeCompileArgs.Add("--env"); hipeCompileArgs.Add("MANAGEMENT_PLUGIN=true"); } // $todo(jeff.lill): // // I was unable to get TLS working correctly for RabbitMQ. I'll come back // and revisit this later: // // https://github.com/jefflill/NeonForge/issues/319 ServiceHelper.StartContainer(node, "neon-hivemq", hive.Definition.Image.HiveMQ, RunOptions.FaultOnError, new CommandBundle( "docker run", "--detach", "--name", "neon-hivemq", "--env", $"CLUSTER_NAME={hive.Definition.Name}", "--env", $"CLUSTER_NODES={sbCluster}", "--env", $"CLUSTER_PARTITION_MODE=autoheal", "--env", $"NODENAME={node.Name}@{node.Name}.{hive.Definition.Hostnames.HiveMQ}", "--env", $"RABBITMQ_USE_LONGNAME=true", "--env", $"RABBITMQ_DEFAULT_USER=sysadmin", "--env", $"RABBITMQ_DEFAULT_PASS=password", "--env", $"RABBITMQ_NODE_PORT={HiveHostPorts.HiveMQAMQP}", "--env", $"RABBITMQ_DIST_PORT={HiveHostPorts.HiveMQDIST}", "--env", $"RABBITMQ_MANAGEMENT_PORT={HiveHostPorts.HiveMQManagement}", "--env", $"RABBITMQ_ERLANG_COOKIE={hive.Definition.HiveMQ.ErlangCookie}", "--env", $"RABBITMQ_VM_MEMORY_HIGH_WATERMARK={hive.Definition.HiveMQ.RamHighWatermark}", hipeCompileArgs, managementPluginArgs, "--env", $"RABBITMQ_DISK_FREE_LIMIT={HiveDefinition.ValidateSize(hive.Definition.HiveMQ.DiskFreeLimit, typeof(HiveMQOptions), nameof(hive.Definition.HiveMQ.DiskFreeLimit))}", //"--env", $"RABBITMQ_SSL_CERTFILE=/etc/neon/certs/hive.crt", //"--env", $"RABBITMQ_SSL_KEYFILE=/etc/neon/certs/hive.key", "--env", $"ERL_EPMD_PORT={HiveHostPorts.HiveMQEPMD}", "--mount", "type=volume,source=neon-hivemq,target=/var/lib/rabbitmq", "--mount", "type=bind,source=/etc/neon/certs,target=/etc/neon/certs,readonly", "--publish", $"{HiveHostPorts.HiveMQEPMD}:{HiveHostPorts.HiveMQEPMD}", "--publish", $"{HiveHostPorts.HiveMQAMQP}:{HiveHostPorts.HiveMQAMQP}", "--publish", $"{HiveHostPorts.HiveMQDIST}:{HiveHostPorts.HiveMQDIST}", "--publish", $"{HiveHostPorts.HiveMQManagement}:{HiveHostPorts.HiveMQManagement}", "--memory", HiveDefinition.ValidateSize(hive.Definition.HiveMQ.RamLimit, typeof(HiveMQOptions), nameof(hive.Definition.HiveMQ.RamLimit)), "--restart", "always", ServiceHelper.ImagePlaceholderArg)); // Wait for the RabbitMQ node to report that it's ready. var timeout = TimeSpan.FromMinutes(4); var pollTime = TimeSpan.FromSeconds(2); node.Status = "hivemq: waiting"; try { NeonHelper.WaitFor( () => { var readyReponse = node.SudoCommand($"docker exec neon-hivemq rabbitmqctl node_health_check -n {node.Name}@{node.Name}.{hive.Definition.Hostnames.HiveMQ}", node.DefaultRunOptions & ~RunOptions.FaultOnError); return(readyReponse.ExitCode == 0); }, timeout: timeout, pollTime: pollTime); } catch (TimeoutException) { node.Fault($"RabbitMQ not ready after waiting [{timeout}]."); return; } node.Status = "hivemq: ready"; } }
/// <summary> /// Verifies that a master node's NTP health. /// </summary> /// <param name="node">The master node.</param> /// <param name="clusterDefinition">The cluster definition.</param> private static void CheckMasterNtp(SshProxy <NodeDefinition> node, ClusterDefinition clusterDefinition) { // We're going to use [ntpq -pw] to query the configured time sources. // We should get something back that looks like // // remote refid st t when poll reach delay offset jitter // ============================================================================== // LOCAL(0).LOCL. 10 l 45m 64 0 0.000 0.000 0.000 // * clock.xmission. .GPS. 1 u 134 256 377 48.939 - 0.549 18.357 // + 173.44.32.10 18.26.4.105 2 u 200 256 377 96.981 - 0.623 3.284 // + pacific.latt.ne 44.24.199.34 3 u 243 256 377 41.457 - 8.929 8.497 // // For master nodes, we're simply going to verify that we have at least one external // time source answering. node.Status = "check: NTP"; var retryDelay = TimeSpan.FromSeconds(30); var fault = (string)null; for (int tryCount = 0; tryCount < 6; tryCount++) { var response = node.SudoCommand("/usr/bin/ntpq -pw", RunOptions.LogOutput); if (response.ExitCode != 0) { Thread.Sleep(retryDelay); continue; } using (var reader = response.OpenOutputTextReader()) { string line; // Column header and table bar lines. line = reader.ReadLine(); if (string.IsNullOrWhiteSpace(line)) { fault = "NTP: Invalid [ntpq -pw] response."; Thread.Sleep(retryDelay); continue; } line = reader.ReadLine(); if (string.IsNullOrWhiteSpace(line) || line[0] != '=') { fault = "NTP: Invalid [ntpq -pw] response."; Thread.Sleep(retryDelay); continue; } // Count the lines starting that don't include [*.LOCL.*], // the local clock. var sourceCount = 0; for (line = reader.ReadLine(); line != null; line = reader.ReadLine()) { if (line.Length > 0 && !line.Contains(".LOCL.")) { sourceCount++; } } if (sourceCount == 0) { fault = "NTP: No external sources are answering."; Thread.Sleep(retryDelay); continue; } // Everything looks good. break; } } if (fault != null) { node.Fault(fault); } }
/// <summary> /// Verifies that a worker node's NTP health. /// </summary> /// <param name="node">The worker node.</param> /// <param name="clusterDefinition">The cluster definition.</param> private static void CheckWorkerNtp(SshProxy <NodeDefinition> node, ClusterDefinition clusterDefinition) { // We're going to use [ntpq -pw] to query the configured time sources. // We should get something back that looks like // // remote refid st t when poll reach delay offset jitter // ============================================================================== // LOCAL(0).LOCL. 10 l 45m 64 0 0.000 0.000 0.000 // * 10.0.1.5 198.60.22.240 2 u 111 128 377 0.062 3.409 0.608 // + 10.0.1.7 198.60.22.240 2 u 111 128 377 0.062 3.409 0.608 // + 10.0.1.7 198.60.22.240 2 u 111 128 377 0.062 3.409 0.608 // // For worker nodes, we need to verify that each of the masters are answering // by confirming that their IP addresses are present. node.Status = "check: NTP"; var retryDelay = TimeSpan.FromSeconds(30); var fault = (string)null; var firstTry = true; tryAgain: for (var tries = 0; tries < 6; tries++) { var output = node.SudoCommand("/usr/bin/ntpq -pw", RunOptions.LogOutput).OutputText; foreach (var master in clusterDefinition.SortedMasters) { // We're going to check the for presence of the master's IP address // or its name, the latter because [ntpq] appears to attempt a reverse // IP address lookup which will resolve into one of the DNS names defined // in the local [/etc/hosts] file. if (!output.Contains(master.PrivateAddress.ToString()) && !output.Contains(master.Name.ToLower())) { fault = $"NTP: Manager [{master.Name}/{master.PrivateAddress}] is not answering."; Thread.Sleep(retryDelay); continue; } // Everything looks OK. break; } } if (fault != null) { if (firstTry) { // $hack(jeff.lill): // // I've seen the NTP check fail on worker nodes, complaining // that the connection attempt was rejected. I manually restarted // the node and then it worked. I'm not sure if the rejected connection // was being made to the local NTP service or from the local service // to NTP running on the master. // // I'm going to assume that it was to the local NTP service and I'm // going to try mitigating this by restarting the local NTP service // and then re-running the tests. I'm only going to do this once. node.SudoCommand("systemctl restart ntp", node.DefaultRunOptions & ~RunOptions.FaultOnError); firstTry = false; goto tryAgain; } node.Fault(fault); } }
/// <summary> /// Provision the virtual machines on the XenServer. /// </summary> /// <param name="xenSshProxy">The XenServer SSH proxy.</param> private void ProvisionVirtualMachines(SshProxy <XenClient> xenSshProxy) { var xenHost = xenSshProxy.Metadata; foreach (var node in GetHostedNodes(xenHost)) { var vmName = GetVmName(node); var processors = node.Metadata.GetVmProcessors(cluster.Definition); var memoryBytes = node.Metadata.GetVmMemory(cluster.Definition); var diskBytes = node.Metadata.GetVmDisk(cluster.Definition); xenSshProxy.Status = FormatVmStatus(vmName, "create: virtual machine"); // We need to create a raw drive if the node hosts a Ceph OSD. var extraDrives = new List <XenVirtualDrive>(); if (node.Metadata.Labels.CephOSD) { extraDrives.Add( new XenVirtualDrive() { Size = node.Metadata.GetCephOSDDriveSize(cluster.Definition) }); } var vm = xenHost.Machine.Create(vmName, cluster.Definition.Hosting.XenServer.TemplateName, processors: processors, memoryBytes: memoryBytes, diskBytes: diskBytes, snapshot: cluster.Definition.Hosting.XenServer.Snapshot, extraDrives: extraDrives, primaryStorageRepository: cluster.Definition.Hosting.XenServer.StorageRepository, extraStorageRespository: cluster.Definition.Hosting.XenServer.OsdStorageRepository); xenSshProxy.Status = FormatVmStatus(vmName, "start: virtual machine"); xenHost.Machine.Start(vm); // We need to wait for the virtual machine to start and obtain // and IP address via DHCP. var address = string.Empty; xenSshProxy.Status = FormatVmStatus(vmName, "discover: ip address"); try { NeonHelper.WaitFor( () => { while (true) { vm = xenHost.Machine.Find(vmName); if (!string.IsNullOrEmpty(vm.Address)) { address = vm.Address; return(true); } Thread.Sleep(1000); } }, TimeSpan.FromMinutes(3)); } catch (TimeoutException) { xenSshProxy.Fault("Timeout waiting for virtual machine to start and set an IP address."); } // SSH into the VM using the DHCP address, configure the static IP // address and extend the primary partition and file system to fill // the drive and then reboot. var subnet = NetworkCidr.Parse(cluster.Definition.Network.PremiseSubnet); // We're going to temporarily set the node to the current VM address // so we can connect via SSH. var nodePrivateAddress = node.PrivateAddress; try { node.PrivateAddress = IPAddress.Parse(address); using (var nodeProxy = cluster.GetNode(node.Name)) { xenSshProxy.Status = FormatVmStatus(vmName, "connect"); nodeProxy.WaitForBoot(); // Configure the node's network stack to the static IP address // and upstream nameservers. node.Status = $"network config [IP={nodePrivateAddress}]"; var primaryInterface = node.GetNetworkInterface(node.PrivateAddress); node.ConfigureNetwork( networkInterface: primaryInterface, address: nodePrivateAddress, gateway: IPAddress.Parse(cluster.Definition.Network.Gateway), subnet: NetworkCidr.Parse(cluster.Definition.Network.PremiseSubnet), nameservers: cluster.Definition.Network.Nameservers.Select(ns => IPAddress.Parse(ns))); // Extend the primary partition and file system to fill // the virtual the drive. xenSshProxy.Status = FormatVmStatus(vmName, $"resize: primary drive"); // $hack(jeff.lill): // // I've seen a transient error here but can't reproduce it. I'm going // to assume for now that the file system might not be quite ready for // this operation directly after the VM has been rebooted, so we're going // to delay for a few seconds before performing the operations. Thread.Sleep(TimeSpan.FromSeconds(5)); nodeProxy.SudoCommand("growpart /dev/xvda 2"); nodeProxy.SudoCommand("resize2fs /dev/xvda2"); // Reboot to pick up the changes. xenSshProxy.Status = FormatVmStatus(vmName, "restarting..."); nodeProxy.Reboot(wait: false); } } finally { // Restore the node's IP address. node.PrivateAddress = nodePrivateAddress; } } }