/// <summary> /// Verifies that a cluster worker node is healthy. /// </summary> /// <param name="node">The server node.</param> /// <param name="clusterDefinition">The cluster definition.</param> public static void CheckWorker(NodeSshProxy <NodeDefinition> node, ClusterDefinition clusterDefinition) { Covenant.Requires <ArgumentNullException>(node != null, nameof(node)); Covenant.Requires <ArgumentException>(node.Metadata.IsWorker, nameof(node)); Covenant.Requires <ArgumentNullException>(clusterDefinition != null, nameof(clusterDefinition)); if (!node.IsFaulted) { CheckWorkerNtp(node, clusterDefinition); } node.Status = "healthy"; }
/// <summary> /// Verifies that a control-plane node's NTP health. /// </summary> /// <param name="node">The control-plane node.</param> /// <param name="clusterDefinition">The cluster definition.</param> private static void CheckControlNodeNtp(NodeSshProxy <NodeDefinition> node, ClusterDefinition clusterDefinition) { // We're going to use [ntpq -pw] to query the configured time sources. // We should get something back that looks like // // remote refid st t when poll reach delay offset jitter // ============================================================================== // LOCAL(0).LOCL. 10 l 45m 64 0 0.000 0.000 0.000 // * clock.xmission. .GPS. 1 u 134 256 377 48.939 - 0.549 18.357 // + 173.44.32.10 18.26.4.105 2 u 200 256 377 96.981 - 0.623 3.284 // + pacific.latt.ne 44.24.199.34 3 u 243 256 377 41.457 - 8.929 8.497 // // For control-plane nodes, we're simply going to verify that we have at least one external // time source answering. node.Status = "check: NTP"; var retryDelay = TimeSpan.FromSeconds(30); var fault = (string)null; for (int tryCount = 0; tryCount < 6; tryCount++) { var response = node.SudoCommand("/usr/bin/ntpq -pw", RunOptions.LogOutput); if (response.ExitCode != 0) { Thread.Sleep(retryDelay); continue; } using (var reader = response.OpenOutputTextReader()) { string line; // Column header and table bar lines. line = reader.ReadLine(); if (string.IsNullOrWhiteSpace(line)) { fault = "NTP: Invalid [ntpq -pw] response."; Thread.Sleep(retryDelay); continue; } line = reader.ReadLine(); if (string.IsNullOrWhiteSpace(line) || line[0] != '=') { fault = "NTP: Invalid [ntpq -pw] response."; Thread.Sleep(retryDelay); continue; } // Count the lines starting that don't include [*.LOCL.*], // the local clock. var sourceCount = 0; for (line = reader.ReadLine(); line != null; line = reader.ReadLine()) { if (line.Length > 0 && !line.Contains(".LOCL.")) { sourceCount++; } } if (sourceCount == 0) { fault = "NTP: No external sources are answering."; Thread.Sleep(retryDelay); continue; } // Everything looks good. break; } } if (fault != null) { node.Fault(fault); } }
/// <summary> /// Verifies that a worker node's NTP health. /// </summary> /// <param name="node">The worker node.</param> /// <param name="clusterDefinition">The cluster definition.</param> private static void CheckWorkerNtp(NodeSshProxy <NodeDefinition> node, ClusterDefinition clusterDefinition) { // We're going to use [ntpq -pw] to query the configured time sources. // We should get something back that looks like // // remote refid st t when poll reach delay offset jitter // ============================================================================== // LOCAL(0).LOCL. 10 l 45m 64 0 0.000 0.000 0.000 // * 10.0.1.5 198.60.22.240 2 u 111 128 377 0.062 3.409 0.608 // + 10.0.1.7 198.60.22.240 2 u 111 128 377 0.062 3.409 0.608 // + 10.0.1.7 198.60.22.240 2 u 111 128 377 0.062 3.409 0.608 // // For worker nodes, we need to verify that each of the control-plane nodes are answering // by confirming that their IP addresses are present. node.Status = "check: NTP"; var retryDelay = TimeSpan.FromSeconds(30); var fault = (string)null; var firstTry = true; tryAgain: for (var tries = 0; tries < 6; tries++) { var output = node.SudoCommand("/usr/bin/ntpq -pw", RunOptions.LogOutput).OutputText; foreach (var controlNode in clusterDefinition.SortedControlNodes) { // We're going to check the for presence of the control-plane node's IP address // or its name, the latter because [ntpq] appears to attempt a reverse // IP address lookup which will resolve into one of the DNS names defined // in the local [/etc/hosts] file. if (!output.Contains(controlNode.Address.ToString()) && !output.Contains(controlNode.Name.ToLower())) { fault = $"NTP: Manager [{controlNode.Name}/{controlNode.Address}] is not answering."; Thread.Sleep(retryDelay); continue; } // Everything looks OK. break; } } if (fault != null) { if (firstTry) { // $hack(jefflill): // // I've seen the NTP check fail on worker nodes, complaining // that the connection attempt was rejected. I manually restarted // the node and then it worked. I'm not sure if the rejected connection // was being made to the local NTP service or from the local service // to NTP running on the control-plane. // // I'm going to assume that it was to the local NTP service and I'm // going to try mitigating this by restarting the local NTP service // and then re-running the tests. I'm only going to do this once. node.SudoCommand("systemctl restart ntp", node.DefaultRunOptions & ~RunOptions.FaultOnError); firstTry = false; goto tryAgain; } node.Fault(fault); } }
/// <summary> /// Inspects the node to determine physical machine capabilities like /// processor count, RAM, and primary disk capacity and then sets the /// corresponding node labels in the cluster definition. /// </summary> /// <param name="node">The target node.</param> private void DetectLabels(NodeSshProxy <NodeDefinition> node) { CommandResponse result; // Download [/proc/meminfo] and extract the [MemTotal] value (in kB). result = node.SudoCommand("cat /proc/meminfo", RunOptions.FaultOnError); if (result.ExitCode == 0) { var memInfo = result.OutputText; var memTotalRegex = new Regex(@"^MemTotal:\s*(?<size>\d+)\s*kB", RegexOptions.Multiline); var memMatch = memTotalRegex.Match(memInfo); if (memMatch.Success && long.TryParse(memMatch.Groups["size"].Value, out var memSizeKiB)) { // Note that the RAM reported by Linux is somewhat less than the // physical RAM installed. node.Metadata.Labels.ComputeRam = (int)(memSizeKiB / 1024); // Convert KiB --> MiB } } // Download [/proc/cpuinfo] and count the number of processors. result = node.SudoCommand("cat /proc/cpuinfo", RunOptions.FaultOnError); if (result.ExitCode == 0) { var cpuInfo = result.OutputText; var processorRegex = new Regex(@"^processor\s*:\s*\d+", RegexOptions.Multiline); var processorMatches = processorRegex.Matches(cpuInfo); node.Metadata.Labels.ComputeCores = processorMatches.Count; } // Determine the primary disk size. // $hack(jefflill): // // I'm not entirely sure how to determine which block device is hosting // the primary file system for all systems. For now, I'm just going to // assume that this can be one of: // // /dev/sda1 // /dev/sda // /dev/xvda1 // /dev/xvda // // I'll try each of these in order and setting the label for the // first reasonable result we get back. var blockDevices = new string[] { "/dev/sda1", "/dev/sda", "/dev/xvda1", "/dev/xvda" }; foreach (var blockDevice in blockDevices) { result = node.SudoCommand($"lsblk -b --output SIZE -n -d {blockDevice}", RunOptions.LogOutput | RunOptions.FaultOnError); if (result.ExitCode == 0) { if (long.TryParse(result.OutputText.Trim(), out var deviceSize) && deviceSize > 0) { node.Metadata.Labels.StorageSize = ByteUnits.ToGiB(deviceSize); break; } } } }