Ejemplo n.º 1
0
        /// <summary>
        /// Load the server status
        /// </summary>
        private async Task LoadServerStatus()
        {
            const string cpu = @"top -bn 2 -d 0.5 | grep ""Cpu(s)"" | tail -n 1 | awk '{print ($2+$4$6)"" %""}'";
            const string mem = @"free -m | grep ""Mem: "" | awk '{print $3 "" MB""}'";
            const string hd  = @"df -k | grep ""/dev/"" | awk '{print $4 "" KB""}'";

            if (_client.IsConnected == false)
            {
                return;
            }
            SshCommand statCommand = await _client.RunCommandAsync(cpu);

            string cpuUsage = statCommand.Result;

            statCommand = await _client.RunCommandAsync(mem);

            string memUsage = statCommand.Result;

            statCommand = await _client.RunCommandAsync(hd);

            string diskAvailable = statCommand.Result;

            lblCpuUsage.Text      = cpuUsage;
            lblMemUsage.Text      = memUsage;
            lblDiskAvailable.Text = diskAvailable;
        }
        public async Task RunMonitorAsync(CancellationToken ct)
        {
            log.Info($"Starting health monitor for node {this.descriptor.IP}");

            while (!ct.IsCancellationRequested)
            {
                int successCounter = 0;

                lock (this.isd)
                    this.isd.state = MonitorState.WaitingForPingResponses;

                int pingCount    = 5;
                int timeout      = 2000;
                int pingInterval = 1000;
                // wait for N consecutive ping replies

                log.Info(
                    $"Waiting for {pingCount} consecutive successful pings with timeout {timeout} and interval {pingInterval}");
                while (successCounter < pingCount && !ct.IsCancellationRequested) //todo config
                {
                    try
                    {
                        Ping      pingService = new Ping();
                        PingReply pingReply   = await pingService.SendPingAsync(this.IP, timeout); // todo config

                        if (pingReply.Status == IPStatus.Success)
                        {
                            log.Info(
                                $"Got response form {this.IP}; result={pingReply.Status}, RTT={pingReply.RoundtripTime} ms");
                            successCounter++;
                            await Task.Delay(pingInterval, ct);
                        }
                        else
                        {
                            log.Info($"No response form {this.IP}; result={pingReply.Status}");
                            successCounter = 0;
                        }
                    }
                    catch (SystemException ex)
                    {
                        log.Error(ex, $"Error while pinging node {this.descriptor.Hostname}");
                        successCounter = 0;
                    }
                }

                if (ct.IsCancellationRequested)
                {
                    break;
                }

                //
                // ok, the node is responding to pings, now try to connect via SSH
                //

                //TODO: replace login/password authorization method with public keys; to be done at the final stages of cluster configuration process
                AuthenticationMethod[] auth = new AuthenticationMethod[]
                {
                    new PasswordAuthenticationMethod("testlogin", "testpassword"),
                };

                lock (this.isd)
                    this.isd.state = MonitorState.EstablishingSSHConnection;

                IPAddress ip = IPAddress.Parse("10.10.10.10");
                ip = this.IP;
                log.Info($"Connecting node via SSH {this.IP}:22...");
                ConnectionInfo ci = new ConnectionInfo(ip.ToString(), 22, auth[0].Username, auth);

                using (SshClient ssh = new SshClient(ci))
                {
                    try
                    {
                        await Task.Run(() => ssh.Connect(), ct);

                        log.Info(
                            $"Connected. Host: {ssh.ConnectionInfo.Host}, server: {ssh.ConnectionInfo.ServerVersion}, username: {ssh.ConnectionInfo.Username}.");
                    }
                    catch (SocketException ex)
                    {
                        log.Error(ex, "Error while connecting node via SSH");
                        continue;
                    }

                    // ===========================
                    // Now we are connected and can start monitoring the node in two stages:
                    // 1. Get information once per connection
                    // 2. Get informations in a loop

                    //
                    // Test host name
                    try
                    {
                        log.Info("Running 'hostname'...");
                        string commandResponse = (await ssh.RunCommandAsync("hostname", 5000)).Trim();
                        bool   hostnameOk      = commandResponse == this.descriptor.Hostname;
                        string msg             =
                            $"HOSTNAME: expected={this.descriptor.Hostname}; recieved={commandResponse}; correct={(hostnameOk ? "YES" : "NO")}";
                        if (hostnameOk)
                        {
                            log.Info(msg);
                        }
                        else
                        {
                            log.Warn(msg);
                        }
                    }
                    catch (Exception ex)
                    {
                        log.Error(ex, "Error while running command");
                        continue;
                    }

                    //
                    // Get CPU information
                    try
                    {
                        Regex lscpuRegex = new Regex("^(?<key>[^:]+?)[:](?<value>.+)$",
                                                     RegexOptions.Compiled | RegexOptions.Singleline);

                        log.Info("Running 'lscpu'...");
                        string commandResponse = (await ssh.RunCommandAsync("lscpu", 5000));

                        Dictionary <string, string> cpuinfo = commandResponse
                                                              .Split("\n", StringSplitOptions.RemoveEmptyEntries)
                                                              .Select(row => lscpuRegex.Match(row.Trim()))
                                                              .Where(m => m.Success)
                                                              .Select(m => new
                                                                      { Key = m.Groups["key"].Value.Trim(), Value = m.Groups["value"].Value.Trim() })
                                                              .ToDictionary(v => v.Key, v => v.Value);

                        lock (this.isd)
                            this.isd.cpuInfo = cpuinfo;
                        log.Trace($"Got {cpuinfo.Count} entries");
                    }
                    catch (Exception ex)
                    {
                        log.Error(ex, "Error while running command");
                        continue;
                    }


                    //
                    // Unix timestamp
                    try
                    {
                        log.Info("Running 'date +%s'...");
                        string commandResponse = (await ssh.RunCommandAsync("date +%s", 5000)).Trim();
                        lock (this.isd)
                            this.isd.statsNodeTimestamp = UInt64.Parse(commandResponse.Trim());
                    }
                    catch (Exception ex)
                    {
                        log.Error(ex, "Error while running command");
                        continue;
                    }

                    log.Info("Entering node health monitoringl loop...");
                    while (!ct.IsCancellationRequested)
                    {
                        DateTime begin = DateTime.Now;

                        //
                        // /proc/stat
                        try
                        {
                            string commandResponse = (await ssh.RunCommandAsync("cat /proc/stat", 5000)).Trim();
                            Regex  procStatRegex   = new Regex("^(?<key>[a-zA-Z0-9]+)[ ](?<value>.+)$",
                                                               RegexOptions.Compiled | RegexOptions.Singleline);

                            Dictionary <string, UInt64[]> dict = commandResponse
                                                                 .Split("\n", StringSplitOptions.RemoveEmptyEntries)
                                                                 .Select(row => procStatRegex.Match(row.Trim()))
                                                                 .Where(m => m.Success)
                                                                 .ToDictionary(
                                x => x.Groups["key"].Value.Trim(),
                                v => v.Groups["value"].Value
                                .Split(' ', StringSplitOptions.RemoveEmptyEntries)
                                .Select(UInt64.Parse).ToArray()
                                );

                            lock (this.isd)
                            {
                                this.isd.cpuAllStatistics = dict["cpu"];
                            }
                        }
                        catch (Exception ex)
                        {
                            log.Error(ex, "Error while running command 'cat /proc/stat'");
                            break;
                        }

                        // /proc/loadavg
                        try
                        {
                            string commandResponse = (await ssh.RunCommandAsync("cat /proc/loadavg", 5000)).Trim();
                            Regex  loadavgRegex    = new Regex(
                                "^(?<l1>[0-9]+[.][0-9]+)\\s*(?<l5>[0-9]+[.][0-9]+)\\s*(?<l15>[0-9]+[.][0-9]+)\\s*(?<rp>[0-9]+)[/]*(?<trp>[0-9]+)\\s*(?<lastpid>[0-9]+)$",
                                RegexOptions.Compiled | RegexOptions.Singleline);

                            Match m = loadavgRegex.Match(commandResponse);

                            lock (this.isd)
                            {
                                this.isd.load1min           = double.Parse(m.Groups["l1"].Value, CultureInfo.InvariantCulture);
                                this.isd.load5min           = double.Parse(m.Groups["l5"].Value, CultureInfo.InvariantCulture);
                                this.isd.load15min          = double.Parse(m.Groups["l15"].Value, CultureInfo.InvariantCulture);
                                this.isd.runningProcesses   = int.Parse(m.Groups["rp"].Value);
                                this.isd.availableProcesses = int.Parse(m.Groups["trp"].Value);
                                this.isd.lastPid            = int.Parse(m.Groups["lastpid"].Value);
                            }
                        }
                        catch (Exception ex)
                        {
                            log.Error(ex, "Error while running command 'cat /proc/loadavg'");
                            break;
                        }


                        //
                        // /proc/uptime
                        try
                        {
                            //
                            string commandResponse = (await ssh.RunCommandAsync("cat /proc/uptime", 5000)).Trim();

                            double[] values = commandResponse
                                              .Split(' ', StringSplitOptions.RemoveEmptyEntries)
                                              .Select(x => double.Parse(x, CultureInfo.InvariantCulture)).ToArray();

                            lock (this.isd)
                            {
                                this.isd.machineUptime    = values[0];
                                this.isd.cpuTotalIdleTime = values[1];
                            }
                        }
                        catch (Exception ex)
                        {
                            log.Error(ex, "Error while running command 'cat /proc/uptime'");
                            break;
                        }

                        //
                        // Get virtual memory statistics
                        try
                        {
                            string commandResponse = (await ssh.RunCommandAsync("vmstat -sSB", 5000)).Trim();

                            Dictionary <string, UInt64> dict = commandResponse
                                                               .Split('\n', StringSplitOptions.RemoveEmptyEntries)
                                                               .Select(x => x.Trim())
                                                               .ToDictionary(
                                (string key) =>
                            {
                                key = key.Substring(key.IndexOf(' ') + 1).Trim().ToLower();
                                if (key.StartsWith("b "))
                                {
                                    key = key.Substring(2).Trim();
                                }
                                return(key);
                            },
                                x => UInt64.Parse(x.Substring(0, x.IndexOf(' ')).Trim()));

                            lock (this.isd)
                            {
                                this.isd.memoryTotal = dict["total memory"];
                                this.isd.memoryUsed  = dict["used memory"];
                                this.isd.memoryFree  = dict["free memory"];

                                this.isd.memorySwapTotal = dict["total swap"];
                                this.isd.memorySwapUsed  = dict["used swap"];
                                this.isd.memorySwapFree  = dict["free swap"];
                            }
                        }
                        catch (Exception ex)
                        {
                            log.Error(ex, "Error while running command 'vmstat -sSB'");
                            break;
                        }


                        // wait up to 10 seconds (count in previous SSH calls)
                        TimeSpan ts    = DateTime.Now - begin;
                        int      delta = (int)Math.Max(0, 10 * 1000 - ts.TotalMilliseconds);
                        await Task.Delay(delta);
                    }

                    log.Info($"Exited node health monitoring loop (ct.IsCancellationRequested={ct.IsCancellationRequested}");
                }
            }

            log.Info($"Health monitor for node {this.descriptor.IP} stopped.");

            lock (this)
                this.isd.state = MonitorState.Terminated;
        }