/// <summary> /// Adds the steps required to configure the hive log collector which aggregates log events received /// from all hive nodes via their [neon-log-host] containers. /// </summary> /// <param name="steps">The configuration step list.</param> private void AddCollectorSteps(ConfigStepList steps) { // Add the steps to create the service. ServiceHelper.AddServiceStartSteps(hive, steps, "neon-log-collector", hive.Definition.Image.LogCollector, new CommandBundle( "docker service create", "--name", "neon-log-collector", "--detach=false", "--mode", "global", "--restart-delay", hive.Definition.Docker.RestartDelay, "--endpoint-mode", "vip", "--network", $"{HiveConst.PrivateNetwork}", "--constraint", $"node.role==manager", "--mount", "type=bind,source=/etc/neon/host-env,destination=/etc/neon/host-env,readonly=true", "--log-driver", "json-file", // Ensure that we don't log to the pipeline to avoid cascading events. ServiceHelper.ImagePlaceholderArg)); // Deploy the [neon-log-collector] traffic manager rule. steps.Add(ActionStep.Create(hive.FirstManager.Name, "setup/neon-log-collection-lbrule", node => { node.Status = "set neon-log-collector traffic manager rule"; // Configure a private hive proxy TCP route so the [neon-log-host] containers // will be able to reach the collectors. var rule = new TrafficTcpRule() { Name = "neon-log-collector", System = true, Log = false // This is important: we don't want to SPAM the log database with its own traffic. }; rule.Frontends.Add( new TrafficTcpFrontend() { ProxyPort = HiveHostPorts.ProxyPrivateTcpLogCollector }); rule.Backends.Add( new TrafficTcpBackend() { Server = "neon-log-collector", Port = NetworkPorts.TDAgentForward }); hive.PrivateTraffic.SetRule(rule); })); }
/// <summary> /// Adds the steps to configure the stateful Elasticsearch instances used to persist the log data. /// </summary> /// <param name="steps">The configuration step list.</param> private void AddElasticsearchSteps(ConfigStepList steps) { var esNodes = new List <SshProxy <NodeDefinition> >(); foreach (var nodeDefinition in hive.Definition.Nodes.Where(n => n.Labels.LogEsData)) { esNodes.Add(hive.GetNode(nodeDefinition.Name)); } // Determine number of manager nodes and the quorum size. // Note that we'll deploy an odd number of managers. var managerCount = Math.Min(esNodes.Count, 5); // We shouldn't ever need more than 5 managers if (!NeonHelper.IsOdd(managerCount)) { managerCount--; } var quorumCount = (managerCount / 2) + 1; // Sort the nodes by name and then separate the manager and // worker nodes (managers will be assigned to nodes that appear // first in the list). var managerEsNodes = new List <SshProxy <NodeDefinition> >(); var normalEsNodes = new List <SshProxy <NodeDefinition> >(); esNodes = esNodes.OrderBy(n => n.Name).ToList(); foreach (var esNode in esNodes) { if (managerEsNodes.Count < managerCount) { managerEsNodes.Add(esNode); } else { normalEsNodes.Add(esNode); } } // Figure out how much RAM to allocate to the Elasticsearch Docker containers // as well as Java heap within. The guidance is to set the heap size to half // the container RAM up to a maximum of 31GB. var esContainerRam = hive.Definition.Log.EsMemoryBytes; var esHeapBytes = Math.Min(esContainerRam / 2, 31L * NeonHelper.Giga); // We're going to use explicit docker commands to deploy the Elasticsearch cluster // log storage containers. // // We're mounting three volumes to the container: // // /etc/neon/host-env - Generic host specific environment variables // /etc/neon/env-log-esdata - Elasticsearch node host specific environment variables // neon-log-esdata-# - Persistent Elasticsearch data folder var esBootstrapNodes = new StringBuilder(); foreach (var esMasterNode in managerEsNodes) { esBootstrapNodes.AppendWithSeparator($"{esMasterNode.PrivateAddress}:{HiveHostPorts.LogEsDataTcp}", ","); } // Create a data volume for each Elasticsearch node and then start the node container. for (int i = 0; i < esNodes.Count; i++) { var esNode = esNodes[i]; var containerName = $"neon-log-esdata"; var isMaster = managerEsNodes.Contains(esNode) ? "true" : "false"; var volumeCommand = CommandStep.CreateSudo(esNode.Name, "docker-volume-create", containerName); steps.Add(volumeCommand); ServiceHelper.AddContainerStartSteps(hive, steps, esNode, containerName, hive.Definition.Image.Elasticsearch, new CommandBundle( "docker run", "--name", containerName, "--detach", "--restart", "always", "--volume", "/etc/neon/host-env:/etc/neon/host-env:ro", "--volume", $"{containerName}:/mnt/esdata", "--env", $"ELASTICSEARCH_CLUSTER={hive.Definition.Datacenter}.{hive.Definition.Name}.neon-log-esdata", "--env", $"ELASTICSEARCH_NODE_MASTER={isMaster}", "--env", $"ELASTICSEARCH_NODE_DATA=true", "--env", $"ELASTICSEARCH_NODE_COUNT={esNodes.Count}", "--env", $"ELASTICSEARCH_HTTP_PORT={HiveHostPorts.LogEsDataHttp}", "--env", $"ELASTICSEARCH_TCP_PORT={HiveHostPorts.LogEsDataTcp}", "--env", $"ELASTICSEARCH_QUORUM={quorumCount}", "--env", $"ELASTICSEARCH_BOOTSTRAP_NODES={esBootstrapNodes}", "--env", $"ES_JAVA_OPTS=-XX:+UnlockExperimentalVMOptions -XX:+UseCGroupMemoryLimitForHeap", "--memory", $"{esContainerRam / NeonHelper.Mega}M", "--memory-reservation", $"{esContainerRam / NeonHelper.Mega}M", "--memory-swappiness", "0", "--network", "host", "--log-driver", "json-file", // Ensure that we don't log to the pipeline to avoid cascading events. ServiceHelper.ImagePlaceholderArg)); } // Configure a private hive proxy route to the Elasticsearch nodes. steps.Add(ActionStep.Create(hive.FirstManager.Name, "setup/elasticsearch-lbrule", node => { var rule = new TrafficHttpRule() { Name = "neon-log-esdata", System = true, Log = false, // This is important: we don't want to SPAM the log database with its own traffic. Resolver = null }; rule.Frontends.Add( new TrafficHttpFrontend() { ProxyPort = HiveHostPorts.ProxyPrivateHttpLogEsData }); foreach (var esNode in esNodes) { rule.Backends.Add( new TrafficHttpBackend() { Server = esNode.Metadata.PrivateAddress.ToString(), Port = HiveHostPorts.LogEsDataHttp }); } hive.PrivateTraffic.SetRule(rule); })); // Wait for the elasticsearch cluster to become ready and then save the // [logstash-*] template. We need to do this before [neon-log-collector] // is started so we'll be sure that no indexes will be created before // we have a chance to persist the pattern. // // This works because [neon-log-collector] is the main service responsible // for persisting events to this index. steps.Add(ActionStep.Create(hive.FirstManager.Name, operationName: null, node => { node.Status = "wait for elasticsearch cluster"; using (var jsonClient = new JsonClient()) { var baseLogEsDataUri = hive.Definition.LogEsDataUri; var timeout = TimeSpan.FromMinutes(5); var timeoutTime = DateTime.UtcNow + timeout; var esNodeCount = hive.Definition.Nodes.Count(n => n.Labels.LogEsData); // Wait for the Elasticsearch cluster. jsonClient.UnsafeRetryPolicy = NoRetryPolicy.Instance; while (true) { try { var response = jsonClient.GetUnsafeAsync($"{baseLogEsDataUri}/_cluster/health").Result; if (response.IsSuccess) { var clusterStatus = response.AsDynamic(); var status = (string)(clusterStatus.status); status = status.ToUpperInvariant(); node.Status = $"wait for [neon-log-esdata] cluster: [status={status}] [{clusterStatus.number_of_nodes}/{esNodeCount} nodes ready])"; // $todo(jeff.lill): // // We're accepting YELLOW status here due to this issue: // // https://github.com/jefflill/NeonForge/issues/257 if ((status == "GREEN" || status == "YELLOW") && clusterStatus.number_of_nodes == esNodeCount) { node.Status = "elasticsearch cluster is ready"; break; } } } catch { if (DateTime.UtcNow >= timeoutTime) { node.Fault($"[neon-log-esdata] cluster not ready after waiting [{timeout}]."); return; } } Thread.Sleep(TimeSpan.FromSeconds(1)); } // Save the [logstash-*] template pattern. var templatePattern = ResourceFiles.Root.GetFolder("Elasticsearch").GetFile("logstash-template.json").Contents; jsonClient.PutAsync($"{baseLogEsDataUri}/_template/logstash-*", templatePattern).Wait(); } })); }