static EC2Instance[] TryReadInstancesByTag(MyAWS aws, string tagName, string tagValue) { Console.WriteLine($"Read instances by tag {tagName}={tagValue}"); var filters = new List <EC2Filter> { new EC2Filter { Name = "tag:" + tagName, Values = new List <string> { tagValue } } }; try { return(aws.EC2.DescribeInstancesAsync(new DescribeInstancesRequest { Filters = filters }) .GetAwaiter().GetResult() .Reservations .SelectMany(r => r.Instances) .ToArray()); } catch (Exception x) { PrintError(x); Console.WriteLine(" Assuming no instances"); return(new EC2Instance[0]); } }
static IDictionary <string, int> ReadContainerCount(MyAWS aws, List <string> ids) { if (!ids.Any()) { throw new ArgumentException("Don't call me with empty list"); } Console.WriteLine($"Read running container count from {ids.Count} instances"); var result = new Dictionary <string, int>(); try { foreach (var pair in aws.RunShellScriptAsync(ids, CONTAINER_COUNT_SCRIPT, true).GetAwaiter().GetResult()) { try { result[pair.Key] = Convert.ToInt32(pair.Value.Trim()); } catch (Exception x) { PrintError(x); } } } catch (Exception x) { PrintError(x); } return(result); }
static bool TryScaleOut(MyAWS aws, int desiredCapacity) { try { Console.WriteLine($"Read auto scaling group '{SCALING_GROUP_NAME}'"); var group = aws.Scaling.DescribeAutoScalingGroupsAsync(new DescribeAutoScalingGroupsRequest { AutoScalingGroupNames = new List <string> { SCALING_GROUP_NAME }, }).GetAwaiter().GetResult().AutoScalingGroups.Single(); Console.WriteLine($"Current capacity: {group.DesiredCapacity} of {Env.MaxAgents}"); if (group.DesiredCapacity >= desiredCapacity) { return(false); } Console.WriteLine("New capacity: " + desiredCapacity); var response = aws.Scaling.UpdateAutoScalingGroupAsync(new UpdateAutoScalingGroupRequest { AutoScalingGroupName = SCALING_GROUP_NAME, MinSize = 0, MaxSize = Env.MaxAgents, DesiredCapacity = desiredCapacity }).GetAwaiter().GetResult(); PrintStatus(response); return(true); } catch (Exception x) { PrintError(x); return(false); } }
static void AdjustAgentCount(int desiredCount, out bool allAgentsAreTerminated) { allAgentsAreTerminated = false; using (var aws = new MyAWS(Env.AwsRegion, Env.AwsAccessKey, Env.AwsSecretKey)) { if (TryScaleOut(aws, desiredCount)) { return; } var allAgents = TryReadInstancesByTag(aws, AGENT_TAG_NAME, AGENT_TAG_VALUE); Console.WriteLine("Total agents: " + allAgents.Length); allAgentsAreTerminated = allAgents.All(i => i.State.Name == InstanceStateName.Terminated); if (!allAgentsAreTerminated) { AgentDiskUsageMonitor.Measure(aws, allAgents); TryTerminate(aws, "stopped", allAgents.Where(i => i.State.Name == InstanceStateName.Stopped)); TryShutdownExcessiveRunning(aws, allAgents, desiredCount); TryTerminate(aws, "lost", allAgents.Where(i => i.State.Name == InstanceStateName.Running && DateTime.Now - i.LaunchTime > TimeSpan.FromHours(12))); } else { Console.WriteLine("All agents are terminated"); } } }
static void TrySendShutdownScript(MyAWS aws, List <string> ids) { Console.WriteLine($"Send shutdown script to {ids.Count} instances"); try { aws.RunShellScriptAsync(ids, SHUTDOWN_SCRIPT, false).GetAwaiter().GetResult(); } catch (Exception x) { PrintError(x); } }
static void TryDetachFromAutoScalingGroup(MyAWS aws, List <string> ids) { Console.WriteLine($"Detach {ids.Count} instances from {SCALING_GROUP_NAME}"); try { var detachResponse = aws.Scaling.DetachInstancesAsync(new DetachInstancesRequest { AutoScalingGroupName = SCALING_GROUP_NAME, InstanceIds = ids, ShouldDecrementDesiredCapacity = true }).GetAwaiter().GetResult(); PrintStatus(detachResponse); } catch (Exception x) { PrintError(x); } }
static void TryTerminate(MyAWS aws, string adjective, List <string> ids) { if (!ids.Any()) { return; } Console.WriteLine($"Terminate {ids.Count} {adjective} agents"); try { var response = aws.EC2.TerminateInstancesAsync(new TerminateInstancesRequest { InstanceIds = ids }).GetAwaiter().GetResult(); PrintStatus(response); } catch (Exception x) { PrintError(x); } }
public static void Measure(MyAWS aws, IEnumerable <Instance> instances) { if (!ENABLED) { return; } var ids = instances .Where(i => i.State.Name == InstanceStateName.Running) .Where(i => (DateTime.Now - i.LaunchTime).TotalMinutes > 1) .Select(i => i.InstanceId) .ToList(); if (!ids.Any()) { return; } try { var outputs = aws.RunShellScriptAsync( ids, "df -BM --output=used /dev/nvme0n1p1", true ).GetAwaiter().GetResult(); foreach (var output in outputs.Values) { var used = 0.001 * Convert.ToInt32(output.Split('\n')[1].TrimEnd('M')); MaxUsed = Math.Max(MaxUsed, used); } } catch { return; } File.WriteAllText(STATE_FILE, MaxUsed.ToString()); Console.BackgroundColor = ConsoleColor.Yellow; Console.ForegroundColor = ConsoleColor.Black; Console.WriteLine($"Max agent disk usage: {MaxUsed} G"); Console.ResetColor(); }
static List <string> FindUnreachable(MyAWS aws, List <string> ids) { try { Console.WriteLine("Read agent statuses"); var statusResponse = aws.EC2 .DescribeInstanceStatusAsync(new DescribeInstanceStatusRequest { InstanceIds = ids }).GetAwaiter().GetResult(); PrintStatus(statusResponse); return(statusResponse.InstanceStatuses .Where(s => s.Status.Details.Any(d => d.Name == StatusName.Reachability && d.Status == StatusType.Failed && DateTime.Now - d.ImpairedSince > Env.AGENT_REACHABILITY_ALARM_THRESHOLD)) .Select(i => i.InstanceId) .ToList()); } catch (Exception x) { PrintError(x); return(new List <string>()); } }
static void TryTerminate(MyAWS aws, string adjective, IEnumerable <EC2Instance> query) { TryTerminate(aws, adjective, query.Select(i => i.InstanceId).ToList()); }
static void TryShutdownExcessiveRunning(MyAWS aws, EC2Instance[] allAgents, int desiredCount) { var sortedRunningAgents = allAgents .Where(i => i.State.Name == InstanceStateName.Running) .OrderBy(i => i.LaunchTime) .ThenBy(i => i.InstanceId) .Select(i => new AgentInfo( i.InstanceId, i.LaunchTime, i.Tags.Any(t => t.Key == SCALING_GROUP_TAG && t.Value == SCALING_GROUP_NAME) )) .ToArray(); Console.WriteLine("Running agents: " + sortedRunningAgents.Length); if (!sortedRunningAgents.Any()) { return; } if (sortedRunningAgents.All(a => a.Retain)) { Console.WriteLine("Retain all agents"); return; } if (sortedRunningAgents.Length > desiredCount) { if (DateTime.Now - LastContainerCountCheck > CONTAINER_COUNT_CHECK_INTERVAL) { LastContainerCountCheck = DateTime.Now; var containerCounts = ReadContainerCount(aws, sortedRunningAgents .Where(a => !a.Retain) .Select(a => a.InstanceId) .ToList() ); foreach (var a in sortedRunningAgents) { if (containerCounts.ContainsKey(a.InstanceId)) { a.SetContainerCount(containerCounts[a.InstanceId]); } } } else { Console.WriteLine("Delay container count check"); } } else { Console.WriteLine("Skip container count check"); } foreach (var a in sortedRunningAgents) { Console.Write(" "); Console.WriteLine(a); } var idsToCheckStatus = sortedRunningAgents.Where(a => a.WillCheckStatus).Select(a => a.InstanceId).ToList(); var idsToSendShutdownScript = sortedRunningAgents.Where(a => a.WillShutdown).Select(a => a.InstanceId).ToList(); var idsToDetach = sortedRunningAgents.Where(a => a.WillDetach).Select(a => a.InstanceId).ToList(); if (idsToCheckStatus.Any()) { var unreachableIds = FindUnreachable(aws, idsToCheckStatus); if (unreachableIds.Any()) { Console.WriteLine("Unreachable agents detected!"); idsToSendShutdownScript = idsToSendShutdownScript.Except(unreachableIds).ToList(); idsToDetach = idsToDetach.Except(unreachableIds).ToList(); TryTerminate(aws, "unreachable", unreachableIds); } } if (idsToSendShutdownScript.Any()) { TrySendShutdownScript(aws, idsToSendShutdownScript); } if (idsToDetach.Any()) { TryDetachFromAutoScalingGroup(aws, idsToDetach); } }