/// <summary> /// Performs the checks and scale/delete operations. /// </summary> /// <param name="log">Log.</param> public static void DoCheck(ILogger log) { // on first run we start a stopwatch to track how long into it we // are (this is only used for fake load mode which is for testing) if (sw == null) { sw = Stopwatch.StartNew(); } // Read all the settings from the environment with sensible defaults // where applicable string clientID = Utils.GetEnvVariableOrDefault("clientID"); string clientSecret = Utils.GetEnvVariableOrDefault("clientSecret"); string tenantID = Utils.GetEnvVariableOrDefault("tenantID", "a8175357-a762-478b-b724-6c2bd3f3f45e"); string location = Utils.GetEnvVariableOrDefault("location"); string subscriptionID = Utils.GetEnvVariableOrDefault("subscriptionID"); string resourcegroupname = Utils.GetEnvVariableOrDefault("resourceGroupName"); string appGwName = Utils.GetEnvVariableOrDefault("appGwName", "appGw"); string scaleSetName = Utils.GetEnvVariableOrDefault("_scaleSetName", "myScaleSet"); int minHealthyServers = Utils.GetEnvVariableOrDefault("_minHealthyServers", 3); int healthBuffer = Utils.GetEnvVariableOrDefault("healthBuffer", 3); int maxConcurrentConnectionsPerNode = Utils.GetEnvVariableOrDefault("_maxConcurrentConnectionsPerNode", 3); int maxScaleUpUnit = Utils.GetEnvVariableOrDefault("_scaleByNodeCount", 10); int maxActiveServers = Utils.GetEnvVariableOrDefault("_maxActiveServers", 100); bool fakeMode = bool.Parse(Utils.GetEnvVariableOrDefault("fakeMode", "false")); int cleanUpEvery = Utils.GetEnvVariableOrDefault("cleanUpEvery", 4); int scaleUpEvery = Utils.GetEnvVariableOrDefault("scaleUpEvery", 1); int scheduleToRunFactor = Utils.GetEnvVariableOrDefault("scheduleToRunFactor", 3); // how often we actually run when getting scheduled bool scaleUpQuickly = bool.Parse(Utils.GetEnvVariableOrDefault("scaleUpQuickly", "true")); bool logCustomMetrics = bool.Parse(Utils.GetEnvVariableOrDefault("logCustomMetrics", "false")); bool logCustomMetricsVerboseLogging = bool.Parse(Utils.GetEnvVariableOrDefault("logCustomMetricsVerboseLogging", "false")); // To get around CRON syntax limitations we can't actually run every 45 seconds // instead we get scheduled every 15 seconds and only actually run // every {scheduleToRunFactor} times. if (scheduleCount++ % scheduleToRunFactor != 0) { log.LogInformation("skipping due to scheduleToRunFactor"); return; } // clean up every {cleanUpEvery} times, scale every {scaleUpEvery} times bool cleanup = runCount % cleanUpEvery == 0; bool scaleup = runCount % scaleUpEvery == 0; runCount++; bool deletedNodes = false; try { log.LogInformation("Creating Azure Client for checkConcurrentCons Function"); var azEnvironment = AzureEnvironment.AzureGlobalCloud; var azClient = AzureClient.CreateAzureClient(clientID, clientSecret, tenantID, azEnvironment, subscriptionID); var scaleSet = azClient.VirtualMachineScaleSets.GetByResourceGroup(resourcegroupname, scaleSetName); var appGw = azClient.ApplicationGateways.GetByResourceGroup(resourcegroupname, appGwName); // We want to make sure that overprovisioning if OFF on the scaleset // since we are creating and deleting vms very often it makes sense to // get the exact counts. // Avoid extra api calls by checking this only once and setting flag once we have checked. if (!checkedOverProvisioningStatus) { VmScaleSetOperations.DisableOverProvisioning(scaleSet, log); checkedOverProvisioningStatus = true; } log.LogInformation($"Got AppGateway: {appGw.Id}"); var appGwBEHealth = azClient.ApplicationGateways.Inner.BackendHealthAsync(resourcegroupname, appGwName).Result; // Only run deletes every now and again if (cleanup) { log.LogInformation($"Scaleset size BEFORE checking for App Payload Failed nodes is {scaleSet.Capacity}"); //// Remove any bad nodes first deletedNodes = ApplicationGatewayOperations.CheckApplicationGatewayBEHealthAndDeleteBadNodes(appGwBEHealth, scaleSet, minHealthyServers, log); log.LogInformation($"Scaleset size AFTER checking for App Payload Failed nodes is {scaleSet.Capacity}"); } else { log.LogInformation("Not running cleanup this pass since cleanup == false"); } var healthyUnhealthyCounts = ApplicationGatewayOperations.GetHealthyAndUnhealthyNodeCounts(appGwBEHealth, log); var healthyNodeCount = healthyUnhealthyCounts.Item1; var unhealthyNodeCount = healthyUnhealthyCounts.Item2; var totalNodeCount = healthyNodeCount + unhealthyNodeCount; log.LogInformation("Detected {TotalNodes} nodes, {HealthyNodes} are healthy, {UnhealthyNodes} are unhealthy (creating & deleting)", totalNodeCount, healthyNodeCount, unhealthyNodeCount); ConnectionInfo connectionInfo; if (fakeMode) { connectionInfo = ApplicationGatewayOperations.GetFakeConnectionMetrics(appGw, azClient, (int)sw.Elapsed.TotalSeconds, log); } else { connectionInfo = ApplicationGatewayOperations.GetConnectionMetrics(appGw, azClient, log); } log.LogInformation(connectionInfo.ToString()); log.LogInformation(connectionInfo.GetHistoryAsString()); var vmsByState = scaleSet.VirtualMachines.List().ToList().GroupBy(v => v.Inner.ProvisioningState).ToDictionary(gdc => gdc.Key, gdc => gdc.ToList()); StringBuilder sb = new StringBuilder(); var deployingNodes = 0; foreach (var k in vmsByState.Keys) { sb.Append($" {k}:{vmsByState[k].Count}"); if (k == "Creating" || k == "Updating") { deployingNodes += vmsByState[k].Count; } } log.LogInformation($"VM counts by state : {sb.ToString()}"); // get the count of nodes which are deploying (will soon be online) log.LogInformation("Considering {0} of these deploying", deployingNodes); // Consider nodes which are healthy NOW plus ones that will soon be healthy in the math var logicalHealthyNodeCount = healthyNodeCount + deployingNodes; log.LogInformation("Node Health Summary from AGW : Healthy={Healthy},Deploying={Deploying},LogicalHealthNodeCount (logical = healthy + deploying)={LogicalHealthyNodeCount} ", healthyNodeCount, deployingNodes, logicalHealthyNodeCount); if (!connectionInfo.ResponseStatus.HasValue) { log.LogError("ResponseStatus information missing cannot make a decision on what to do"); return; } double rps; log.LogInformation("Computing rps using ResponseStatus only"); rps = connectionInfo.ResponseStatus.Value / 60.0; log.LogInformation($"ResponseStatus: { connectionInfo.ResponseStatus.Value } Total Requests: {connectionInfo.TotalRequests ?? 0} RPS: {rps}"); double idealNumberOfNodes = Math.Max(rps / maxConcurrentConnectionsPerNode, minHealthyServers); log.LogInformation("Ideal Node Count based on ResponseStatus = {IdealNodeCount}", idealNumberOfNodes); double rpsPerNode = rps / Math.Max(healthyNodeCount, 1); // Sanity check that the number makes sense if (idealNumberOfNodes > 160) { log.LogInformation("Ideal Node Count looks incorrect, possibly due to transient metrics ({IdealNodeCount}), ignoring scale event", idealNumberOfNodes); } else { if (logCustomMetrics) { log.LogInformation("Logging Custom Metrics"); metricJSONGenerator.populateCustomMetric("RPSPerInstance", rpsPerNode.ToString(), appGw.Name, scaleSet.RegionName, scaleSet.Id, log, clientID, clientSecret, tenantID, logCustomMetricsVerboseLogging); //metricJSONGenerator.populateCustomMetric("RPS", rps.ToString(), appGw.Name, scaleSet.RegionName, scaleSet.Id, log, clientID, clientSecret, tenantID, logCustomMetricsVerboseLogging); //metricJSONGenerator.populateCustomMetric("IdealNodeCount", idealNumberOfNodes.ToString(), appGw.Name, scaleSet.RegionName, scaleSet.Id, log, clientID, clientSecret, tenantID, logCustomMetricsVerboseLogging); //metricJSONGenerator.populateCustomMetric("MaxConcurrentConnectionsPerNode", maxConcurrentConnectionsPerNode.ToString(), appGw.Name, scaleSet.RegionName, scaleSet.Id, log, clientID, clientSecret, tenantID, logCustomMetricsVerboseLogging); return; } else { log.LogInformation($"Skipping logging Custom Metrics per appSetting - not used in production"); } // If we will scale down, hold off unless we get a consistent message to do that int idealNodes = (int)Math.Ceiling(idealNumberOfNodes); idealNodes += healthBuffer; if (idealNodes < scaleSet.Capacity) { scaleDownRequests.Add(idealNodes); if (scaleDownRequests.Count > 3) { log.LogInformation($"Scaling down due to repeated requests, list = {string.Join(",", scaleDownRequests.Select(s => s.ToString()))}, avg = {scaleDownRequests.Average()}"); idealNodes = (int)scaleDownRequests.Average(); log.LogInformation($"Scale down : Attempting to change capacity from {scaleSet.Capacity} to {idealNodes}"); VmScaleSetOperations.ScaleToTargetSize(scaleSet, idealNodes, maxScaleUpUnit, maxActiveServers, false, deletedNodes, log); // wipe out the list scaleDownRequests.Clear(); } else { log.LogInformation($"Scale down request received, total requests {scaleDownRequests.Count}, list = {string.Join(",", scaleDownRequests.Select(s => s.ToString()))} not scaling yet"); } } else { scaleDownRequests.Clear(); if (scaleup) { log.LogInformation($"Scale up : Attempting to change capacity from {scaleSet.Capacity} to {idealNodes}"); VmScaleSetOperations.ScaleToTargetSize(scaleSet, idealNodes, maxScaleUpUnit, maxActiveServers, scaleUpQuickly, deletedNodes, log); } else { log.LogInformation("** Not performing scale up operations as scaleup == false (45 second CRON adjustment)"); } } } } catch (Exception e) { log.LogError(e, e.ToString()); } }