private int UpdateWorkerList(SiteMetadata site, string[] currentWorkerSet) { uint now = GetCurrentTickCount(); // Union the cached list of known workers with the list provided by the FE. The list from the FE // may be stale, so we have to accept new workers generously and carefully age out stale workers. foreach (string ipAddress in currentWorkerSet) { InstanceEndpoint instance = site.Endpoints.GetOrAdd(ipAddress); instance.LastRefreshTimestamp = now; // Clear the busy status for workers whose busy status is ready to expire. if (instance.IsBusy && instance.IsBusyUntil < now) { site.Endpoints.ClearBusyStatus(instance); } // Periodically trace the health statistics of the workers if (instance.NextMetricsTraceTime == 0) { instance.NextMetricsTraceTime = now + WorkerMetricsTraceInterval; } else if (now > instance.NextMetricsTraceTime && Monitor.TryEnter(instance)) { try { Debug.WriteLine("UpdateWorkerList: Worker metrics for site {0}: {1}", site.Name, instance.ToString()); AntaresEventProvider.EventWriteLBHttpDispatchEndpointMetrics(site.Name, instance.IPAddress, instance.PendingRequestCount, instance.IsBusy, instance.Weight); instance.NextMetricsTraceTime = now + WorkerMetricsTraceInterval; } finally { Monitor.Exit(instance); } } } // This is a performance-sensitive code path so we throttle the age-out logic to avoid using excess CPU. if (now >= site.NextWorkerExpirationTick && Monitor.TryEnter(site)) { try { site.Endpoints.RemoveStaleEntries(site.NextWorkerExpirationTick); // Wait M seconds before doing another worker expiration check. site.NextWorkerExpirationTick = now + WorkerExpirationCheckInterval; } finally { Monitor.Exit(site); } } site.IsBurstMode = site.Endpoints.Count < this.burstLimit; return(site.Endpoints.Count); }
// Caller must be holding an async lock private async Task <InstanceEndpoint> ScaleOutAsync( SiteMetadata site, int targetInstanceCount, InstanceEndpoint previousEndpoint) { Debug.WriteLine("Attempting to scale out to " + targetInstanceCount); AntaresEventProvider.EventWriteLBHttpDispatchSiteInfoMessage(site.Name, "ScaleOut", string.Format("Attempting to scale out to {0} instances.", targetInstanceCount)); string[] ipAddresses = await this.OnScaleOutAsync(site.Name, targetInstanceCount); if (ipAddresses != null) { this.UpdateWorkerList(site, ipAddresses); } // It is expected in most cases that this will return the newly added worker (if any). return(site.Endpoints.ReserveBestInstance(previousEndpoint)); }
public virtual async Task <string> DispatchRequestAsync( string requestId, string siteName, string defaultHostName, string[] knownWorkers) { SiteMetadata site = this.knownSites.GetOrAdd(siteName, name => new SiteMetadata(siteName)); this.UpdateWorkerList(site, knownWorkers); InstanceEndpoint bestInstance = site.Endpoints.ReserveBestInstance(null); if (site.IsBurstMode) { if (bestInstance.PendingRequestCount <= 1) { // Endpoint is idle - choose it. return(bestInstance.IPAddress); } else { // All endpoints are occupied; try to scale out. using (var scaleLock = await site.ScaleLock.LockAsync(MaxLockTime)) { if (scaleLock.TimedOut) { // The caller should return 429 to avoid overloading the current role Debug.WriteLine("Timed-out waiting to start a scale operation in burst mode."); AntaresEventProvider.EventWriteLBHttpDispatchSiteWarningMessage(site.Name, "DispatchRequest", string.Format("Timed-out ({0}ms) waiting to start a scale operation in burst mode. Reverting to {1}.", MaxLockTime, bestInstance.IPAddress)); return(bestInstance.IPAddress); } if (site.IsBurstMode) { // No instances are idle, scale-out and send the request to the new instance. // Note that there will be a cold-start penalty for this request, but it's // been decided that this is better than sending a request to an existing // but potentially CPU-pegged instance. int targetInstanceCount = Math.Min(this.burstLimit, site.Endpoints.Count + 1); InstanceEndpoint newInstance = await this.ScaleOutAsync( site, targetInstanceCount, bestInstance); return(newInstance.IPAddress); } } } } // Pick one request at a time to use for latency tracking uint now = GetCurrentTickCount(); if (Interlocked.CompareExchange(ref bestInstance.HealthTrackingRequestId, requestId, null) == null) { bestInstance.HealthTrackingRequestStartTime = now; } if (bestInstance.PendingRequestCount <= 1) { // This is an idle worker (the current request is the pending one). return(bestInstance.IPAddress); } if (bestInstance.IsBusy) { using (var result = await site.ScaleLock.LockAsync(MaxLockTime)) { if (result.TimedOut) { // Scale operations are unhealthy Debug.WriteLine("Timed-out waiting to start a scale operation on busy instance."); AntaresEventProvider.EventWriteLBHttpDispatchSiteWarningMessage(site.Name, "DispatchRequest", string.Format("Timed-out ({0}ms) waiting to start a scale operation on a busy instance {1}.", MaxLockTime, bestInstance.IPAddress)); return(bestInstance.IPAddress); } if (!bestInstance.IsBusy) { // The instance became healthy while we were waiting. return(bestInstance.IPAddress); } // Serialize scale-out requests to avoid overloading our infrastructure. Each serialized scale // request will request one more instance from the previous request which can result in rapid-scale out. bestInstance = await this.ScaleOutAsync(site, site.Endpoints.Count + 1, bestInstance); if (bestInstance.IsBusy) { // Scale-out failed Debug.WriteLine("Best instance is still busy after a scale operation."); AntaresEventProvider.EventWriteLBHttpDispatchSiteWarningMessage(site.Name, "DispatchRequest", string.Format("Best instance {0} is still busy after a scale operation.", bestInstance.IPAddress)); } return(bestInstance.IPAddress); } } bool isQuestionable = bestInstance.PendingRequestCount > QuestionablePendingRequestCount || bestInstance.HealthTrackingRequestStartTime < now - QuestionableRequestLatency; if (isQuestionable && bestInstance.NextAllowedPingTime <= now && Interlocked.CompareExchange(ref bestInstance.PingLock, 1, 0) == 0) { bool isHealthy; try { isHealthy = await this.PingAsync(siteName, bestInstance, defaultHostName); } finally { bestInstance.NextAllowedPingTime = now + PingInterval; bestInstance.PingLock = 0; } if (!isHealthy) { site.Endpoints.SetIsBusy(bestInstance); // Serialize scale-out requests to avoid overloading our infrastructure. Each serialized scale // request will request one more instance from the previous request which can result in rapid-scale out. using (var result = await site.ScaleLock.LockAsync(MaxLockTime)) { if (result.TimedOut) { // Scale operations are unhealthy Debug.WriteLine("Timed-out waiting to start a scale operation after unhealthy ping."); AntaresEventProvider.EventWriteLBHttpDispatchSiteWarningMessage(site.Name, "DispatchRequest", string.Format("Timed-out ({0}ms) waiting to start a scale operation after unhealthy ping to {1}.", MaxLockTime, bestInstance.IPAddress)); return(bestInstance.IPAddress); } bestInstance = await this.ScaleOutAsync(site, site.Endpoints.Count + 1, bestInstance); if (bestInstance.IsBusy) { // Scale-out failed Debug.WriteLine("Best worker is still busy after a ping-initiated scale."); AntaresEventProvider.EventWriteLBHttpDispatchSiteWarningMessage(site.Name, "DispatchRequest", string.Format("Best worker {0} is still busy after a ping-initiated scale.", bestInstance.IPAddress)); } return(bestInstance.IPAddress); } } } return(bestInstance.IPAddress); }
public SortedEndpointCollection(SiteMetadata site) { this.site = site; this.innerList = new LinkedList <InstanceEndpoint>(); }