public InstanceEndpoint ReserveBestInstance(InstanceEndpoint previousEndpoint) { lock (this.innerList) { // The first instance in the list is always the best. LinkedListNode <InstanceEndpoint> reserved = this.innerList.First; reserved.Value.PendingRequestCount++; if (this.innerList.Count > 1) { // The list is always sorted and we assume that in the common case incrementing // the pending request count will naturally move the item to the back of the // list. As a result, this operation is expected to be O(1) in most cases. this.UpdatePositionFromBack(reserved); if (previousEndpoint != null) { previousEndpoint.PendingRequestCount--; // This was previously the best, but was downgraded temporarily. // try to find it from the back. It will likely need to be moved // toward the front. this.UpdatePositionFromFront(previousEndpoint.Node); } } return(reserved.Value); } }
public bool TryGetValue(string ipAddress, out InstanceEndpoint result) { lock (this.innerList) { return(this.TryGetValueInternal(ipAddress, out result)); } }
public int RemoveStaleEntries(uint expirationTime) { int removedCount = 0; lock (this.innerList) { LinkedListNode <InstanceEndpoint> current = this.innerList.First; while (current != null) { LinkedListNode <InstanceEndpoint> next = current.Next; InstanceEndpoint instance = current.Value; if (instance.LastRefreshTimestamp < expirationTime) { this.innerList.Remove(current); removedCount++; Debug.WriteLine("UpdateWorkerList: Removed worker {0} from the list.", (object)instance.IPAddress); AntaresEventProvider.EventWriteLBHttpDispatchEndpointInfoMessage(site.Name, instance.IPAddress, "UpdateWorkerList", "Removing worker from routing list"); } current = next; } } return(removedCount); }
private async Task <bool> PingAsync(string siteName, InstanceEndpoint instance, string defaultHostName) { // A null status code is interpreted to be equivalent to a timeout or an error sending the ping. // All other status codes are assumed to mean that the endpoint is unhealthy. HttpStatusCode?statusCode = await this.OnPingAsync(siteName, instance.IPAddress, defaultHostName); return(statusCode != null && statusCode < HttpStatusCode.BadRequest); }
private int UpdateWorkerList(SiteMetadata site, string[] currentWorkerSet) { uint now = GetCurrentTickCount(); // Union the cached list of known workers with the list provided by the FE. The list from the FE // may be stale, so we have to accept new workers generously and carefully age out stale workers. foreach (string ipAddress in currentWorkerSet) { InstanceEndpoint instance = site.Endpoints.GetOrAdd(ipAddress); instance.LastRefreshTimestamp = now; // Clear the busy status for workers whose busy status is ready to expire. if (instance.IsBusy && instance.IsBusyUntil < now) { site.Endpoints.ClearBusyStatus(instance); } // Periodically trace the health statistics of the workers if (instance.NextMetricsTraceTime == 0) { instance.NextMetricsTraceTime = now + WorkerMetricsTraceInterval; } else if (now > instance.NextMetricsTraceTime && Monitor.TryEnter(instance)) { try { Debug.WriteLine("UpdateWorkerList: Worker metrics for site {0}: {1}", site.Name, instance.ToString()); AntaresEventProvider.EventWriteLBHttpDispatchEndpointMetrics(site.Name, instance.IPAddress, instance.PendingRequestCount, instance.IsBusy, instance.Weight); instance.NextMetricsTraceTime = now + WorkerMetricsTraceInterval; } finally { Monitor.Exit(instance); } } } // This is a performance-sensitive code path so we throttle the age-out logic to avoid using excess CPU. if (now >= site.NextWorkerExpirationTick && Monitor.TryEnter(site)) { try { site.Endpoints.RemoveStaleEntries(site.NextWorkerExpirationTick); // Wait M seconds before doing another worker expiration check. site.NextWorkerExpirationTick = now + WorkerExpirationCheckInterval; } finally { Monitor.Exit(site); } } site.IsBurstMode = site.Endpoints.Count < this.burstLimit; return(site.Endpoints.Count); }
// Caller must be holding the lock private bool TryGetValueInternal(string ipAddress, out InstanceEndpoint result) { foreach (var instance in this.innerList) { if (instance.IPAddress == ipAddress) { result = instance; return(true); } } result = null; return(false); }
public void OnRequestCompleted(InstanceEndpoint instance) { lock (this.innerList) { instance.PendingRequestCount--; if (this.innerList.Count > 1) { // Move the instance towards the front of the list now that // its availability has increased. this.UpdatePositionFromFront(instance.Node); } } }
internal void SetIsBusy(InstanceEndpoint instance) { lock (this.innerList) { uint duration = SiteRequestDispatcher.BusyStatusDuration; instance.IsBusyUntil = HttpScaleEnvironment.TickCount + duration; instance.IsBusy = true; Debug.WriteLine("Set instance {0} as busy for the next {1}ms. New weight: {2}", instance.IPAddress, duration, instance.Weight); AntaresEventProvider.EventWriteLBHttpDispatchEndpointInfoMessage(site.Name, instance.IPAddress, "SetIsBusy", string.Format("Set instance busy for {0}ms. New weight: {1}", duration, instance.Weight)); // Busy adds a large amount of weight to an instance, moving it towards the back. this.UpdatePositionFromBack(instance.Node); } }
internal void ClearBusyStatus(InstanceEndpoint instance) { lock (this.innerList) { instance.IsBusy = false; // Clearing the busy flag is going to give it a big priority boost, // though it's not clear where it will end up. Since most instances will // have similar status, it should be safe to assume clearing a busy flag // will move it toward the front of the priority list. this.UpdatePositionFromFront(instance.Node); Debug.WriteLine("Removed busy status from {0}. New weight: {1}", instance.IPAddress, instance.Weight); AntaresEventProvider.EventWriteLBHttpDispatchEndpointInfoMessage(site.Name, instance.IPAddress, "ClearBusyStatus", "Removing busy status. New weight: " + instance.Weight); } }
// Caller must be holding an async lock private async Task <InstanceEndpoint> ScaleOutAsync( SiteMetadata site, int targetInstanceCount, InstanceEndpoint previousEndpoint) { Debug.WriteLine("Attempting to scale out to " + targetInstanceCount); AntaresEventProvider.EventWriteLBHttpDispatchSiteInfoMessage(site.Name, "ScaleOut", string.Format("Attempting to scale out to {0} instances.", targetInstanceCount)); string[] ipAddresses = await this.OnScaleOutAsync(site.Name, targetInstanceCount); if (ipAddresses != null) { this.UpdateWorkerList(site, ipAddresses); } // It is expected in most cases that this will return the newly added worker (if any). return(site.Endpoints.ReserveBestInstance(previousEndpoint)); }
public InstanceEndpoint GetOrAdd(string ipAddress) { InstanceEndpoint newInstance; lock (this.innerList) { InstanceEndpoint existingInstance; if (this.TryGetValueInternal(ipAddress, out existingInstance)) { return(existingInstance); } newInstance = new InstanceEndpoint(ipAddress); newInstance.Node = this.innerList.AddFirst(newInstance); } Debug.WriteLine("UpdateWorkerList: Added worker {0}.", (object)ipAddress); AntaresEventProvider.EventWriteLBHttpDispatchEndpointInfoMessage(this.site.Name, ipAddress, "UpdateWorkerList", "Added worker"); return(newInstance); }
public virtual async Task <string> DispatchRequestAsync( string requestId, string siteName, string defaultHostName, string[] knownWorkers) { SiteMetadata site = this.knownSites.GetOrAdd(siteName, name => new SiteMetadata(siteName)); this.UpdateWorkerList(site, knownWorkers); InstanceEndpoint bestInstance = site.Endpoints.ReserveBestInstance(null); if (site.IsBurstMode) { if (bestInstance.PendingRequestCount <= 1) { // Endpoint is idle - choose it. return(bestInstance.IPAddress); } else { // All endpoints are occupied; try to scale out. using (var scaleLock = await site.ScaleLock.LockAsync(MaxLockTime)) { if (scaleLock.TimedOut) { // The caller should return 429 to avoid overloading the current role Debug.WriteLine("Timed-out waiting to start a scale operation in burst mode."); AntaresEventProvider.EventWriteLBHttpDispatchSiteWarningMessage(site.Name, "DispatchRequest", string.Format("Timed-out ({0}ms) waiting to start a scale operation in burst mode. Reverting to {1}.", MaxLockTime, bestInstance.IPAddress)); return(bestInstance.IPAddress); } if (site.IsBurstMode) { // No instances are idle, scale-out and send the request to the new instance. // Note that there will be a cold-start penalty for this request, but it's // been decided that this is better than sending a request to an existing // but potentially CPU-pegged instance. int targetInstanceCount = Math.Min(this.burstLimit, site.Endpoints.Count + 1); InstanceEndpoint newInstance = await this.ScaleOutAsync( site, targetInstanceCount, bestInstance); return(newInstance.IPAddress); } } } } // Pick one request at a time to use for latency tracking uint now = GetCurrentTickCount(); if (Interlocked.CompareExchange(ref bestInstance.HealthTrackingRequestId, requestId, null) == null) { bestInstance.HealthTrackingRequestStartTime = now; } if (bestInstance.PendingRequestCount <= 1) { // This is an idle worker (the current request is the pending one). return(bestInstance.IPAddress); } if (bestInstance.IsBusy) { using (var result = await site.ScaleLock.LockAsync(MaxLockTime)) { if (result.TimedOut) { // Scale operations are unhealthy Debug.WriteLine("Timed-out waiting to start a scale operation on busy instance."); AntaresEventProvider.EventWriteLBHttpDispatchSiteWarningMessage(site.Name, "DispatchRequest", string.Format("Timed-out ({0}ms) waiting to start a scale operation on a busy instance {1}.", MaxLockTime, bestInstance.IPAddress)); return(bestInstance.IPAddress); } if (!bestInstance.IsBusy) { // The instance became healthy while we were waiting. return(bestInstance.IPAddress); } // Serialize scale-out requests to avoid overloading our infrastructure. Each serialized scale // request will request one more instance from the previous request which can result in rapid-scale out. bestInstance = await this.ScaleOutAsync(site, site.Endpoints.Count + 1, bestInstance); if (bestInstance.IsBusy) { // Scale-out failed Debug.WriteLine("Best instance is still busy after a scale operation."); AntaresEventProvider.EventWriteLBHttpDispatchSiteWarningMessage(site.Name, "DispatchRequest", string.Format("Best instance {0} is still busy after a scale operation.", bestInstance.IPAddress)); } return(bestInstance.IPAddress); } } bool isQuestionable = bestInstance.PendingRequestCount > QuestionablePendingRequestCount || bestInstance.HealthTrackingRequestStartTime < now - QuestionableRequestLatency; if (isQuestionable && bestInstance.NextAllowedPingTime <= now && Interlocked.CompareExchange(ref bestInstance.PingLock, 1, 0) == 0) { bool isHealthy; try { isHealthy = await this.PingAsync(siteName, bestInstance, defaultHostName); } finally { bestInstance.NextAllowedPingTime = now + PingInterval; bestInstance.PingLock = 0; } if (!isHealthy) { site.Endpoints.SetIsBusy(bestInstance); // Serialize scale-out requests to avoid overloading our infrastructure. Each serialized scale // request will request one more instance from the previous request which can result in rapid-scale out. using (var result = await site.ScaleLock.LockAsync(MaxLockTime)) { if (result.TimedOut) { // Scale operations are unhealthy Debug.WriteLine("Timed-out waiting to start a scale operation after unhealthy ping."); AntaresEventProvider.EventWriteLBHttpDispatchSiteWarningMessage(site.Name, "DispatchRequest", string.Format("Timed-out ({0}ms) waiting to start a scale operation after unhealthy ping to {1}.", MaxLockTime, bestInstance.IPAddress)); return(bestInstance.IPAddress); } bestInstance = await this.ScaleOutAsync(site, site.Endpoints.Count + 1, bestInstance); if (bestInstance.IsBusy) { // Scale-out failed Debug.WriteLine("Best worker is still busy after a ping-initiated scale."); AntaresEventProvider.EventWriteLBHttpDispatchSiteWarningMessage(site.Name, "DispatchRequest", string.Format("Best worker {0} is still busy after a ping-initiated scale.", bestInstance.IPAddress)); } return(bestInstance.IPAddress); } } } return(bestInstance.IPAddress); }