Example #1
0
        public InstanceEndpoint ReserveBestInstance(InstanceEndpoint previousEndpoint)
        {
            lock (this.innerList)
            {
                // The first instance in the list is always the best.
                LinkedListNode <InstanceEndpoint> reserved = this.innerList.First;
                reserved.Value.PendingRequestCount++;

                if (this.innerList.Count > 1)
                {
                    // The list is always sorted and we assume that in the common case incrementing
                    // the pending request count will naturally move the item to the back of the
                    // list. As a result, this operation is expected to be O(1) in most cases.
                    this.UpdatePositionFromBack(reserved);

                    if (previousEndpoint != null)
                    {
                        previousEndpoint.PendingRequestCount--;

                        // This was previously the best, but was downgraded temporarily.
                        // try to find it from the back. It will likely need to be moved
                        // toward the front.
                        this.UpdatePositionFromFront(previousEndpoint.Node);
                    }
                }

                return(reserved.Value);
            }
        }
Example #2
0
 public bool TryGetValue(string ipAddress, out InstanceEndpoint result)
 {
     lock (this.innerList)
     {
         return(this.TryGetValueInternal(ipAddress, out result));
     }
 }
Example #3
0
        public int RemoveStaleEntries(uint expirationTime)
        {
            int removedCount = 0;

            lock (this.innerList)
            {
                LinkedListNode <InstanceEndpoint> current = this.innerList.First;
                while (current != null)
                {
                    LinkedListNode <InstanceEndpoint> next = current.Next;

                    InstanceEndpoint instance = current.Value;
                    if (instance.LastRefreshTimestamp < expirationTime)
                    {
                        this.innerList.Remove(current);
                        removedCount++;

                        Debug.WriteLine("UpdateWorkerList: Removed worker {0} from the list.", (object)instance.IPAddress);
                        AntaresEventProvider.EventWriteLBHttpDispatchEndpointInfoMessage(site.Name, instance.IPAddress, "UpdateWorkerList", "Removing worker from routing list");
                    }

                    current = next;
                }
            }

            return(removedCount);
        }
        private async Task <bool> PingAsync(string siteName, InstanceEndpoint instance, string defaultHostName)
        {
            // A null status code is interpreted to be equivalent to a timeout or an error sending the ping.
            // All other status codes are assumed to mean that the endpoint is unhealthy.
            HttpStatusCode?statusCode = await this.OnPingAsync(siteName, instance.IPAddress, defaultHostName);

            return(statusCode != null && statusCode < HttpStatusCode.BadRequest);
        }
        private int UpdateWorkerList(SiteMetadata site, string[] currentWorkerSet)
        {
            uint now = GetCurrentTickCount();

            // Union the cached list of known workers with the list provided by the FE. The list from the FE
            // may be stale, so we have to accept new workers generously and carefully age out stale workers.
            foreach (string ipAddress in currentWorkerSet)
            {
                InstanceEndpoint instance = site.Endpoints.GetOrAdd(ipAddress);
                instance.LastRefreshTimestamp = now;

                // Clear the busy status for workers whose busy status is ready to expire.
                if (instance.IsBusy && instance.IsBusyUntil < now)
                {
                    site.Endpoints.ClearBusyStatus(instance);
                }

                // Periodically trace the health statistics of the workers
                if (instance.NextMetricsTraceTime == 0)
                {
                    instance.NextMetricsTraceTime = now + WorkerMetricsTraceInterval;
                }
                else if (now > instance.NextMetricsTraceTime && Monitor.TryEnter(instance))
                {
                    try
                    {
                        Debug.WriteLine("UpdateWorkerList: Worker metrics for site {0}: {1}", site.Name, instance.ToString());
                        AntaresEventProvider.EventWriteLBHttpDispatchEndpointMetrics(site.Name, instance.IPAddress, instance.PendingRequestCount, instance.IsBusy, instance.Weight);
                        instance.NextMetricsTraceTime = now + WorkerMetricsTraceInterval;
                    }
                    finally
                    {
                        Monitor.Exit(instance);
                    }
                }
            }

            // This is a performance-sensitive code path so we throttle the age-out logic to avoid using excess CPU.
            if (now >= site.NextWorkerExpirationTick && Monitor.TryEnter(site))
            {
                try
                {
                    site.Endpoints.RemoveStaleEntries(site.NextWorkerExpirationTick);

                    // Wait M seconds before doing another worker expiration check.
                    site.NextWorkerExpirationTick = now + WorkerExpirationCheckInterval;
                }
                finally
                {
                    Monitor.Exit(site);
                }
            }

            site.IsBurstMode = site.Endpoints.Count < this.burstLimit;
            return(site.Endpoints.Count);
        }
Example #6
0
        // Caller must be holding the lock
        private bool TryGetValueInternal(string ipAddress, out InstanceEndpoint result)
        {
            foreach (var instance in this.innerList)
            {
                if (instance.IPAddress == ipAddress)
                {
                    result = instance;
                    return(true);
                }
            }

            result = null;
            return(false);
        }
Example #7
0
        public void OnRequestCompleted(InstanceEndpoint instance)
        {
            lock (this.innerList)
            {
                instance.PendingRequestCount--;

                if (this.innerList.Count > 1)
                {
                    // Move the instance towards the front of the list now that
                    // its availability has increased.
                    this.UpdatePositionFromFront(instance.Node);
                }
            }
        }
Example #8
0
        internal void SetIsBusy(InstanceEndpoint instance)
        {
            lock (this.innerList)
            {
                uint duration = SiteRequestDispatcher.BusyStatusDuration;
                instance.IsBusyUntil = HttpScaleEnvironment.TickCount + duration;
                instance.IsBusy      = true;

                Debug.WriteLine("Set instance {0} as busy for the next {1}ms. New weight: {2}", instance.IPAddress, duration, instance.Weight);
                AntaresEventProvider.EventWriteLBHttpDispatchEndpointInfoMessage(site.Name, instance.IPAddress, "SetIsBusy", string.Format("Set instance busy for {0}ms. New weight: {1}", duration, instance.Weight));

                // Busy adds a large amount of weight to an instance, moving it towards the back.
                this.UpdatePositionFromBack(instance.Node);
            }
        }
Example #9
0
        internal void ClearBusyStatus(InstanceEndpoint instance)
        {
            lock (this.innerList)
            {
                instance.IsBusy = false;

                // Clearing the busy flag is going to give it a big priority boost,
                // though it's not clear where it will end up. Since most instances will
                // have similar status, it should be safe to assume clearing a busy flag
                // will move it toward the front of the priority list.
                this.UpdatePositionFromFront(instance.Node);

                Debug.WriteLine("Removed busy status from {0}. New weight: {1}", instance.IPAddress, instance.Weight);
                AntaresEventProvider.EventWriteLBHttpDispatchEndpointInfoMessage(site.Name, instance.IPAddress, "ClearBusyStatus", "Removing busy status. New weight: " + instance.Weight);
            }
        }
        // Caller must be holding an async lock
        private async Task <InstanceEndpoint> ScaleOutAsync(
            SiteMetadata site,
            int targetInstanceCount,
            InstanceEndpoint previousEndpoint)
        {
            Debug.WriteLine("Attempting to scale out to " + targetInstanceCount);
            AntaresEventProvider.EventWriteLBHttpDispatchSiteInfoMessage(site.Name, "ScaleOut", string.Format("Attempting to scale out to {0} instances.", targetInstanceCount));
            string[] ipAddresses = await this.OnScaleOutAsync(site.Name, targetInstanceCount);

            if (ipAddresses != null)
            {
                this.UpdateWorkerList(site, ipAddresses);
            }

            // It is expected in most cases that this will return the newly added worker (if any).
            return(site.Endpoints.ReserveBestInstance(previousEndpoint));
        }
Example #11
0
        public InstanceEndpoint GetOrAdd(string ipAddress)
        {
            InstanceEndpoint newInstance;

            lock (this.innerList)
            {
                InstanceEndpoint existingInstance;
                if (this.TryGetValueInternal(ipAddress, out existingInstance))
                {
                    return(existingInstance);
                }

                newInstance      = new InstanceEndpoint(ipAddress);
                newInstance.Node = this.innerList.AddFirst(newInstance);
            }

            Debug.WriteLine("UpdateWorkerList: Added worker {0}.", (object)ipAddress);
            AntaresEventProvider.EventWriteLBHttpDispatchEndpointInfoMessage(this.site.Name, ipAddress, "UpdateWorkerList", "Added worker");
            return(newInstance);
        }
        public virtual async Task <string> DispatchRequestAsync(
            string requestId,
            string siteName,
            string defaultHostName,
            string[] knownWorkers)
        {
            SiteMetadata site = this.knownSites.GetOrAdd(siteName, name => new SiteMetadata(siteName));

            this.UpdateWorkerList(site, knownWorkers);

            InstanceEndpoint bestInstance = site.Endpoints.ReserveBestInstance(null);

            if (site.IsBurstMode)
            {
                if (bestInstance.PendingRequestCount <= 1)
                {
                    // Endpoint is idle - choose it.
                    return(bestInstance.IPAddress);
                }
                else
                {
                    // All endpoints are occupied; try to scale out.
                    using (var scaleLock = await site.ScaleLock.LockAsync(MaxLockTime))
                    {
                        if (scaleLock.TimedOut)
                        {
                            // The caller should return 429 to avoid overloading the current role
                            Debug.WriteLine("Timed-out waiting to start a scale operation in burst mode.");
                            AntaresEventProvider.EventWriteLBHttpDispatchSiteWarningMessage(site.Name, "DispatchRequest", string.Format("Timed-out ({0}ms) waiting to start a scale operation in burst mode. Reverting to {1}.", MaxLockTime, bestInstance.IPAddress));
                            return(bestInstance.IPAddress);
                        }

                        if (site.IsBurstMode)
                        {
                            // No instances are idle, scale-out and send the request to the new instance.
                            // Note that there will be a cold-start penalty for this request, but it's
                            // been decided that this is better than sending a request to an existing
                            // but potentially CPU-pegged instance.
                            int targetInstanceCount      = Math.Min(this.burstLimit, site.Endpoints.Count + 1);
                            InstanceEndpoint newInstance = await this.ScaleOutAsync(
                                site,
                                targetInstanceCount,
                                bestInstance);

                            return(newInstance.IPAddress);
                        }
                    }
                }
            }

            // Pick one request at a time to use for latency tracking
            uint now = GetCurrentTickCount();

            if (Interlocked.CompareExchange(ref bestInstance.HealthTrackingRequestId, requestId, null) == null)
            {
                bestInstance.HealthTrackingRequestStartTime = now;
            }

            if (bestInstance.PendingRequestCount <= 1)
            {
                // This is an idle worker (the current request is the pending one).
                return(bestInstance.IPAddress);
            }

            if (bestInstance.IsBusy)
            {
                using (var result = await site.ScaleLock.LockAsync(MaxLockTime))
                {
                    if (result.TimedOut)
                    {
                        // Scale operations are unhealthy
                        Debug.WriteLine("Timed-out waiting to start a scale operation on busy instance.");
                        AntaresEventProvider.EventWriteLBHttpDispatchSiteWarningMessage(site.Name, "DispatchRequest", string.Format("Timed-out ({0}ms) waiting to start a scale operation on a busy instance {1}.", MaxLockTime, bestInstance.IPAddress));
                        return(bestInstance.IPAddress);
                    }

                    if (!bestInstance.IsBusy)
                    {
                        // The instance became healthy while we were waiting.
                        return(bestInstance.IPAddress);
                    }

                    // Serialize scale-out requests to avoid overloading our infrastructure. Each serialized scale
                    // request will request one more instance from the previous request which can result in rapid-scale out.
                    bestInstance = await this.ScaleOutAsync(site, site.Endpoints.Count + 1, bestInstance);

                    if (bestInstance.IsBusy)
                    {
                        // Scale-out failed
                        Debug.WriteLine("Best instance is still busy after a scale operation.");
                        AntaresEventProvider.EventWriteLBHttpDispatchSiteWarningMessage(site.Name, "DispatchRequest", string.Format("Best instance {0} is still busy after a scale operation.", bestInstance.IPAddress));
                    }

                    return(bestInstance.IPAddress);
                }
            }

            bool isQuestionable =
                bestInstance.PendingRequestCount > QuestionablePendingRequestCount ||
                bestInstance.HealthTrackingRequestStartTime < now - QuestionableRequestLatency;

            if (isQuestionable &&
                bestInstance.NextAllowedPingTime <= now &&
                Interlocked.CompareExchange(ref bestInstance.PingLock, 1, 0) == 0)
            {
                bool isHealthy;
                try
                {
                    isHealthy = await this.PingAsync(siteName, bestInstance, defaultHostName);
                }
                finally
                {
                    bestInstance.NextAllowedPingTime = now + PingInterval;
                    bestInstance.PingLock            = 0;
                }

                if (!isHealthy)
                {
                    site.Endpoints.SetIsBusy(bestInstance);

                    // Serialize scale-out requests to avoid overloading our infrastructure. Each serialized scale
                    // request will request one more instance from the previous request which can result in rapid-scale out.
                    using (var result = await site.ScaleLock.LockAsync(MaxLockTime))
                    {
                        if (result.TimedOut)
                        {
                            // Scale operations are unhealthy
                            Debug.WriteLine("Timed-out waiting to start a scale operation after unhealthy ping.");
                            AntaresEventProvider.EventWriteLBHttpDispatchSiteWarningMessage(site.Name, "DispatchRequest", string.Format("Timed-out ({0}ms) waiting to start a scale operation after unhealthy ping to {1}.", MaxLockTime, bestInstance.IPAddress));
                            return(bestInstance.IPAddress);
                        }

                        bestInstance = await this.ScaleOutAsync(site, site.Endpoints.Count + 1, bestInstance);

                        if (bestInstance.IsBusy)
                        {
                            // Scale-out failed
                            Debug.WriteLine("Best worker is still busy after a ping-initiated scale.");
                            AntaresEventProvider.EventWriteLBHttpDispatchSiteWarningMessage(site.Name, "DispatchRequest", string.Format("Best worker {0} is still busy after a ping-initiated scale.", bestInstance.IPAddress));
                        }

                        return(bestInstance.IPAddress);
                    }
                }
            }

            return(bestInstance.IPAddress);
        }