private int UpdateWorkerList(SiteMetadata site, string[] currentWorkerSet)
        {
            uint now = GetCurrentTickCount();

            // Union the cached list of known workers with the list provided by the FE. The list from the FE
            // may be stale, so we have to accept new workers generously and carefully age out stale workers.
            foreach (string ipAddress in currentWorkerSet)
            {
                InstanceEndpoint instance = site.Endpoints.GetOrAdd(ipAddress);
                instance.LastRefreshTimestamp = now;

                // Clear the busy status for workers whose busy status is ready to expire.
                if (instance.IsBusy && instance.IsBusyUntil < now)
                {
                    site.Endpoints.ClearBusyStatus(instance);
                }

                // Periodically trace the health statistics of the workers
                if (instance.NextMetricsTraceTime == 0)
                {
                    instance.NextMetricsTraceTime = now + WorkerMetricsTraceInterval;
                }
                else if (now > instance.NextMetricsTraceTime && Monitor.TryEnter(instance))
                {
                    try
                    {
                        Debug.WriteLine("UpdateWorkerList: Worker metrics for site {0}: {1}", site.Name, instance.ToString());
                        AntaresEventProvider.EventWriteLBHttpDispatchEndpointMetrics(site.Name, instance.IPAddress, instance.PendingRequestCount, instance.IsBusy, instance.Weight);
                        instance.NextMetricsTraceTime = now + WorkerMetricsTraceInterval;
                    }
                    finally
                    {
                        Monitor.Exit(instance);
                    }
                }
            }

            // This is a performance-sensitive code path so we throttle the age-out logic to avoid using excess CPU.
            if (now >= site.NextWorkerExpirationTick && Monitor.TryEnter(site))
            {
                try
                {
                    site.Endpoints.RemoveStaleEntries(site.NextWorkerExpirationTick);

                    // Wait M seconds before doing another worker expiration check.
                    site.NextWorkerExpirationTick = now + WorkerExpirationCheckInterval;
                }
                finally
                {
                    Monitor.Exit(site);
                }
            }

            site.IsBurstMode = site.Endpoints.Count < this.burstLimit;
            return(site.Endpoints.Count);
        }
        // Caller must be holding an async lock
        private async Task <InstanceEndpoint> ScaleOutAsync(
            SiteMetadata site,
            int targetInstanceCount,
            InstanceEndpoint previousEndpoint)
        {
            Debug.WriteLine("Attempting to scale out to " + targetInstanceCount);
            AntaresEventProvider.EventWriteLBHttpDispatchSiteInfoMessage(site.Name, "ScaleOut", string.Format("Attempting to scale out to {0} instances.", targetInstanceCount));
            string[] ipAddresses = await this.OnScaleOutAsync(site.Name, targetInstanceCount);

            if (ipAddresses != null)
            {
                this.UpdateWorkerList(site, ipAddresses);
            }

            // It is expected in most cases that this will return the newly added worker (if any).
            return(site.Endpoints.ReserveBestInstance(previousEndpoint));
        }
        public virtual async Task <string> DispatchRequestAsync(
            string requestId,
            string siteName,
            string defaultHostName,
            string[] knownWorkers)
        {
            SiteMetadata site = this.knownSites.GetOrAdd(siteName, name => new SiteMetadata(siteName));

            this.UpdateWorkerList(site, knownWorkers);

            InstanceEndpoint bestInstance = site.Endpoints.ReserveBestInstance(null);

            if (site.IsBurstMode)
            {
                if (bestInstance.PendingRequestCount <= 1)
                {
                    // Endpoint is idle - choose it.
                    return(bestInstance.IPAddress);
                }
                else
                {
                    // All endpoints are occupied; try to scale out.
                    using (var scaleLock = await site.ScaleLock.LockAsync(MaxLockTime))
                    {
                        if (scaleLock.TimedOut)
                        {
                            // The caller should return 429 to avoid overloading the current role
                            Debug.WriteLine("Timed-out waiting to start a scale operation in burst mode.");
                            AntaresEventProvider.EventWriteLBHttpDispatchSiteWarningMessage(site.Name, "DispatchRequest", string.Format("Timed-out ({0}ms) waiting to start a scale operation in burst mode. Reverting to {1}.", MaxLockTime, bestInstance.IPAddress));
                            return(bestInstance.IPAddress);
                        }

                        if (site.IsBurstMode)
                        {
                            // No instances are idle, scale-out and send the request to the new instance.
                            // Note that there will be a cold-start penalty for this request, but it's
                            // been decided that this is better than sending a request to an existing
                            // but potentially CPU-pegged instance.
                            int targetInstanceCount      = Math.Min(this.burstLimit, site.Endpoints.Count + 1);
                            InstanceEndpoint newInstance = await this.ScaleOutAsync(
                                site,
                                targetInstanceCount,
                                bestInstance);

                            return(newInstance.IPAddress);
                        }
                    }
                }
            }

            // Pick one request at a time to use for latency tracking
            uint now = GetCurrentTickCount();

            if (Interlocked.CompareExchange(ref bestInstance.HealthTrackingRequestId, requestId, null) == null)
            {
                bestInstance.HealthTrackingRequestStartTime = now;
            }

            if (bestInstance.PendingRequestCount <= 1)
            {
                // This is an idle worker (the current request is the pending one).
                return(bestInstance.IPAddress);
            }

            if (bestInstance.IsBusy)
            {
                using (var result = await site.ScaleLock.LockAsync(MaxLockTime))
                {
                    if (result.TimedOut)
                    {
                        // Scale operations are unhealthy
                        Debug.WriteLine("Timed-out waiting to start a scale operation on busy instance.");
                        AntaresEventProvider.EventWriteLBHttpDispatchSiteWarningMessage(site.Name, "DispatchRequest", string.Format("Timed-out ({0}ms) waiting to start a scale operation on a busy instance {1}.", MaxLockTime, bestInstance.IPAddress));
                        return(bestInstance.IPAddress);
                    }

                    if (!bestInstance.IsBusy)
                    {
                        // The instance became healthy while we were waiting.
                        return(bestInstance.IPAddress);
                    }

                    // Serialize scale-out requests to avoid overloading our infrastructure. Each serialized scale
                    // request will request one more instance from the previous request which can result in rapid-scale out.
                    bestInstance = await this.ScaleOutAsync(site, site.Endpoints.Count + 1, bestInstance);

                    if (bestInstance.IsBusy)
                    {
                        // Scale-out failed
                        Debug.WriteLine("Best instance is still busy after a scale operation.");
                        AntaresEventProvider.EventWriteLBHttpDispatchSiteWarningMessage(site.Name, "DispatchRequest", string.Format("Best instance {0} is still busy after a scale operation.", bestInstance.IPAddress));
                    }

                    return(bestInstance.IPAddress);
                }
            }

            bool isQuestionable =
                bestInstance.PendingRequestCount > QuestionablePendingRequestCount ||
                bestInstance.HealthTrackingRequestStartTime < now - QuestionableRequestLatency;

            if (isQuestionable &&
                bestInstance.NextAllowedPingTime <= now &&
                Interlocked.CompareExchange(ref bestInstance.PingLock, 1, 0) == 0)
            {
                bool isHealthy;
                try
                {
                    isHealthy = await this.PingAsync(siteName, bestInstance, defaultHostName);
                }
                finally
                {
                    bestInstance.NextAllowedPingTime = now + PingInterval;
                    bestInstance.PingLock            = 0;
                }

                if (!isHealthy)
                {
                    site.Endpoints.SetIsBusy(bestInstance);

                    // Serialize scale-out requests to avoid overloading our infrastructure. Each serialized scale
                    // request will request one more instance from the previous request which can result in rapid-scale out.
                    using (var result = await site.ScaleLock.LockAsync(MaxLockTime))
                    {
                        if (result.TimedOut)
                        {
                            // Scale operations are unhealthy
                            Debug.WriteLine("Timed-out waiting to start a scale operation after unhealthy ping.");
                            AntaresEventProvider.EventWriteLBHttpDispatchSiteWarningMessage(site.Name, "DispatchRequest", string.Format("Timed-out ({0}ms) waiting to start a scale operation after unhealthy ping to {1}.", MaxLockTime, bestInstance.IPAddress));
                            return(bestInstance.IPAddress);
                        }

                        bestInstance = await this.ScaleOutAsync(site, site.Endpoints.Count + 1, bestInstance);

                        if (bestInstance.IsBusy)
                        {
                            // Scale-out failed
                            Debug.WriteLine("Best worker is still busy after a ping-initiated scale.");
                            AntaresEventProvider.EventWriteLBHttpDispatchSiteWarningMessage(site.Name, "DispatchRequest", string.Format("Best worker {0} is still busy after a ping-initiated scale.", bestInstance.IPAddress));
                        }

                        return(bestInstance.IPAddress);
                    }
                }
            }

            return(bestInstance.IPAddress);
        }
Example #4
0
 public SortedEndpointCollection(SiteMetadata site)
 {
     this.site      = site;
     this.innerList = new LinkedList <InstanceEndpoint>();
 }