示例#1
0
        private async Task <IndexRequest> Index(InProgressIndexRequest request)
        {
            var siteHost = GetHost(request.Url);
            var siteHostWithDotBefore = '.' + siteHost;

            var siteMapUrl = new Uri(request.Url, "/sitemap.xml");
            var siteMap    = await siteMapGetter.GetSiteMap(siteMapUrl);

            if (pagesPerSiteLimiter.IsLimitReached(siteMap.Links.Length))
            {
                return(request.SetError(GetTooManyPagesErrorMessage(
                                            request.Url, siteMap.Links.Length, pagesPerSiteLimiter.PagesPerSiteLimit)
                                        ));
            }

            var urlsToIndex = new ConcurrentDictionary <Uri, byte>();

            urlsToIndex.TryAdd(request.Url, default);
            siteMap.Links
            .Where(uri =>
                   (uri.Host == siteHost || uri.Host.EndsWith(siteHostWithDotBefore)) &&
                   uri != request.Url)
            .Distinct()
            .ForEach(uri => urlsToIndex.TryAdd(uri, default));

            request.UpdatePagesCounts(0, urlsToIndex.Count);
            indexRequestsQueue.Update(request);

            Result <string> indexingResult;
            var             indexedUrls             = new ConcurrentDictionary <Uri, byte>();
            var             isUrlFromRequestIndexed = false;
            var             indexedUrlsRoughCount   = 0;

            var semaphore              = new SemaphoreSlim(32);
            var indexingTasks          = new ConcurrentDictionary <Task, byte>();
            var completedIndexingTasks = new ConcurrentStack <Task <Result <string> > >();

            while (!urlsToIndex.IsEmpty)
            {
                Uri currentUrl;
                if (!isUrlFromRequestIndexed)
                {
                    currentUrl = request.Url;
                    isUrlFromRequestIndexed = true;
                }
                else
                {
                    currentUrl = urlsToIndex.Keys.First();
                }

                urlsToIndex.TryRemove(currentUrl, out _);
                indexedUrls.TryAdd(currentUrl, default);

                if (semaphore.CurrentCount == 0)
                {
                    var indexedPagesCount = indexedUrls.Count;
                    var foundPagesCount   = indexedPagesCount + urlsToIndex.Count;
                    request.UpdatePagesCounts(indexedPagesCount, foundPagesCount);
#pragma warning disable 4014
                    indexRequestsQueue.UpdateAsync(request);
#pragma warning restore 4014
                }
                semaphore.Wait();
                indexingTasks.TryAdd(
                    IndexPage(currentUrl, request, siteHost, siteHostWithDotBefore, urlsToIndex, indexedUrls)
                    .ContinueWith(task =>
                {
                    semaphore.Release();
                    indexingTasks.TryRemove(task, out _);
                    completedIndexingTasks.Push(task);
                }),
                    default
                    );

                indexingResult = CheckResultOfCompletedTasks();
                if (indexingResult.IsFailure)
                {
                    return(request.SetError(indexingResult.Error));
                }

                while (urlsToIndex.IsEmpty)
                {
                    var indexingTask = indexingTasks.Keys.FirstOrDefault();
                    if (indexingTask == null)
                    {
                        break;
                    }

                    indexingTasks.TryRemove(indexingTask, out _);
                    indexingTask.Wait();
                }

                if (indexedUrls.Count / 200 > indexedUrlsRoughCount / 200)
                {
                    GC.Collect();
                    indexedUrlsRoughCount = indexedUrls.Count;
                }

                // ReSharper disable once InvertIf
                if (pagesPerSiteLimiter.IsLimitReached(indexedUrls.Count))
                {
                    Task.WaitAll(indexingTasks.Keys.ToArray());
                    _client.DeleteMany(
                        indexedUrls.Keys
                        .Where(uri => uri != request.Url)
                        .Select(uri => uri.ToString()),
                        _options.DocumentsIndexName
                        );
                    return(request.SetError(GetTooManyPagesErrorMessage(
                                                request.Url,
                                                indexedUrls.Count,
                                                pagesPerSiteLimiter.PagesPerSiteLimit)
                                            ));
                }
            }

            Task.WaitAll(indexingTasks.Keys.ToArray());
            indexingResult = CheckResultOfCompletedTasks();
            if (indexingResult.IsFailure)
            {
                return(request.SetError(indexingResult.Error));
            }

            Debug.Assert(urlsToIndex.Count == 0,
                         "После завершения индексации остались непроиндексированные страницы");
            request.UpdatePagesCounts(indexedUrls.Count, indexedUrls.Count);
            return(request.SetIndexed(DateTime.UtcNow));

            Result <string> CheckResultOfCompletedTasks()
            {
                while (completedIndexingTasks.TryPop(out var completedTask))
                {
                    var result = completedTask.Result;
                    if (result.IsFailure)
                    {
                        return(result);
                    }
                }

                return(Result <string> .Success());
            }
        }