private async Task <bool> ProcessIndexAsync(string catalogIndexUrl, DateTimeOffset minCommitTimestamp)
        {
            var index = await _client.GetIndexAsync(catalogIndexUrl);

            var pageItems = index.GetPagesInBounds(
                minCommitTimestamp,
                _settings.MaxCommitTimestamp);

            _logger.LogInformation(
                "{pages} pages were in the time bounds, out of {totalPages}.",
                pageItems.Count,
                index.Items.Count);

            var success = true;

            for (var i = 0; i < pageItems.Count; i++)
            {
                success = await ProcessPageAsync(minCommitTimestamp, pageItems[i]);

                if (!success)
                {
                    _logger.LogWarning(
                        "{unprocessedPages} out of {pages} pages were left incomplete due to a processing failure.",
                        pageItems.Count - i,
                        pageItems.Count);
                    break;
                }
            }

            return(success);
        }
Exemple #2
0
        private async Task ExecuteAsync(CancellationToken token)
        {
            using (var cancelledCts = new CancellationTokenSource())
                using (var produceWorkCts = new CancellationTokenSource())
                {
                    // Initialize the indexes, container and excluded packages data.
                    await InitializeAsync();

                    // Here, we fetch the current catalog timestamp to use as the initial cursor value for
                    // catalog2azuresearch. The idea here is that database is always more up-to-date than the catalog.
                    // We're about to read the database so if we capture a catalog timestamp now, we are guaranteed that
                    // any data we get from a database query will be more recent than the data represented by this catalog
                    // timestamp. When catalog2azuresearch starts up for the first time to update the index produced by this
                    // job, it will probably encounter some duplicate packages, but this is okay.
                    //
                    // Note that we could capture any dependency cursors here instead of catalog cursor, but this is
                    // pointless because there is no reliable way to filter out data fetched from the database based on a
                    // catalog-based cursor value. Suppose the dependency cursor is catalog2registration. If
                    // catalog2registration is very behind, then the index produced by this job will include packages that
                    // are not yet restorable (since they are not in the registration hives). This could lead to a case
                    // where a user is able to search for a package that he cannot restore. We mitigate this risk by
                    // trusting that our end-to-end tests will fail when catalog2registration (or any other V3 component) is
                    // broken, this blocking the deployment of new Azure Search indexes.
                    var catalogIndex = await _catalogClient.GetIndexAsync(_options.Value.CatalogIndexUrl);

                    var initialCursorValue = catalogIndex.CommitTimestamp;
                    _logger.LogInformation("The initial cursor value will be {CursorValue:O}.", initialCursorValue);

                    var initialAuxiliaryData = await PushAllPackageRegistrationsAsync(cancelledCts, produceWorkCts);

                    // Write the owner data file.
                    await WriteOwnerDataAsync(initialAuxiliaryData.Owners);

                    // Write the download data file.
                    await WriteDownloadDataAsync(initialAuxiliaryData.Downloads);

                    // Write the verified packages data file.
                    await WriteVerifiedPackagesDataAsync(initialAuxiliaryData.VerifiedPackages);

                    // Write popularity transfers data file.
                    await WritePopularityTransfersDataAsync(initialAuxiliaryData.PopularityTransfers);

                    // Write the cursor.
                    _logger.LogInformation("Writing the initial cursor value to be {CursorValue:O}.", initialCursorValue);
                    var frontCursorStorage = _storageFactory.Create();
                    var frontCursor        = new DurableCursor(
                        frontCursorStorage.ResolveUri(Catalog2AzureSearchCommand.CursorRelativeUri),
                        frontCursorStorage,
                        DateTime.MinValue);
                    frontCursor.Value = initialCursorValue.UtcDateTime;
                    await frontCursor.SaveAsync(token);
                }
        }
        private async Task ProcessIndexAsync(string catalogIndexUrl, DateTimeOffset minCommitTimestamp, CancellationToken token)
        {
            var index = await _client.GetIndexAsync(catalogIndexUrl, token);

            var pageItems = index.GetPagesInBounds(
                minCommitTimestamp,
                _settings.MaxCommitTimestamp);

            _logger.LogInformation(
                "{pages} pages were in the time bounds, out of {totalPages}.",
                pageItems.Count,
                index.Items.Count);

            foreach (var pageItem in pageItems)
            {
                using (_logger.BeginScope(("page", pageItem)))
                {
                    await ProcessPageAsync(minCommitTimestamp, pageItem, token);
                }
            }
        }
        public async static Task <(CatalogIndex, IEnumerable <CatalogLeafItem>)> LoadCatalogAsync(
            this ICatalogClient catalogClient,
            DateTimeOffset minCursor,
            DateTimeOffset maxCursor,
            ILogger logger,
            CancellationToken cancellationToken)
        {
            var catalogIndex = await catalogClient.GetIndexAsync(cancellationToken);

            var catalogLeafItems = new ConcurrentBag <CatalogLeafItem>();
            var catalogPageUrls  = new ConcurrentBag <CatalogPageItem>(
                catalogIndex.GetPagesInBounds(minCursor, maxCursor));

            await ParallelAsync.RunAsync(
                catalogPageUrls,
                ProcessCatalogPageAsync,
                cancellationToken);

            return(catalogIndex, catalogLeafItems);

            async Task ProcessCatalogPageAsync(CatalogPageItem pageItem, CancellationToken token)
            {
                logger.LogInformation("Processing catalog page {CatalogPageUrl}...", pageItem.CatalogPageUrl);

                var page = await catalogClient.GetPageAsync(pageItem.CatalogPageUrl, token);

                var leafs = page.GetLeavesInBounds(minCursor, maxCursor, excludeRedundantLeaves: true);

                foreach (var catalogLeafItem in leafs)
                {
                    catalogLeafItems.Add(catalogLeafItem);
                }

                logger.LogInformation("Processed catalog page {CatalogPageUrl}", pageItem.CatalogPageUrl);
            }
        }
Exemple #5
0
        private async Task <bool> ProcessIndexAsync(string catalogIndexUrl, DateTimeOffset minCommitTimestamp, CancellationToken cancellationToken)
        {
            var index = await _client.GetIndexAsync(catalogIndexUrl);

            // Fetch pages for processing
            var pageItems = index.GetPagesInBounds(
                minCommitTimestamp,
                _settings.MaxCommitTimestamp)
                            .Take(BatchSize).ToList();

            _logger.LogInformation(
                "{pages} pages were in the time bounds, out of {totalPages}.",
                pageItems.Count,
                index.Items.Count);

            if (pageItems.Count == 0)
            {
                return(true);
            }

            var success      = true;
            var latestCommit = pageItems.Max(page => page.CommitTimestamp);

            // Fetch all catalog pages
            var pageItemTasks = new List <Task <CatalogPage> >();

            foreach (var pageItem in pageItems)
            {
                pageItemTasks.Add(GetPageAsync(pageItem.Url, cancellationToken));
            }

            var catalogPages = await Task.WhenAll(pageItemTasks);

            var leavesToProcess = catalogPages
                                  .Where(catalogPage => catalogPage != null)
                                  .SelectMany(
                catalogPage => catalogPage.GetLeavesInBounds(
                    minCommitTimestamp,
                    _settings.MaxCommitTimestamp,
                    _settings.ExcludeRedundantLeaves))
                                  .GroupBy(package => package.PackageId + "-" + package.PackageVersion)
                                  .Select(group => group.OrderByDescending(package => package.CommitTimestamp).First());

            // Process leaves
            var leafTasks = new List <Task <bool> >();

            foreach (var leafItem in leavesToProcess)
            {
                leafTasks.Add(ProcessLeafAsync(leafItem, cancellationToken));
            }

            if (leafTasks.Count == 0)
            {
                return(true);
            }

            var leafResults = await Task.WhenAll(leafTasks);

            // ReSharper disable once RedundantBoolCompare
            success = leafResults.All(result => result == true);

            if (cancellationToken.IsCancellationRequested)
            {
                _logger.LogWarning("Stop processing because of cancellation request.");
                success = false;
            }

            if (success)
            {
                await _cursor.SetAsync(latestCommit);
            }

            return(success);
        }