private async Task <bool> ProcessIndexAsync(string catalogIndexUrl, DateTimeOffset minCommitTimestamp) { var index = await _client.GetIndexAsync(catalogIndexUrl); var pageItems = index.GetPagesInBounds( minCommitTimestamp, _settings.MaxCommitTimestamp); _logger.LogInformation( "{pages} pages were in the time bounds, out of {totalPages}.", pageItems.Count, index.Items.Count); var success = true; for (var i = 0; i < pageItems.Count; i++) { success = await ProcessPageAsync(minCommitTimestamp, pageItems[i]); if (!success) { _logger.LogWarning( "{unprocessedPages} out of {pages} pages were left incomplete due to a processing failure.", pageItems.Count - i, pageItems.Count); break; } } return(success); }
private async Task ExecuteAsync(CancellationToken token) { using (var cancelledCts = new CancellationTokenSource()) using (var produceWorkCts = new CancellationTokenSource()) { // Initialize the indexes, container and excluded packages data. await InitializeAsync(); // Here, we fetch the current catalog timestamp to use as the initial cursor value for // catalog2azuresearch. The idea here is that database is always more up-to-date than the catalog. // We're about to read the database so if we capture a catalog timestamp now, we are guaranteed that // any data we get from a database query will be more recent than the data represented by this catalog // timestamp. When catalog2azuresearch starts up for the first time to update the index produced by this // job, it will probably encounter some duplicate packages, but this is okay. // // Note that we could capture any dependency cursors here instead of catalog cursor, but this is // pointless because there is no reliable way to filter out data fetched from the database based on a // catalog-based cursor value. Suppose the dependency cursor is catalog2registration. If // catalog2registration is very behind, then the index produced by this job will include packages that // are not yet restorable (since they are not in the registration hives). This could lead to a case // where a user is able to search for a package that he cannot restore. We mitigate this risk by // trusting that our end-to-end tests will fail when catalog2registration (or any other V3 component) is // broken, this blocking the deployment of new Azure Search indexes. var catalogIndex = await _catalogClient.GetIndexAsync(_options.Value.CatalogIndexUrl); var initialCursorValue = catalogIndex.CommitTimestamp; _logger.LogInformation("The initial cursor value will be {CursorValue:O}.", initialCursorValue); var initialAuxiliaryData = await PushAllPackageRegistrationsAsync(cancelledCts, produceWorkCts); // Write the owner data file. await WriteOwnerDataAsync(initialAuxiliaryData.Owners); // Write the download data file. await WriteDownloadDataAsync(initialAuxiliaryData.Downloads); // Write the verified packages data file. await WriteVerifiedPackagesDataAsync(initialAuxiliaryData.VerifiedPackages); // Write popularity transfers data file. await WritePopularityTransfersDataAsync(initialAuxiliaryData.PopularityTransfers); // Write the cursor. _logger.LogInformation("Writing the initial cursor value to be {CursorValue:O}.", initialCursorValue); var frontCursorStorage = _storageFactory.Create(); var frontCursor = new DurableCursor( frontCursorStorage.ResolveUri(Catalog2AzureSearchCommand.CursorRelativeUri), frontCursorStorage, DateTime.MinValue); frontCursor.Value = initialCursorValue.UtcDateTime; await frontCursor.SaveAsync(token); } }
private async Task ProcessIndexAsync(string catalogIndexUrl, DateTimeOffset minCommitTimestamp, CancellationToken token) { var index = await _client.GetIndexAsync(catalogIndexUrl, token); var pageItems = index.GetPagesInBounds( minCommitTimestamp, _settings.MaxCommitTimestamp); _logger.LogInformation( "{pages} pages were in the time bounds, out of {totalPages}.", pageItems.Count, index.Items.Count); foreach (var pageItem in pageItems) { using (_logger.BeginScope(("page", pageItem))) { await ProcessPageAsync(minCommitTimestamp, pageItem, token); } } }
public async static Task <(CatalogIndex, IEnumerable <CatalogLeafItem>)> LoadCatalogAsync( this ICatalogClient catalogClient, DateTimeOffset minCursor, DateTimeOffset maxCursor, ILogger logger, CancellationToken cancellationToken) { var catalogIndex = await catalogClient.GetIndexAsync(cancellationToken); var catalogLeafItems = new ConcurrentBag <CatalogLeafItem>(); var catalogPageUrls = new ConcurrentBag <CatalogPageItem>( catalogIndex.GetPagesInBounds(minCursor, maxCursor)); await ParallelAsync.RunAsync( catalogPageUrls, ProcessCatalogPageAsync, cancellationToken); return(catalogIndex, catalogLeafItems); async Task ProcessCatalogPageAsync(CatalogPageItem pageItem, CancellationToken token) { logger.LogInformation("Processing catalog page {CatalogPageUrl}...", pageItem.CatalogPageUrl); var page = await catalogClient.GetPageAsync(pageItem.CatalogPageUrl, token); var leafs = page.GetLeavesInBounds(minCursor, maxCursor, excludeRedundantLeaves: true); foreach (var catalogLeafItem in leafs) { catalogLeafItems.Add(catalogLeafItem); } logger.LogInformation("Processed catalog page {CatalogPageUrl}", pageItem.CatalogPageUrl); } }
private async Task <bool> ProcessIndexAsync(string catalogIndexUrl, DateTimeOffset minCommitTimestamp, CancellationToken cancellationToken) { var index = await _client.GetIndexAsync(catalogIndexUrl); // Fetch pages for processing var pageItems = index.GetPagesInBounds( minCommitTimestamp, _settings.MaxCommitTimestamp) .Take(BatchSize).ToList(); _logger.LogInformation( "{pages} pages were in the time bounds, out of {totalPages}.", pageItems.Count, index.Items.Count); if (pageItems.Count == 0) { return(true); } var success = true; var latestCommit = pageItems.Max(page => page.CommitTimestamp); // Fetch all catalog pages var pageItemTasks = new List <Task <CatalogPage> >(); foreach (var pageItem in pageItems) { pageItemTasks.Add(GetPageAsync(pageItem.Url, cancellationToken)); } var catalogPages = await Task.WhenAll(pageItemTasks); var leavesToProcess = catalogPages .Where(catalogPage => catalogPage != null) .SelectMany( catalogPage => catalogPage.GetLeavesInBounds( minCommitTimestamp, _settings.MaxCommitTimestamp, _settings.ExcludeRedundantLeaves)) .GroupBy(package => package.PackageId + "-" + package.PackageVersion) .Select(group => group.OrderByDescending(package => package.CommitTimestamp).First()); // Process leaves var leafTasks = new List <Task <bool> >(); foreach (var leafItem in leavesToProcess) { leafTasks.Add(ProcessLeafAsync(leafItem, cancellationToken)); } if (leafTasks.Count == 0) { return(true); } var leafResults = await Task.WhenAll(leafTasks); // ReSharper disable once RedundantBoolCompare success = leafResults.All(result => result == true); if (cancellationToken.IsCancellationRequested) { _logger.LogWarning("Stop processing because of cancellation request."); success = false; } if (success) { await _cursor.SetAsync(latestCommit); } return(success); }