public async Task Collect(SqlConnection connection, Uri serviceDiscoveryUri, DateTime?lastCreateTime, string fileName) { using (var context = new EntitiesContext(connection, readOnly: true)) using (var cursor = new FileCursor(CursorFileName)) using (var logger = new Logger(ErrorsFileName)) { context.SetCommandTimeout(300); // large query var startTime = await cursor.Read(); logger.Log($"Starting metadata collection - Cursor time: {startTime:u}"); var repository = new EntityRepository <Package>(context); var packages = repository.GetAll() .Include(p => p.PackageRegistration); if (QueryIncludes != null) { packages = packages.Include(QueryIncludes); } packages = packages .Where(p => p.Created <lastCreateTime && p.Created> startTime) .Where(p => p.PackageStatusKey == PackageStatus.Available) .OrderBy(p => p.Created); if (LimitTo > 0) { packages = packages.Take(LimitTo); } var flatContainerUri = await GetFlatContainerUri(serviceDiscoveryUri); using (var csv = CreateCsvWriter(fileName)) using (var http = new HttpClient()) { // We want these downloads ignored by stats pipelines - this user agent is automatically skipped. // See https://github.com/NuGet/NuGet.Jobs/blob/262da48ed05d0366613bbf1c54f47879aad96dcd/src/Stats.ImportAzureCdnStatistics/StatisticsParser.cs#L41 http.DefaultRequestHeaders.TryAddWithoutValidation("User-Agent", "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0; AppInsights) Backfill Job: NuGet.Gallery GalleryTools"); var counter = 0; var lastCreatedDate = default(DateTime?); foreach (var package in packages) { var id = package.PackageRegistration.Id; var version = package.NormalizedVersion; var idLowered = id.ToLowerInvariant(); var versionLowered = version.ToLowerInvariant(); try { var metadata = default(TMetadata); var nuspecUri = $"{flatContainerUri}/{idLowered}/{versionLowered}/{idLowered}.nuspec"; using (var nuspecStream = await http.GetStreamAsync(nuspecUri)) { var document = LoadDocument(nuspecStream); var nuspecReader = new NuspecReader(document); if (SourceType == MetadataSourceType.NuspecOnly) { metadata = ReadMetadata(nuspecReader); } else if (SourceType == MetadataSourceType.Nupkg) { var nupkgUri = $"{flatContainerUri}/{idLowered}/{versionLowered}/{idLowered}.{versionLowered}.nupkg"; metadata = await FetchMetadataAsync(http, nupkgUri, nuspecReader, id, version, logger); } } if (ShouldWriteMetadata(metadata)) { var record = new PackageMetadata(id, version, metadata, package.Created); csv.WriteRecord(record); await csv.NextRecordAsync(); logger.LogPackage(id, version, $"Metadata saved"); } } catch (Exception e) { await logger.LogPackageError(id, version, e); } counter++; if (!lastCreatedDate.HasValue || lastCreatedDate < package.Created) { lastCreatedDate = package.Created; } if (counter >= CollectBatchSize) { logger.Log($"Writing {package.Created:u} to cursor..."); await cursor.Write(package.Created); counter = 0; } } if (counter > 0 && lastCreatedDate.HasValue) { await cursor.Write(lastCreatedDate.Value); } } } }
public async Task Update(SqlConnection connection, string fileName) { if (!File.Exists(fileName)) { throw new ArgumentException($"File '{fileName}' doesn't exist"); } using (var context = new EntitiesContext(connection, readOnly: false)) using (var cursor = new FileCursor(CursorFileName)) using (var logger = new Logger(ErrorsFileName)) { var startTime = await cursor.Read(); logger.Log($"Starting database update - Cursor time: {startTime:u}"); var repository = new EntityRepository <Package>(context); var packages = repository.GetAll().Include(p => p.PackageRegistration); using (var csv = CreateCsvReader(fileName)) { var counter = 0; var lastCreatedDate = default(DateTime?); var result = await TryReadMetadata(csv); while (result.Success) { var metadata = result.Metadata; if (metadata.Created >= startTime) { var package = packages.FirstOrDefault(p => p.PackageRegistration.Id == metadata.Id && p.NormalizedVersion == metadata.Version); if (package != null) { UpdatePackage(package, metadata.Metadata, context); logger.LogPackage(metadata.Id, metadata.Version, "Metadata updated."); counter++; if (!lastCreatedDate.HasValue || lastCreatedDate < package.Created) { lastCreatedDate = metadata.Created; } } else { await logger.LogPackageError(metadata.Id, metadata.Version, "Could not find package in the database."); } } if (counter >= UpdateBatchSize) { await CommitBatch(context, cursor, logger, metadata.Created); counter = 0; } result = await TryReadMetadata(csv); } if (counter > 0) { await CommitBatch(context, cursor, logger, lastCreatedDate); } } } }
public async Task Collect(string connectionString, Uri serviceDiscoveryUri, DateTime?lastCreateTime, string fileName) { using (var context = new EntitiesContext(connectionString, readOnly: true)) using (var cursor = new FileCursor(CursorFileName)) using (var logger = new Logger(ErrorsFileName)) { var startTime = await cursor.Read(); logger.Log($"Starting metadata collection - Cursor time: {startTime:u}"); var repository = new EntityRepository <Package>(context); var packages = repository.GetAll() .Include(p => p.PackageRegistration) .Where(p => p.Created <lastCreateTime && p.Created> startTime) .Where(p => p.PackageStatusKey == PackageStatus.Available || p.PackageStatusKey == PackageStatus.Validating) .OrderBy(p => p.Created); var flatContainerUri = await GetFlatContainerUri(serviceDiscoveryUri); using (var csv = CreateCsvWriter(fileName)) using (var http = new HttpClient()) { var counter = 0; var lastCreatedDate = default(DateTime?); foreach (var package in packages) { var id = package.PackageRegistration.Id; var version = package.NormalizedVersion; var nuspecUri = $"{flatContainerUri}/{id.ToLowerInvariant()}/{version.ToLowerInvariant()}/{id.ToLowerInvariant()}.nuspec"; try { var metadata = await FetchMetadata(http, nuspecUri); if (ShouldWriteMetadata(metadata)) { var record = new PackageMetadata(id, version, metadata, package.Created); csv.WriteRecord(record); await csv.NextRecordAsync(); logger.LogPackage(id, version, "Metadata saved."); } } catch (Exception e) { await logger.LogPackageError(id, version, e); } counter++; if (!lastCreatedDate.HasValue || lastCreatedDate < package.Created) { lastCreatedDate = package.Created; } if (counter >= CollectBatchSize) { logger.Log($"Writing {package.Created:u} to cursor..."); await cursor.Write(package.Created); counter = 0; } } if (counter > 0 && lastCreatedDate.HasValue) { await cursor.Write(lastCreatedDate.Value); } } } }