public async Task <OpenLibraryVersion> SaveArchive(OpenLibraryDownload version, CancellationToken ct) { var matchingVersion = await FindArchiveEntries(version.Datestamp.AddDays(-1), version.Datestamp.AddDays(1), version.ArchiveType, ct); if (matchingVersion?.Any(v => v.PublishDate == version.Datestamp) == true) { throw new ArgumentException($"{version.ArchiveType.GetKey()} is already in the archives"); } var transferReport = await _storageStreamer.StreamHttpToS3(version.Source, _openLibVersionsBucket, version.ObjectName, ct); var versionEntry = new OpenLibraryVersion { SourceUrl = version.Source, Kind = version.ArchiveType.GetKey(), ObjectName = version.ObjectName, Bytes = transferReport.Bytes, Uri = transferReport.DestinationUrl, PublishDate = version.Datestamp, }; await SaveArchiveEntry(versionEntry, ct); return(versionEntry); }
private async Task <OpenLibraryDownload> GetDownload(DateTime date, OpenLibraryArchiveType archiveType, CancellationToken ct) { // https://archive.org/download/ol_dump_2021-03-19/ol_dump_editions_2021-03-19.txt.gz var formattedDate = date.ToIsoDateString(); var url = $"https://archive.org/download/ol_dump_{formattedDate}/ol_dump_{archiveType.GetKey()}_{formattedDate}.txt.gz"; _logger.LogInformation($"Checking {url}"); var timer = Stopwatch.StartNew(); OpenLibraryDownload dl = null; using (var response = await _client.GetAsync(url, HttpCompletionOption.ResponseHeadersRead, ct)) { timer.Stop(); if (response.IsSuccessStatusCode) { dl = new OpenLibraryDownload { Datestamp = date, Source = url, ArchiveType = archiveType, }; } } timer.Stop(); _logger.LogInformation($"Checked {url} in {timer.ElapsedMilliseconds}ms. Exists = {dl is not null}"); return(dl); }
public static string GetObjectName(this OpenLibraryDownload dl) => GetObjectName(dl.Datestamp, dl.ArchiveType);