static async Task <StringPath> JoinFiles(ISimpleFileStore store, IReadOnlyCollection <StoreFileMd> toOptimise, StringPath destPath, int parallel, ILogger log) { var optimisedFile = FilePath(destPath, toOptimise.Last().Ts); using (var joinedStream = new MemoryStream()) { using (var zipWriter = new GZipStream(joinedStream, CompressionLevel.Optimal, true)) { var inStreams = await toOptimise.BlockFunc(async s => { var inStream = await store.Load(s.Path, log).WithDuration(); log.Debug("Optimise {Path} - loaded file {SourceFile} to be optimised in {Duration}", destPath, s.Path, inStream.Duration.HumanizeShort()); return(inStream.Result); }, parallel); foreach (var s in inStreams) { using var zr = new GZipStream(s, CompressionMode.Decompress, false); await zr.CopyToAsync(zipWriter); } } joinedStream.Seek(0, SeekOrigin.Begin); await store.Save(optimisedFile, joinedStream); } // when in-place, this is dirty if we fail now. There is no transaction capability in cloud storage, so downstream process must handle duplicates // successfully staged files, delete from land. Incremental using TS will work without delete, but it's more efficient to delete process landed files. await toOptimise.BlockAction(f => store.Delete(f.Path), parallel) .WithWrappedException(e => "Failed to delete optimised files. Duplicate records need to be handled downstream"); log.Debug("Optimise {Path} - deleted {Files} that were optimised into {OptimisedFile}", destPath, toOptimise.Count, optimisedFile); return(optimisedFile); }
async Task <IReadOnlyCollection <JObject> > Jsonl(StoreFileMd f) { await using var sr = await Store.Load(f.Path); var existingJs = sr.LoadJsonlGz <JObject>(); return(existingJs); }
public static async Task <T> Get <T>(this ISimpleFileStore store, StringPath path, bool zip = true, ILogger log = null) { using var stream = await store.Load(path.AddJsonExtention(zip), log); if (!zip) { return(stream.ToObject <T>()); } await using var zr = new GZipStream(stream, CompressionMode.Decompress, true); return(zr.ToObject <T>()); }
public async Task Convert(ILogger log) { var files = (await Store.List("import/watch_time").SelectManyList()).Where(f => f.Path.ExtensionsString == "csv"); await files.BlockAction(async f => { using var stream = await Store.Load(f.Path); using var sr = new StreamReader(stream); using var csv = new CsvReader(sr, CultureInfo.InvariantCulture) { Configuration = { Encoding = Encoding.UTF8, HasHeaderRecord = true, MissingFieldFound = null, BadDataFound = r => log.Warning("Error reading csv data at {RowNumber}: {RowData}", r.Row, r.RawRecord) } }; var rows = await csv.GetRecordsAsync <dynamic>().ToListAsync(); await Store.Save(f.Path.Parent.Add($"{f.Path.NameSansExtension}.json.gz"), await rows.ToJsonlGzStream(), log); }, parallelism : 4); }
public static async Task Process(ISimpleFileStore store, YtWeb ytWeb, ILogger log) { var blobs = await store.List("rec_exports").SelectManyList(); //blobs = blobs.Where(b => b.Path == "rec_exports/Traffic source 2019-07-01_2019-08-01 David Pakman Show.zip").ToList(); var fileInfoRegex = new Regex("^Traffic source (?'from'\\d+-\\d+-\\d+)_(?'to'\\d+-\\d+-\\d+) (?'channel'[^.]+)", RegexOptions.Compiled); var appendStore = new JsonlStore <TrafficSourceRow>(store, "rec_exports_processed", r => r.FileUpdated.FileSafeTimestamp(), log); var md = await appendStore.LatestFile(); var latestModified = md?.Ts.ParseFileSafeTimestamp(); var newBlobs = latestModified != null ? blobs.Where(b => b.Modified > latestModified).ToList() : blobs; log.Information("Processing {NewExports}/{AllExports} exports", newBlobs.Count, blobs.Count); foreach (var b in newBlobs) { log.Information("Processing {Path}", b.Path); var m = fileInfoRegex.Match(b.Path.Name); if (m.Groups.Count < 3) { throw new InvalidOperationException($"unable to parse export info from file name '{b.Path.Name}'"); } var exportInfo = new { Channel = m.Groups["channel"].Value, From = m.Groups["from"].Value.ParseDate(), To = m.Groups["to"].Value.ParseDate() }; var stream = await store.Load(b.Path); var zip = new ZipArchive(stream); using var csvStream = new StreamReader( zip.GetEntry("Table data.csv")?.Open() ?? throw new InvalidOperationException("expected export to have 'Table data.csv'"), Encoding.UTF8); using var csvReader = new CsvReader(csvStream, CsvExtensions.DefaultConfig); var records = csvReader.GetRecords <TrafficSourceExportRow>().ToList(); var rows = (await records.BlockFunc(ToTrafficSourceRow, 4, progressUpdate: p => log.Debug("Processing traffic sources for {Path}: {Rows}/{TotalRows}", b.Path, p.Completed, records.Count))) .NotNull().ToList(); await appendStore.Append(rows, log); log.Information("Completed processing traffic source exports for {Path}", b.Path); async Task <TrafficSourceRow> ToTrafficSourceRow(TrafficSourceExportRow row) { var source = row.Source.Split("."); if (source.Length != 2 || source[0] != "YT_RELATED") { return(null); // total at the top or otherwise. not interested } var videoId = source[1]; var fromVideo = await ytWeb.GetVideo(videoId, log); return(new TrafficSourceRow { ToChannelTitle = exportInfo.Channel, From = exportInfo.From, To = exportInfo.To, Impressions = row.Impressions, Source = row.Source, AvgViewDuration = row.AvgViewDuration, Views = row.Views, SourceType = row.SourceType, FromChannelId = fromVideo?.ChannelId, FromChannelTitle = fromVideo?.ChannelTitle, FromVideoId = fromVideo?.Id, FromVideoTitle = fromVideo?.Title, ImpressionClickThrough = row.ImpressionClickThrough, WatchTimeHrsTotal = row.WatchTimeHrsTotal, FileUpdated = b.Modified?.UtcDateTime ?? DateTime.MinValue }); } } log.Information("Completed processing traffic source exports"); }
async Task <IReadOnlyCollection <T> > LoadJsonl(StringPath path) { await using var stream = await Store.Load(path); return(stream.LoadJsonlGz <T>()); }