/// <summary> /// Returns the most recent appended collection /// </summary> async Task <FileListItem> LatestFile() { var files = (await Store.List(Path).SelectManyList()).Where(p => !p.Path.Name.StartsWith("_")); var latest = files.OrderByDescending(f => StoreFileMd.GetTs(f.Path)).FirstOrDefault(); return(latest); }
async Task <List <StoreFileMd> > FilesToUpgrade(StringPath path, int fromVersion) { var files = (await Store.List(path, true).SelectManyList()).Select(StoreFileMd.FromFileItem).ToList(); var toUpgrade = files.Where(f => (f.Version ?? "0").ParseInt() == fromVersion).ToList(); return(toUpgrade); }
public static async IAsyncEnumerable <IReadOnlyCollection <StoreFileMd> > Files(this ISimpleFileStore store, StringPath path, bool allDirectories = false) { await foreach (var p in store.List(path, allDirectories)) { yield return(p .Where(p => !p.Path.Name.StartsWith("_") && p.Path.Name.EndsWith(Extension)) .Select(StoreFileMd.FromFileItem).ToArray()); } }
JsonStoreFiles(this ISimpleFileStore store, SPath path, bool allDirectories = false) { var allFiles = await store.List(path, allDirectories).ToArrayAsync(); foreach (var b in allFiles.Select(b => b .Where(p => !p.Path.Name.StartsWith("_") && p.Path.Name.EndsWith(Extension)) .Select(StoreFileMd.FromFileItem).ToArray()) .Select(dummy => (IReadOnlyCollection <StoreFileMd>)dummy)) { yield return(b); } }
public async Task Convert(ILogger log) { var files = (await Store.List("import/watch_time").SelectManyList()).Where(f => f.Path.ExtensionsString == "csv"); await files.BlockAction(async f => { using var stream = await Store.Load(f.Path); using var sr = new StreamReader(stream); using var csv = new CsvReader(sr, CultureInfo.InvariantCulture) { Configuration = { Encoding = Encoding.UTF8, HasHeaderRecord = true, MissingFieldFound = null, BadDataFound = r => log.Warning("Error reading csv data at {RowNumber}: {RowData}", r.Row, r.RawRecord) } }; var rows = await csv.GetRecordsAsync <dynamic>().ToListAsync(); await Store.Save(f.Path.Parent.Add($"{f.Path.NameSansExtension}.json.gz"), await rows.ToJsonlGzStream(), log); }, parallelism : 4); }
public static async Task <IReadOnlyCollection <StoreFileMd> > Files(this ISimpleFileStore store, StringPath path, bool allDirectories = false) { var list = (await store.List(path, allDirectories).SelectManyList()).Where(p => !p.Path.Name.StartsWith("_") && p.Path.Name.EndsWith(Extension)); return(list.Select(StoreFileMd.FromFileItem).ToList()); }
public static async Task Process(ISimpleFileStore store, YtWeb ytWeb, ILogger log) { var blobs = await store.List("rec_exports").SelectManyList(); //blobs = blobs.Where(b => b.Path == "rec_exports/Traffic source 2019-07-01_2019-08-01 David Pakman Show.zip").ToList(); var fileInfoRegex = new Regex("^Traffic source (?'from'\\d+-\\d+-\\d+)_(?'to'\\d+-\\d+-\\d+) (?'channel'[^.]+)", RegexOptions.Compiled); var appendStore = new JsonlStore <TrafficSourceRow>(store, "rec_exports_processed", r => r.FileUpdated.FileSafeTimestamp(), log); var md = await appendStore.LatestFile(); var latestModified = md?.Ts.ParseFileSafeTimestamp(); var newBlobs = latestModified != null ? blobs.Where(b => b.Modified > latestModified).ToList() : blobs; log.Information("Processing {NewExports}/{AllExports} exports", newBlobs.Count, blobs.Count); foreach (var b in newBlobs) { log.Information("Processing {Path}", b.Path); var m = fileInfoRegex.Match(b.Path.Name); if (m.Groups.Count < 3) { throw new InvalidOperationException($"unable to parse export info from file name '{b.Path.Name}'"); } var exportInfo = new { Channel = m.Groups["channel"].Value, From = m.Groups["from"].Value.ParseDate(), To = m.Groups["to"].Value.ParseDate() }; var stream = await store.Load(b.Path); var zip = new ZipArchive(stream); using var csvStream = new StreamReader( zip.GetEntry("Table data.csv")?.Open() ?? throw new InvalidOperationException("expected export to have 'Table data.csv'"), Encoding.UTF8); using var csvReader = new CsvReader(csvStream, CsvExtensions.DefaultConfig); var records = csvReader.GetRecords <TrafficSourceExportRow>().ToList(); var rows = (await records.BlockFunc(ToTrafficSourceRow, 4, progressUpdate: p => log.Debug("Processing traffic sources for {Path}: {Rows}/{TotalRows}", b.Path, p.Completed, records.Count))) .NotNull().ToList(); await appendStore.Append(rows, log); log.Information("Completed processing traffic source exports for {Path}", b.Path); async Task <TrafficSourceRow> ToTrafficSourceRow(TrafficSourceExportRow row) { var source = row.Source.Split("."); if (source.Length != 2 || source[0] != "YT_RELATED") { return(null); // total at the top or otherwise. not interested } var videoId = source[1]; var fromVideo = await ytWeb.GetVideo(videoId, log); return(new TrafficSourceRow { ToChannelTitle = exportInfo.Channel, From = exportInfo.From, To = exportInfo.To, Impressions = row.Impressions, Source = row.Source, AvgViewDuration = row.AvgViewDuration, Views = row.Views, SourceType = row.SourceType, FromChannelId = fromVideo?.ChannelId, FromChannelTitle = fromVideo?.ChannelTitle, FromVideoId = fromVideo?.Id, FromVideoTitle = fromVideo?.Title, ImpressionClickThrough = row.ImpressionClickThrough, WatchTimeHrsTotal = row.WatchTimeHrsTotal, FileUpdated = b.Modified?.UtcDateTime ?? DateTime.MinValue }); } } log.Information("Completed processing traffic source exports"); }