static async Task SaveVids(ChannelStored2 c, IReadOnlyCollection <VideoItem> vids, JsonlStore <VideoStored2> vidStore, DateTime?uploadedFrom, ILogger log) { var updated = DateTime.UtcNow; var vidsStored = vids.Select(v => new VideoStored2 { VideoId = v.Id, Title = v.Title, Description = v.Description, Duration = v.Duration, Keywords = v.Keywords.ToList(), Statistics = v.Statistics, ChannelId = c.ChannelId, ChannelTitle = c.ChannelTitle, UploadDate = v.UploadDate, AddedDate = v.AddedDate, Updated = updated }).ToList(); if (vidsStored.Count > 0) { await vidStore.Append(vidsStored, log); } var newVideos = vidsStored.Count(v => uploadedFrom == null || v.UploadDate > uploadedFrom); log.Information("Collect - {Channel} - Recorded {VideoCount} videos. {NewCount} new, {UpdatedCount} updated", c.ChannelTitle, vids.Count, newVideos, vids.Count - newVideos); }
public static async Task Process(ISimpleFileStore store, YtWeb ytWeb, ILogger log) { var blobs = await store.List("rec_exports").SelectManyList(); //blobs = blobs.Where(b => b.Path == "rec_exports/Traffic source 2019-07-01_2019-08-01 David Pakman Show.zip").ToList(); var fileInfoRegex = new Regex("^Traffic source (?'from'\\d+-\\d+-\\d+)_(?'to'\\d+-\\d+-\\d+) (?'channel'[^.]+)", RegexOptions.Compiled); var appendStore = new JsonlStore <TrafficSourceRow>(store, "rec_exports_processed", r => r.FileUpdated.FileSafeTimestamp(), log); var md = await appendStore.LatestFile(); var latestModified = md?.Ts.ParseFileSafeTimestamp(); var newBlobs = latestModified != null ? blobs.Where(b => b.Modified > latestModified).ToList() : blobs; log.Information("Processing {NewExports}/{AllExports} exports", newBlobs.Count, blobs.Count); foreach (var b in newBlobs) { log.Information("Processing {Path}", b.Path); var m = fileInfoRegex.Match(b.Path.Name); if (m.Groups.Count < 3) { throw new InvalidOperationException($"unable to parse export info from file name '{b.Path.Name}'"); } var exportInfo = new { Channel = m.Groups["channel"].Value, From = m.Groups["from"].Value.ParseDate(), To = m.Groups["to"].Value.ParseDate() }; var stream = await store.Load(b.Path); var zip = new ZipArchive(stream); using var csvStream = new StreamReader( zip.GetEntry("Table data.csv")?.Open() ?? throw new InvalidOperationException("expected export to have 'Table data.csv'"), Encoding.UTF8); using var csvReader = new CsvReader(csvStream, CsvExtensions.DefaultConfig); var records = csvReader.GetRecords <TrafficSourceExportRow>().ToList(); var rows = (await records.BlockFunc(ToTrafficSourceRow, 4, progressUpdate: p => log.Debug("Processing traffic sources for {Path}: {Rows}/{TotalRows}", b.Path, p.Completed, records.Count))) .NotNull().ToList(); await appendStore.Append(rows, log); log.Information("Completed processing traffic source exports for {Path}", b.Path); async Task <TrafficSourceRow> ToTrafficSourceRow(TrafficSourceExportRow row) { var source = row.Source.Split("."); if (source.Length != 2 || source[0] != "YT_RELATED") { return(null); // total at the top or otherwise. not interested } var videoId = source[1]; var fromVideo = await ytWeb.GetVideo(videoId, log); return(new TrafficSourceRow { ToChannelTitle = exportInfo.Channel, From = exportInfo.From, To = exportInfo.To, Impressions = row.Impressions, Source = row.Source, AvgViewDuration = row.AvgViewDuration, Views = row.Views, SourceType = row.SourceType, FromChannelId = fromVideo?.ChannelId, FromChannelTitle = fromVideo?.ChannelTitle, FromVideoId = fromVideo?.Id, FromVideoTitle = fromVideo?.Title, ImpressionClickThrough = row.ImpressionClickThrough, WatchTimeHrsTotal = row.WatchTimeHrsTotal, FileUpdated = b.Modified?.UtcDateTime ?? DateTime.MinValue }); } } log.Information("Completed processing traffic source exports"); }