Example #1
0
 public YtCollector(BlobStores stores, AppCfg cfg, SnowflakeConnectionProvider sf, IPipeCtx pipeCtx, YtWeb ytWeb,
                    YtClient api, ILogger log)
 {
     DbStore = new(stores.Store(DataStoreType.DbStage), log);
     Cfg     = cfg;
     Sf      = sf;
     PipeCtx = pipeCtx;
     Scraper = ytWeb;
     Api     = api;
 }
Example #2
0
 public TrafficCmd(BlobStores stores, YtWeb scraper, ILogger log)
 {
     Stores  = stores;
     Scraper = scraper;
     Log     = log;
 }
Example #3
0
        public static async Task Process(ISimpleFileStore store, YtWeb ytWeb, ILogger log)
        {
            var blobs = await store.List("rec_exports").SelectManyList();

            //blobs = blobs.Where(b => b.Path == "rec_exports/Traffic source 2019-07-01_2019-08-01 David Pakman Show.zip").ToList();

            var fileInfoRegex = new Regex("^Traffic source (?'from'\\d+-\\d+-\\d+)_(?'to'\\d+-\\d+-\\d+) (?'channel'[^.]+)", RegexOptions.Compiled);

            var appendStore = new JsonlStore <TrafficSourceRow>(store, "rec_exports_processed", r => r.FileUpdated.FileSafeTimestamp(), log);

            var md = await appendStore.LatestFile();

            var latestModified = md?.Ts.ParseFileSafeTimestamp();

            var newBlobs = latestModified != null
        ? blobs.Where(b => b.Modified > latestModified).ToList()
        : blobs;

            log.Information("Processing {NewExports}/{AllExports} exports", newBlobs.Count, blobs.Count);

            foreach (var b in newBlobs)
            {
                log.Information("Processing {Path}", b.Path);

                var m = fileInfoRegex.Match(b.Path.Name);
                if (m.Groups.Count < 3)
                {
                    throw new InvalidOperationException($"unable to parse export info from file name '{b.Path.Name}'");
                }
                var exportInfo = new {
                    Channel = m.Groups["channel"].Value,
                    From    = m.Groups["from"].Value.ParseDate(),
                    To      = m.Groups["to"].Value.ParseDate()
                };

                var stream = await store.Load(b.Path);

                var zip = new ZipArchive(stream);
                using var csvStream = new StreamReader(
                          zip.GetEntry("Table data.csv")?.Open() ?? throw new InvalidOperationException("expected export to have 'Table data.csv'"),
                          Encoding.UTF8);
                using var csvReader = new CsvReader(csvStream, CsvExtensions.DefaultConfig);

                var records = csvReader.GetRecords <TrafficSourceExportRow>().ToList();
                var rows    = (await records.BlockFunc(ToTrafficSourceRow, 4,
                                                       progressUpdate: p => log.Debug("Processing traffic sources for {Path}: {Rows}/{TotalRows}", b.Path, p.Completed, records.Count)))
                              .NotNull().ToList();

                await appendStore.Append(rows, log);

                log.Information("Completed processing traffic source exports for {Path}", b.Path);

                async Task <TrafficSourceRow> ToTrafficSourceRow(TrafficSourceExportRow row)
                {
                    var source = row.Source.Split(".");

                    if (source.Length != 2 || source[0] != "YT_RELATED")
                    {
                        return(null); // total at the top or otherwise. not interested
                    }
                    var videoId   = source[1];
                    var fromVideo = await ytWeb.GetVideo(videoId, log);

                    return(new TrafficSourceRow {
                        ToChannelTitle = exportInfo.Channel,
                        From = exportInfo.From,
                        To = exportInfo.To,
                        Impressions = row.Impressions,
                        Source = row.Source,
                        AvgViewDuration = row.AvgViewDuration,
                        Views = row.Views,
                        SourceType = row.SourceType,
                        FromChannelId = fromVideo?.ChannelId,
                        FromChannelTitle = fromVideo?.ChannelTitle,
                        FromVideoId = fromVideo?.Id,
                        FromVideoTitle = fromVideo?.Title,
                        ImpressionClickThrough = row.ImpressionClickThrough,
                        WatchTimeHrsTotal = row.WatchTimeHrsTotal,
                        FileUpdated = b.Modified?.UtcDateTime ?? DateTime.MinValue
                    });
                }
            }

            log.Information("Completed processing traffic source exports");
        }