/// <summary>
        ///   Returns the most recent appended collection
        /// </summary>
        async Task <FileListItem> LatestFile()
        {
            var files  = (await Store.List(Path).SelectManyList()).Where(p => !p.Path.Name.StartsWith("_"));
            var latest = files.OrderByDescending(f => StoreFileMd.GetTs(f.Path)).FirstOrDefault();

            return(latest);
        }
        async Task <List <StoreFileMd> > FilesToUpgrade(StringPath path, int fromVersion)
        {
            var files     = (await Store.List(path, true).SelectManyList()).Select(StoreFileMd.FromFileItem).ToList();
            var toUpgrade = files.Where(f => (f.Version ?? "0").ParseInt() == fromVersion).ToList();

            return(toUpgrade);
        }
 public static async IAsyncEnumerable <IReadOnlyCollection <StoreFileMd> > Files(this ISimpleFileStore store, StringPath path, bool allDirectories = false)
 {
     await foreach (var p in store.List(path, allDirectories))
     {
         yield return(p
                      .Where(p => !p.Path.Name.StartsWith("_") && p.Path.Name.EndsWith(Extension))
                      .Select(StoreFileMd.FromFileItem).ToArray());
     }
 }
Example #4
0
    JsonStoreFiles(this ISimpleFileStore store, SPath path, bool allDirectories = false)
    {
        var allFiles = await store.List(path, allDirectories).ToArrayAsync();

        foreach (var b in allFiles.Select(b => b
                                          .Where(p => !p.Path.Name.StartsWith("_") && p.Path.Name.EndsWith(Extension))
                                          .Select(StoreFileMd.FromFileItem).ToArray())
                 .Select(dummy => (IReadOnlyCollection <StoreFileMd>)dummy))
        {
            yield return(b);
        }
    }
 public async Task Convert(ILogger log)
 {
     var files = (await Store.List("import/watch_time").SelectManyList()).Where(f => f.Path.ExtensionsString == "csv");
     await files.BlockAction(async f => {
         using var stream = await Store.Load(f.Path);
         using var sr     = new StreamReader(stream);
         using var csv    = new CsvReader(sr, CultureInfo.InvariantCulture)
               {
                   Configuration =
                   {
                       Encoding          = Encoding.UTF8,
                       HasHeaderRecord   = true,
                       MissingFieldFound = null,
                       BadDataFound      = r => log.Warning("Error reading csv data at {RowNumber}: {RowData}", r.Row, r.RawRecord)
                   }
               };
         var rows = await csv.GetRecordsAsync <dynamic>().ToListAsync();
         await Store.Save(f.Path.Parent.Add($"{f.Path.NameSansExtension}.json.gz"), await rows.ToJsonlGzStream(), log);
     }, parallelism : 4);
 }
        public static async Task <IReadOnlyCollection <StoreFileMd> > Files(this ISimpleFileStore store, StringPath path, bool allDirectories = false)
        {
            var list = (await store.List(path, allDirectories).SelectManyList()).Where(p => !p.Path.Name.StartsWith("_") && p.Path.Name.EndsWith(Extension));

            return(list.Select(StoreFileMd.FromFileItem).ToList());
        }
Example #7
0
        public static async Task Process(ISimpleFileStore store, YtWeb ytWeb, ILogger log)
        {
            var blobs = await store.List("rec_exports").SelectManyList();

            //blobs = blobs.Where(b => b.Path == "rec_exports/Traffic source 2019-07-01_2019-08-01 David Pakman Show.zip").ToList();

            var fileInfoRegex = new Regex("^Traffic source (?'from'\\d+-\\d+-\\d+)_(?'to'\\d+-\\d+-\\d+) (?'channel'[^.]+)", RegexOptions.Compiled);

            var appendStore = new JsonlStore <TrafficSourceRow>(store, "rec_exports_processed", r => r.FileUpdated.FileSafeTimestamp(), log);

            var md = await appendStore.LatestFile();

            var latestModified = md?.Ts.ParseFileSafeTimestamp();

            var newBlobs = latestModified != null
        ? blobs.Where(b => b.Modified > latestModified).ToList()
        : blobs;

            log.Information("Processing {NewExports}/{AllExports} exports", newBlobs.Count, blobs.Count);

            foreach (var b in newBlobs)
            {
                log.Information("Processing {Path}", b.Path);

                var m = fileInfoRegex.Match(b.Path.Name);
                if (m.Groups.Count < 3)
                {
                    throw new InvalidOperationException($"unable to parse export info from file name '{b.Path.Name}'");
                }
                var exportInfo = new {
                    Channel = m.Groups["channel"].Value,
                    From    = m.Groups["from"].Value.ParseDate(),
                    To      = m.Groups["to"].Value.ParseDate()
                };

                var stream = await store.Load(b.Path);

                var zip = new ZipArchive(stream);
                using var csvStream = new StreamReader(
                          zip.GetEntry("Table data.csv")?.Open() ?? throw new InvalidOperationException("expected export to have 'Table data.csv'"),
                          Encoding.UTF8);
                using var csvReader = new CsvReader(csvStream, CsvExtensions.DefaultConfig);

                var records = csvReader.GetRecords <TrafficSourceExportRow>().ToList();
                var rows    = (await records.BlockFunc(ToTrafficSourceRow, 4,
                                                       progressUpdate: p => log.Debug("Processing traffic sources for {Path}: {Rows}/{TotalRows}", b.Path, p.Completed, records.Count)))
                              .NotNull().ToList();

                await appendStore.Append(rows, log);

                log.Information("Completed processing traffic source exports for {Path}", b.Path);

                async Task <TrafficSourceRow> ToTrafficSourceRow(TrafficSourceExportRow row)
                {
                    var source = row.Source.Split(".");

                    if (source.Length != 2 || source[0] != "YT_RELATED")
                    {
                        return(null); // total at the top or otherwise. not interested
                    }
                    var videoId   = source[1];
                    var fromVideo = await ytWeb.GetVideo(videoId, log);

                    return(new TrafficSourceRow {
                        ToChannelTitle = exportInfo.Channel,
                        From = exportInfo.From,
                        To = exportInfo.To,
                        Impressions = row.Impressions,
                        Source = row.Source,
                        AvgViewDuration = row.AvgViewDuration,
                        Views = row.Views,
                        SourceType = row.SourceType,
                        FromChannelId = fromVideo?.ChannelId,
                        FromChannelTitle = fromVideo?.ChannelTitle,
                        FromVideoId = fromVideo?.Id,
                        FromVideoTitle = fromVideo?.Title,
                        ImpressionClickThrough = row.ImpressionClickThrough,
                        WatchTimeHrsTotal = row.WatchTimeHrsTotal,
                        FileUpdated = b.Modified?.UtcDateTime ?? DateTime.MinValue
                    });
                }
            }

            log.Information("Completed processing traffic source exports");
        }