コード例 #1
0
        static async Task <StringPath> JoinFiles(ISimpleFileStore store, IReadOnlyCollection <StoreFileMd> toOptimise, StringPath destPath, int parallel,
                                                 ILogger log)
        {
            var optimisedFile = FilePath(destPath, toOptimise.Last().Ts);

            using (var joinedStream = new MemoryStream()) {
                using (var zipWriter = new GZipStream(joinedStream, CompressionLevel.Optimal, true)) {
                    var inStreams = await toOptimise.BlockFunc(async s => {
                        var inStream = await store.Load(s.Path, log).WithDuration();
                        log.Debug("Optimise {Path} - loaded file {SourceFile} to be optimised in {Duration}",
                                  destPath, s.Path, inStream.Duration.HumanizeShort());
                        return(inStream.Result);
                    }, parallel);

                    foreach (var s in inStreams)
                    {
                        using var zr = new GZipStream(s, CompressionMode.Decompress, false);
                        await zr.CopyToAsync(zipWriter);
                    }
                }
                joinedStream.Seek(0, SeekOrigin.Begin);
                await store.Save(optimisedFile, joinedStream);
            }

            // when in-place, this is dirty if we fail now. There is no transaction capability in cloud storage, so downstream process must handle duplicates
            // successfully staged files, delete from land. Incremental using TS will work without delete, but it's more efficient to delete process landed files.
            await toOptimise.BlockAction(f => store.Delete(f.Path), parallel)
            .WithWrappedException(e => "Failed to delete optimised files. Duplicate records need to be handled downstream");

            log.Debug("Optimise {Path} - deleted {Files} that were optimised into {OptimisedFile}",
                      destPath, toOptimise.Count, optimisedFile);

            return(optimisedFile);
        }
コード例 #2
0
        async Task <IReadOnlyCollection <JObject> > Jsonl(StoreFileMd f)
        {
            await using var sr = await Store.Load(f.Path);

            var existingJs = sr.LoadJsonlGz <JObject>();

            return(existingJs);
        }
コード例 #3
0
        public static async Task <T> Get <T>(this ISimpleFileStore store, StringPath path, bool zip = true, ILogger log = null)
        {
            using var stream = await store.Load(path.AddJsonExtention(zip), log);

            if (!zip)
            {
                return(stream.ToObject <T>());
            }
            await using var zr = new GZipStream(stream, CompressionMode.Decompress, true);
            return(zr.ToObject <T>());
        }
コード例 #4
0
 public async Task Convert(ILogger log)
 {
     var files = (await Store.List("import/watch_time").SelectManyList()).Where(f => f.Path.ExtensionsString == "csv");
     await files.BlockAction(async f => {
         using var stream = await Store.Load(f.Path);
         using var sr     = new StreamReader(stream);
         using var csv    = new CsvReader(sr, CultureInfo.InvariantCulture)
               {
                   Configuration =
                   {
                       Encoding          = Encoding.UTF8,
                       HasHeaderRecord   = true,
                       MissingFieldFound = null,
                       BadDataFound      = r => log.Warning("Error reading csv data at {RowNumber}: {RowData}", r.Row, r.RawRecord)
                   }
               };
         var rows = await csv.GetRecordsAsync <dynamic>().ToListAsync();
         await Store.Save(f.Path.Parent.Add($"{f.Path.NameSansExtension}.json.gz"), await rows.ToJsonlGzStream(), log);
     }, parallelism : 4);
 }
コード例 #5
0
        public static async Task Process(ISimpleFileStore store, YtWeb ytWeb, ILogger log)
        {
            var blobs = await store.List("rec_exports").SelectManyList();

            //blobs = blobs.Where(b => b.Path == "rec_exports/Traffic source 2019-07-01_2019-08-01 David Pakman Show.zip").ToList();

            var fileInfoRegex = new Regex("^Traffic source (?'from'\\d+-\\d+-\\d+)_(?'to'\\d+-\\d+-\\d+) (?'channel'[^.]+)", RegexOptions.Compiled);

            var appendStore = new JsonlStore <TrafficSourceRow>(store, "rec_exports_processed", r => r.FileUpdated.FileSafeTimestamp(), log);

            var md = await appendStore.LatestFile();

            var latestModified = md?.Ts.ParseFileSafeTimestamp();

            var newBlobs = latestModified != null
        ? blobs.Where(b => b.Modified > latestModified).ToList()
        : blobs;

            log.Information("Processing {NewExports}/{AllExports} exports", newBlobs.Count, blobs.Count);

            foreach (var b in newBlobs)
            {
                log.Information("Processing {Path}", b.Path);

                var m = fileInfoRegex.Match(b.Path.Name);
                if (m.Groups.Count < 3)
                {
                    throw new InvalidOperationException($"unable to parse export info from file name '{b.Path.Name}'");
                }
                var exportInfo = new {
                    Channel = m.Groups["channel"].Value,
                    From    = m.Groups["from"].Value.ParseDate(),
                    To      = m.Groups["to"].Value.ParseDate()
                };

                var stream = await store.Load(b.Path);

                var zip = new ZipArchive(stream);
                using var csvStream = new StreamReader(
                          zip.GetEntry("Table data.csv")?.Open() ?? throw new InvalidOperationException("expected export to have 'Table data.csv'"),
                          Encoding.UTF8);
                using var csvReader = new CsvReader(csvStream, CsvExtensions.DefaultConfig);

                var records = csvReader.GetRecords <TrafficSourceExportRow>().ToList();
                var rows    = (await records.BlockFunc(ToTrafficSourceRow, 4,
                                                       progressUpdate: p => log.Debug("Processing traffic sources for {Path}: {Rows}/{TotalRows}", b.Path, p.Completed, records.Count)))
                              .NotNull().ToList();

                await appendStore.Append(rows, log);

                log.Information("Completed processing traffic source exports for {Path}", b.Path);

                async Task <TrafficSourceRow> ToTrafficSourceRow(TrafficSourceExportRow row)
                {
                    var source = row.Source.Split(".");

                    if (source.Length != 2 || source[0] != "YT_RELATED")
                    {
                        return(null); // total at the top or otherwise. not interested
                    }
                    var videoId   = source[1];
                    var fromVideo = await ytWeb.GetVideo(videoId, log);

                    return(new TrafficSourceRow {
                        ToChannelTitle = exportInfo.Channel,
                        From = exportInfo.From,
                        To = exportInfo.To,
                        Impressions = row.Impressions,
                        Source = row.Source,
                        AvgViewDuration = row.AvgViewDuration,
                        Views = row.Views,
                        SourceType = row.SourceType,
                        FromChannelId = fromVideo?.ChannelId,
                        FromChannelTitle = fromVideo?.ChannelTitle,
                        FromVideoId = fromVideo?.Id,
                        FromVideoTitle = fromVideo?.Title,
                        ImpressionClickThrough = row.ImpressionClickThrough,
                        WatchTimeHrsTotal = row.WatchTimeHrsTotal,
                        FileUpdated = b.Modified?.UtcDateTime ?? DateTime.MinValue
                    });
                }
            }

            log.Information("Completed processing traffic source exports");
        }
コード例 #6
0
        async Task <IReadOnlyCollection <T> > LoadJsonl(StringPath path)
        {
            await using var stream = await Store.Load(path);

            return(stream.LoadJsonlGz <T>());
        }