Task <long> CreateContainers(BranchState state, string[] paths, ILogger log) => new[] { Premium, Standard }.BlockAction(async tier => { var s = Stores.Store(tier: tier); var c = s.Container; var exists = await c.ExistsAsync(); if (!exists) { await c.CreateAsync(); await c.SetAccessPolicyAsync(PublicAccessType.BlobContainer); } await PopulateContainer(tier, state, paths, log); });
public Parler(ILogger log, BlobStores stores, GoogleCfg Cfg) { Log = log; this.Cfg = Cfg; Db = stores.Store(DataStoreType.DbStage); Dir = Path.GetTempPath().AsPath().Combine("recfluence", "parler"); }
public async ValueTask ExecuteAsync(IConsole console) { var includedDirs = Dirs?.UnJoin('|'); var dirs = new[] { "videos", "recs", "captions" }.Where(d => includedDirs == null || includedDirs.Contains(d)); var store = Stores.Store(DataStoreType.DbStage); foreach (var dir in dirs) { Log.Information("upgrade-partitions - {Dir} started", dir); var files = await store.Files(dir, allDirectories : true).SelectMany() .Where(f => f.Path.Tokens.Count == 3) // only optimise from within partitions .ToListAsync(); var plan = JsonlStoreExtensions.OptimisePlan(dir, files, Cfg.Optimise, Log); if (plan.Count < 10) // if the plan is small, run locally, otherwise on many machines { await store.Optimise(Cfg.Optimise, plan, Log); } else { await plan.Process(Ctx, b => Stage.ProcessOptimisePlan(b, store, PipeArg.Inject <ILogger>()), new() { MaxParallel = 12, MinWorkItems = 1 }, log : Log, cancel : console.GetCancellationToken()); } } }
public async Task CreateOrReplace(BranchState state, ILogger log) { var sw = Stopwatch.StartNew(); var schema = Sf.Cfg.Schema; var store = Stores.Store(DataStoreType.DbStage); var container = store.Container; var sasUri = container.GenerateSasUri(List | Read, DateTimeOffset.UtcNow.AddYears(100)); var stageUrl = $"azure://{sasUri.Host}{sasUri.AbsolutePath}"; var sasToken = sasUri.Query; using var conn = await Sf.Open(log, "", ""); // connection sans db & schema. If you specify ones that doesn't exist, all queries hang. var db = Sf.Cfg.DbName(); IEnumerable <Script> scripts; var dbComment = new DbComment { Expires = DateTime.UtcNow.AddDays(2), Email = EnvCfg.Email }.ToJson(JCfg); if (state.In(Clone, CloneDb)) { scripts = new[] { new Script("db copy", @$ "create or replace database {db} clone {Sf.Cfg.Db} comment='{dbComment}'") } }
public YtCollector(BlobStores stores, AppCfg cfg, SnowflakeConnectionProvider sf, IPipeCtx pipeCtx, YtWeb ytWeb, YtClient api, ILogger log) { DbStore = new(stores.Store(DataStoreType.DbStage), log); Cfg = cfg; Sf = sf; PipeCtx = pipeCtx; Scraper = ytWeb; Api = api; }
public async ValueTask ExecuteAsync(IConsole console) { var privateStore = Stores.Store(DataStoreType.Private); await TrafficSourceExports.Process(privateStore, Scraper, Log); }
public async Task CreateOrReplace(WarehouseCreateMode mode, [CanBeNull] string schema, ILogger log) { if (mode == CloneProd && Sf.Cfg.DbSuffix.NullOrEmpty()) { throw new( "DbSuffix needs to be set when cloning a db. Typically you should set warehouse.mode to \"Branch\" in local.appcfg.json to create a dev warehouse"); } var sw = Stopwatch.StartNew(); schema ??= Sf.Cfg.Schema; var stages = new[] { DbStage, Private }.Select(type => { var store = Stores.Store(type); var container = ((AzureBlobFileStore)store).Container; var sasUri = container.GenerateSasUri(List | Read, DateTimeOffset.UtcNow.AddYears(100)); var name = type switch { Private => "yt_private", DbStage => "yt_data", _ => throw new($"store type {type} has no stage") }; return(new { Uri = $"azure://{sasUri.Host}{sasUri.AbsolutePath}", Sas = sasUri.Query, Name = name }); }).ToArray(); if (mode == CreateSchemaIfNotExists) { using var dbConn = await Sf.Open(log, schema : ""); var schemaExists = await dbConn.ExecuteScalar <bool>($"{Scope} - schema exits", $"select exists(select * from information_schema.schemata where catalog_name = current_database() and schema_name ilike '{schema}')"); if (schemaExists) { return; } } // use a non-contextual connection for creating databases/schema's using var conn = await Sf.Open(log, "", ""); var db = Sf.Cfg.DbName(); var dbComment = new DbComment { Expires = DateTime.UtcNow.AddDays(2), Email = EnvCfg.Email }.ToJson(JCfg); var scripts = (mode.In(CloneProd) ? new[] { new Script("db copy", @$ "create or replace database {db} clone {Sf.Cfg.Db} comment='{dbComment}'") } : new[] { new Script("db create", @$ "create database if not exists {db} comment='{dbComment}'"), // don't replace new Script("schema", $"create schema if not exists {db}.{schema}") }) .Concat(new Script("file formats", $"create or replace file format {db}.{schema}.json type = 'json'", $"create or replace file format {db}.{schema}.json_zst type = 'json' compression = ZSTD", $"create or replace file format {db}.{schema}.tsv type = 'csv' field_delimiter = '\t' validate_UTF8 = false NULL_IF=('')", $"create or replace file format {db}.{schema}.tsv_header type = 'csv' field_delimiter = '\t' validate_UTF8 = false NULL_IF=('') skip_header=1 field_optionally_enclosed_by ='\"'", $"create or replace file format {db}.{schema}.tsv_header_no_enclose type = 'csv' field_delimiter = '\t' validate_UTF8 = false NULL_IF=('') skip_header=1")) .Concat(stages.Select(s => new Script($"stage {s.Name}", $"create or replace stage {db}.{schema}.{s.Name} url='{s.Uri}' credentials=(azure_sas_token='{s.Sas}') file_format=(type=json compression=gzip)" ))) .Concat(WhCfg.AdminRoles.Select(r => new Script($"init role {r}", ScriptMode.Parallel, $"grant all on database {db} to role {r}", $"grant all on schema {db}.{schema} to role {r}", $"grant all on all tables in schema {db}.{schema} to role {r}", $"grant all on future tables in schema {db}.{schema} to role {r}", $"grant all on all views in schema {db}.{schema} to role {r}", $"grant all on future views in schema {db}.{schema} to role {r}", $"grant all on all stages in database {db} to role {r}", $"grant all on all functions in database {db} to role {r}" ))) .Concat(WhCfg.ReadRoles.Select(r => new Script($"init role {r}", ScriptMode.Parallel, $"grant usage,monitor on database {db} to role {r}", $"grant usage, monitor on all schemas in database {db} to role {r}", $"grant select on future tables in schema {db}.{schema} to role {r}", $"grant select on future views in schema {db}.{schema} to role {r}", $"grant select on all tables in schema {db}.{schema} to role {r}", $"grant select on all views in schema {db}.{schema} to role {r}", $"grant usage on all stages in schema {db}.{schema} to role {r}", $"grant usage on all functions in schema {db}.{schema} to role {r}", $"grant usage on all file formats in schema {db}.{schema} to role {r}" ))); foreach (var s in scripts) { await s.Sqls.BlockDo <string>(q => conn.Execute(s.Name, q), s.Mode == Sequential? 1 : WhCfg.MetadataParallel); } log.Information("Create Warehouse - {Db} created/updated in {Duration}", db, sw.Elapsed.HumanizeShort()); }
public async Task CreateOrReplace(BranchState state, ILogger log) { var sw = Stopwatch.StartNew(); var schema = Sf.Cfg.Schema; var store = Stores.Store(DataStoreType.DbStage); var container = store.Container; var sasUri = container.GenerateSasUri(List | Read, DateTimeOffset.UtcNow.AddYears(100)); var stageUrl = $"azure://{sasUri.Host}{sasUri.AbsolutePath}"; var sasToken = sasUri.Query; using var conn = await Sf.Open(log, "", ""); // connection sans db & schema. If you specify ones that doesn't exist, all queries hang. var db = Sf.Cfg.DbName(); var dbComment = new DbComment { Expires = DateTime.UtcNow.AddDays(2), Email = EnvCfg.Email }.ToJson(JCfg); var scripts = (state.In(Clone, CloneDb) ? new[] { new Script("db copy", @$ "create or replace database {db} clone {Sf.Cfg.Db} comment='{dbComment}'") } : new[] { new Script("db create", @$ "create or replace database {db} comment='{dbComment}'"), new Script("schema", $"create schema if not exists {db}.{schema}"), }) .Concat(new Script("stage", $"create or replace stage {db}.{schema}.yt_data url='{stageUrl}' credentials=(azure_sas_token='{sasToken}') file_format=(type=json compression=gzip)", $"create or replace file format {db}.{schema}.json type = 'json'", $"create or replace file format {db}.{schema}.json_zst type = 'json' compression = ZSTD", $"create or replace file format {db}.{schema}.tsv type = 'csv' field_delimiter = '\t' validate_UTF8 = false NULL_IF=('')", $"create or replace file format {db}.{schema}.tsv_header type = 'csv' field_delimiter = '\t' validate_UTF8 = false NULL_IF=('') skip_header=1 field_optionally_enclosed_by ='\"'", $"create or replace file format {db}.{schema}.tsv_header_no_enclose type = 'csv' field_delimiter = '\t' validate_UTF8 = false NULL_IF=('') skip_header=1" )) .Concat( WhCfg.AdminRoles.Select(r => new Script($"init role {r}", ScriptMode.Parallel, $"grant all on database {db} to role {r}", $"grant all on schema {db}.{schema} to role {r}", $"grant all on all tables in schema {db}.{schema} to role {r}", $"grant all on future tables in schema {db}.{schema} to role {r}", $"grant all on all views in schema {db}.{schema} to role {r}", $"grant all on future views in schema {db}.{schema} to role {r}", $"grant all on all stages in database {db} to role {r}", $"grant all on all functions in database {db} to role {r}" ))) .Concat(WhCfg.ReadRoles.Select(r => new Script($"init role {r}", ScriptMode.Parallel, $"grant usage,monitor on database {db} to role {r}", $"grant usage, monitor on all schemas in database {db} to role {r}", $"grant select on future tables in database {db} to role {r}", $"grant select on future views in database {db} to role {r}", $"grant select on all tables in database {db} to role {r}", $"grant select on all views in database {db} to role {r}", $"grant usage on all stages in database {db} to role {r}", $"grant usage on all functions in database {db} to role {r}" ))); foreach (var s in scripts) { await s.Sqls.BlockAction(q => conn.Execute(s.Name, q), s.Mode == Sequential? 1 : WhCfg.MetadataParallel); } log.Information("Create Warehouse - {Db} created/updated in {Duration}", db, sw.Elapsed.HumanizeShort()); }
public YtIndexResults(BlobStores stores, SnowflakeConnectionProvider sf) { Sf = sf; BlobIndex = new(stores.Store(DataStoreType.Results)); }