async Task <(string[] files, long rows, ByteSize size)> CopyInto(ILoggedConnection <IDbConnection> db, string table, StageTableCfg t) { var startTime = await db.ExecuteScalar <string>("current time", "select current_timestamp()::string"); var(stage, path) = t.StoreType switch { DataStoreType.Db => (Cfg.Stage, StorageCfg.DbPath), DataStoreType.Private => (Cfg.Private, null), _ => throw new InvalidOperationException($"No warehouse stage for store type {t.StoreType}") }; var sql = $"copy into {table} from @{new[] {stage, path}.Concat(t.Dir.Tokens).NotNull().Join(" / ")}/ file_format=(type=json)"; await db.Execute("copy into", sql); // sf should return this info form copy_into (its in their UI, but not in .net or jdbc drivers) // the int that is return is the # of rows form the first file loaded. So we go get this ourselves var copyResults = await db.Query <(string fileName, long rows, long size)>("copy results", "select file_name, row_count, file_size " + $"from table(information_schema.copy_history(table_name=>'{table}', start_time=>'{startTime}'::timestamp_ltz))"); var res = (copyResults.Select(r => r.fileName).ToArray(), copyResults.Sum(r => r.rows), copyResults.Sum(r => r.size).Bytes()); return(res); } }
async Task Incremental(ILoggedConnection <IDbConnection> db, string table, StageTableCfg t, DateTime latestTs) { var store = Store(t); await store.Optimise(Cfg.Optimise, t.Dir, latestTs.FileSafeTimestamp(), db.Log); // optimise files newer than the last load var((_, rows, size), dur) = await CopyInto(db, table, t).WithDuration(); db.Log.Information("StageUpdate - {Table} incremental load of {Rows} rows ({Size}) took {Duration}", table, rows, size.Humanize("#.#"), dur.HumanizeShort()); }
async Task FullLoad(ILoggedConnection <IDbConnection> db, string table, StageTableCfg t) { var store = Store(t); if (t.IsNativeStore) { await store.Optimise(Cfg.Optimise, t.Dir, null, db.Log); // optimise all files when performing a full load } await db.Execute("truncate table", $"truncate table {table}"); // no transaction, stage tables aren't reported on so don't need to be available var((_, rows, size), dur) = await CopyInto(db, table, t).WithDuration(); db.Log.Information("StageUpdate - {Table} full load of {Rows} rows ({Size}) took {Duration}", table, rows, size.Humanize("#.#"), dur.HumanizeShort()); }
async Task <ChannelUpdatePlan[]> ChannelsToDiscover(ILoggedConnection <IDbConnection> db, ILogger log) { var toAdd = await db.Query <(string channel_id, string channel_title, string source)>("channels to classify", @"with review_channels as ( select channel_id , channel_title -- probably missing values. reviews without channels don't have titles from channel_review r where not exists(select * from channel_stage c where c.v:ChannelId::string=r.channel_id) ) , rec_channels as ( select to_channel_id as channel_id, any_value(to_channel_title) as channel_title from rec r where to_channel_id is not null and not exists(select * from channel_stage c where c.v:ChannelId::string=r.to_channel_id) group by to_channel_id ) , s as ( select channel_id, channel_title, 'review' as source from review_channels sample (:remaining rows) union all select channel_id, channel_title, 'rec' as source from rec_channels sample (:remaining rows) ) select * from s limit :remaining", param : new { remaining = RCfg.DiscoverChannels }); log.Debug("Collect - found {Channels} new channels for discovery", toAdd.Count); var toDiscover = toAdd .Select(c => new ChannelUpdatePlan(new ChannelStored2 { ChannelId = c.channel_id, ChannelTitle = c.channel_title }, c.source == "review" ? Standard : Discover)).ToArray(); return(toDiscover); }
/// <summary>Copies the given files into staging tables MOTE: filesToCopy is relative to the store path. The resulting /// LoadHistoryRow.File paths are converted to be equivalent</summary> public static async Task <LoadHistoryRow[]> CopyInto(this StageTableCfg t, ISimpleFileStore store, string sfStage, ILoggedConnection <IDbConnection> db, [CanBeNull] SPath[] filesToCopy, ILogger log) { if (filesToCopy?.Length > 1000) { throw new("copying 1k+ files not implemented"); } var stagePath = new string[] { sfStage, store.BasePathSansContainer().Dot(c => c.IsEmpty ? null : c) }.Concat(t.Dir.Tokens).NotNull().Join("/"); var cols = await t.TableCols(db); // support subsets of columns (e.g. no loaded or updated columns var selectCols = cols.Join(",", c => c.column_name.ToLowerInvariant() switch { "v" => "$1 v", "loaded" => "sysdate() loaded", "updated" => "v:Updated::timestamp_ntz updated", _ => throw new($"stage column {c.column_name} not supported") });
public static async Task <SfCol[]> TableCols(this StageTableCfg t, ILoggedConnection <IDbConnection> db) { var cols = await db.QueryAsync <SfCol>("show columns", $"show columns in table {t.Table}").ToArrayAsync(); return(cols); }
public static async Task <DateTime?> LatestTimestamp(this StageTableCfg t, ILoggedConnection <IDbConnection> db) => await db.ExecuteScalar <DateTime?>("latest timestamp", $"select max(v:{t.TsCol ?? "Updated"}::timestamp_ntz) from {t.Table}");
/// <summary>Find videos that we should update to collect comments (chrome update). We do this once x days after a video is /// uploaded.</summary> async Task <IReadOnlyCollection <(string ChannelId, string VideoId)> > VideosForChromeUpdate(IReadOnlyCollection <ChannelStored2> channels, ILoggedConnection <IDbConnection> db, ILogger log) { var ids = await db.Query <(string ChannelId, string VideoId)>("videos sans-comments", $@"with chrome_extra_latest as ( select video_id , updated , row_number() over (partition by video_id order by updated desc) as age_no , count(1) over (partition by video_id) as extras from video_extra v where source='Chrome' qualify age_no=1 ) , videos_to_update as ( select * , row_number() over (partition by channel_id order by views desc) as channel_rank from ( select v.video_id , v.channel_id , v.views , datediff(d, e.updated, convert_timezone('UTC', current_timestamp())) as extra_ago , datediff(d, v.upload_date, convert_timezone('UTC', current_timestamp())) as upload_ago from video_latest v left join chrome_extra_latest e on e.video_id=v.video_id where v.channel_id in ({channels.Join(",", c => $"'{c.ChannelId}'")}) and upload_ago>7 and e.updated is null -- update only 7 days after being uploaded ) qualify channel_rank<=:videos_per_channel ) select channel_id, video_id from videos_to_update", param : new { videos_per_channel = RCfg.PopulateMissingCommentsLimit }); return(ids); }
async Task <IReadOnlyCollection <(string ChannelId, string VideoId)> > DeadVideosForExtraUpdate(IReadOnlyCollection <ChannelStored2> channels, ILoggedConnection <IDbConnection> db, ILogger log) { var ids = await db.Query <(string ChannelId, string VideoId)>("missing videos", $@" with chans as ( select channel_id from channel_accepted where status_msg<>'Dead' and channel_id in ({channels.Join(",", c => $"'{c.ChannelId}'")}) ) , missing as ( select v.channel_id , v.channel_title , v.video_id , e.video_id is not null as has_extra , v.video_title , e.error , e.sub_error , v.updated , v.latest_update , v.video_no , datediff(d, e.updated, v.latest_update) days_since_extra_update -- check missing only if the video extra is older than the latest update from video_missing v left join video_extra e on e.video_id=v.video_id where ( e.video_id is null or days_since_extra_update>30) and exists(select * from chans c where c.channel_id=v.channel_id) ) , missing_and_fixes as ( select channel_id, video_id, 'missing' from missing union all select channel_id, video_id, 'copyright' from video_extra e where (error like '%copyright%' or sub_error like '%copyright%') and copyright_holder is null and e.channel_id is not null and exists(select * from chans c where c.channel_id=e.channel_id) ) select * from missing_and_fixes "); return(ids); }