Example #1
0
        async Task <(string[] files, long rows, ByteSize size)> CopyInto(ILoggedConnection <IDbConnection> db, string table, StageTableCfg t)
        {
            var startTime = await db.ExecuteScalar <string>("current time", "select current_timestamp()::string");

            var(stage, path) = t.StoreType switch {
                DataStoreType.Db => (Cfg.Stage, StorageCfg.DbPath),
                DataStoreType.Private => (Cfg.Private, null),
                _ => throw new InvalidOperationException($"No warehouse stage for store type {t.StoreType}")
            };


            var sql = $"copy into {table} from @{new[] {stage, path}.Concat(t.Dir.Tokens).NotNull().Join(" / ")}/ file_format=(type=json)";
            await db.Execute("copy into", sql);

            // sf should return this info form copy_into (its in their UI, but not in .net or jdbc drivers)
            // the int that is return is the # of rows form the first file loaded. So we go get this ourselves
            var copyResults = await db.Query <(string fileName, long rows, long size)>("copy results",
                                                                                       "select file_name, row_count, file_size " +
                                                                                       $"from table(information_schema.copy_history(table_name=>'{table}', start_time=>'{startTime}'::timestamp_ltz))");

            var res = (copyResults.Select(r => r.fileName).ToArray(), copyResults.Sum(r => r.rows), copyResults.Sum(r => r.size).Bytes());

            return(res);
        }
    }
Example #2
0
        async Task Incremental(ILoggedConnection <IDbConnection> db, string table, StageTableCfg t, DateTime latestTs)
        {
            var store = Store(t);
            await store.Optimise(Cfg.Optimise, t.Dir, latestTs.FileSafeTimestamp(), db.Log); // optimise files newer than the last load

            var((_, rows, size), dur) = await CopyInto(db, table, t).WithDuration();

            db.Log.Information("StageUpdate - {Table} incremental load of {Rows} rows ({Size}) took {Duration}",
                               table, rows, size.Humanize("#.#"), dur.HumanizeShort());
        }
Example #3
0
        async Task FullLoad(ILoggedConnection <IDbConnection> db, string table, StageTableCfg t)
        {
            var store = Store(t);

            if (t.IsNativeStore)
            {
                await store.Optimise(Cfg.Optimise, t.Dir, null, db.Log);   // optimise all files when performing a full load
            }
            await db.Execute("truncate table", $"truncate table {table}"); // no transaction, stage tables aren't reported on so don't need to be available

            var((_, rows, size), dur) = await CopyInto(db, table, t).WithDuration();

            db.Log.Information("StageUpdate - {Table} full load of {Rows} rows ({Size}) took {Duration}",
                               table, rows, size.Humanize("#.#"), dur.HumanizeShort());
        }
Example #4
0
        async Task <ChannelUpdatePlan[]> ChannelsToDiscover(ILoggedConnection <IDbConnection> db, ILogger log)
        {
            var toAdd = await db.Query <(string channel_id, string channel_title, string source)>("channels to classify",
                                                                                                  @"with review_channels as (
  select channel_id
       , channel_title -- probably missing values. reviews without channels don't have titles
  from channel_review r
  where not exists(select * from channel_stage c where c.v:ChannelId::string=r.channel_id)
)
   , rec_channels as (
  select to_channel_id as channel_id, any_value(to_channel_title) as channel_title
  from rec r
  where to_channel_id is not null
    and not exists(select * from channel_stage c where c.v:ChannelId::string=r.to_channel_id)
  group by to_channel_id
)
   , s as (
  select channel_id, channel_title, 'review' as source
  from review_channels sample (:remaining rows)
  union all
  select channel_id, channel_title, 'rec' as source
  from rec_channels sample (:remaining rows)
)
select *
from s
limit :remaining", param : new { remaining = RCfg.DiscoverChannels });

            log.Debug("Collect - found {Channels} new channels for discovery", toAdd.Count);

            var toDiscover = toAdd
                             .Select(c => new ChannelUpdatePlan(new ChannelStored2 {
                ChannelId    = c.channel_id,
                ChannelTitle = c.channel_title
            }, c.source == "review" ? Standard : Discover)).ToArray();

            return(toDiscover);
        }
Example #5
0
    /// <summary>Copies the given files into staging tables MOTE: filesToCopy is relative to the store path. The resulting
    ///   LoadHistoryRow.File paths are converted to be equivalent</summary>
    public static async Task <LoadHistoryRow[]> CopyInto(this StageTableCfg t, ISimpleFileStore store, string sfStage, ILoggedConnection <IDbConnection> db,
                                                         [CanBeNull] SPath[] filesToCopy, ILogger log)
    {
        if (filesToCopy?.Length > 1000)
        {
            throw new("copying 1k+ files not implemented");
        }

        var stagePath = new string[] { sfStage, store.BasePathSansContainer().Dot(c => c.IsEmpty ? null : c) }.Concat(t.Dir.Tokens).NotNull().Join("/");

        var cols = await t.TableCols(db); // support subsets of columns (e.g. no loaded or updated columns

        var selectCols = cols.Join(",", c => c.column_name.ToLowerInvariant() switch {
            "v" => "$1 v",
            "loaded" => "sysdate() loaded",
            "updated" => "v:Updated::timestamp_ntz updated",
            _ => throw new($"stage column {c.column_name} not supported")
        });
Example #6
0
    public static async Task <SfCol[]> TableCols(this StageTableCfg t, ILoggedConnection <IDbConnection> db)
    {
        var cols = await db.QueryAsync <SfCol>("show columns", $"show columns in table {t.Table}").ToArrayAsync();

        return(cols);
    }
Example #7
0
 public static async Task <DateTime?> LatestTimestamp(this StageTableCfg t, ILoggedConnection <IDbConnection> db) =>
 await db.ExecuteScalar <DateTime?>("latest timestamp", $"select max(v:{t.TsCol ?? "Updated"}::timestamp_ntz) from {t.Table}");
Example #8
0
        /// <summary>Find videos that we should update to collect comments (chrome update). We do this once x days after a video is
        ///   uploaded.</summary>
        async Task <IReadOnlyCollection <(string ChannelId, string VideoId)> > VideosForChromeUpdate(IReadOnlyCollection <ChannelStored2> channels,
                                                                                                     ILoggedConnection <IDbConnection> db,
                                                                                                     ILogger log)
        {
            var ids = await db.Query <(string ChannelId, string VideoId)>("videos sans-comments",
                                                                          $@"with chrome_extra_latest as (
  select video_id
       , updated
       , row_number() over (partition by video_id order by updated desc) as age_no
       , count(1) over (partition by video_id) as extras
  from video_extra v
  where source='Chrome'
    qualify age_no=1
)
   , videos_to_update as (
  select *
       , row_number() over (partition by channel_id order by views desc) as channel_rank
  from (
         select v.video_id
              , v.channel_id
              , v.views
              , datediff(d, e.updated, convert_timezone('UTC', current_timestamp())) as extra_ago
              , datediff(d, v.upload_date, convert_timezone('UTC', current_timestamp())) as upload_ago

         from video_latest v
                left join chrome_extra_latest e on e.video_id=v.video_id
         where v.channel_id in ({channels.Join(",", c => $"'{c.ChannelId}'")})
           and upload_ago>7
           and e.updated is null -- update only 7 days after being uploaded
       )
    qualify channel_rank<=:videos_per_channel
)
select channel_id, video_id
from videos_to_update",
                                                                          param : new { videos_per_channel = RCfg.PopulateMissingCommentsLimit });

            return(ids);
        }
Example #9
0
        async Task <IReadOnlyCollection <(string ChannelId, string VideoId)> > DeadVideosForExtraUpdate(IReadOnlyCollection <ChannelStored2> channels,
                                                                                                        ILoggedConnection <IDbConnection> db,
                                                                                                        ILogger log)
        {
            var ids = await db.Query <(string ChannelId, string VideoId)>("missing videos", $@"
with chans as (
  select channel_id
  from channel_accepted
  where status_msg<>'Dead'
   and channel_id in ({channels.Join(",", c => $"'{c.ChannelId}'")})
)
   , missing as (
  select v.channel_id
       , v.channel_title
       , v.video_id
       , e.video_id is not null as has_extra
       , v.video_title
       , e.error
       , e.sub_error
       , v.updated
       , v.latest_update
       , v.video_no
       , datediff(d, e.updated, v.latest_update) days_since_extra_update -- check missing only if the video extra is older than the latest update
  from video_missing v
         left join video_extra e on e.video_id=v.video_id

  where (
      e.video_id is null
      or days_since_extra_update>30)
    and exists(select * from chans c where c.channel_id=v.channel_id)
)
   , missing_and_fixes as (
  select channel_id, video_id, 'missing'
  from missing
  union all
  select channel_id, video_id, 'copyright'
  from video_extra e
  where (error like '%copyright%' or sub_error like '%copyright%')
    and copyright_holder is null
    and e.channel_id is not null
    and exists(select * from chans c where c.channel_id=e.channel_id)
)
select *
from missing_and_fixes
");

            return(ids);
        }