async Task <IReadOnlyCollection <string> > VideoToUpdateRecs(ChannelStored2 c, IEnumerable <VideoItem> vids) { var prevUpdateMeta = await Store.Recs.LatestFile(c.ChannelId); var prevUpdate = prevUpdateMeta?.Ts.ParseFileSafeTimestamp(); var vidsDesc = vids.OrderByDescending(v => v.UploadDate).ToList(); var inThisWeeksRecUpdate = ChannelInTodaysUpdate(c, cycleDays: 7); var toUpdate = new List <VideoItem>(); if (prevUpdate == null) { Log.Debug("Collect - {Channel} - first rec update, collecting max", c.ChannelTitle); toUpdate.AddRange(vidsDesc.Take(RCfg.RefreshRecsMax)); } else if (inThisWeeksRecUpdate) { Log.Debug("Collect - {Channel} - performing weekly recs update", c.ChannelTitle); toUpdate.AddRange(vidsDesc.Where(v => v.UploadDate > prevUpdate || v.UploadDate.IsYoungerThan(RCfg.RefreshRecsWithin)) .Take(RCfg.RefreshRecsMax)); var deficit = RCfg.RefreshRecsMin - toUpdate.Count; if (deficit > 0) { toUpdate.AddRange(vidsDesc.Where(v => toUpdate.All(u => u.Id != v.Id)) .Take(deficit)); // if we don't have new videos, refresh the min amount by adding videos } } else { Log.Debug("Collect - {Channel} - skipping rec update because it's not this channels day", c.ChannelTitle); } return(toUpdate.Select(v => v.Id).ToList()); }
/// <summary>Saves recs for all of the given vids</summary> async Task SaveRecsAndExtra(ChannelStored2 c, CollectPart[] parts, HashSet <string> forChromeUpdate, string[] forWebUpdate, ILogger log) { var chromeExtra = await ChromeScraper.GetRecsAndExtra(forChromeUpdate, log); var webExtra = await Scraper.GetRecsAndExtra(forWebUpdate, log, c.ChannelId, c.ChannelTitle); var allExtra = chromeExtra.Concat(webExtra).ToArray(); var extra = allExtra.Select(v => v.Extra).NotNull().ToArray(); foreach (var e in extra) { e.ChannelId ??= c.ChannelId; // if the video has an error, it may not have picked up the channel e.ChannelTitle ??= c.ChannelTitle; } var updated = DateTime.UtcNow; var recs = new List <RecStored2>(); if (parts.ShouldRun(VidRecs)) { recs.AddRange(ToRecStored(allExtra, updated)); if (recs.Any()) { await Store.Recs.Append(recs, log); } } if (extra.Any()) { await Store.VideoExtra.Append(extra, log); } log.Information("Collect - {Channel} - Recorded {WebExtras} web-extras, {ChromeExtras} chrome-extras, {Recs} recs, {Comments} comments", c.ChannelTitle, webExtra.Count, chromeExtra.Count, recs.Count, extra.Sum(e => e.Comments?.Length ?? 0)); }
static async Task SaveVids(ChannelStored2 c, IReadOnlyCollection <VideoItem> vids, JsonlStore <VideoStored2> vidStore, DateTime?uploadedFrom, ILogger log) { var updated = DateTime.UtcNow; var vidsStored = vids.Select(v => new VideoStored2 { VideoId = v.Id, Title = v.Title, Description = v.Description, Duration = v.Duration, Keywords = v.Keywords.ToList(), Statistics = v.Statistics, ChannelId = c.ChannelId, ChannelTitle = c.ChannelTitle, UploadDate = v.UploadDate, AddedDate = v.AddedDate, Updated = updated }).ToList(); if (vidsStored.Count > 0) { await vidStore.Append(vidsStored, log); } var newVideos = vidsStored.Count(v => uploadedFrom == null || v.UploadDate > uploadedFrom); log.Information("Collect - {Channel} - Recorded {VideoCount} videos. {NewCount} new, {UpdatedCount} updated", c.ChannelTitle, vids.Count, newVideos, vids.Count - newVideos); }
/// <summary>Saves captions for all new videos from the vids list</summary> async Task SaveNewCaptions(ChannelStored2 channel, IEnumerable <VideoItem> vids, ILogger log) { var lastUpload = (await Store.Captions.LatestFile(channel.ChannelId))?.Ts.ParseFileSafeTimestamp(); // last video upload in this channel partition we have captions for var consecutiveCaptionMissing = 0; async Task <VideoCaptionStored2> GetCaption(VideoItem v) { if (consecutiveCaptionMissing >= MaxConsecutiveCaptionsMissing) { return(null); } var videoLog = log.ForContext("VideoId", v.Id); ClosedCaptionTrack track; try { var captions = await Scraper.GetCaptions(v.Id, log); var enInfo = captions.FirstOrDefault(t => t.Language.Code == "en"); if (enInfo == null) { if (Interlocked.Increment(ref consecutiveCaptionMissing) == MaxConsecutiveCaptionsMissing) { log.Debug("SaveCaptions - too many consecutive videos are missing captions. Assuming it won't have any."); } return(null); } track = await Scraper.GetClosedCaptionTrackAsync(enInfo, videoLog); consecutiveCaptionMissing = 0; } catch (Exception ex) { ex.ThrowIfUnrecoverable(); log.Warning(ex, "Unable to get captions for {VideoID}: {Error}", v.Id, ex.Message); return(null); } return(new VideoCaptionStored2 { ChannelId = channel.ChannelId, VideoId = v.Id, UploadDate = v.UploadDate, Updated = DateTime.Now, Info = track.Info, Captions = track.Captions }); } var captionsToStore = (await vids.Where(v => lastUpload == null || v.UploadDate > lastUpload) .BlockFunc(GetCaption, RCfg.CaptionParallel)).NotNull().ToList(); if (captionsToStore.Any()) { await Store.Captions.Append(captionsToStore, log); } log.Information("Collect - {Channel} - Saved {Captions} captions", channel.ChannelTitle, captionsToStore.Count); }
async IAsyncEnumerable <VideoItem> ChannelVidItems(ChannelStored2 c, DateTime uploadFrom, ILogger log) { await foreach (var vids in Scraper.GetChannelUploadsAsync(c.ChannelId, log)) { foreach (var v in vids) { yield return(v); } if (vids.Any(v => v.AddedDate < uploadFrom)) { yield break; // return all vids on a page because its free. But stop once we have a page with something older than uploadFrom } } }
/// <summary> /// Saves captions for all new videos from the vids list /// </summary> async Task SaveNewCaptions(ChannelStored2 channel, IEnumerable <VideoItem> vids, ILogger log) { var store = Store.CaptionStore(channel.ChannelId); var lastUpload = (await store.LatestFileMetadata())?.Ts.ParseFileSafeTimestamp(); // last video upload we have captions for async Task <VideoCaptionStored2> GetCaption(VideoItem v) { var videoLog = log.ForContext("VideoId", v.Id); ClosedCaptionTrack track; try { var captions = await Scraper.GetCaptions(v.Id, log); var enInfo = captions.FirstOrDefault(t => t.Language.Code == "en"); if (enInfo == null) { return(null); } track = await Scraper.GetClosedCaptionTrackAsync(enInfo, videoLog); } catch (Exception ex) { log.Warning(ex, "Unable to get captions for {VideoID}: {Error}", v.Id, ex.Message); return(null); } return(new VideoCaptionStored2 { VideoId = v.Id, UploadDate = v.UploadDate.UtcDateTime, Updated = DateTime.Now, Info = track.Info, Captions = track.Captions }); } var captionsToStore = (await vids.Where(v => lastUpload == null || v.UploadDate.UtcDateTime > lastUpload) .BlockTransform(GetCaption, Cfg.DefaultParallel)).NotNull().ToList(); if (captionsToStore.Any()) { await store.Append(captionsToStore); } log.Information("{Channel} - Saved {Captions} captions", channel.ChannelTitle, captionsToStore.Count); }
async IAsyncEnumerable <VideoItem> ChannelVidItems(ChannelStored2 c, DateTime uploadFrom, ILogger log) { await foreach (var vids in Scraper.GetChannelUploadsAsync(c.ChannelId, log)) { foreach (var v in vids) { if (v.UploadDate > uploadFrom) { yield return(v); } else { yield break; // break on the first video older than updateFrom. } } } }
async Task UpdateAllInChannel(ChannelStored2 c, DbConnection conn, ILogger log) { if (c.StatusMessage.HasValue()) { log.Information("{Channel} - Not updating videos/recs/captions because it has a status msg: {StatusMessage} ", c.ChannelTitle, c.StatusMessage); return; } log.Information("{Channel} - Starting channel update of videos/recs/captions", c.ChannelTitle); // fix updated if missing. Remove once all records have been updated var vidStore = Store.VideoStore(c.ChannelId); var md = await vidStore.LatestFileMetadata(); var lastUpload = md?.Ts?.ParseFileSafeTimestamp(); var lastModified = md?.Modified; var recentlyUpdated = lastModified != null && lastModified.Value.IsYoungerThan(RCfg.RefreshAllAfter); // get the oldest date for videos to store updated statistics for. This overlaps so that we have a history of video stats. var uploadedFrom = md == null ? RCfg.From : DateTime.UtcNow - RCfg.RefreshVideosWithin; if (recentlyUpdated) { log.Information("{Channel} - skipping update, video stats have been updated recently {LastModified}", c.ChannelTitle, lastModified); } var vids = recentlyUpdated ? null : await ChannelVidItems(c, uploadedFrom, log).ToListAsync(); if (vids != null) { await SaveVids(c, vids, vidStore, lastUpload, log); await SaveNewCaptions(c, vids, log); } if (vids != null || UpdateType == UpdateType.AllWithMissingRecs) { await SaveRecs(c, vids, conn, log); } }
async Task <ChannelStored2> UpdateChannel(ChannelStored2 channel, bool full, ILogger log) { var channelLog = log.ForContext("Channel", channel.ChannelId).ForContext("ChannelId", channel.ChannelId); var c = channel.JsonClone(); try { c.Updated = DateTime.Now; var d = await Api.ChannelData(c.ChannelId, full); // to save quota - full update only when missing features channels if (d != null) { c.ChannelTitle = d.Title; c.Description = d.Description; c.LogoUrl = d.Thumbnails?.Default__?.Url; c.Subs = d.Stats?.SubCount; c.ChannelViews = d.Stats?.ViewCount; c.Country = d.Country; c.FeaturedChannelIds = d.FeaturedChannelIds ?? c.FeaturedChannelIds; c.Keywords = d.Keywords ?? c.Keywords; c.Subscriptions = d.Subscriptions ?? c.Subscriptions; c.DefaultLanguage = d.DefaultLanguage ?? c.DefaultLanguage; c.Status = ChannelStatus.Alive; if (full) { c.LastFullUpdate = c.Updated; } } else { c.Status = ChannelStatus.Dead; } channelLog.Information("Collect - {Channel} - read {Full} channel details ({ReviewStatus})", c.ChannelTitle, full ? "full" : "simple", c.ReviewStatus.EnumString()); } catch (Exception ex) { channelLog.Error(ex, "Collect - {Channel} - Error when updating details for channel : {Error}", c.ChannelTitle, ex.Message); } return(c); }
public ChannelUpdatePlan(ChannelStored2 channel, UpdateChannelType update = Standard, DateTime?videosFrom = null) { Channel = channel; Update = update; VideosFrom = videosFrom; }
static bool ChannelInTodaysUpdate(ChannelStored2 c, int cycleDays) => c.ChannelId.GetHashCode() % cycleDays == (DateTime.Today - DateTime.UnixEpoch).TotalDays.RoundToInt() % cycleDays;
async Task <IReadOnlyCollection <(string Id, string Title)> > VideosWithNoRecs(ChannelStored2 c, DbConnection connection) { var cmd = connection.CreateCommand(); cmd.CommandText = $@"select v.video_id, v.video_title from video_latest v where v.channel_id = '{c.ChannelId}' and not exists(select * from rec r where r.from_video_id = v.video_id) group by v.video_id, v.video_title"; var reader = await cmd.ExecuteReaderAsync(); var ids = new List <(string, string)>(); while (await reader.ReadAsync()) { ids.Add((reader["VIDEO_ID"].ToString(), reader["VIDEO_TITLE"].ToString())); } Log.Information("{Channel} - found {Recommendations} video's missing recommendations", c.ChannelTitle, ids.Count); return(ids); }
/// <summary> /// Saves recs for all of the given vids /// </summary> async Task SaveRecs(ChannelStored2 c, IReadOnlyCollection <VideoItem> vids, DbConnection conn, ILogger log) { var recStore = Store.RecStore(c.ChannelId); var toUpdate = UpdateType == UpdateType.AllWithMissingRecs ? await VideosWithNoRecs(c, conn) : await VideoToUpdateRecs(vids, recStore); var recs = await toUpdate.BlockTransform( async v => (fromId : v.Id, fromTitle : v.Title, recs : await Scraper.GetRecs(v.Id, log)), Cfg.DefaultParallel); // read failed recs from the API (either because of an error, or because the video is 18+ restricted) var failed = recs.Where(v => v.recs.None()).ToList(); if (failed.Any()) { var apiRecs = await failed.BlockTransform(async f => { ICollection <RecommendedVideoListItem> related = new List <RecommendedVideoListItem>(); try { related = await Api.GetRelatedVideos(f.fromId); } catch (Exception ex) { log.Warning(ex, "Unable to get related videos for {VideoId}: {Error}", f.fromId, ex.Message); } return(f.fromId, f.fromTitle, recs : related.NotNull().Select(r => new Rec { Source = RecSource.Api, ToChannelTitle = r.ChannelTitle, ToChannelId = r.ChannelId, ToVideoId = r.VideoId, ToVideoTitle = r.VideoTitle, Rank = r.Rank }).ToReadOnly()); }); recs = recs.Concat(apiRecs).ToList(); log.Information("{Channel} - {Videos} videos recommendations fell back to using the API: {VideoList}", c.ChannelTitle, failed.Count, apiRecs.Select(r => r.fromId)); } var updated = DateTime.UtcNow; var recsStored = recs .SelectMany(v => v.recs.Select((r, i) => new RecStored2 { FromChannelId = c.ChannelId, FromVideoId = v.fromId, FromVideoTitle = v.fromTitle, ToChannelTitle = r.ToChannelTitle, ToChannelId = r.ToChannelId, ToVideoId = r.ToVideoId, ToVideoTitle = r.ToVideoTitle, Rank = i + 1, Source = r.Source, Updated = updated })).ToList(); if (recsStored.Any()) { await recStore.Append(recsStored); } Log.Information("{Channel} - Recorded {RecCount} recs: {Recs}", c.ChannelTitle, recsStored.Count, recs.Select(v => new { Id = v.fromId, v.recs.Count }).ToList()); }
async Task <IReadOnlyCollection <ChannelStored2> > UpdateChannels() { var store = Store.ChannelStore; Log.Information("Starting channels update. Limited to ({Included})", Cfg.LimitedToSeedChannels?.HasItems() == true ? Cfg.LimitedToSeedChannels.Join("|") : "All"); async Task <ChannelStored2> UpdateChannel(ChannelSheet channel) { var log = Log.ForContext("Channel", channel.Title).ForContext("ChannelId", channel.Id); var channelData = new ChannelData { Id = channel.Id, Title = channel.Title }; try { channelData = await Api.ChannelData(channel.Id) ?? // Use API to get channel instead of scraper. We get better info faster new ChannelData { Id = channel.Id, Title = channel.Title, Status = ChannelStatus.Dead }; log.Information("{Channel} - read channel details", channelData.Title); } catch (Exception ex) { channelData.Status = ChannelStatus.Dead; log.Error(ex, "{Channel} - Error when updating details for channel : {Error}", channel.Title, ex.Message); } var channelStored = new ChannelStored2 { ChannelId = channel.Id, ChannelTitle = channelData.Title ?? channel.Title, Status = channelData.Status, MainChannelId = channel.MainChannelId, Description = channelData.Description, LogoUrl = channelData.Thumbnails?.Default__?.Url, Subs = channelData.Stats?.SubCount, ChannelViews = channelData.Stats?.ViewCount, Country = channelData.Country, Updated = DateTime.UtcNow, Relevance = channel.Relevance, LR = channel.LR, HardTags = channel.HardTags, SoftTags = channel.SoftTags, UserChannels = channel.UserChannels }; return(channelStored); } var seeds = await ChannelSheets.Channels(Cfg.Sheets, Log); var channels = await seeds.Where(c => Cfg.LimitedToSeedChannels.IsEmpty() || Cfg.LimitedToSeedChannels.Contains(c.Id)) .BlockTransform(UpdateChannel, Cfg.DefaultParallel, progressUpdate: p => Log.Debug("Reading channels {ChannelCount}/{ChannelTotal}", p.CompletedTotal, seeds.Count)); if (channels.Any()) { await store.Append(channels); } return(channels); }