Ejemplo n.º 1
0
        async Task <IReadOnlyCollection <string> > VideoToUpdateRecs(ChannelStored2 c, IEnumerable <VideoItem> vids)
        {
            var prevUpdateMeta = await Store.Recs.LatestFile(c.ChannelId);

            var prevUpdate           = prevUpdateMeta?.Ts.ParseFileSafeTimestamp();
            var vidsDesc             = vids.OrderByDescending(v => v.UploadDate).ToList();
            var inThisWeeksRecUpdate = ChannelInTodaysUpdate(c, cycleDays: 7);

            var toUpdate = new List <VideoItem>();

            if (prevUpdate == null)
            {
                Log.Debug("Collect - {Channel} - first rec update, collecting max", c.ChannelTitle);
                toUpdate.AddRange(vidsDesc.Take(RCfg.RefreshRecsMax));
            }
            else if (inThisWeeksRecUpdate)
            {
                Log.Debug("Collect - {Channel} - performing weekly recs update", c.ChannelTitle);
                toUpdate.AddRange(vidsDesc.Where(v => v.UploadDate > prevUpdate || v.UploadDate.IsYoungerThan(RCfg.RefreshRecsWithin))
                                  .Take(RCfg.RefreshRecsMax));
                var deficit = RCfg.RefreshRecsMin - toUpdate.Count;
                if (deficit > 0)
                {
                    toUpdate.AddRange(vidsDesc.Where(v => toUpdate.All(u => u.Id != v.Id))
                                      .Take(deficit)); // if we don't have new videos, refresh the min amount by adding videos
                }
            }
            else
            {
                Log.Debug("Collect - {Channel} - skipping rec update because it's not this channels day", c.ChannelTitle);
            }

            return(toUpdate.Select(v => v.Id).ToList());
        }
Ejemplo n.º 2
0
        /// <summary>Saves recs for all of the given vids</summary>
        async Task SaveRecsAndExtra(ChannelStored2 c, CollectPart[] parts, HashSet <string> forChromeUpdate, string[] forWebUpdate, ILogger log)
        {
            var chromeExtra = await ChromeScraper.GetRecsAndExtra(forChromeUpdate, log);

            var webExtra = await Scraper.GetRecsAndExtra(forWebUpdate, log, c.ChannelId, c.ChannelTitle);

            var allExtra = chromeExtra.Concat(webExtra).ToArray();
            var extra    = allExtra.Select(v => v.Extra).NotNull().ToArray();

            foreach (var e in extra)
            {
                e.ChannelId ??= c.ChannelId; // if the video has an error, it may not have picked up the channel
                e.ChannelTitle ??= c.ChannelTitle;
            }

            var updated = DateTime.UtcNow;
            var recs    = new List <RecStored2>();

            if (parts.ShouldRun(VidRecs))
            {
                recs.AddRange(ToRecStored(allExtra, updated));
                if (recs.Any())
                {
                    await Store.Recs.Append(recs, log);
                }
            }

            if (extra.Any())
            {
                await Store.VideoExtra.Append(extra, log);
            }

            log.Information("Collect - {Channel} - Recorded {WebExtras} web-extras, {ChromeExtras} chrome-extras, {Recs} recs, {Comments} comments",
                            c.ChannelTitle, webExtra.Count, chromeExtra.Count, recs.Count, extra.Sum(e => e.Comments?.Length ?? 0));
        }
Ejemplo n.º 3
0
        static async Task SaveVids(ChannelStored2 c, IReadOnlyCollection <VideoItem> vids, JsonlStore <VideoStored2> vidStore, DateTime?uploadedFrom,
                                   ILogger log)
        {
            var updated    = DateTime.UtcNow;
            var vidsStored = vids.Select(v => new VideoStored2 {
                VideoId      = v.Id,
                Title        = v.Title,
                Description  = v.Description,
                Duration     = v.Duration,
                Keywords     = v.Keywords.ToList(),
                Statistics   = v.Statistics,
                ChannelId    = c.ChannelId,
                ChannelTitle = c.ChannelTitle,
                UploadDate   = v.UploadDate,
                AddedDate    = v.AddedDate,
                Updated      = updated
            }).ToList();

            if (vidsStored.Count > 0)
            {
                await vidStore.Append(vidsStored, log);
            }

            var newVideos = vidsStored.Count(v => uploadedFrom == null || v.UploadDate > uploadedFrom);

            log.Information("Collect - {Channel} - Recorded {VideoCount} videos. {NewCount} new, {UpdatedCount} updated",
                            c.ChannelTitle, vids.Count, newVideos, vids.Count - newVideos);
        }
Ejemplo n.º 4
0
        /// <summary>Saves captions for all new videos from the vids list</summary>
        async Task SaveNewCaptions(ChannelStored2 channel, IEnumerable <VideoItem> vids, ILogger log)
        {
            var lastUpload =
                (await Store.Captions.LatestFile(channel.ChannelId))?.Ts.ParseFileSafeTimestamp(); // last video upload in this channel partition we have captions for

            var consecutiveCaptionMissing = 0;

            async Task <VideoCaptionStored2> GetCaption(VideoItem v)
            {
                if (consecutiveCaptionMissing >= MaxConsecutiveCaptionsMissing)
                {
                    return(null);
                }
                var videoLog = log.ForContext("VideoId", v.Id);
                ClosedCaptionTrack track;

                try {
                    var captions = await Scraper.GetCaptions(v.Id, log);

                    var enInfo = captions.FirstOrDefault(t => t.Language.Code == "en");
                    if (enInfo == null)
                    {
                        if (Interlocked.Increment(ref consecutiveCaptionMissing) == MaxConsecutiveCaptionsMissing)
                        {
                            log.Debug("SaveCaptions - too many consecutive videos are missing captions. Assuming it won't have any.");
                        }
                        return(null);
                    }
                    track = await Scraper.GetClosedCaptionTrackAsync(enInfo, videoLog);

                    consecutiveCaptionMissing = 0;
                }
                catch (Exception ex) {
                    ex.ThrowIfUnrecoverable();
                    log.Warning(ex, "Unable to get captions for {VideoID}: {Error}", v.Id, ex.Message);
                    return(null);
                }

                return(new VideoCaptionStored2 {
                    ChannelId = channel.ChannelId,
                    VideoId = v.Id,
                    UploadDate = v.UploadDate,
                    Updated = DateTime.Now,
                    Info = track.Info,
                    Captions = track.Captions
                });
            }

            var captionsToStore =
                (await vids.Where(v => lastUpload == null || v.UploadDate > lastUpload)
                 .BlockFunc(GetCaption, RCfg.CaptionParallel)).NotNull().ToList();

            if (captionsToStore.Any())
            {
                await Store.Captions.Append(captionsToStore, log);
            }

            log.Information("Collect - {Channel} - Saved {Captions} captions", channel.ChannelTitle, captionsToStore.Count);
        }
Ejemplo n.º 5
0
 async IAsyncEnumerable <VideoItem> ChannelVidItems(ChannelStored2 c, DateTime uploadFrom, ILogger log)
 {
     await foreach (var vids in Scraper.GetChannelUploadsAsync(c.ChannelId, log))
     {
         foreach (var v in vids)
         {
             yield return(v);
         }
         if (vids.Any(v => v.AddedDate < uploadFrom))
         {
             yield break; // return all vids on a page because its free. But stop once we have a page with something older than uploadFrom
         }
     }
 }
Ejemplo n.º 6
0
        /// <summary>
        ///   Saves captions for all new videos from the vids list
        /// </summary>
        async Task SaveNewCaptions(ChannelStored2 channel, IEnumerable <VideoItem> vids, ILogger log)
        {
            var store      = Store.CaptionStore(channel.ChannelId);
            var lastUpload = (await store.LatestFileMetadata())?.Ts.ParseFileSafeTimestamp(); // last video upload we have captions for

            async Task <VideoCaptionStored2> GetCaption(VideoItem v)
            {
                var videoLog = log.ForContext("VideoId", v.Id);

                ClosedCaptionTrack track;

                try {
                    var captions = await Scraper.GetCaptions(v.Id, log);

                    var enInfo = captions.FirstOrDefault(t => t.Language.Code == "en");
                    if (enInfo == null)
                    {
                        return(null);
                    }
                    track = await Scraper.GetClosedCaptionTrackAsync(enInfo, videoLog);
                }
                catch (Exception ex) {
                    log.Warning(ex, "Unable to get captions for {VideoID}: {Error}", v.Id, ex.Message);
                    return(null);
                }

                return(new VideoCaptionStored2 {
                    VideoId = v.Id,
                    UploadDate = v.UploadDate.UtcDateTime,
                    Updated = DateTime.Now,
                    Info = track.Info,
                    Captions = track.Captions
                });
            }

            var captionsToStore =
                (await vids.Where(v => lastUpload == null || v.UploadDate.UtcDateTime > lastUpload)
                 .BlockTransform(GetCaption, Cfg.DefaultParallel)).NotNull().ToList();

            if (captionsToStore.Any())
            {
                await store.Append(captionsToStore);
            }

            log.Information("{Channel} - Saved {Captions} captions", channel.ChannelTitle, captionsToStore.Count);
        }
Ejemplo n.º 7
0
 async IAsyncEnumerable <VideoItem> ChannelVidItems(ChannelStored2 c, DateTime uploadFrom, ILogger log)
 {
     await foreach (var vids in Scraper.GetChannelUploadsAsync(c.ChannelId, log))
     {
         foreach (var v in vids)
         {
             if (v.UploadDate > uploadFrom)
             {
                 yield return(v);
             }
             else
             {
                 yield break; // break on the first video older than updateFrom.
             }
         }
     }
 }
Ejemplo n.º 8
0
        async Task UpdateAllInChannel(ChannelStored2 c, DbConnection conn, ILogger log)
        {
            if (c.StatusMessage.HasValue())
            {
                log.Information("{Channel} - Not updating videos/recs/captions because it has a status msg: {StatusMessage} ",
                                c.ChannelTitle, c.StatusMessage);
                return;
            }
            log.Information("{Channel} - Starting channel update of videos/recs/captions", c.ChannelTitle);

            // fix updated if missing. Remove once all records have been updated
            var vidStore = Store.VideoStore(c.ChannelId);

            var md = await vidStore.LatestFileMetadata();

            var lastUpload   = md?.Ts?.ParseFileSafeTimestamp();
            var lastModified = md?.Modified;

            var recentlyUpdated = lastModified != null && lastModified.Value.IsYoungerThan(RCfg.RefreshAllAfter);

            // get the oldest date for videos to store updated statistics for. This overlaps so that we have a history of video stats.
            var uploadedFrom = md == null ? RCfg.From : DateTime.UtcNow - RCfg.RefreshVideosWithin;

            if (recentlyUpdated)
            {
                log.Information("{Channel} - skipping update, video stats have been updated recently {LastModified}", c.ChannelTitle, lastModified);
            }

            var vids = recentlyUpdated ? null : await ChannelVidItems(c, uploadedFrom, log).ToListAsync();

            if (vids != null)
            {
                await SaveVids(c, vids, vidStore, lastUpload, log);
                await SaveNewCaptions(c, vids, log);
            }
            if (vids != null || UpdateType == UpdateType.AllWithMissingRecs)
            {
                await SaveRecs(c, vids, conn, log);
            }
        }
Ejemplo n.º 9
0
        async Task <ChannelStored2> UpdateChannel(ChannelStored2 channel, bool full, ILogger log)
        {
            var channelLog = log.ForContext("Channel", channel.ChannelId).ForContext("ChannelId", channel.ChannelId);
            var c          = channel.JsonClone();

            try {
                c.Updated = DateTime.Now;
                var d = await Api.ChannelData(c.ChannelId, full); // to save quota - full update only when missing features channels

                if (d != null)
                {
                    c.ChannelTitle       = d.Title;
                    c.Description        = d.Description;
                    c.LogoUrl            = d.Thumbnails?.Default__?.Url;
                    c.Subs               = d.Stats?.SubCount;
                    c.ChannelViews       = d.Stats?.ViewCount;
                    c.Country            = d.Country;
                    c.FeaturedChannelIds = d.FeaturedChannelIds ?? c.FeaturedChannelIds;
                    c.Keywords           = d.Keywords ?? c.Keywords;
                    c.Subscriptions      = d.Subscriptions ?? c.Subscriptions;
                    c.DefaultLanguage    = d.DefaultLanguage ?? c.DefaultLanguage;
                    c.Status             = ChannelStatus.Alive;
                    if (full)
                    {
                        c.LastFullUpdate = c.Updated;
                    }
                }
                else
                {
                    c.Status = ChannelStatus.Dead;
                }
                channelLog.Information("Collect - {Channel} - read {Full} channel details ({ReviewStatus})",
                                       c.ChannelTitle, full ? "full" : "simple", c.ReviewStatus.EnumString());
            }
            catch (Exception ex) {
                channelLog.Error(ex, "Collect - {Channel} - Error when updating details for channel : {Error}", c.ChannelTitle, ex.Message);
            }
            return(c);
        }
Ejemplo n.º 10
0
 public ChannelUpdatePlan(ChannelStored2 channel, UpdateChannelType update = Standard, DateTime?videosFrom = null)
 {
     Channel    = channel;
     Update     = update;
     VideosFrom = videosFrom;
 }
Ejemplo n.º 11
0
 static bool ChannelInTodaysUpdate(ChannelStored2 c, int cycleDays) =>
 c.ChannelId.GetHashCode() % cycleDays == (DateTime.Today - DateTime.UnixEpoch).TotalDays.RoundToInt() % cycleDays;
Ejemplo n.º 12
0
        async Task <IReadOnlyCollection <(string Id, string Title)> > VideosWithNoRecs(ChannelStored2 c, DbConnection connection)
        {
            var cmd = connection.CreateCommand();

            cmd.CommandText = $@"select v.video_id, v.video_title
      from video_latest v
        where
      v.channel_id = '{c.ChannelId}'
      and not exists(select * from rec r where r.from_video_id = v.video_id)
                     group by v.video_id, v.video_title";
            var reader = await cmd.ExecuteReaderAsync();

            var ids = new List <(string, string)>();

            while (await reader.ReadAsync())
            {
                ids.Add((reader["VIDEO_ID"].ToString(), reader["VIDEO_TITLE"].ToString()));
            }

            Log.Information("{Channel} - found {Recommendations} video's missing recommendations", c.ChannelTitle, ids.Count);
            return(ids);
        }
Ejemplo n.º 13
0
        /// <summary>
        ///   Saves recs for all of the given vids
        /// </summary>
        async Task SaveRecs(ChannelStored2 c, IReadOnlyCollection <VideoItem> vids, DbConnection conn, ILogger log)
        {
            var recStore = Store.RecStore(c.ChannelId);

            var toUpdate = UpdateType == UpdateType.AllWithMissingRecs
        ? await VideosWithNoRecs(c, conn)
        : await VideoToUpdateRecs(vids, recStore);

            var recs = await toUpdate.BlockTransform(
                async v => (fromId : v.Id, fromTitle : v.Title, recs : await Scraper.GetRecs(v.Id, log)),
                Cfg.DefaultParallel);

            // read failed recs from the API (either because of an error, or because the video is 18+ restricted)
            var failed = recs.Where(v => v.recs.None()).ToList();

            if (failed.Any())
            {
                var apiRecs = await failed.BlockTransform(async f => {
                    ICollection <RecommendedVideoListItem> related = new List <RecommendedVideoListItem>();
                    try {
                        related = await Api.GetRelatedVideos(f.fromId);
                    }
                    catch (Exception ex) {
                        log.Warning(ex, "Unable to get related videos for {VideoId}: {Error}", f.fromId, ex.Message);
                    }
                    return(f.fromId, f.fromTitle, recs : related.NotNull().Select(r => new Rec {
                        Source = RecSource.Api,
                        ToChannelTitle = r.ChannelTitle,
                        ToChannelId = r.ChannelId,
                        ToVideoId = r.VideoId,
                        ToVideoTitle = r.VideoTitle,
                        Rank = r.Rank
                    }).ToReadOnly());
                });

                recs = recs.Concat(apiRecs).ToList();

                log.Information("{Channel} - {Videos} videos recommendations fell back to using the API: {VideoList}",
                                c.ChannelTitle, failed.Count, apiRecs.Select(r => r.fromId));
            }

            var updated    = DateTime.UtcNow;
            var recsStored = recs
                             .SelectMany(v => v.recs.Select((r, i) => new RecStored2 {
                FromChannelId  = c.ChannelId,
                FromVideoId    = v.fromId,
                FromVideoTitle = v.fromTitle,
                ToChannelTitle = r.ToChannelTitle,
                ToChannelId    = r.ToChannelId,
                ToVideoId      = r.ToVideoId,
                ToVideoTitle   = r.ToVideoTitle,
                Rank           = i + 1,
                Source         = r.Source,
                Updated        = updated
            })).ToList();

            if (recsStored.Any())
            {
                await recStore.Append(recsStored);
            }

            Log.Information("{Channel} - Recorded {RecCount} recs: {Recs}", c.ChannelTitle, recsStored.Count, recs.Select(v => new { Id = v.fromId, v.recs.Count }).ToList());
        }
Ejemplo n.º 14
0
        async Task <IReadOnlyCollection <ChannelStored2> > UpdateChannels()
        {
            var store = Store.ChannelStore;

            Log.Information("Starting channels update. Limited to ({Included})",
                            Cfg.LimitedToSeedChannels?.HasItems() == true ? Cfg.LimitedToSeedChannels.Join("|") : "All");

            async Task <ChannelStored2> UpdateChannel(ChannelSheet channel)
            {
                var log = Log.ForContext("Channel", channel.Title).ForContext("ChannelId", channel.Id);

                var channelData = new ChannelData {
                    Id = channel.Id, Title = channel.Title
                };

                try {
                    channelData = await Api.ChannelData(channel.Id) ?? // Use API to get channel instead of scraper. We get better info faster
                                  new ChannelData
                    {
                        Id = channel.Id, Title = channel.Title, Status = ChannelStatus.Dead
                    };
                    log.Information("{Channel} - read channel details", channelData.Title);
                }
                catch (Exception ex) {
                    channelData.Status = ChannelStatus.Dead;
                    log.Error(ex, "{Channel} - Error when updating details for channel : {Error}", channel.Title, ex.Message);
                }
                var channelStored = new ChannelStored2 {
                    ChannelId     = channel.Id,
                    ChannelTitle  = channelData.Title ?? channel.Title,
                    Status        = channelData.Status,
                    MainChannelId = channel.MainChannelId,
                    Description   = channelData.Description,
                    LogoUrl       = channelData.Thumbnails?.Default__?.Url,
                    Subs          = channelData.Stats?.SubCount,
                    ChannelViews  = channelData.Stats?.ViewCount,
                    Country       = channelData.Country,
                    Updated       = DateTime.UtcNow,
                    Relevance     = channel.Relevance,
                    LR            = channel.LR,
                    HardTags      = channel.HardTags,
                    SoftTags      = channel.SoftTags,
                    UserChannels  = channel.UserChannels
                };

                return(channelStored);
            }

            var seeds = await ChannelSheets.Channels(Cfg.Sheets, Log);

            var channels = await seeds.Where(c => Cfg.LimitedToSeedChannels.IsEmpty() || Cfg.LimitedToSeedChannels.Contains(c.Id))
                           .BlockTransform(UpdateChannel, Cfg.DefaultParallel,
                                           progressUpdate: p => Log.Debug("Reading channels {ChannelCount}/{ChannelTotal}", p.CompletedTotal, seeds.Count));

            if (channels.Any())
            {
                await store.Append(channels);
            }

            return(channels);
        }