Пример #1
0
        async Task <ByteSize> LoadBLobData(SyncTableCfg tableCfg, ILogger log, string loadId, string sourceSql, object maxTs, TableId loadTable)
        {
            var path        = StringPath.Relative("sync", tableCfg.Name, loadId);
            var copyTask    = Source.CopyTo(path, sourceSql, tableCfg, maxTs);
            var loadedFiles = new KeyedCollection <StringPath, FileListItem>(f => f.Path);

            while (true) // load as the files are created
            {
                if (copyTask.IsFaulted)
                {
                    break;
                }
                var toLoad = (await Store.List(path).SelectManyList())
                             .Where(f => !loadedFiles.ContainsKey(f.Path)).ToArray();
                if (toLoad.None())
                {
                    if (copyTask.IsCompleted)
                    {
                        break;
                    }
                    await 5.Seconds().Delay();
                    continue;
                }
                log.Debug("Sync {Table} - loading: {Files}", tableCfg.Name, toLoad.Join("|", l => l.Path.ToString()));
                await Dest.LoadFrom(toLoad.Select(f => f.Path), loadTable);

                loadedFiles.AddRange(toLoad);
                await toLoad.BlockAction(f => Store.Delete(f.Path, log), parallel : 8);
            }

            log.Information("Sync {Table} - copied {Files} files ({Size})", tableCfg.Name, loadedFiles.Count, loadedFiles.Sum(f => f.Bytes).Bytes().Humanize("#,#"));
            return(loadedFiles.Sum(f => f.Bytes).Bytes());
        }
Пример #2
0
        async Task LogParseError(string msg, Exception ex, string videoId, string rawHtml, ILogger log)
        {
            var path   = StringPath.Relative(DateTime.UtcNow.ToString("yyyy-MM-dd"), $"{videoId}.html");
            var logUrl = LogStore.Url(path);
            await LogStore.Save(path, rawHtml.AsStream(), log);

            log.Warning(ex, "WebScraper - {VideoId} - saved html that we could not parse '{msg}' ({Url}). error: {Error}",
                        videoId, msg, logUrl, ex?.ToString());
        }
Пример #3
0
        async Task SaveCfg(string dir)
        {
            var localDir = LocalResultsDir.Combine(dir);

            localDir.EnsureDirectoryExists();
            var storeDir     = StringPath.Relative(dir);
            var localCfgFile = localDir.Combine("cfg.json");

            Cfg.ToJsonFile(localCfgFile);
            await Store.Save(storeDir.Add("cfg.json"), localCfgFile);
        }
Пример #4
0
        async Task SaveParquet <T>(IEnumerable <T> rows, string name, string dir) where T : new()
        {
            var storeDir  = StringPath.Relative(dir);
            var localFile = LocalResultsDir.Combine(dir).Combine($"{name}.parquet");

            ParquetConvert.Serialize(rows, localFile.FullPath);
            var storePath = storeDir.Add(localFile.FileName);
            await Store.Save(storePath, localFile);

            Log.Information("Saved {Path}", storePath);
        }
Пример #5
0
 async Task SaveParquet <T>(IEnumerable <T> rows, string name, string dir) where T : new()
 {
     await rows.Chunk(200000).Select((r, i) => (chunkRows: r, index: i)).BlockTransform(async chunk => {
         var storeDir  = StringPath.Relative(dir);
         var localFile = LocalResultsDir.Combine(dir).Combine($"{name}.{chunk.index}.parquet");
         ParquetConvert.Serialize(chunk.chunkRows, localFile.FullPath);
         var storePath = storeDir.Add(localFile.FileName);
         await Store.Save(storePath, localFile);
         Log.Information("Saved {Path}", storePath);
         return(storeDir);
     }, 4);
 }
Пример #6
0
        async Task <(BlobIndexMeta index, StringPath path)> UpdateBlobIndex(ILogger log, string name, string[] indexCols, string sql,
                                                                            ByteSize size, Action <JObject> onProcessed = null)
        {
            using var con = await Sf.OpenConnection(log);

            var rows = con.QueryBlocking <dynamic>(name, sql)
                       .Select(d => (JObject)JObject.FromObject(d))
                       .Select(j => j.ToCamelCase()).GetEnumerator();

            var camelIndex = CamelIndex(indexCols);
            var path       = StringPath.Relative("index", name);
            var index      = await BlobIndex.SaveIndexedJsonl(path, rows, camelIndex, size, log, onProcessed);

            return(index, path);
        }
Пример #7
0
        public async Task <string> GetAndUpdateVideoCaptions(string videoId)
        {
            //var video = await ytScaper.GetVideoAsync(videoId);
            var tracks = await ytScaper.GetVideoClosedCaptionTrackInfosAsync(videoId);

            var en = tracks.FirstOrDefault(t => t.Language.Code == "en");

            if (en == null)
            {
                return(null);
            }
            var track = await ytScaper.GetClosedCaptionTrackAsync(en);

            var text = track.Captions.Select(c => c.Text).Join("\n");

            await Store.Save(StringPath.Relative("VideoCaptions", $"{videoId}.txt"), text.AsStream());

            return(text);
        }
Пример #8
0
        public async Task <string> GetAndUpdateVideoCaptions(string channelId, string videoId, ILogger log)
        {
            IReadOnlyList <ClosedCaptionTrackInfo> tracks;

            try {
                tracks = await ytScaper.GetVideoClosedCaptionTrackInfosAsync(videoId);
            }
            catch (Exception ex) {
                log.Warning(ex, "Unable to get captions for {VideoID}: {Error}", videoId, ex.Message);
                return(null);
            }
            var en = tracks.FirstOrDefault(t => t.Language.Code == "en");

            if (en == null)
            {
                return(null);
            }

            ClosedCaptionTrack track;

            try {
                track = await Policy.Handle <HttpRequestException>()
                        .RetryWithBackoff()
                        .ExecuteAsync(() => ytScaper.GetClosedCaptionTrackAsync(en));
            }
            catch (Exception ex) {
                log.Warning(ex, "Unable to get captions for {VideoID}: {Error}", videoId, ex.Message);
                return(null);
            }
            var text = track.Captions.Select(c => c.Text).Join("\n");

            if (text != null)
            {
                var path = StringPath.Relative("VideoCaptions", channelId, $"{videoId}.txt");
                try {
                    await Store.Save(path, text.AsStream());
                }
                catch (Exception ex) {
                    log.Warning(ex, "Error when saving captions {Path}", path);
                }
            }
            return(text);
        }
Пример #9
0
        public async Task SaveCaptions()
        {
            var cfg = await Setup.LoadCfg();

            var log = Setup.CreateTestLogger();

            var store      = new YtStore(new YtClient(cfg.App, log), cfg.DataStore());
            var channelCfg = await cfg.App.LoadChannelConfig();

            foreach (var c in channelCfg.Seeds.Randomize())
            {
                var existingCaptionIds = (await store.Store.List(StringPath.Relative("VideoCaptions", c.Id)))
                                         .Select(b => b.NameSansExtension).ToHashSet();
                if (existingCaptionIds.Any())
                {
                    continue;
                }
                var cvc = await store.ChannelVideosCollection.Get(c.Id);

                var toUpdate = cvc.Vids.OrderByDescending(v => v.PublishedAt).Take(50)
                               .Where(v => !existingCaptionIds.Contains(v.VideoId)).ToList();
                await toUpdate.BlockAction(v => store.GetAndUpdateVideoCaptions(c.Id, v.VideoId, log), cfg.App.ParallelCollect);
            }
        }
Пример #10
0
        //ytInitialPlayerResponse.responseContext.serviceTrackingParams.filter(p => p.service == "CSI")[0].params
        public async Task <RecsAndExtra> GetRecsAndExtra(string videoId, ILogger log)
        {
            var watchPage = await GetVideoWatchPageHtmlAsync(videoId, log);

            var(html, raw, url) = watchPage;
            var infoDic = await GetVideoInfoDicAsync(videoId, log);

            var videoItem = GetVideo(videoId, infoDic, watchPage);

            var extra = new VideoExtraStored2 {
                VideoId      = videoId,
                Updated      = DateTime.UtcNow,
                ChannelId    = videoItem?.ChannelId,
                ChannelTitle = videoItem?.ChannelTitle,
                Description  = videoItem?.Description,
                Duration     = videoItem?.Duration,
                Keywords     = videoItem?.Keywords,
                Title        = videoItem?.Title,
                UploadDate   = videoItem?.UploadDate,
                AddedDate    = videoItem?.AddedDate,
                Statistics   = videoItem?.Statistics,
                Source       = ScrapeSource.Web
            };

            var ytInitPr = GetClientObjectFromWatchPage(html, "ytInitialPlayerResponse");

            if (ytInitPr != null && ytInitPr.Value <string>("status") != "OK")
            {
                var playerError = ytInitPr.SelectToken("playabilityStatus.errorScreen.playerErrorMessageRenderer");
                extra.Error    = playerError?.SelectToken("reason.simpleText")?.Value <string>();
                extra.SubError = (playerError?.SelectToken("subreason.simpleText") ??
                                  playerError?.SelectToken("subreason.runs[0].text"))
                                 ?.Value <string>();
            }
            if (extra.Error == null)
            {
                var restrictedMode = html.QueryElements("head > meta[property=\"og:restrictions:age\"]").FirstOrDefault()?.GetAttribute("content")?.Value == "18+";
                if (restrictedMode)
                {
                    extra.Error    = RestrictedVideoError;
                    extra.SubError = "Unable to find recommended video because it is age restricted and requires to log in";
                }
            }
            if (extra.Error == null)
            {
                extra.SubError = html.QueryElements("#unavailable-submessage").FirstOrDefault()?.GetInnerText();
                if (extra.SubError == "")
                {
                    extra.SubError = null;
                }
                if (extra.SubError.HasValue()) // all pages have the error, but not a sub-error
                {
                    extra.Error = html.QueryElements("#unavailable-message").FirstOrDefault()?.GetInnerText();
                }
            }
            if (extra.Error != null)
            {
                return(new RecsAndExtra(extra, new Rec[] { }));
            }

            var(recs, recEx) = Def.New(() => GetRecs2(html)).Try();
            if (recs?.Any() != true || recEx != null)
            {
                var uri    = new Uri(url);
                var path   = StringPath.Relative(DateTime.UtcNow.ToString("yyyy-MM-dd"), $"{uri.PathAndQuery}.html");
                var logUrl = LogStore.Url(path);
                await LogStore.Save(path, raw.AsStream(), log);

                log.Warning("WebScraper - Unable to find recs at ({Url}). error: {Error}", logUrl, recEx?.ToString());
            }

            return(new RecsAndExtra(extra, recs));
        }
Пример #11
0
 async Task SaveToLatestAndDateDirs(string fileName, FPath tempFile) =>
   await Task.WhenAll(
     Store.Save(StringPath.Relative(Version, DateTime.UtcNow.ToString("yyyy-MM-dd")).Add(fileName), tempFile),
     Store.Save(StringPath.Relative(Version, "latest").Add(fileName), tempFile)
   );
Пример #12
0
 public AppendCollectionStore <VideoCaptionStored2> CaptionStore(string channelId) =>
 new AppendCollectionStore <VideoCaptionStored2>(Store, StringPath.Relative("captions", channelId), c => c.UploadDate.FileSafeTimestamp(),
                                                 StoreVersion.ToString(), Log);
Пример #13
0
 public AppendCollectionStore <RecStored2> RecStore(string channelId) =>
 new AppendCollectionStore <RecStored2>(Store, StringPath.Relative("recs", channelId), r => r.Updated.FileSafeTimestamp(), StoreVersion.ToString(), Log);
Пример #14
0
 public static StringPath ToStringPath(this FPath path) => path.IsRooted ? StringPath.Absolute(path.Tokens) : StringPath.Relative(path.Tokens);
Пример #15
0
        //ytInitialPlayerResponse.responseContext.serviceTrackingParams.filter(p => p.service == "CSI")[0].params
        public async Task <RecsAndExtra> GetRecsAndExtra(string videoId, ILogger log)
        {
            var watchPage = await GetVideoWatchPageHtmlAsync(videoId, log);

            var(html, raw, url) = watchPage;
            var infoDic = await GetVideoInfoDicAsync(videoId, log);

            var videoItem = GetVideo(videoId, infoDic, watchPage);

            var extra = new VideoExtraStored2 {
                VideoId      = videoId,
                Updated      = DateTime.UtcNow,
                ChannelId    = videoItem.ChannelId,
                ChannelTitle = videoItem.ChannelTitle,
                Description  = videoItem.Description,
                Duration     = videoItem.Duration,
                Keywords     = videoItem.Keywords,
                Title        = videoItem.Title,
                UploadDate   = videoItem.UploadDate.UtcDateTime,
                Statistics   = videoItem.Statistics,
                Source       = ScrapeSource.Web,
                Thumbnail    = VideoThumbnail.FromVideoId(videoId)
            };

            var restrictedMode = html.QueryElements("head > meta[property=\"og:restrictions:age\"]").FirstOrDefault()?.GetAttribute("content")?.Value == "18+";

            if (restrictedMode)
            {
                extra.Error    = RestrictedVideoError;
                extra.SubError = "Unable to find recommended video because it is age restricted and requires to log in";
            }
            else
            {
                extra.SubError = html.QueryElements("#unavailable-submessage").FirstOrDefault()?.GetInnerText();
                if (extra.SubError == "")
                {
                    extra.SubError = null;
                }
                if (extra.SubError.HasValue()) // all pages have the error, but not a sub-error
                {
                    extra.Error = html.QueryElements("#unavailable-message").FirstOrDefault()?.GetInnerText();
                }
            }
            if (extra.Error != null)
            {
                return(new RecsAndExtra(extra, new Rec[] { }));
            }


            var(recs, recEx) = Def.New(() => GetRecs2(html)).Try();
            if (recs?.Any() != true || recEx != null)
            {
                var uri    = new Uri(url);
                var path   = StringPath.Relative(DateTime.UtcNow.ToString("yyyy-MM-dd"), $"{uri.PathAndQuery}.html");
                var logUrl = LogStore.Url(path);
                await LogStore.Save(path, raw.AsStream(), log);

                log.Warning("WebScraper - Unable to find recs at ({Url}). error: {Error}", logUrl, recEx?.ToString());
            }

            var match = _ytAdRegex.Match(raw);

            extra.HasAd = match.Success && match.Groups[1].Value == "1";

            return(new RecsAndExtra(extra, recs));
        }