async Task <ByteSize> LoadBLobData(SyncTableCfg tableCfg, ILogger log, string loadId, string sourceSql, object maxTs, TableId loadTable) { var path = StringPath.Relative("sync", tableCfg.Name, loadId); var copyTask = Source.CopyTo(path, sourceSql, tableCfg, maxTs); var loadedFiles = new KeyedCollection <StringPath, FileListItem>(f => f.Path); while (true) // load as the files are created { if (copyTask.IsFaulted) { break; } var toLoad = (await Store.List(path).SelectManyList()) .Where(f => !loadedFiles.ContainsKey(f.Path)).ToArray(); if (toLoad.None()) { if (copyTask.IsCompleted) { break; } await 5.Seconds().Delay(); continue; } log.Debug("Sync {Table} - loading: {Files}", tableCfg.Name, toLoad.Join("|", l => l.Path.ToString())); await Dest.LoadFrom(toLoad.Select(f => f.Path), loadTable); loadedFiles.AddRange(toLoad); await toLoad.BlockAction(f => Store.Delete(f.Path, log), parallel : 8); } log.Information("Sync {Table} - copied {Files} files ({Size})", tableCfg.Name, loadedFiles.Count, loadedFiles.Sum(f => f.Bytes).Bytes().Humanize("#,#")); return(loadedFiles.Sum(f => f.Bytes).Bytes()); }
async Task LogParseError(string msg, Exception ex, string videoId, string rawHtml, ILogger log) { var path = StringPath.Relative(DateTime.UtcNow.ToString("yyyy-MM-dd"), $"{videoId}.html"); var logUrl = LogStore.Url(path); await LogStore.Save(path, rawHtml.AsStream(), log); log.Warning(ex, "WebScraper - {VideoId} - saved html that we could not parse '{msg}' ({Url}). error: {Error}", videoId, msg, logUrl, ex?.ToString()); }
async Task SaveCfg(string dir) { var localDir = LocalResultsDir.Combine(dir); localDir.EnsureDirectoryExists(); var storeDir = StringPath.Relative(dir); var localCfgFile = localDir.Combine("cfg.json"); Cfg.ToJsonFile(localCfgFile); await Store.Save(storeDir.Add("cfg.json"), localCfgFile); }
async Task SaveParquet <T>(IEnumerable <T> rows, string name, string dir) where T : new() { var storeDir = StringPath.Relative(dir); var localFile = LocalResultsDir.Combine(dir).Combine($"{name}.parquet"); ParquetConvert.Serialize(rows, localFile.FullPath); var storePath = storeDir.Add(localFile.FileName); await Store.Save(storePath, localFile); Log.Information("Saved {Path}", storePath); }
async Task SaveParquet <T>(IEnumerable <T> rows, string name, string dir) where T : new() { await rows.Chunk(200000).Select((r, i) => (chunkRows: r, index: i)).BlockTransform(async chunk => { var storeDir = StringPath.Relative(dir); var localFile = LocalResultsDir.Combine(dir).Combine($"{name}.{chunk.index}.parquet"); ParquetConvert.Serialize(chunk.chunkRows, localFile.FullPath); var storePath = storeDir.Add(localFile.FileName); await Store.Save(storePath, localFile); Log.Information("Saved {Path}", storePath); return(storeDir); }, 4); }
async Task <(BlobIndexMeta index, StringPath path)> UpdateBlobIndex(ILogger log, string name, string[] indexCols, string sql, ByteSize size, Action <JObject> onProcessed = null) { using var con = await Sf.OpenConnection(log); var rows = con.QueryBlocking <dynamic>(name, sql) .Select(d => (JObject)JObject.FromObject(d)) .Select(j => j.ToCamelCase()).GetEnumerator(); var camelIndex = CamelIndex(indexCols); var path = StringPath.Relative("index", name); var index = await BlobIndex.SaveIndexedJsonl(path, rows, camelIndex, size, log, onProcessed); return(index, path); }
public async Task <string> GetAndUpdateVideoCaptions(string videoId) { //var video = await ytScaper.GetVideoAsync(videoId); var tracks = await ytScaper.GetVideoClosedCaptionTrackInfosAsync(videoId); var en = tracks.FirstOrDefault(t => t.Language.Code == "en"); if (en == null) { return(null); } var track = await ytScaper.GetClosedCaptionTrackAsync(en); var text = track.Captions.Select(c => c.Text).Join("\n"); await Store.Save(StringPath.Relative("VideoCaptions", $"{videoId}.txt"), text.AsStream()); return(text); }
public async Task <string> GetAndUpdateVideoCaptions(string channelId, string videoId, ILogger log) { IReadOnlyList <ClosedCaptionTrackInfo> tracks; try { tracks = await ytScaper.GetVideoClosedCaptionTrackInfosAsync(videoId); } catch (Exception ex) { log.Warning(ex, "Unable to get captions for {VideoID}: {Error}", videoId, ex.Message); return(null); } var en = tracks.FirstOrDefault(t => t.Language.Code == "en"); if (en == null) { return(null); } ClosedCaptionTrack track; try { track = await Policy.Handle <HttpRequestException>() .RetryWithBackoff() .ExecuteAsync(() => ytScaper.GetClosedCaptionTrackAsync(en)); } catch (Exception ex) { log.Warning(ex, "Unable to get captions for {VideoID}: {Error}", videoId, ex.Message); return(null); } var text = track.Captions.Select(c => c.Text).Join("\n"); if (text != null) { var path = StringPath.Relative("VideoCaptions", channelId, $"{videoId}.txt"); try { await Store.Save(path, text.AsStream()); } catch (Exception ex) { log.Warning(ex, "Error when saving captions {Path}", path); } } return(text); }
public async Task SaveCaptions() { var cfg = await Setup.LoadCfg(); var log = Setup.CreateTestLogger(); var store = new YtStore(new YtClient(cfg.App, log), cfg.DataStore()); var channelCfg = await cfg.App.LoadChannelConfig(); foreach (var c in channelCfg.Seeds.Randomize()) { var existingCaptionIds = (await store.Store.List(StringPath.Relative("VideoCaptions", c.Id))) .Select(b => b.NameSansExtension).ToHashSet(); if (existingCaptionIds.Any()) { continue; } var cvc = await store.ChannelVideosCollection.Get(c.Id); var toUpdate = cvc.Vids.OrderByDescending(v => v.PublishedAt).Take(50) .Where(v => !existingCaptionIds.Contains(v.VideoId)).ToList(); await toUpdate.BlockAction(v => store.GetAndUpdateVideoCaptions(c.Id, v.VideoId, log), cfg.App.ParallelCollect); } }
//ytInitialPlayerResponse.responseContext.serviceTrackingParams.filter(p => p.service == "CSI")[0].params public async Task <RecsAndExtra> GetRecsAndExtra(string videoId, ILogger log) { var watchPage = await GetVideoWatchPageHtmlAsync(videoId, log); var(html, raw, url) = watchPage; var infoDic = await GetVideoInfoDicAsync(videoId, log); var videoItem = GetVideo(videoId, infoDic, watchPage); var extra = new VideoExtraStored2 { VideoId = videoId, Updated = DateTime.UtcNow, ChannelId = videoItem?.ChannelId, ChannelTitle = videoItem?.ChannelTitle, Description = videoItem?.Description, Duration = videoItem?.Duration, Keywords = videoItem?.Keywords, Title = videoItem?.Title, UploadDate = videoItem?.UploadDate, AddedDate = videoItem?.AddedDate, Statistics = videoItem?.Statistics, Source = ScrapeSource.Web }; var ytInitPr = GetClientObjectFromWatchPage(html, "ytInitialPlayerResponse"); if (ytInitPr != null && ytInitPr.Value <string>("status") != "OK") { var playerError = ytInitPr.SelectToken("playabilityStatus.errorScreen.playerErrorMessageRenderer"); extra.Error = playerError?.SelectToken("reason.simpleText")?.Value <string>(); extra.SubError = (playerError?.SelectToken("subreason.simpleText") ?? playerError?.SelectToken("subreason.runs[0].text")) ?.Value <string>(); } if (extra.Error == null) { var restrictedMode = html.QueryElements("head > meta[property=\"og:restrictions:age\"]").FirstOrDefault()?.GetAttribute("content")?.Value == "18+"; if (restrictedMode) { extra.Error = RestrictedVideoError; extra.SubError = "Unable to find recommended video because it is age restricted and requires to log in"; } } if (extra.Error == null) { extra.SubError = html.QueryElements("#unavailable-submessage").FirstOrDefault()?.GetInnerText(); if (extra.SubError == "") { extra.SubError = null; } if (extra.SubError.HasValue()) // all pages have the error, but not a sub-error { extra.Error = html.QueryElements("#unavailable-message").FirstOrDefault()?.GetInnerText(); } } if (extra.Error != null) { return(new RecsAndExtra(extra, new Rec[] { })); } var(recs, recEx) = Def.New(() => GetRecs2(html)).Try(); if (recs?.Any() != true || recEx != null) { var uri = new Uri(url); var path = StringPath.Relative(DateTime.UtcNow.ToString("yyyy-MM-dd"), $"{uri.PathAndQuery}.html"); var logUrl = LogStore.Url(path); await LogStore.Save(path, raw.AsStream(), log); log.Warning("WebScraper - Unable to find recs at ({Url}). error: {Error}", logUrl, recEx?.ToString()); } return(new RecsAndExtra(extra, recs)); }
async Task SaveToLatestAndDateDirs(string fileName, FPath tempFile) => await Task.WhenAll( Store.Save(StringPath.Relative(Version, DateTime.UtcNow.ToString("yyyy-MM-dd")).Add(fileName), tempFile), Store.Save(StringPath.Relative(Version, "latest").Add(fileName), tempFile) );
public AppendCollectionStore <VideoCaptionStored2> CaptionStore(string channelId) => new AppendCollectionStore <VideoCaptionStored2>(Store, StringPath.Relative("captions", channelId), c => c.UploadDate.FileSafeTimestamp(), StoreVersion.ToString(), Log);
public AppendCollectionStore <RecStored2> RecStore(string channelId) => new AppendCollectionStore <RecStored2>(Store, StringPath.Relative("recs", channelId), r => r.Updated.FileSafeTimestamp(), StoreVersion.ToString(), Log);
public static StringPath ToStringPath(this FPath path) => path.IsRooted ? StringPath.Absolute(path.Tokens) : StringPath.Relative(path.Tokens);
//ytInitialPlayerResponse.responseContext.serviceTrackingParams.filter(p => p.service == "CSI")[0].params public async Task <RecsAndExtra> GetRecsAndExtra(string videoId, ILogger log) { var watchPage = await GetVideoWatchPageHtmlAsync(videoId, log); var(html, raw, url) = watchPage; var infoDic = await GetVideoInfoDicAsync(videoId, log); var videoItem = GetVideo(videoId, infoDic, watchPage); var extra = new VideoExtraStored2 { VideoId = videoId, Updated = DateTime.UtcNow, ChannelId = videoItem.ChannelId, ChannelTitle = videoItem.ChannelTitle, Description = videoItem.Description, Duration = videoItem.Duration, Keywords = videoItem.Keywords, Title = videoItem.Title, UploadDate = videoItem.UploadDate.UtcDateTime, Statistics = videoItem.Statistics, Source = ScrapeSource.Web, Thumbnail = VideoThumbnail.FromVideoId(videoId) }; var restrictedMode = html.QueryElements("head > meta[property=\"og:restrictions:age\"]").FirstOrDefault()?.GetAttribute("content")?.Value == "18+"; if (restrictedMode) { extra.Error = RestrictedVideoError; extra.SubError = "Unable to find recommended video because it is age restricted and requires to log in"; } else { extra.SubError = html.QueryElements("#unavailable-submessage").FirstOrDefault()?.GetInnerText(); if (extra.SubError == "") { extra.SubError = null; } if (extra.SubError.HasValue()) // all pages have the error, but not a sub-error { extra.Error = html.QueryElements("#unavailable-message").FirstOrDefault()?.GetInnerText(); } } if (extra.Error != null) { return(new RecsAndExtra(extra, new Rec[] { })); } var(recs, recEx) = Def.New(() => GetRecs2(html)).Try(); if (recs?.Any() != true || recEx != null) { var uri = new Uri(url); var path = StringPath.Relative(DateTime.UtcNow.ToString("yyyy-MM-dd"), $"{uri.PathAndQuery}.html"); var logUrl = LogStore.Url(path); await LogStore.Save(path, raw.AsStream(), log); log.Warning("WebScraper - Unable to find recs at ({Url}). error: {Error}", logUrl, recEx?.ToString()); } var match = _ytAdRegex.Match(raw); extra.HasAd = match.Success && match.Groups[1].Value == "1"; return(new RecsAndExtra(extra, recs)); }