async Task LogParseError(string msg, Exception ex, string videoId, string rawHtml, ILogger log) { var path = StringPath.Relative(DateTime.UtcNow.ToString("yyyy-MM-dd"), $"{videoId}.html"); var logUrl = LogStore.Url(path); await LogStore.Save(path, rawHtml.AsStream(), log); log.Warning(ex, "WebScraper - {VideoId} - saved html that we could not parse '{msg}' ({Url}). error: {Error}", videoId, msg, logUrl, ex?.ToString()); }
//ytInitialPlayerResponse.responseContext.serviceTrackingParams.filter(p => p.service == "CSI")[0].params public async Task <RecsAndExtra> GetRecsAndExtra(string videoId, ILogger log) { var watchPage = await GetVideoWatchPageHtmlAsync(videoId, log); var(html, raw, url) = watchPage; var infoDic = await GetVideoInfoDicAsync(videoId, log); var videoItem = GetVideo(videoId, infoDic, watchPage); var extra = new VideoExtraStored2 { VideoId = videoId, Updated = DateTime.UtcNow, ChannelId = videoItem?.ChannelId, ChannelTitle = videoItem?.ChannelTitle, Description = videoItem?.Description, Duration = videoItem?.Duration, Keywords = videoItem?.Keywords, Title = videoItem?.Title, UploadDate = videoItem?.UploadDate, AddedDate = videoItem?.AddedDate, Statistics = videoItem?.Statistics, Source = ScrapeSource.Web }; var ytInitPr = GetClientObjectFromWatchPage(html, "ytInitialPlayerResponse"); if (ytInitPr != null && ytInitPr.Value <string>("status") != "OK") { var playerError = ytInitPr.SelectToken("playabilityStatus.errorScreen.playerErrorMessageRenderer"); extra.Error = playerError?.SelectToken("reason.simpleText")?.Value <string>(); extra.SubError = (playerError?.SelectToken("subreason.simpleText") ?? playerError?.SelectToken("subreason.runs[0].text")) ?.Value <string>(); } if (extra.Error == null) { var restrictedMode = html.QueryElements("head > meta[property=\"og:restrictions:age\"]").FirstOrDefault()?.GetAttribute("content")?.Value == "18+"; if (restrictedMode) { extra.Error = RestrictedVideoError; extra.SubError = "Unable to find recommended video because it is age restricted and requires to log in"; } } if (extra.Error == null) { extra.SubError = html.QueryElements("#unavailable-submessage").FirstOrDefault()?.GetInnerText(); if (extra.SubError == "") { extra.SubError = null; } if (extra.SubError.HasValue()) // all pages have the error, but not a sub-error { extra.Error = html.QueryElements("#unavailable-message").FirstOrDefault()?.GetInnerText(); } } if (extra.Error != null) { return(new RecsAndExtra(extra, new Rec[] { })); } var(recs, recEx) = Def.New(() => GetRecs2(html)).Try(); if (recs?.Any() != true || recEx != null) { var uri = new Uri(url); var path = StringPath.Relative(DateTime.UtcNow.ToString("yyyy-MM-dd"), $"{uri.PathAndQuery}.html"); var logUrl = LogStore.Url(path); await LogStore.Save(path, raw.AsStream(), log); log.Warning("WebScraper - Unable to find recs at ({Url}). error: {Error}", logUrl, recEx?.ToString()); } return(new RecsAndExtra(extra, recs)); }
//ytInitialPlayerResponse.responseContext.serviceTrackingParams.filter(p => p.service == "CSI")[0].params public async Task <RecsAndExtra> GetRecsAndExtra(string videoId, ILogger log) { var watchPage = await GetVideoWatchPageHtmlAsync(videoId, log); var(html, raw, url) = watchPage; var infoDic = await GetVideoInfoDicAsync(videoId, log); var videoItem = GetVideo(videoId, infoDic, watchPage); var extra = new VideoExtraStored2 { VideoId = videoId, Updated = DateTime.UtcNow, ChannelId = videoItem.ChannelId, ChannelTitle = videoItem.ChannelTitle, Description = videoItem.Description, Duration = videoItem.Duration, Keywords = videoItem.Keywords, Title = videoItem.Title, UploadDate = videoItem.UploadDate.UtcDateTime, Statistics = videoItem.Statistics, Source = ScrapeSource.Web, Thumbnail = VideoThumbnail.FromVideoId(videoId) }; var restrictedMode = html.QueryElements("head > meta[property=\"og:restrictions:age\"]").FirstOrDefault()?.GetAttribute("content")?.Value == "18+"; if (restrictedMode) { extra.Error = RestrictedVideoError; extra.SubError = "Unable to find recommended video because it is age restricted and requires to log in"; } else { extra.SubError = html.QueryElements("#unavailable-submessage").FirstOrDefault()?.GetInnerText(); if (extra.SubError == "") { extra.SubError = null; } if (extra.SubError.HasValue()) // all pages have the error, but not a sub-error { extra.Error = html.QueryElements("#unavailable-message").FirstOrDefault()?.GetInnerText(); } } if (extra.Error != null) { return(new RecsAndExtra(extra, new Rec[] { })); } var(recs, recEx) = Def.New(() => GetRecs2(html)).Try(); if (recs?.Any() != true || recEx != null) { var uri = new Uri(url); var path = StringPath.Relative(DateTime.UtcNow.ToString("yyyy-MM-dd"), $"{uri.PathAndQuery}.html"); var logUrl = LogStore.Url(path); await LogStore.Save(path, raw.AsStream(), log); log.Warning("WebScraper - Unable to find recs at ({Url}). error: {Error}", logUrl, recEx?.ToString()); } var match = _ytAdRegex.Match(raw); extra.HasAd = match.Success && match.Groups[1].Value == "1"; return(new RecsAndExtra(extra, recs)); }