static async Task <StringPath> JoinFiles(ISimpleFileStore store, IReadOnlyCollection <StoreFileMd> toOptimise, StringPath destPath, int parallel, ILogger log) { var optimisedFile = FilePath(destPath, toOptimise.Last().Ts); using (var joinedStream = new MemoryStream()) { using (var zipWriter = new GZipStream(joinedStream, CompressionLevel.Optimal, true)) { var inStreams = await toOptimise.BlockFunc(async s => { var inStream = await store.Load(s.Path, log).WithDuration(); log.Debug("Optimise {Path} - loaded file {SourceFile} to be optimised in {Duration}", destPath, s.Path, inStream.Duration.HumanizeShort()); return(inStream.Result); }, parallel); foreach (var s in inStreams) { using var zr = new GZipStream(s, CompressionMode.Decompress, false); await zr.CopyToAsync(zipWriter); } } joinedStream.Seek(0, SeekOrigin.Begin); await store.Save(optimisedFile, joinedStream); } // when in-place, this is dirty if we fail now. There is no transaction capability in cloud storage, so downstream process must handle duplicates // successfully staged files, delete from land. Incremental using TS will work without delete, but it's more efficient to delete process landed files. await toOptimise.BlockAction(f => store.Delete(f.Path), parallel) .WithWrappedException(e => "Failed to delete optimised files. Duplicate records need to be handled downstream"); log.Debug("Optimise {Path} - deleted {Files} that were optimised into {OptimisedFile}", destPath, toOptimise.Count, optimisedFile); return(optimisedFile); }
/// <summary>Serializes item into the object store</summary> /// <param name="path">The path to the object (no extensions)</param> public static async Task Set <T>(this ISimpleFileStore store, StringPath path, T item, bool zip = true, ILogger log = default, JsonSerializerSettings jCfg = default) { await using var memStream = new MemoryStream(); var serializer = jCfg != null?JsonSerializer.Create(jCfg) : JsonExtensions.DefaultSerializer; if (zip) { await using (var zipWriter = new GZipStream(memStream, CompressionLevel.Optimal, true)) { await using var tw = new StreamWriter(zipWriter, Encoding.UTF8); serializer.Serialize(new JsonTextWriter(tw), item); } } else { await using (var tw = new StreamWriter(memStream, Encoding.UTF8, leaveOpen: true)) serializer.Serialize(new JsonTextWriter(tw), item); } var fullPath = path.AddJsonExtention(zip); memStream.Seek(0, SeekOrigin.Begin); await store.Save(fullPath, memStream, log); }
async Task LogParseError(string msg, Exception ex, string videoId, string rawHtml, ILogger log) { var path = StringPath.Relative(DateTime.UtcNow.ToString("yyyy-MM-dd"), $"{videoId}.html"); var logUrl = LogStore.Url(path); await LogStore.Save(path, rawHtml.AsStream(), log); log.Warning(ex, "WebScraper - {VideoId} - saved html that we could not parse '{msg}' ({Url}). error: {Error}", videoId, msg, logUrl, ex?.ToString()); }
public async Task Append(IReadOnlyCollection <T> items) { var ts = items.Max(GetTs); var path = StoreFileMd.FilePath(Path, ts, Version); await using var memStream = items.ToJsonlGzStream(); var res = await Store.Save(path, memStream).WithDuration(); Log.Debug("Store - Saved '{Path}' in {Duration}", path, res); }
async Task ReplaceJsonLFile(StoreFileMd f, StringPath newPath, IEnumerable <JToken> upgradedJs) { await using var stream = upgradedJs.ToJsonlGzStream(); await Store.Save(newPath, stream); var deleted = await Store.Delete(f.Path); if (!deleted) { throw new InvalidOperationException($"Didn't delete old file {f.Path}"); } Log.Information("Upgraded {OldFile} to {File}", f.Path, newPath); }
public async Task Convert(ILogger log) { var files = (await Store.List("import/watch_time").SelectManyList()).Where(f => f.Path.ExtensionsString == "csv"); await files.BlockAction(async f => { using var stream = await Store.Load(f.Path); using var sr = new StreamReader(stream); using var csv = new CsvReader(sr, CultureInfo.InvariantCulture) { Configuration = { Encoding = Encoding.UTF8, HasHeaderRecord = true, MissingFieldFound = null, BadDataFound = r => log.Warning("Error reading csv data at {RowNumber}: {RowData}", r.Row, r.RawRecord) } }; var rows = await csv.GetRecordsAsync <dynamic>().ToListAsync(); await Store.Save(f.Path.Parent.Add($"{f.Path.NameSansExtension}.json.gz"), await rows.ToJsonlGzStream(), log); }, parallelism : 4); }
//ytInitialPlayerResponse.responseContext.serviceTrackingParams.filter(p => p.service == "CSI")[0].params public async Task <RecsAndExtra> GetRecsAndExtra(string videoId, ILogger log) { var watchPage = await GetVideoWatchPageHtmlAsync(videoId, log); var(html, raw, url) = watchPage; var infoDic = await GetVideoInfoDicAsync(videoId, log); var videoItem = GetVideo(videoId, infoDic, watchPage); var extra = new VideoExtraStored2 { VideoId = videoId, Updated = DateTime.UtcNow, ChannelId = videoItem?.ChannelId, ChannelTitle = videoItem?.ChannelTitle, Description = videoItem?.Description, Duration = videoItem?.Duration, Keywords = videoItem?.Keywords, Title = videoItem?.Title, UploadDate = videoItem?.UploadDate, AddedDate = videoItem?.AddedDate, Statistics = videoItem?.Statistics, Source = ScrapeSource.Web }; var ytInitPr = GetClientObjectFromWatchPage(html, "ytInitialPlayerResponse"); if (ytInitPr != null && ytInitPr.Value <string>("status") != "OK") { var playerError = ytInitPr.SelectToken("playabilityStatus.errorScreen.playerErrorMessageRenderer"); extra.Error = playerError?.SelectToken("reason.simpleText")?.Value <string>(); extra.SubError = (playerError?.SelectToken("subreason.simpleText") ?? playerError?.SelectToken("subreason.runs[0].text")) ?.Value <string>(); } if (extra.Error == null) { var restrictedMode = html.QueryElements("head > meta[property=\"og:restrictions:age\"]").FirstOrDefault()?.GetAttribute("content")?.Value == "18+"; if (restrictedMode) { extra.Error = RestrictedVideoError; extra.SubError = "Unable to find recommended video because it is age restricted and requires to log in"; } } if (extra.Error == null) { extra.SubError = html.QueryElements("#unavailable-submessage").FirstOrDefault()?.GetInnerText(); if (extra.SubError == "") { extra.SubError = null; } if (extra.SubError.HasValue()) // all pages have the error, but not a sub-error { extra.Error = html.QueryElements("#unavailable-message").FirstOrDefault()?.GetInnerText(); } } if (extra.Error != null) { return(new RecsAndExtra(extra, new Rec[] { })); } var(recs, recEx) = Def.New(() => GetRecs2(html)).Try(); if (recs?.Any() != true || recEx != null) { var uri = new Uri(url); var path = StringPath.Relative(DateTime.UtcNow.ToString("yyyy-MM-dd"), $"{uri.PathAndQuery}.html"); var logUrl = LogStore.Url(path); await LogStore.Save(path, raw.AsStream(), log); log.Warning("WebScraper - Unable to find recs at ({Url}). error: {Error}", logUrl, recEx?.ToString()); } return(new RecsAndExtra(extra, recs)); }
async Task SaveToLatestAndDateDirs(string fileName, FPath tempFile) => await Task.WhenAll( Store.Save(StringPath.Relative(Version, DateTime.UtcNow.ToString("yyyy-MM-dd")).Add(fileName), tempFile), Store.Save(StringPath.Relative(Version, "latest").Add(fileName), tempFile) );
async Task SavePeriods(StringPath path, IDictionary <string, JObject> uniqPeriods, ILogger log) { var stream = await uniqPeriods.Values.ToJsonlGzStream(); await Store.Save(path.Add("periods.jsonl.gz"), stream, log); }
//ytInitialPlayerResponse.responseContext.serviceTrackingParams.filter(p => p.service == "CSI")[0].params public async Task <RecsAndExtra> GetRecsAndExtra(string videoId, ILogger log) { var watchPage = await GetVideoWatchPageHtmlAsync(videoId, log); var(html, raw, url) = watchPage; var infoDic = await GetVideoInfoDicAsync(videoId, log); var videoItem = GetVideo(videoId, infoDic, watchPage); var extra = new VideoExtraStored2 { VideoId = videoId, Updated = DateTime.UtcNow, ChannelId = videoItem.ChannelId, ChannelTitle = videoItem.ChannelTitle, Description = videoItem.Description, Duration = videoItem.Duration, Keywords = videoItem.Keywords, Title = videoItem.Title, UploadDate = videoItem.UploadDate.UtcDateTime, Statistics = videoItem.Statistics, Source = ScrapeSource.Web, Thumbnail = VideoThumbnail.FromVideoId(videoId) }; var restrictedMode = html.QueryElements("head > meta[property=\"og:restrictions:age\"]").FirstOrDefault()?.GetAttribute("content")?.Value == "18+"; if (restrictedMode) { extra.Error = RestrictedVideoError; extra.SubError = "Unable to find recommended video because it is age restricted and requires to log in"; } else { extra.SubError = html.QueryElements("#unavailable-submessage").FirstOrDefault()?.GetInnerText(); if (extra.SubError == "") { extra.SubError = null; } if (extra.SubError.HasValue()) // all pages have the error, but not a sub-error { extra.Error = html.QueryElements("#unavailable-message").FirstOrDefault()?.GetInnerText(); } } if (extra.Error != null) { return(new RecsAndExtra(extra, new Rec[] { })); } var(recs, recEx) = Def.New(() => GetRecs2(html)).Try(); if (recs?.Any() != true || recEx != null) { var uri = new Uri(url); var path = StringPath.Relative(DateTime.UtcNow.ToString("yyyy-MM-dd"), $"{uri.PathAndQuery}.html"); var logUrl = LogStore.Url(path); await LogStore.Save(path, raw.AsStream(), log); log.Warning("WebScraper - Unable to find recs at ({Url}). error: {Error}", logUrl, recEx?.ToString()); } var match = _ytAdRegex.Match(raw); extra.HasAd = match.Success && match.Groups[1].Value == "1"; return(new RecsAndExtra(extra, recs)); }