static async Task <StringPath> JoinFiles(ISimpleFileStore store, IReadOnlyCollection <StoreFileMd> toOptimise, StringPath destPath, int parallel,
                                                 ILogger log)
        {
            var optimisedFile = FilePath(destPath, toOptimise.Last().Ts);

            using (var joinedStream = new MemoryStream()) {
                using (var zipWriter = new GZipStream(joinedStream, CompressionLevel.Optimal, true)) {
                    var inStreams = await toOptimise.BlockFunc(async s => {
                        var inStream = await store.Load(s.Path, log).WithDuration();
                        log.Debug("Optimise {Path} - loaded file {SourceFile} to be optimised in {Duration}",
                                  destPath, s.Path, inStream.Duration.HumanizeShort());
                        return(inStream.Result);
                    }, parallel);

                    foreach (var s in inStreams)
                    {
                        using var zr = new GZipStream(s, CompressionMode.Decompress, false);
                        await zr.CopyToAsync(zipWriter);
                    }
                }
                joinedStream.Seek(0, SeekOrigin.Begin);
                await store.Save(optimisedFile, joinedStream);
            }

            // when in-place, this is dirty if we fail now. There is no transaction capability in cloud storage, so downstream process must handle duplicates
            // successfully staged files, delete from land. Incremental using TS will work without delete, but it's more efficient to delete process landed files.
            await toOptimise.BlockAction(f => store.Delete(f.Path), parallel)
            .WithWrappedException(e => "Failed to delete optimised files. Duplicate records need to be handled downstream");

            log.Debug("Optimise {Path} - deleted {Files} that were optimised into {OptimisedFile}",
                      destPath, toOptimise.Count, optimisedFile);

            return(optimisedFile);
        }
Exemple #2
0
        /// <summary>Serializes item into the object store</summary>
        /// <param name="path">The path to the object (no extensions)</param>
        public static async Task Set <T>(this ISimpleFileStore store, StringPath path, T item, bool zip = true, ILogger log = default,
                                         JsonSerializerSettings jCfg = default)
        {
            await using var memStream = new MemoryStream();

            var serializer = jCfg != null?JsonSerializer.Create(jCfg) : JsonExtensions.DefaultSerializer;

            if (zip)
            {
                await using (var zipWriter = new GZipStream(memStream, CompressionLevel.Optimal, true)) {
                    await using var tw = new StreamWriter(zipWriter, Encoding.UTF8);
                    serializer.Serialize(new JsonTextWriter(tw), item);
                }
            }
            else
            {
                await using (var tw = new StreamWriter(memStream, Encoding.UTF8, leaveOpen: true))
                    serializer.Serialize(new JsonTextWriter(tw), item);
            }

            var fullPath = path.AddJsonExtention(zip);

            memStream.Seek(0, SeekOrigin.Begin);

            await store.Save(fullPath, memStream, log);
        }
        async Task LogParseError(string msg, Exception ex, string videoId, string rawHtml, ILogger log)
        {
            var path   = StringPath.Relative(DateTime.UtcNow.ToString("yyyy-MM-dd"), $"{videoId}.html");
            var logUrl = LogStore.Url(path);
            await LogStore.Save(path, rawHtml.AsStream(), log);

            log.Warning(ex, "WebScraper - {VideoId} - saved html that we could not parse '{msg}' ({Url}). error: {Error}",
                        videoId, msg, logUrl, ex?.ToString());
        }
        public async Task Append(IReadOnlyCollection <T> items)
        {
            var ts   = items.Max(GetTs);
            var path = StoreFileMd.FilePath(Path, ts, Version);

            await using var memStream = items.ToJsonlGzStream();
            var res = await Store.Save(path, memStream).WithDuration();

            Log.Debug("Store - Saved '{Path}' in {Duration}", path, res);
        }
        async Task ReplaceJsonLFile(StoreFileMd f, StringPath newPath, IEnumerable <JToken> upgradedJs)
        {
            await using var stream = upgradedJs.ToJsonlGzStream();
            await Store.Save(newPath, stream);

            var deleted = await Store.Delete(f.Path);

            if (!deleted)
            {
                throw new InvalidOperationException($"Didn't delete old file {f.Path}");
            }
            Log.Information("Upgraded {OldFile} to {File}", f.Path, newPath);
        }
 public async Task Convert(ILogger log)
 {
     var files = (await Store.List("import/watch_time").SelectManyList()).Where(f => f.Path.ExtensionsString == "csv");
     await files.BlockAction(async f => {
         using var stream = await Store.Load(f.Path);
         using var sr     = new StreamReader(stream);
         using var csv    = new CsvReader(sr, CultureInfo.InvariantCulture)
               {
                   Configuration =
                   {
                       Encoding          = Encoding.UTF8,
                       HasHeaderRecord   = true,
                       MissingFieldFound = null,
                       BadDataFound      = r => log.Warning("Error reading csv data at {RowNumber}: {RowData}", r.Row, r.RawRecord)
                   }
               };
         var rows = await csv.GetRecordsAsync <dynamic>().ToListAsync();
         await Store.Save(f.Path.Parent.Add($"{f.Path.NameSansExtension}.json.gz"), await rows.ToJsonlGzStream(), log);
     }, parallelism : 4);
 }
Exemple #7
0
        //ytInitialPlayerResponse.responseContext.serviceTrackingParams.filter(p => p.service == "CSI")[0].params
        public async Task <RecsAndExtra> GetRecsAndExtra(string videoId, ILogger log)
        {
            var watchPage = await GetVideoWatchPageHtmlAsync(videoId, log);

            var(html, raw, url) = watchPage;
            var infoDic = await GetVideoInfoDicAsync(videoId, log);

            var videoItem = GetVideo(videoId, infoDic, watchPage);

            var extra = new VideoExtraStored2 {
                VideoId      = videoId,
                Updated      = DateTime.UtcNow,
                ChannelId    = videoItem?.ChannelId,
                ChannelTitle = videoItem?.ChannelTitle,
                Description  = videoItem?.Description,
                Duration     = videoItem?.Duration,
                Keywords     = videoItem?.Keywords,
                Title        = videoItem?.Title,
                UploadDate   = videoItem?.UploadDate,
                AddedDate    = videoItem?.AddedDate,
                Statistics   = videoItem?.Statistics,
                Source       = ScrapeSource.Web
            };

            var ytInitPr = GetClientObjectFromWatchPage(html, "ytInitialPlayerResponse");

            if (ytInitPr != null && ytInitPr.Value <string>("status") != "OK")
            {
                var playerError = ytInitPr.SelectToken("playabilityStatus.errorScreen.playerErrorMessageRenderer");
                extra.Error    = playerError?.SelectToken("reason.simpleText")?.Value <string>();
                extra.SubError = (playerError?.SelectToken("subreason.simpleText") ??
                                  playerError?.SelectToken("subreason.runs[0].text"))
                                 ?.Value <string>();
            }
            if (extra.Error == null)
            {
                var restrictedMode = html.QueryElements("head > meta[property=\"og:restrictions:age\"]").FirstOrDefault()?.GetAttribute("content")?.Value == "18+";
                if (restrictedMode)
                {
                    extra.Error    = RestrictedVideoError;
                    extra.SubError = "Unable to find recommended video because it is age restricted and requires to log in";
                }
            }
            if (extra.Error == null)
            {
                extra.SubError = html.QueryElements("#unavailable-submessage").FirstOrDefault()?.GetInnerText();
                if (extra.SubError == "")
                {
                    extra.SubError = null;
                }
                if (extra.SubError.HasValue()) // all pages have the error, but not a sub-error
                {
                    extra.Error = html.QueryElements("#unavailable-message").FirstOrDefault()?.GetInnerText();
                }
            }
            if (extra.Error != null)
            {
                return(new RecsAndExtra(extra, new Rec[] { }));
            }

            var(recs, recEx) = Def.New(() => GetRecs2(html)).Try();
            if (recs?.Any() != true || recEx != null)
            {
                var uri    = new Uri(url);
                var path   = StringPath.Relative(DateTime.UtcNow.ToString("yyyy-MM-dd"), $"{uri.PathAndQuery}.html");
                var logUrl = LogStore.Url(path);
                await LogStore.Save(path, raw.AsStream(), log);

                log.Warning("WebScraper - Unable to find recs at ({Url}). error: {Error}", logUrl, recEx?.ToString());
            }

            return(new RecsAndExtra(extra, recs));
        }
 async Task SaveToLatestAndDateDirs(string fileName, FPath tempFile) =>
   await Task.WhenAll(
     Store.Save(StringPath.Relative(Version, DateTime.UtcNow.ToString("yyyy-MM-dd")).Add(fileName), tempFile),
     Store.Save(StringPath.Relative(Version, "latest").Add(fileName), tempFile)
   );
Exemple #9
0
        async Task SavePeriods(StringPath path, IDictionary <string, JObject> uniqPeriods, ILogger log)
        {
            var stream = await uniqPeriods.Values.ToJsonlGzStream();

            await Store.Save(path.Add("periods.jsonl.gz"), stream, log);
        }
Exemple #10
0
        //ytInitialPlayerResponse.responseContext.serviceTrackingParams.filter(p => p.service == "CSI")[0].params
        public async Task <RecsAndExtra> GetRecsAndExtra(string videoId, ILogger log)
        {
            var watchPage = await GetVideoWatchPageHtmlAsync(videoId, log);

            var(html, raw, url) = watchPage;
            var infoDic = await GetVideoInfoDicAsync(videoId, log);

            var videoItem = GetVideo(videoId, infoDic, watchPage);

            var extra = new VideoExtraStored2 {
                VideoId      = videoId,
                Updated      = DateTime.UtcNow,
                ChannelId    = videoItem.ChannelId,
                ChannelTitle = videoItem.ChannelTitle,
                Description  = videoItem.Description,
                Duration     = videoItem.Duration,
                Keywords     = videoItem.Keywords,
                Title        = videoItem.Title,
                UploadDate   = videoItem.UploadDate.UtcDateTime,
                Statistics   = videoItem.Statistics,
                Source       = ScrapeSource.Web,
                Thumbnail    = VideoThumbnail.FromVideoId(videoId)
            };

            var restrictedMode = html.QueryElements("head > meta[property=\"og:restrictions:age\"]").FirstOrDefault()?.GetAttribute("content")?.Value == "18+";

            if (restrictedMode)
            {
                extra.Error    = RestrictedVideoError;
                extra.SubError = "Unable to find recommended video because it is age restricted and requires to log in";
            }
            else
            {
                extra.SubError = html.QueryElements("#unavailable-submessage").FirstOrDefault()?.GetInnerText();
                if (extra.SubError == "")
                {
                    extra.SubError = null;
                }
                if (extra.SubError.HasValue()) // all pages have the error, but not a sub-error
                {
                    extra.Error = html.QueryElements("#unavailable-message").FirstOrDefault()?.GetInnerText();
                }
            }
            if (extra.Error != null)
            {
                return(new RecsAndExtra(extra, new Rec[] { }));
            }


            var(recs, recEx) = Def.New(() => GetRecs2(html)).Try();
            if (recs?.Any() != true || recEx != null)
            {
                var uri    = new Uri(url);
                var path   = StringPath.Relative(DateTime.UtcNow.ToString("yyyy-MM-dd"), $"{uri.PathAndQuery}.html");
                var logUrl = LogStore.Url(path);
                await LogStore.Save(path, raw.AsStream(), log);

                log.Warning("WebScraper - Unable to find recs at ({Url}). error: {Error}", logUrl, recEx?.ToString());
            }

            var match = _ytAdRegex.Match(raw);

            extra.HasAd = match.Success && match.Groups[1].Value == "1";

            return(new RecsAndExtra(extra, recs));
        }