internal static async Task <(string ImdbUrl, string ImdbHtml, string ParentUrl, string ParentHtml, string ReleaseUrl, string ReleaseHtml, ImdbMetadata ImdbMetadata)> DownloadAsync( string imdbId, bool useCache, string imdbFile, string parentFile, string releaseFile, IWebDriver?webDriver = null) { using WebClient? webClient = webDriver is null ? new() { Encoding = Encoding.UTF8 } : null; webClient?.AddChromeHeaders(); string imdbUrl = $"https://www.imdb.com/title/{imdbId}/"; string imdbHtml = useCache && File.Exists(imdbFile) ? await File.ReadAllTextAsync(imdbFile) : await Retry.FixedIntervalAsync(async() => webDriver is not null?await webDriver.DownloadStringAsync(imdbUrl) : await webClient !.DownloadCompressedStringAsync(imdbUrl), retryCount : 10); CQ imdbCQ = imdbHtml; string json = imdbCQ.Find(@"script[type=""application/ld+json""]").Text(); ImdbMetadata imdbMetadata = JsonSerializer.Deserialize <ImdbMetadata>( json, new JsonSerializerOptions() { PropertyNameCaseInsensitive = true }) ?? throw new InvalidOperationException(json); string parentUrl = string.Empty; string parentHtml = string.Empty; string parentHref = imdbCQ.Find(@"div.titleParent a").FirstOrDefault()?.GetAttribute("href") ?? imdbCQ.Find(@"div").FirstOrDefault(div => div.Classes.Any(@class => @class.StartsWith("TitleBlock__SeriesParentLinkWrapper", StringComparison.Ordinal)))?.Cq().Find("a").Attr("href") ?? string.Empty; if (!string.IsNullOrWhiteSpace(parentHref)) { string parentImdbId = Regex.Match(parentHref, "tt[0-9]+").Value; (parentUrl, parentHtml, _, _, _, _, imdbMetadata.Parent) = await DownloadAsync(parentImdbId, useCache, parentFile, string.Empty, releaseFile); } string htmlTitle = imdbCQ.Find(@"title").Text(); string htmlTitleYear = htmlTitle.Contains("(", StringComparison.Ordinal) ? htmlTitle[(htmlTitle.LastIndexOf("(", StringComparison.Ordinal) + 1)..htmlTitle.LastIndexOf(")", StringComparison.Ordinal)]
internal static async Task DownloadImdbMetadataAsync(string directory, int level = 2, bool overwrite = false, bool isTV = false, Action <string>?log = null) { log ??= TraceLog; await EnumerateDirectories(directory, level) .ParallelForEachAsync(async movie => { if (!overwrite && Directory.EnumerateFiles(movie, JsonMetadataSearchPattern, SearchOption.TopDirectoryOnly).Any()) { log($"Skip {movie}."); return; } string?nfo = Directory.EnumerateFiles(movie, XmlMetadataSearchPattern, SearchOption.TopDirectoryOnly).FirstOrDefault(); if (string.IsNullOrWhiteSpace(nfo)) { log($"!Missing metadata {movie}."); return; } string?imdbId = XDocument.Load(nfo).Root?.Element((isTV ? "imdb_id" : "imdbid") !)?.Value; if (string.IsNullOrWhiteSpace(imdbId)) { await File.WriteAllTextAsync(Path.Combine(movie, $"{NotExistingFlag}{JsonMetadataExtension}"), "{}"); return; } (string imdbJson, string year, string[] regions) = await Retry.FixedIntervalAsync(async() => await Imdb.DownloadJsonAsync($"https://www.imdb.com/title/{imdbId}"), retryCount: 10); Debug.Assert(!string.IsNullOrWhiteSpace(imdbJson)); if (string.IsNullOrWhiteSpace(year)) { ImdbMetadata imdbMetadata = JsonSerializer.Deserialize <ImdbMetadata>( imdbJson, new() { PropertyNameCaseInsensitive = true, IgnoreReadOnlyProperties = true }) ?? throw new InvalidOperationException(imdbJson); year = imdbMetadata.YearOfCurrentRegion; } if (string.IsNullOrWhiteSpace(year)) { log($"!Year is missing for {imdbId}: {movie}"); } if (!regions.Any()) { log($"!Location is missing for {imdbId}: {movie}"); } string json = Path.Combine(movie, $"{imdbId}.{year}.{string.Join(",", regions.Take(5))}{JsonMetadataExtension}"); log($"Downloaded https://www.imdb.com/title/{imdbId} to {json}."); await File.WriteAllTextAsync(json, imdbJson); log($"Saved to {json}."); }, IOMaxDegreeOfParallelism); }
internal static async Task <(string Json, string Year, string[] Regions)> DownloadJsonAsync(string url) { using WebClient webClient = new(); string imdbHtml = await webClient.DownloadStringTaskAsync(url); CQ cqImdb = new(imdbHtml); string year = cqImdb.Find(@"#titleYear").Text().Trim().TrimStart('(').TrimEnd(')').Trim(); string json = cqImdb.Find(@"script[type=""application/ld+json""]").Text(); if (string.IsNullOrWhiteSpace(year)) { string episodeYear = cqImdb.Find(@"a[title=""See more release dates""]").Text(); Match match = Regex.Match(episodeYear, "[0-9]{4}"); if (match.Success) { year = match.Value; Debug.Assert(year.Length == 4); } } if (string.IsNullOrWhiteSpace(year)) { ImdbMetadata imdbMetadata = JsonSerializer.Deserialize <ImdbMetadata>( json, new() { PropertyNameCaseInsensitive = true, IgnoreReadOnlyProperties = true }) ?? throw new InvalidOperationException(url); year = imdbMetadata.YearOfCurrentRegion; } return( json, year, cqImdb .Find(@"#titleDetails .txt-block") .Elements .Select(element => new CQ(element).Text().Trim()) .FirstOrDefault(text => text.StartsWith("Country:", StringComparison.InvariantCultureIgnoreCase)) ?.Replace("Country:", string.Empty, StringComparison.InvariantCultureIgnoreCase) .Split('|') .Select(region => region.Trim()) .ToArray() ?? Array.Empty <string>()); }