public YtWeb(ProxyCfg proxy, YtCollectCfg collectCfg, ISimpleFileStore logStore) { Proxy = proxy; CollectCfg = collectCfg; LogStore = logStore; Clients = new(proxy.DirectAndProxies(), p => Task.FromResult(p.CreateHttpClient())); }
public WebScraper(ProxyCfg proxy, YtCollectCfg collectCfg, ISimpleFileStore logStore) { Proxy = proxy; CollectCfg = collectCfg; LogStore = logStore; Clients = new ResourceCycle <HttpClient, ProxyConnectionCfg>(proxy.DirectAndProxies(), p => Task.FromResult(CreateHttpClient(p))); }
int _lastUsedBrowserIdx; // re-use the same browser proxies across different calls to recs and extra public async Task <IReadOnlyCollection <RecsAndExtra> > GetRecsAndExtra(IReadOnlyCollection <string> videos, ILogger log) { var sw = Stopwatch.StartNew(); log = log.ForContext("Module", nameof(ChromeScraper)); var proxies = ProxyCfg.DirectAndProxies().Take(2) .ToArray(); // only use the first fallback (datacenter proxies). residential is too expensive for the volume of data we are loading. var parallel = CollectCfg.ChromeParallel; var videosPerBrowserPool = videos.Count / CollectCfg.ChromeParallel; var res = await videos.Batch(videosPerBrowserPool).WithIndex().BlockFunc(async batch => { var(b, i) = batch; var requests = new ConcurrentBag <(Request req, bool aborted)>(); await using var browsers = new ResourceCycle <Browser, ProxyConnectionCfg>(proxies, p => CreateBrowser(log, p), _lastUsedBrowserIdx); async Task <Page> Page(Browser browser, ProxyConnectionCfg proxy) { var page = await browser.NewPageAsync(); // create page inside retry loop. some transient errors seems to be caused by state in the page if (proxy.Creds != null) { await page.AuthenticateAsync(proxy.Creds.AsCreds()); // workaround for chrome not supporting password proxies } await page.SetCookieAsync(); // clears cookies await ConfigureRequests(page, (req, aborted) => requests.Add((req, aborted))); return(page); } return(await b.BlockFunc(async v => { //var context = await browser.CreateIncognitoBrowserContextAsync(); // opening windows is more reliable than tabs in practice var videoAttempt = 0; var videoLog = log.ForContext("Video", v); while (true) { var(browser, proxy) = await browsers.Get(); videoAttempt++; var lastAttempt = videoAttempt >= CollectCfg.ChromeAttempts; videoLog.Debug("loading video {Video}. Proxy={Proxy}", v, proxy?.Url == null ? "Direct" : proxy.Url); using var page = await Page(browser, proxy); try { var(video, notOkResponse) = await GetVideo(page, v, videoLog); if (notOkResponse != null) { if (notOkResponse.Status == HttpStatusCode.TooManyRequests) { await browsers.NextResource(browser); _lastUsedBrowserIdx = browsers.Idx; videoAttempt = 0; videoLog.Information("ChromeScraper - error response ({Status}) loading {Video}: using next proxy", notOkResponse.Status, v); } else { throw new InvalidOperationException($"Not OK response ({notOkResponse.Status})"); } } if (video == null) { continue; } videoLog.Debug("ChromeScraper - finished loading video {Video} with {Comments} comments and {Recommendations} recs in {Duration}", v, video.Extra.Comments?.Length ?? 0, video.Recs?.Length ?? 0, sw.Elapsed.HumanizeShort()); return video; } catch (Exception ex) { videoLog.Warning(ex, "ChromeScraper - failed to load video {Video} from {Url} (attempt {Attempt}): {Error}", v, page.Url, videoAttempt, ex.Message); if (lastAttempt) { throw; } } } }, parallel: 1, progressUpdate: p => log.Debug("ChromeScraper - browser pool {Pool} progress {Complete}/{Total}", i, p.CompletedTotal, b.Count))); }, parallel); log.Information("ChromeScraper - finished loading all {Videos} videos in {Duration}", videos.Count, sw.HumanizeShort()); return(res.SelectMany().ToReadOnly()); }