Пример #1
0
 public YtWeb(ProxyCfg proxy, YtCollectCfg collectCfg, ISimpleFileStore logStore)
 {
     Proxy      = proxy;
     CollectCfg = collectCfg;
     LogStore   = logStore;
     Clients    = new(proxy.DirectAndProxies(), p => Task.FromResult(p.CreateHttpClient()));
 }
Пример #2
0
 public WebScraper(ProxyCfg proxy, YtCollectCfg collectCfg, ISimpleFileStore logStore)
 {
     Proxy      = proxy;
     CollectCfg = collectCfg;
     LogStore   = logStore;
     Clients    = new ResourceCycle <HttpClient, ProxyConnectionCfg>(proxy.DirectAndProxies(), p => Task.FromResult(CreateHttpClient(p)));
 }
Пример #3
0
        int _lastUsedBrowserIdx; // re-use the same browser proxies across different calls to recs and extra

        public async Task <IReadOnlyCollection <RecsAndExtra> > GetRecsAndExtra(IReadOnlyCollection <string> videos, ILogger log)
        {
            var sw = Stopwatch.StartNew();

            log = log.ForContext("Module", nameof(ChromeScraper));
            var proxies = ProxyCfg.DirectAndProxies().Take(2)
                          .ToArray(); // only use the first fallback (datacenter proxies). residential is too expensive for the volume of data we are loading.
            var parallel             = CollectCfg.ChromeParallel;
            var videosPerBrowserPool = videos.Count / CollectCfg.ChromeParallel;

            var res = await videos.Batch(videosPerBrowserPool).WithIndex().BlockFunc(async batch => {
                var(b, i)                = batch;
                var requests             = new ConcurrentBag <(Request req, bool aborted)>();
                await using var browsers = new ResourceCycle <Browser, ProxyConnectionCfg>(proxies, p => CreateBrowser(log, p), _lastUsedBrowserIdx);

                async Task <Page> Page(Browser browser, ProxyConnectionCfg proxy)
                {
                    var page = await browser.NewPageAsync(); // create page inside retry loop. some transient errors seems to be caused by state in the page
                    if (proxy.Creds != null)
                    {
                        await page.AuthenticateAsync(proxy.Creds.AsCreds()); // workaround for chrome not supporting password proxies
                    }
                    await page.SetCookieAsync();                             // clears cookies
                    await ConfigureRequests(page, (req, aborted) => requests.Add((req, aborted)));
                    return(page);
                }

                return(await b.BlockFunc(async v => {
                    //var context = await browser.CreateIncognitoBrowserContextAsync(); // opening windows is more reliable than tabs in practice
                    var videoAttempt = 0;
                    var videoLog = log.ForContext("Video", v);

                    while (true)
                    {
                        var(browser, proxy) = await browsers.Get();
                        videoAttempt++;
                        var lastAttempt = videoAttempt >= CollectCfg.ChromeAttempts;
                        videoLog.Debug("loading video {Video}. Proxy={Proxy}", v, proxy?.Url == null ? "Direct" : proxy.Url);
                        using var page = await Page(browser, proxy);
                        try {
                            var(video, notOkResponse) = await GetVideo(page, v, videoLog);
                            if (notOkResponse != null)
                            {
                                if (notOkResponse.Status == HttpStatusCode.TooManyRequests)
                                {
                                    await browsers.NextResource(browser);
                                    _lastUsedBrowserIdx = browsers.Idx;
                                    videoAttempt = 0;
                                    videoLog.Information("ChromeScraper - error response ({Status}) loading {Video}: using next proxy", notOkResponse.Status, v);
                                }
                                else
                                {
                                    throw new InvalidOperationException($"Not OK response ({notOkResponse.Status})");
                                }
                            }

                            if (video == null)
                            {
                                continue;
                            }

                            videoLog.Debug("ChromeScraper - finished loading video {Video} with {Comments} comments and {Recommendations} recs in {Duration}",
                                           v, video.Extra.Comments?.Length ?? 0, video.Recs?.Length ?? 0, sw.Elapsed.HumanizeShort());
                            return video;
                        }
                        catch (Exception ex) {
                            videoLog.Warning(ex, "ChromeScraper - failed to load video {Video} from {Url} (attempt {Attempt}): {Error}", v, page.Url, videoAttempt,
                                             ex.Message);
                            if (lastAttempt)
                            {
                                throw;
                            }
                        }
                    }
                }, parallel: 1, progressUpdate: p => log.Debug("ChromeScraper - browser pool {Pool} progress {Complete}/{Total}",
                                                               i, p.CompletedTotal, b.Count)));
            }, parallel);

            log.Information("ChromeScraper - finished loading all {Videos} videos in {Duration}", videos.Count, sw.HumanizeShort());
            return(res.SelectMany().ToReadOnly());
        }