public Proxy GetProxy(ProxyFilter pf = null) { lock (_queueLock) { if (pf != null) { ProxyLog.Info("Getting Proxy. Filter: " + pf.GetFilterLogInfo()); } else { ProxyLog.Info("Getting Proxy. No Filter."); } CheckIcedProxies(); Proxy proxy = _availableProxies.FirstOrDefault(p => p.MatchesFilter(pf)); if (proxy != null) { ProxyLog.Info("Proxy Retrieved From Available Proxies: " + proxy.GetLogInfo(pf?.Site)); } else { using (ISession session = _sessionFactory.OpenSession()) { using (var trans = session.BeginTransaction()) { IList <Proxy> results = null; try { var sortMode = pf?.Site != null ? ProxyFilter.ProxyFilterQuery.SortModes.SiteScore : ProxyFilter.ProxyFilterQuery.SortModes.Score; results = QueryDatabaseForProxies(pf, session, sortMode); } catch (Exception ex) { Log.Error("Failed to query database for proxies.", ex); throw new ProxyRepositoryFailureException("A failure occurred while querying proxies", ex); } Proxy newProxy = results.FirstOrDefault(); if (newProxy == null) { ProxyLog.Info("No Matching Proxy Available"); throw new ProxyRepositoryFailureException("No matching proxies available"); } ProxyLog.Info("Proxy Retrieved From Database: " + newProxy.GetLogInfo(pf?.Site)); newProxy.LastSession = this.SessionID; newProxy.Repository = this; session.Save(newProxy); trans.Commit(); return(newProxy); } } } _availableProxies.Remove(proxy); return(proxy); } }
public IList <Proxy> QueryDatabaseForProxies(ProxyFilter pf, ISession session, ProxyFilter.ProxyFilterQuery.SortModes sortMode) { var transform = new ProxyFilter.ProxyFilterQuery(pf, this.SessionID, sortMode); var results = session.CreateSQLQuery(transform.GetSql()) .AddEntity("proxy", typeof(Proxy)) .SetResultTransformer(Transformers.DistinctRootEntity) .List <Proxy>(); return(results); }
/// <summary> /// Create the filter from the factory /// </summary> /// <returns></returns> public ProxyFilter CreateFilter() { ProxyFilter filter = OnCreateFilter(); if (filter != null) { filter.Graph = Graph; if (Client != null) { filter.Client = Client.Create(null); } if (Layers != null) { filter.Layers = Layers; } filter.Block = Block; } return(filter); }
/// <summary> /// Create "requests" which are descriptions of what pages to scrape, and how. /// </summary> protected IEnumerable <Request> GetPagesToScrape() { var request = new Request("http://www.github.com"); // You can select a raw HTTP request, which will just return the result of the request, or you can // choose to load the page in Chrome. request.DriverType = Request.DriverTypes.HeadlessChrome; // When using Chrome, you can specify a list of elements whose presence will indicate that the // page has fully loaded. Without them, the request will just be given a long timeout to ensure // that everything has loaded. So this can help with maximizing the rate processing by eliminating // the need to wait for the timeout after the necessary components are loaded. request.ElementsToWaitFor = new List <string>() { @"//*/button[contains(text(), 'Sign up for GitHub') and contains(@class, 'btn-primary-mktg')]" }; // Initialize the filter for the proxy. You can use this to micromanage the quality of the // proxy you are getting. In this case it's just making sure that it doesn't get a proxy // that has been overused recently in the current proxy session. ProxyFilter pf = new ProxyFilter(); pf.NotCurrentSession = true; request.ProxyFilter = pf; yield return(request); // Return some more of the same, to demonstrate the way requests are distributed amongst proxies. for (var i = 0; i < 15; i++) { yield return(new Request("http://www.github.com") { DriverType = Request.DriverTypes.HeadlessChrome, ElementsToWaitFor = new List <string>() { @"//*/button[contains(text(), 'Sign up for GitHub') and contains(@class, 'btn-primary-mktg')]" }, ProxyFilter = pf }); } }