/// <summary> /// 抓取页面时使用代理 /// </summary> /// <param name="url"></param> /// <returns></returns> private string DownloadWithProxy(string url) { try { for (var i = 0; i < 3; i++) { WebProxy webProxy = null; var service = new ProxyService(); Proxy proxy = service.GetProxy(); if (proxy != null) { webProxy = new WebProxy(proxy.Adress + ":" + proxy.Port); } var html = HttpHelper.DownloadHtml(url, webProxy, TimeOut); return(html); } } catch (Exception ex) { } return(""); }
public override void Start() { LogHelper.LogMsg("》》》开始抓取西刺代理"); try { var service = new ProxyService(); List <string> list = new List <string>() { "http://www.xicidaili.com/nt/", "http://www.xicidaili.com/nn/", "http://www.xicidaili.com/wn/", "http://www.xicidaili.com/wt/" }; foreach (var utlitem in list) { string url = utlitem; string html = DownloadProxyPage(url); if (!CheckHtml(html, url, "西刺代理")) { continue; } HtmlDocument doc = new HtmlAgilityPack.HtmlDocument(); doc.LoadHtml(html); HtmlNode node = doc.DocumentNode; //获取总页数 var totalPage = GetTotalPage(node); for (var i = 1; i <= totalPage; i++) { var pageUrl = url + i; //如果是第一页,已经取过不用再取 if (i != 1) { html = DownloadProxyPage(pageUrl); if (!CheckHtml(html, pageUrl, "西刺代理")) { return; } doc.LoadHtml(html); node = doc.DocumentNode; } string xpathstring = "//table[@id='ip_list']/tr[@class]"; HtmlNodeCollection collection = node.SelectNodes(xpathstring); if (collection == null) { LogHelper.LogError("无获取西刺代理ip,请检查网站结构是否有改动"); continue; } var proxyList = new List <Proxy>(); //提取代理 Parallel.ForEach(collection, item => { Proxy proxy = new Proxy(); string xpath = "td[2]"; proxy.Adress = item.SelectSingleNode(xpath).InnerHtml.Trim(); xpath = "td[3]"; int port = 0; if (!int.TryParse(item.SelectSingleNode(xpath).InnerHtml.Trim(), out port)) { LogHelper.LogError("西刺代理,取端口号时出错:" + item.InnerHtml); return; } proxy.Port = port; proxy.Source = pageUrl; proxyList.Add(proxy); }); VerifyAndSave(proxyList); } } } catch (Exception ex) { LogHelper.LogError("抓取西刺代理时出错:" + ex); } LogHelper.LogMsg("《《《西刺代理抓取完成"); }