/// <summary> /// The crawl process. /// </summary> /// <param name="threadIndex"> /// The thread index. /// </param> private void CrawlProcess(object threadIndex) { var currentThreadIndex = (int)threadIndex; while (true) { // 根据队列中的 Url 数量和空闲线程的数量,判断线程是睡眠还是退出 if (UrlQueue.Instance.Count == 0) { this.threadStatus[currentThreadIndex] = true; if (!this.threadStatus.Any(t => t == false)) { break; } Thread.Sleep(2000); continue; } this.threadStatus[currentThreadIndex] = false; if (UrlQueue.Instance.Count == 0) { continue; } UrlInfo urlInfo = UrlQueue.Instance.DeQueue(); var curIPProxy = Settings.GetIPProxy(); try { if (urlInfo == null) { continue; } // 1~5 秒随机间隔的自动限速 if (this.Settings.AutoSpeedLimit) { try { var maxSecond = 5000; var inSecond = 1000; if (this.Settings.AutoSpeedLimitMinMSecond >= inSecond) { inSecond = this.Settings.AutoSpeedLimitMinMSecond; } if (this.Settings.AutoSpeedLimitMaxMSecond >= maxSecond) { maxSecond = this.Settings.AutoSpeedLimitMaxMSecond; } int span = this.random.Next(inSecond, maxSecond); Thread.Sleep(span); } catch (Exception ex) { throw new Exception("AutoSpeedLimit" + ex.Message); } } string html = string.Empty; switch (Settings.CrawlMode) { case EnumCrawlMode.PhantomJsViaSelenium: html = GetPhantomJsResult(urlInfo); break; case EnumCrawlMode.HttpHelper: case EnumCrawlMode.SuperWebClient: if (Settings.UseSuperWebClient) { html = GetSupperHttpResult(urlInfo); } else { html = GetHttpResult(urlInfo); } break; } if (!string.IsNullOrEmpty(html)) { this.ParseLinks(urlInfo, html); } if (this.DataReceivedEvent != null) { this.DataReceivedEvent( new DataReceivedEventArgs { Url = urlInfo.UrlString, Depth = urlInfo.Depth, Html = html, IpProx = curIPProxy, urlInfo = urlInfo }); } } catch (WebException webEx) { var ev = new CrawlErrorEventArgs { Url = urlInfo.UrlString, Depth = urlInfo.Depth, Exception = webEx, IpProx = curIPProxy, urlInfo = urlInfo }; if (webEx.Status == WebExceptionStatus.Timeout || webEx.Status == WebExceptionStatus.ProtocolError || webEx.Message.Contains("远程服务器返回错误") || webEx.Message.Contains("网关")) { //Settings.SetUnviableIP(curIPProxy);//设置为无效代理 ev.needChangeIp = true; } ev.needTryAgain = true; if (this.CrawlErrorEvent != null) { if (urlInfo != null) { this.CrawlErrorEvent(ev ); } } } catch (Exception exception) { var errorEV = new CrawlErrorEventArgs { Url = urlInfo.UrlString, Depth = urlInfo.Depth, Exception = exception, IpProx = curIPProxy, urlInfo = urlInfo }; if (exception.Message.Contains("超时") || exception.Message.Contains("远程服务器返回错误")) { // Settings.SetUnviableIP(curIPProxy);//设置为无效代理 errorEV.needChangeIp = true; } errorEV.needTryAgain = true; if (this.CrawlErrorEvent != null) { if (urlInfo != null) { this.CrawlErrorEvent(errorEV ); } } } finally { //if (request != null) //{ // request.Abort(); //} //if (response != null) //{ // response.Close(); //} } } }
private void Master_CrawlErrorEvent(CrawlErrorEventArgs args) { // }
/// <summary> /// The crawl process. /// </summary> /// <param name="threadIndex"> /// The thread index. /// </param> private void CrawlProcess_Abort(object threadIndex) { var currentThreadIndex = (int)threadIndex; while (true) { // 根据队列中的 Url 数量和空闲线程的数量,判断线程是睡眠还是退出 if (UrlQueue.Instance.Count == 0) { this.threadStatus[currentThreadIndex] = true; if (!this.threadStatus.Any(t => t == false)) { break; } Thread.Sleep(2000); continue; } this.threadStatus[currentThreadIndex] = false; if (UrlQueue.Instance.Count == 0) { continue; } UrlInfo urlInfo = UrlQueue.Instance.DeQueue(); HttpWebRequest request = null; HttpWebResponse response = null; IPProxy curIPProxy = null; try { if (urlInfo == null) { continue; } // 1~5 秒随机间隔的自动限速 if (this.Settings.AutoSpeedLimit) { int span = this.random.Next(1000, 5000); Thread.Sleep(span); } // 创建并配置Web请求 request = WebRequest.Create(urlInfo.UrlString) as HttpWebRequest; curIPProxy = this.ConfigRequest(request);//返回当前的代理地址 if (request != null) { response = request.GetResponse() as HttpWebResponse; } if (response != null) { this.PersistenceCookie(response); Stream stream = null; // 如果页面压缩,则解压数据流 if (response.ContentEncoding == "gzip") { Stream responseStream = response.GetResponseStream(); if (responseStream != null) { stream = new GZipStream(responseStream, CompressionMode.Decompress); } } else { stream = response.GetResponseStream(); } using (stream) { string html = this.ParseContent(stream, response.CharacterSet); this.ParseLinks(urlInfo, html); if (this.DataReceivedEvent != null) { this.DataReceivedEvent( new DataReceivedEventArgs { Url = urlInfo.UrlString, Depth = urlInfo.Depth, Html = html, IpProx = curIPProxy }); } if (stream != null) { stream.Close(); } } } } catch (WebException webEx) { var ev = new CrawlErrorEventArgs { Url = urlInfo.UrlString, Depth = urlInfo.Depth, Exception = webEx, IpProx = curIPProxy }; if (webEx.Status == WebExceptionStatus.Timeout || webEx.Status == WebExceptionStatus.ProtocolError || webEx.Message.Contains("远程服务器返回错误") || webEx.Message.Contains("网关")) { //Settings.SetUnviableIP(curIPProxy);//设置为无效代理 ev.needChangeIp = true; } ev.needTryAgain = true; if (this.CrawlErrorEvent != null) { if (urlInfo != null) { this.CrawlErrorEvent(ev ); } } } catch (Exception exception) { var errorEV = new CrawlErrorEventArgs { Url = urlInfo.UrlString, Depth = urlInfo.Depth, Exception = exception, IpProx = curIPProxy }; if (exception.Message.Contains("超时") || exception.Message.Contains("远程服务器返回错误")) { // Settings.SetUnviableIP(curIPProxy);//设置为无效代理 errorEV.needChangeIp = true; } errorEV.needTryAgain = true; if (this.CrawlErrorEvent != null) { if (urlInfo != null) { this.CrawlErrorEvent(errorEV ); } } } finally { if (request != null) { request.Abort(); } if (response != null) { response.Close(); } } } }