Ejemplo n.º 1
0
        /// <summary>
        /// The crawl process.
        /// </summary>
        /// <param name="threadIndex">
        /// The thread index.
        /// </param>
        private void CrawlProcess(object threadIndex)
        {
            var currentThreadIndex = (int)threadIndex;

            while (true)
            {
                // 根据队列中的 Url 数量和空闲线程的数量,判断线程是睡眠还是退出
                if (UrlQueue.Instance.Count == 0)
                {
                    this.threadStatus[currentThreadIndex] = true;
                    if (!this.threadStatus.Any(t => t == false))
                    {
                        break;
                    }

                    Thread.Sleep(2000);
                    continue;
                }

                this.threadStatus[currentThreadIndex] = false;

                if (UrlQueue.Instance.Count == 0)
                {
                    continue;
                }

                UrlInfo urlInfo = UrlQueue.Instance.DeQueue();



                var curIPProxy = Settings.GetIPProxy();
                try
                {
                    if (urlInfo == null)
                    {
                        continue;
                    }

                    // 1~5 秒随机间隔的自动限速
                    if (this.Settings.AutoSpeedLimit)
                    {
                        try
                        {
                            var maxSecond = 5000;
                            var inSecond  = 1000;
                            if (this.Settings.AutoSpeedLimitMinMSecond >= inSecond)
                            {
                                inSecond = this.Settings.AutoSpeedLimitMinMSecond;
                            }
                            if (this.Settings.AutoSpeedLimitMaxMSecond >= maxSecond)
                            {
                                maxSecond = this.Settings.AutoSpeedLimitMaxMSecond;
                            }

                            int span = this.random.Next(inSecond, maxSecond);

                            Thread.Sleep(span);
                        }
                        catch (Exception ex)
                        {
                            throw new Exception("AutoSpeedLimit" + ex.Message);
                        }
                    }
                    string html = string.Empty;
                    switch (Settings.CrawlMode)
                    {
                    case EnumCrawlMode.PhantomJsViaSelenium:
                        html = GetPhantomJsResult(urlInfo);
                        break;

                    case EnumCrawlMode.HttpHelper:
                    case EnumCrawlMode.SuperWebClient:
                        if (Settings.UseSuperWebClient)
                        {
                            html = GetSupperHttpResult(urlInfo);
                        }
                        else
                        {
                            html = GetHttpResult(urlInfo);
                        }
                        break;
                    }

                    if (!string.IsNullOrEmpty(html))
                    {
                        this.ParseLinks(urlInfo, html);
                    }

                    if (this.DataReceivedEvent != null)
                    {
                        this.DataReceivedEvent(
                            new DataReceivedEventArgs
                        {
                            Url    = urlInfo.UrlString,
                            Depth  = urlInfo.Depth,
                            Html   = html,
                            IpProx = curIPProxy, urlInfo = urlInfo
                        });
                    }
                }
                catch (WebException webEx)
                {
                    var ev = new CrawlErrorEventArgs
                    {
                        Url       = urlInfo.UrlString,
                        Depth     = urlInfo.Depth,
                        Exception = webEx,
                        IpProx    = curIPProxy,
                        urlInfo   = urlInfo
                    };
                    if (webEx.Status == WebExceptionStatus.Timeout || webEx.Status == WebExceptionStatus.ProtocolError || webEx.Message.Contains("远程服务器返回错误") || webEx.Message.Contains("网关"))
                    {
                        //Settings.SetUnviableIP(curIPProxy);//设置为无效代理
                        ev.needChangeIp = true;
                    }
                    ev.needTryAgain = true;
                    if (this.CrawlErrorEvent != null)
                    {
                        if (urlInfo != null)
                        {
                            this.CrawlErrorEvent(ev
                                                 );
                        }
                    }
                }

                catch (Exception exception)
                {
                    var errorEV = new CrawlErrorEventArgs {
                        Url = urlInfo.UrlString, Depth = urlInfo.Depth, Exception = exception, IpProx = curIPProxy, urlInfo = urlInfo
                    };

                    if (exception.Message.Contains("超时") || exception.Message.Contains("远程服务器返回错误"))
                    {
                        // Settings.SetUnviableIP(curIPProxy);//设置为无效代理
                        errorEV.needChangeIp = true;
                    }
                    errorEV.needTryAgain = true;
                    if (this.CrawlErrorEvent != null)
                    {
                        if (urlInfo != null)
                        {
                            this.CrawlErrorEvent(errorEV
                                                 );
                        }
                    }
                }
                finally
                {
                    //if (request != null)
                    //{
                    //    request.Abort();
                    //}

                    //if (response != null)
                    //{
                    //    response.Close();
                    //}
                }
            }
        }
Ejemplo n.º 2
0
 private void Master_CrawlErrorEvent(CrawlErrorEventArgs args)
 {
     //
 }
Ejemplo n.º 3
0
        /// <summary>
        /// The crawl process.
        /// </summary>
        /// <param name="threadIndex">
        /// The thread index.
        /// </param>
        private void CrawlProcess_Abort(object threadIndex)
        {
            var currentThreadIndex = (int)threadIndex;

            while (true)
            {
                // 根据队列中的 Url 数量和空闲线程的数量,判断线程是睡眠还是退出
                if (UrlQueue.Instance.Count == 0)
                {
                    this.threadStatus[currentThreadIndex] = true;
                    if (!this.threadStatus.Any(t => t == false))
                    {
                        break;
                    }

                    Thread.Sleep(2000);
                    continue;
                }

                this.threadStatus[currentThreadIndex] = false;

                if (UrlQueue.Instance.Count == 0)
                {
                    continue;
                }

                UrlInfo urlInfo = UrlQueue.Instance.DeQueue();

                HttpWebRequest  request    = null;
                HttpWebResponse response   = null;
                IPProxy         curIPProxy = null;
                try
                {
                    if (urlInfo == null)
                    {
                        continue;
                    }

                    // 1~5 秒随机间隔的自动限速
                    if (this.Settings.AutoSpeedLimit)
                    {
                        int span = this.random.Next(1000, 5000);
                        Thread.Sleep(span);
                    }

                    // 创建并配置Web请求
                    request    = WebRequest.Create(urlInfo.UrlString) as HttpWebRequest;
                    curIPProxy = this.ConfigRequest(request);//返回当前的代理地址

                    if (request != null)
                    {
                        response = request.GetResponse() as HttpWebResponse;
                    }

                    if (response != null)
                    {
                        this.PersistenceCookie(response);

                        Stream stream = null;

                        // 如果页面压缩,则解压数据流
                        if (response.ContentEncoding == "gzip")
                        {
                            Stream responseStream = response.GetResponseStream();
                            if (responseStream != null)
                            {
                                stream = new GZipStream(responseStream, CompressionMode.Decompress);
                            }
                        }
                        else
                        {
                            stream = response.GetResponseStream();
                        }

                        using (stream)
                        {
                            string html = this.ParseContent(stream, response.CharacterSet);

                            this.ParseLinks(urlInfo, html);

                            if (this.DataReceivedEvent != null)
                            {
                                this.DataReceivedEvent(
                                    new DataReceivedEventArgs
                                {
                                    Url   = urlInfo.UrlString,
                                    Depth = urlInfo.Depth,
                                    Html  = html, IpProx = curIPProxy
                                });
                            }

                            if (stream != null)
                            {
                                stream.Close();
                            }
                        }
                    }
                }
                catch (WebException webEx)
                {
                    var ev = new CrawlErrorEventArgs
                    {
                        Url       = urlInfo.UrlString,
                        Depth     = urlInfo.Depth,
                        Exception = webEx,
                        IpProx    = curIPProxy
                    };
                    if (webEx.Status == WebExceptionStatus.Timeout || webEx.Status == WebExceptionStatus.ProtocolError || webEx.Message.Contains("远程服务器返回错误") || webEx.Message.Contains("网关"))
                    {
                        //Settings.SetUnviableIP(curIPProxy);//设置为无效代理
                        ev.needChangeIp = true;
                    }
                    ev.needTryAgain = true;
                    if (this.CrawlErrorEvent != null)
                    {
                        if (urlInfo != null)
                        {
                            this.CrawlErrorEvent(ev
                                                 );
                        }
                    }
                }

                catch (Exception exception)
                {
                    var errorEV = new CrawlErrorEventArgs {
                        Url = urlInfo.UrlString, Depth = urlInfo.Depth, Exception = exception, IpProx = curIPProxy
                    };

                    if (exception.Message.Contains("超时") || exception.Message.Contains("远程服务器返回错误"))
                    {
                        // Settings.SetUnviableIP(curIPProxy);//设置为无效代理
                        errorEV.needChangeIp = true;
                    }
                    errorEV.needTryAgain = true;
                    if (this.CrawlErrorEvent != null)
                    {
                        if (urlInfo != null)
                        {
                            this.CrawlErrorEvent(errorEV
                                                 );
                        }
                    }
                }
                finally
                {
                    if (request != null)
                    {
                        request.Abort();
                    }

                    if (response != null)
                    {
                        response.Close();
                    }
                }
            }
        }