示例#1
0
 private static bool AuthurFile(string url, string searchForlder)
 {
     using (HtmlDownloader downloader = new HtmlDownloader(url, Properties.Resources.FileFilter))
     {
         if (downloader.IsLocalFile)
         {
             //文件去重
             string[] result = Directory.GetFiles(searchForlder, Path.GetFileName(url), SearchOption.AllDirectories);
             if (result != null && result.Length > 0)
             {
                 return(false);
             }
         }
     }
     return(true);
 }
示例#2
0
        private void btnSpider_Click(object sender, EventArgs e)
        {
            _baseForlder = txtBasePath.Text;
            string[] rootUrl = (AttachControl as IHtmlSpider).RootUrls;;
            if (string.IsNullOrEmpty(_baseForlder) || rootUrl == null || rootUrl.Length == 0)
            {
                return;
            }
            SetControlState(false);

            MyConsole.AppendLine("爬取程序正在启动...");
            //获取参数
            MyConsole.AppendLine("开始收集爬虫需要的参数>>>");

            _limitedCount = int.Parse(txtNum.Text);
            pnlRight.Controls.Add(_chart); //显示报表控件
            MyConsole.AppendSign();
            MyConsole.AppendLine("开始爬取..");

            _startTime              = DateTime.Now; //记录爬取初始时间
            UrlManager.BaseForlder  = _baseForlder;
            UrlManager.LimitedCount = _limitedCount;

            UrlManager.AddNewUrls(rootUrl[0], rootUrl);  //添加根地址

            //设置爬虫的线程个数
            for (int i = 0; i < txtThreadNum.Value; i++)
            {
                //开始爬取
                Thread spiderThead = new Thread(new ThreadStart(delegate
                {
                    while (true)
                    {
                        if (_limitedCount != 0 && _totalCount > _limitedCount)
                        {
                            continue;
                        }
                        if (UrlManager.HasUrl)
                        {
                            string url = UrlManager.PopOneUrl();
                            try
                            {
                                //初始化爬虫下载器
                                HtmlDownloader download       = new HtmlDownloader(url, Properties.Resources.FileFilter, Encoding.UTF8);
                                download.OnDownloadChanged   += download_OnDownloadChanged;
                                download.OnDownloadCompleted += download_OnDownloadCompleted;
                                download.OnDownloadErrored   += download_OnDownloadErrored;
                                if (download.HasResponse)
                                {
                                    MyConsole.AppendLine(string.Format("开始爬取Url:{0},时间:{1}", url, DateTime.Now));
                                    download.DownloadAsync();
                                    UpdateGridView(url, SpiderState.爬取中);
                                }
                                else
                                {
                                    MyConsole.AppendLine(string.Format("爬取Url:{0}失败,异常原因:远程链接失败,时间:{1}", url, DateTime.Now));
                                    UpdateGridView(url, SpiderState.失败);
                                }
                            }
                            catch (Exception ex)
                            {
                                MyConsole.AppendLine(string.Format("爬取Url:{0}失败,异常原因:{1},时间:{2}", url, ex.Message, DateTime.Now));
                                UpdateGridView(url, SpiderState.失败);
                            }
                        }
                    }
                }));
                spiderThead.Name         = "SpiderThead" + i;
                spiderThead.IsBackground = true;
                spiderThead.Start();
            }
        }