private static bool AuthurFile(string url, string searchForlder) { using (HtmlDownloader downloader = new HtmlDownloader(url, Properties.Resources.FileFilter)) { if (downloader.IsLocalFile) { //文件去重 string[] result = Directory.GetFiles(searchForlder, Path.GetFileName(url), SearchOption.AllDirectories); if (result != null && result.Length > 0) { return(false); } } } return(true); }
private void btnSpider_Click(object sender, EventArgs e) { _baseForlder = txtBasePath.Text; string[] rootUrl = (AttachControl as IHtmlSpider).RootUrls;; if (string.IsNullOrEmpty(_baseForlder) || rootUrl == null || rootUrl.Length == 0) { return; } SetControlState(false); MyConsole.AppendLine("爬取程序正在启动..."); //获取参数 MyConsole.AppendLine("开始收集爬虫需要的参数>>>"); _limitedCount = int.Parse(txtNum.Text); pnlRight.Controls.Add(_chart); //显示报表控件 MyConsole.AppendSign(); MyConsole.AppendLine("开始爬取.."); _startTime = DateTime.Now; //记录爬取初始时间 UrlManager.BaseForlder = _baseForlder; UrlManager.LimitedCount = _limitedCount; UrlManager.AddNewUrls(rootUrl[0], rootUrl); //添加根地址 //设置爬虫的线程个数 for (int i = 0; i < txtThreadNum.Value; i++) { //开始爬取 Thread spiderThead = new Thread(new ThreadStart(delegate { while (true) { if (_limitedCount != 0 && _totalCount > _limitedCount) { continue; } if (UrlManager.HasUrl) { string url = UrlManager.PopOneUrl(); try { //初始化爬虫下载器 HtmlDownloader download = new HtmlDownloader(url, Properties.Resources.FileFilter, Encoding.UTF8); download.OnDownloadChanged += download_OnDownloadChanged; download.OnDownloadCompleted += download_OnDownloadCompleted; download.OnDownloadErrored += download_OnDownloadErrored; if (download.HasResponse) { MyConsole.AppendLine(string.Format("开始爬取Url:{0},时间:{1}", url, DateTime.Now)); download.DownloadAsync(); UpdateGridView(url, SpiderState.爬取中); } else { MyConsole.AppendLine(string.Format("爬取Url:{0}失败,异常原因:远程链接失败,时间:{1}", url, DateTime.Now)); UpdateGridView(url, SpiderState.失败); } } catch (Exception ex) { MyConsole.AppendLine(string.Format("爬取Url:{0}失败,异常原因:{1},时间:{2}", url, ex.Message, DateTime.Now)); UpdateGridView(url, SpiderState.失败); } } } })); spiderThead.Name = "SpiderThead" + i; spiderThead.IsBackground = true; spiderThead.Start(); } }