private void BasicWebSpider(string motherUrl, string similar, DataTable keys, string sheng, string shi, string xian, string name, bool catchSub) { //相似表中的被抓取网址 string webInfo = ""; HtmlParse.Parse parse = new HtmlParse.Parse(); //读取相似表中要抓取的网址 webInfo = HtmlUtil.getHtml(motherUrl, ""); //取出所有的超链 string[] strA = HtmlUtil.GetElementsByTagName(webInfo, "a"); //TbReleaseInfo ri = new TbReleaseInfo(); List<ModelReleaseInfo> mris = new List<ModelReleaseInfo>(); #region 逐个链接判断 for (int i = 0; i < strA.Length; i++) { if (Program.ProClose == true) break; Application.DoEvents(); string url = CrawlHtml.processUrl(motherUrl, strA[i]); if (string.IsNullOrEmpty(similar) || (!string.IsNullOrEmpty(similar) && HtmlUtil.getSimilarDegree(similar, url) >= 0.70)) { //因为有的网站会出现访问过快的话,会屏蔽访问者,所以在此让线程停止2秒钟。这样的话,会出现总体访问时间过长的问题 Thread.Sleep(2000); //得到此链接的源码 webInfo = HtmlUtil.getHtml(url, ""); if (string.IsNullOrEmpty(webInfo)) { continue; } //处理下级页面的超链 if (catchSub) { BasicWebSpider(url, similar, keys, sheng, shi, xian, name, false); } //判断该链接是否包含关键词 List<ModelReleaseInfo> newsInfos = parse.ParseGeneralWeb(strA[i], url, keys, sheng, shi, xian, Name, webInfo, 7); if (newsInfos != null && newsInfos.Count > 0) { //写入数据库 DataPersistenceControl.GetInstance().Add(newsInfos); //刷新界面 //RefreshDataGridView(7); } } } #endregion }