private void BasicWebSpider(string motherUrl, string similar, DataTable keys, string sheng, string shi, string xian, string name, bool catchSub) { //相似表中的被抓取网址 string webInfo = ""; HtmlParse.Parse parse = new HtmlParse.Parse(); //读取相似表中要抓取的网址 webInfo = HtmlUtil.getHtml(motherUrl, ""); //取出所有的超链 string[] strA = HtmlUtil.GetElementsByTagName(webInfo, "a"); //TbReleaseInfo ri = new TbReleaseInfo(); List<ModelReleaseInfo> mris = new List<ModelReleaseInfo>(); #region 逐个链接判断 for (int i = 0; i < strA.Length; i++) { if (Program.ProClose == true) break; Application.DoEvents(); string url = CrawlHtml.processUrl(motherUrl, strA[i]); if (string.IsNullOrEmpty(similar) || (!string.IsNullOrEmpty(similar) && HtmlUtil.getSimilarDegree(similar, url) >= 0.70)) { //因为有的网站会出现访问过快的话,会屏蔽访问者,所以在此让线程停止2秒钟。这样的话,会出现总体访问时间过长的问题 Thread.Sleep(2000); //得到此链接的源码 webInfo = HtmlUtil.getHtml(url, ""); if (string.IsNullOrEmpty(webInfo)) { continue; } //处理下级页面的超链 if (catchSub) { BasicWebSpider(url, similar, keys, sheng, shi, xian, name, false); } //判断该链接是否包含关键词 List<ModelReleaseInfo> newsInfos = parse.ParseGeneralWeb(strA[i], url, keys, sheng, shi, xian, Name, webInfo, 7); if (newsInfos != null && newsInfos.Count > 0) { //写入数据库 DataPersistenceControl.GetInstance().Add(newsInfos); //刷新界面 //RefreshDataGridView(7); } } } #endregion }
private void BlogWebSpider() { #region 处理关键字 MySqlCmd cmd = new MySqlCmd(); //得到关键字列表 DataTable dtkey; DataTable dtParts; if (selectKID == -1) { //全部 dtkey = cmd.GetTabel("select * from Keywords"); } else { dtkey = cmd.GetTabel("select * from Keywords where kid=" + selectKID); } dtParts = cmd.GetTabel("SELECT * FROM partword"); #endregion HtmlParse.Parse parse = new HtmlParse.Parse(); //parse.ReportCatchProcess += new HtmlParse.ReportCatchProcessEventHandler(Blog_ReportCatchProcess); List<ModelReleaseInfo> webDatas = new List<ModelReleaseInfo>(); //按关键字循环 for (int kw = 0; kw < dtkey.Rows.Count; kw++) { //处理关键字 if (selectKID != -1 && selectKwName != "全部") { if (dtkey.Rows[kw]["name"].ToString().Trim() != selectKwName) continue; } string keyword = dtkey.Rows[kw]["KeyWord"].ToString().Trim(); string keyTitle = dtkey.Rows[kw]["Name"].ToString().Trim(); int kid = 0; int.TryParse(dtkey.Rows[kw]["kid"].ToString().Trim(), out kid); #region 按关键字检索 //组成查询字串 string url = "http://www.sogou.com/web?interation=196647&query=" + keyword + "&ie=utf8"; string html = HtmlUtil.HttpGet(url, Encoding.UTF8); #endregion List<ModelReleaseInfo> mris = parse.ParseSogouBlog(html, keyword, kid); if (mris != null && mris.Count() > 0) { DataPersistenceControl.GetInstance().Add(mris); } //防止拉黑 Thread.Sleep(Interval2m); } }
private void BaiduWebWebSpider() { #region 处理关键字 MySqlCmd cmd = new MySqlCmd(); //得到关键字列表 DataTable dtkey; DataTable dtParts; if (selectKID == -1) { //全部 dtkey = cmd.GetTabel("select * from Keywords"); } else { dtkey = cmd.GetTabel("select * from Keywords where kid=" + selectKID); } dtParts = cmd.GetTabel("SELECT * FROM partword"); #endregion HtmlParse.Parse parse = new HtmlParse.Parse(); //parse.ReportCatchProcess += new HtmlParse.ReportCatchProcessEventHandler(BaiduWeb_ReportCatchProcess); List<ModelReleaseInfo> webDatas = new List<ModelReleaseInfo>(); //按关键字循环 for (int kw = 0; kw < dtkey.Rows.Count; kw++) { //处理关键字 if (selectKID != -1 && selectKwName != "全部") { if (dtkey.Rows[kw]["name"].ToString().Trim() != selectKwName) continue; } string keyword = dtkey.Rows[kw]["KeyWord"].ToString().Trim(); string keyTitle = dtkey.Rows[kw]["Name"].ToString().Trim(); int kid = 0; int.TryParse(dtkey.Rows[kw]["kid"].ToString().Trim(), out kid); #region 百度检索 for (int i = 0; i < 5; i++) { String encodeKey = CrawlHtml.UrlEncode(keyword); string url = string.Format(@"http://www.baidu.com/s?wd={0}&pn={1}&ie=utf-8", encodeKey, i * 10); string html = HtmlUtil.HttpGet(url, Encoding.UTF8); List<ModelReleaseInfo> mris = parse.ParseBaiduWeb(html, keyword, kid); if (mris != null && mris.Count() > 0) { DataPersistenceControl.GetInstance().Add(mris); } else { break; } //防止拉黑 Thread.Sleep(Interval50s); } #endregion #region bing检索 for (int i = 0; i < 10; i++) { //组成查询字串 string url = string.Format("http://cn.bing.com/search?q={0}&first={1}&FORM=PERE", keyword, i * 10 + 1); string html = HtmlUtil.HttpGet(url, Encoding.UTF8); List<ModelReleaseInfo> mris = parse.ParseBingWeb(html, keyword, kid); if (mris != null && mris.Count() > 0) { //写入数据库 DataPersistenceControl.GetInstance().Add(mris); } else { break; } //防止拉黑 Thread.Sleep(Interval30s); } #endregion #region 搜狗检索 for (int i = 0; i < 10; i++) { //组成查询字串 string url = string.Format("http://www.sogou.com/web?query={0}&page={1}&ie=utf8", keyword, i + 1); string html = HtmlUtil.HttpGet(url, Encoding.UTF8); List<ModelReleaseInfo> mris = parse.ParseSogouWeb(html, keyword, kid); if (mris != null && mris.Count() > 0) { //写入数据库 DataPersistenceControl.GetInstance().Add(mris); } else { break; } //防止拉黑 Thread.Sleep(Interval2m); } #endregion #region 中搜检索 for (int i = 0; i < 10; i++) { //组成查询字串 string url = string.Format("http://www.zhongsou.com/third?w={0}&b={1}", keyword, i + 1); string html = HtmlUtil.HttpGet(url, Encoding.Default); List<ModelReleaseInfo> mris = parse.ParseZhongsouWeb(html, keyword, kid); if (mris != null && mris.Count() > 0) { //写入数据库 DataPersistenceControl.GetInstance().Add(mris); } else { break; } //防止拉黑 Thread.Sleep(Interval2m); } #endregion #region 好搜检索 for (int i = 0; i < 10; i++) { //组成查询字串 string url = string.Format("http://www.haosou.com/s?q={0}&pn={1}", keyword, i + 1); string html = HtmlUtil.HttpGet(url, Encoding.UTF8); List<ModelReleaseInfo> mris = parse.ParseHaosouWeb(html, keyword, kid); if (mris != null && mris.Count() > 0) { //写入数据库 DataPersistenceControl.GetInstance().Add(mris); } else { break; } //防止拉黑 Thread.Sleep(Interval50s); } #endregion } }
private void WeixinWebSpider() { #region 处理关键字 MySqlCmd cmd = new MySqlCmd(); //得到关键字列表 DataTable dtkey; //DataTable dtParts; //dtkey = cmd.GetTabel("select * from Keywords"); //dtParts = cmd.GetTabel("SELECT * FROM partword"); if (selectKID == -1) { //全部 dtkey = cmd.GetTabel("select * from Keywords"); } else { dtkey = cmd.GetTabel("select * from Keywords where kid=" + selectKID); } #endregion HtmlParse.Parse parse = new HtmlParse.Parse(); //parse.ReportCatchProcess += new HtmlParse.Parse.ReportCatchProcessEventHandler(Weixin_ReportCatchProcess); List<ModelReleaseInfo> webDatas = new List<ModelReleaseInfo>(); //按关键字循环 for (int kw = 0; kw < dtkey.Rows.Count; kw++) { //处理关键字 if (selectKID != -1 && selectKwName != "全部") { if (dtkey.Rows[kw]["name"].ToString().Trim() != selectKwName) continue; } string keyword = dtkey.Rows[kw]["KeyWord"].ToString().Trim(); //string keyTitle = dtkey.Rows[kw]["Name"].ToString().Trim(); int kid = 0; int.TryParse(dtkey.Rows[kw]["kid"].ToString().Trim(), out kid); #region 按关键字检索 //组成查询字串 string url = "http://weixin.sogou.com/weixin?type=2&query=" + keyword + "&ie=utf8"; //string html = HtmlUtil.HttpGet(url, Encoding.UTF8); CookieContainer cookies = new CookieContainer(); string strCookie = ""; string html = HtmlUtil.HttpGet(url, Encoding.UTF8, cookies, "weixin.sogou.com", ref strCookie); #endregion List<ModelReleaseInfo> mris = parse.ParseSogouWeixin(html, keyword, kid, cookies, strCookie); if (mris != null && mris.Count() > 0) { DataPersistenceControl.GetInstance().Add(mris); //webDatas.AddRange(mris); //刷新界面 //RefreshDataGridView(6); } //防止拉黑(2分钟) Thread.Sleep(Interval2m); } }
private void WeiboWebSpider() { #region 处理关键字 MySqlCmd cmd = new MySqlCmd(); //得到关键字列表 DataTable dtkey; DataTable dtParts; if (selectKID == -1) { //全部 dtkey = cmd.GetTabel("select * from Keywords"); } else { dtkey = cmd.GetTabel("select * from Keywords where kid=" + selectKID); } dtParts = cmd.GetTabel("SELECT * FROM partword"); #endregion HtmlParse.Parse parse = new HtmlParse.Parse(); //parse.ReportCatchProcess += new HtmlParse.ReportCatchProcessEventHandler(Weibo_ReportCatchProcess); List<ModelReleaseInfo> webDatas = new List<ModelReleaseInfo>(); //按关键字循环 for (int kw = 0; kw < dtkey.Rows.Count; kw++) { //处理关键字 if (selectKID != -1 && selectKwName != "全部") { if (dtkey.Rows[kw]["name"].ToString().Trim() != selectKwName) continue; } string keyword = dtkey.Rows[kw]["KeyWord"].ToString().Trim(); string keyTitle = dtkey.Rows[kw]["Name"].ToString().Trim(); int kid = 0; int.TryParse(dtkey.Rows[kw]["kid"].ToString().Trim(), out kid); #region 新浪微博检索 String encodeKey = CrawlHtml.UrlEncode(keyword); string url = "http://s.weibo.com/weibo/" + encodeKey + "?topnav=1&wvr=6&b=1&page=1"; string html = HtmlUtil.HttpGet(url, Encoding.UTF8); List<ModelReleaseInfo> mris = parse.ParseSinaWeibo(html, keyword, kid); if (mris != null && mris.Count() > 0) { DataPersistenceControl.GetInstance().Add(mris); } //防止微博拉黑 Thread.Sleep(Interval30s); #endregion #region 中搜检索 for (int i = 0; i < 10; i++) { //组成查询字串 url = ""; html = ""; mris = null; url = string.Format("http://t.zhongsou.com/wb?w={0}&b={1}", keyword, i + 1); html = HtmlUtil.HttpGet(url, Encoding.Default); mris = parse.ParseZhongsouWeibo(html, keyword, kid); if (mris != null && mris.Count() > 0) { //写入数据库 DataPersistenceControl.GetInstance().Add(mris); } else { break; } //防止拉黑 Thread.Sleep(Interval2m); } #endregion } }
private void TiebaWebSpider() { #region 处理关键字 MySqlCmd cmd = new MySqlCmd(); //得到关键字列表 DataTable dtkey; DataTable dtParts; if (selectKID == -1) { //全部 dtkey = cmd.GetTabel("select * from Keywords"); } else { dtkey = cmd.GetTabel("select * from Keywords where kid=" + selectKID); } dtParts = cmd.GetTabel("SELECT * FROM partword"); #endregion HtmlParse.Parse parse = new HtmlParse.Parse(); //parse.ReportCatchProcess += new HtmlParse.ReportCatchProcessEventHandler(Tieba_ReportCatchProcess); //按关键字循环 for (int kw = 0; kw < dtkey.Rows.Count; kw++) { //处理关键字 if (selectKID != -1 && selectKwName != "全部") { if (dtkey.Rows[kw]["name"].ToString().Trim() != selectKwName) continue; } string keyword = dtkey.Rows[kw]["KeyWord"].ToString().Trim(); string keyTitle = dtkey.Rows[kw]["Name"].ToString().Trim(); int kid = 0; int.TryParse(dtkey.Rows[kw]["kid"].ToString().Trim(), out kid); #region 按关键字检索 //组成查询字串 //返回前60条数据,暂且不处理翻页 string url = "http://tieba.baidu.com/f/search/res?ie=utf-8&rn=60&qw=" + keyword; string html = HtmlUtil.HttpGet(url, Encoding.Default); #endregion List<ModelReleaseInfo> mris = parse.ParseBaiduTieba(html, keyword, kid); if (mris != null && mris.Count() > 0) { DataPersistenceControl.GetInstance().Add(mris); } //防止拉黑 Thread.Sleep(Interval50s); } }
private void MediaWebSpider() { #region 处理关键字 MySqlCmd cmd = new MySqlCmd(); //得到关键字列表 DataTable dtkey; DataTable dtParts; if (selectKID == -1) { //全部 dtkey = cmd.GetTabel("select * from Keywords"); } else { dtkey = cmd.GetTabel("select * from Keywords where kid=" + selectKID); } dtParts = cmd.GetTabel("SELECT * FROM partword"); #endregion HtmlParse.Parse parse = new HtmlParse.Parse(); //parse.ReportCatchProcess += new HtmlParse.ReportCatchProcessEventHandler(parse_ReportCatchProcess); List<ModelReleaseInfo> webDatas = new List<ModelReleaseInfo>(); //按关键字循环 for (int kw = 0; kw < dtkey.Rows.Count; kw++) { //处理关键字 if (selectKID != -1 && selectKwName != "全部") { if (dtkey.Rows[kw]["name"].ToString().Trim() != selectKwName) continue; } string keyword = dtkey.Rows[kw]["KeyWord"].ToString().Trim(); //string keyTitle = dtkey.Rows[kw]["Name"].ToString().Trim(); int kid = 0; int.TryParse(dtkey.Rows[kw]["kid"].ToString().Trim(), out kid); #region 百度检索 //组成查询字串 string url = "http://news.baidu.com/ns?rn=100&word=" + keyword; string html = HtmlUtil.HttpGet(url, Encoding.UTF8); List<ModelReleaseInfo> mris = parse.ParseBaiduNews(html, keyword, kid); if (mris != null && mris.Count() > 0) { //写入数据库 DataPersistenceControl.GetInstance().Add(mris); } #endregion #region bing检索 for (int i = 0; i < 10; i++) { //组成查询字串 url = ""; html = ""; mris = null; url = string.Format("http://cn.bing.com/news/search?q={0}&first={1}&FORM=PENR", keyword, i * 10 + 1); html = HtmlUtil.HttpGet(url, Encoding.UTF8); mris = parse.ParseBingNews(html, keyword, kid); if (mris != null && mris.Count() > 0) { //写入数据库 DataPersistenceControl.GetInstance().Add(mris); } else { break; } //防止拉黑 Thread.Sleep(Interval50s); } #endregion #region 搜狗新闻 for (int i = 0; i < 10; i++) { //组成查询字串 url = ""; html = ""; mris = null; url = string.Format("http://news.sogou.com/news?query={0}&page={1}", keyword, i + 1); html = HtmlUtil.HttpGet(url, Encoding.Default); mris = parse.ParseSogouNews(html, keyword, kid); if (mris != null && mris.Count() > 0) { //写入数据库 DataPersistenceControl.GetInstance().Add(mris); } else { break; } //防止拉黑(2分钟) Thread.Sleep(Interval2m); } #endregion #region 中搜新闻 for (int i = 0; i < 10; i++) { //组成查询字串 url = ""; html = ""; mris = null; url = string.Format("http://zixun.zhongsou.com/n?w={0}&b={1}", keyword, i + 1); html = HtmlUtil.HttpGet(url, Encoding.Default); mris = parse.ParseZhongsouNews(html, keyword, kid); if (mris != null && mris.Count() > 0) { //写入数据库 DataPersistenceControl.GetInstance().Add(mris); } else { break; } //防止拉黑(2分钟) Thread.Sleep(Interval2m); } #endregion #region 好搜新闻 for (int i = 0; i < 10; i++) { //组成查询字串 url = ""; html = ""; mris = null; url = string.Format("http://news.haosou.com/ns?q={0}&pn={1}&tn=news&rank=rank&j=0", keyword, i + 1); html = HtmlUtil.HttpGet(url, Encoding.UTF8); mris = parse.ParseHaosouNews(html, keyword, kid); if (mris != null && mris.Count() > 0) { //写入数据库 DataPersistenceControl.GetInstance().Add(mris); } else { break; } //防止拉黑 Thread.Sleep(Interval50s); } #endregion } }