private void CrawlerForm_Load(object sender, EventArgs e) { WebInfo wInfo = null; RequestHeaders rinfo = null; try { DataTable dt_param = db.selectDatas("SELECT * FROM getNetworkData..ArticleParam WHERE isqy=1"); for (int i = 0; i < dt_param.Rows.Count; i++) { wInfo = new WebInfo(); wInfo.WebName = dt_param.Rows[i]["WebName"].ToString(); wInfo.RkUrl = dt_param.Rows[i]["RkUrl"].ToString(); rinfo = new RequestHeaders(); rinfo.UserAgent = dt_param.Rows[i]["UserAgent"].ToString(); rinfo.Accept = dt_param.Rows[i]["Accept"].ToString(); rinfo.Referer = dt_param.Rows[i]["Referer"].ToString(); rinfo.Charset = Encoding.GetEncoding(dt_param.Rows[i]["Charset"].ToString()); wInfo.RequestInfo = rinfo; wInfo.Pattern = dt_param.Rows[i]["Pattern"].ToString(); wInfo.PageNumPattern = dt_param.Rows[i]["PageNumPattern"].ToString(); int PageSize = 0; if (int.TryParse(dt_param.Rows[i]["PageSize"].ToString(), out PageSize)) { wInfo.PageSize = int.Parse(dt_param.Rows[i]["PageSize"].ToString()); } wInfoList.Add(wInfo); } } catch (Exception ecp) { MessageBox.Show("异常:" + ecp.Message); return; } //duibiTime = DateTime.Now.AddDays(-1).ToString("yyyy.MM.dd"); getTaskThread = new Thread(getTask); getTaskThread.Start(); }
/// <summary> /// 获取结果列表 - 采集方法 /// </summary> /// <param name="keyWords">关键词</param> /// <param name="webName">网站</param> private void getArticleList(string keyWords, string webName) { WebInfo wInfo = wInfoList.Find(m => m.WebName == webName); if (wInfo != null) { string keyWordsTemp = keyWords; if (webName == "中国创新网" || webName == "万维家电") { keyWordsTemp = HttpUtility.UrlEncode(keyWords, Encoding.GetEncoding("GB2312")); } string rkUrl = wInfo.RkUrl.Replace("[keyWords]", keyWordsTemp); int pageTotal = 2; bool flag = true; MatchCollection mcLast = null; for (int i = 1; i < pageTotal; i++) { if (!flag) { break; } string url = ""; string m = ""; try { if (webName == "百度学术" || webName == "techweb") { url = rkUrl.Replace("{p}", ((i - 1) * 10).ToString()); } else if (webName == "光明家电") { url = rkUrl.Replace("{p}", (i - 1).ToString()); } else if (webName == "中国创新网" && i > 1) { Regex r_rkUrl = new Regex("(?<=href=\").*?(?=\" class=\"f1\">下一页)"); url = r_rkUrl.Match(m).Value.ToString().Trim(); if (url == "") { break; } else { url = "http://www.chinahightech.com/search/" + url; } } else { url = rkUrl.Replace("{p}", i.ToString()); } m = cClass.getHtmlCode(url, wInfo.RequestInfo); #region 获取页码 if (wInfo.PageNumPattern != null && wInfo.PageNumPattern != "") { string pageNumText = Regex.Match(m, wInfo.PageNumPattern).Value.ToString(); pageNumText = pageNumText.Replace(",", "").Replace(",", "").Replace(" ", "").Trim(); int pageSize = wInfo.PageSize; if (int.TryParse(pageNumText, out pageSize)) { pageTotal = (int.Parse(pageNumText) / wInfo.PageSize); if (pageTotal >= 10 && pageTotal < 50) { pageTotal = pageTotal / 2 + 1; } else if (pageTotal >= 50) { pageTotal = 11; } } else { pageTotal = 2; } } else { pageTotal += 1; } #endregion MatchCollection mList = Regex.Matches(m, wInfo.Pattern); if (mList != null && mList.Count > 0) { #region 正则取得结果 mcLast = cClass.cfBool(mList, mcLast); if (mcLast == null) { break; } foreach (Match match in mList) { try { //#region 正则取得结果 ArticleInfo aInfo = new ArticleInfo(); aInfo.Url = url; aInfo.KeyWords = keyWords; aInfo.WebName = wInfo.WebName; //第一财经日报 2015-05-19 01:55:52 string sourceBy = match.Groups["sourceBy"].Value.ToString().Replace("\r", "").Replace("\n", "").Trim(); sourceBy = r.Replace(sourceBy, "").Trim(); #region 日期 判断 if (wInfo.WebName == "新浪科技" || wInfo.WebName == "techweb") { string riqi = Regex.Match(sourceBy, @"\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}").Value.ToString(); try { aInfo.Riqi = Convert.ToDateTime(riqi.Substring(0, 10)).ToString("yyyy.MM.dd"); } catch { } if (Convert.ToDateTime(aInfo.Riqi) < Convert.ToDateTime(duibiTime)) { flag = false; break; } aInfo.SourceBy = sourceBy.Replace(riqi, "").Trim(); } else { aInfo.Riqi = Convert.ToDateTime(match.Groups["riqi"].Value.ToString().Trim()).ToString("yyyy.MM.dd"); aInfo.SourceBy = sourceBy; if (wInfo.WebName == "百度学术") { try { if (int.Parse(aInfo.Riqi) < int.Parse(duibiTime.Substring(0, 4))) { flag = false; break; } } catch { flag = false; break; } } else { if (!(wInfo.PageNumPattern != null && wInfo.PageNumPattern != "")) { if (Convert.ToDateTime(aInfo.Riqi) < Convert.ToDateTime(duibiTime)) { flag = false; break; } } } } #endregion string title = match.Groups["title"].Value.ToString(); aInfo.Title = r.Replace(title, "").Trim(); string content = match.Groups["content"].Value.ToString(); aInfo.Content = r.Replace(content, "").Trim(); if (containStr((aInfo.Title + aInfo.Content), aInfo.KeyWords) && !(aInfo.Title + aInfo.Content).Contains("车")) { if (Convert.ToDateTime(aInfo.Riqi) >= Convert.ToDateTime(duibiTime)) { #region aInfo.Click = match.Groups["click"].Value.ToString().Trim(); string sourceUrl = match.Groups["sourceUrl"].Value.ToString().Trim(); aInfo.SourceUrl = r.Replace(sourceUrl, "").Trim(); if (webName == "千龙网") { aInfo.SourceUrl = "http://www.chinaso.com" + aInfo.SourceUrl; } else if (webName == "中国创新网") { aInfo.SourceUrl = "http://www.chinahightech.com/" + aInfo.SourceUrl; } string insertSql = "INSERT INTO getNetworkData..ArticleTable_刘大任(Kid,Did,抓取网页,来源,访问数量,日期,标题,内容摘要,关键词,新闻链接,渠道) VALUES('" + Kid + "','" + Did + "','" + aInfo.Url + "','" + aInfo.SourceBy + "','" + aInfo.Click + "','" + aInfo.Riqi + "','" + aInfo.Title + "','" + aInfo.Content + "','" + keyWords + "','" + aInfo.SourceUrl + "','" + aInfo.WebName + "')"; #endregion #region lock (objLockSql) { SQLStringList.Add(insertSql); cjCount += 1; if (SQLStringList.Count >= 200) { if (db.ExecuteSqlTran(SQLStringList)) { SQLStringList = new ArrayList(); } } this.Invoke(new ThreadStart(delegate() { this.label2.Text = "当前已处理关键词:[" + keyWordsCountDone + " / " + keyWordsCount + "] 共采集数据:" + cjCount + " 条"; })); } #endregion } } } catch (Exception ecp) { writeLog(@"C:\Log_ArticleCrawler.log", DateTime.Now.ToString("yyyy-MM-dd HH:mm:ss") + " 采集方法异常:" + ecp.Message + " url=[" + url + "]\r\n"); } } #endregion } else { break; } } catch (Exception ecp) { writeLog(@"C:\Log_ArticleCrawler.log", DateTime.Now.ToString("yyyy-MM-dd HH:mm:ss") + " 采集方法异常:" + ecp.Message + "\r\n"); } } } }