//刷新微博数据,前台感觉是在搜索微博 private void reFreshWeibo() { if (!Program.ProClose) { this.BeginInvoke(new MethodInvoker(delegate() { lbweibo.Visible = true; lbweibo.Text = "正在搜索微博数据..."; lbweibo.ForeColor = Color.DarkBlue; })); string wblog = "select uid,releasename AS title,contexts,releasedate,infosource,keywords,releasename,collectdate,snapshot,webname,pid,part,reposts,comments from ReleaseInfowb order by uid desc limit 0,300"; DataBaseServer.SQLitecommand cmd = new SQLitecommand(); DataTable dtwBlog = new DataTable(); dtwBlog = cmd.GetTabel(wblog); this.BeginInvoke(new MethodInvoker(delegate() { dvWBlog.DataSource = dtwBlog; dvWBlog.Refresh(); lbweibo.Text = "一轮搜索完毕!"; lbweibo.ForeColor = Color.Red; })); } }
private void GetBaiduInfo() { this.BeginInvoke(new MethodInvoker(delegate() { lbAll.Text = ""; lbAll.Visible = true; })); SQLitecommand cmd = new SQLitecommand(); //得到关键字列表 DataTable dtkey; DataTable dtParts; dtkey = cmd.GetTabel("select * from Keywords"); dtParts = cmd.GetTabel("SELECT * FROM partword"); Dictionary<string, string> keywords = new Dictionary<string, string>(); for (int kw = 0; kw < dtkey.Rows.Count; kw++) { if (keywords.ContainsKey(dtkey.Rows[kw]["Name"].ToString().Trim() + "-" + dtkey.Rows[kw]["kid"].ToString().Trim())) { keywords[dtkey.Rows[kw]["Name"].ToString().Trim() + "-" + dtkey.Rows[kw]["kid"].ToString().Trim()] = keywords[dtkey.Rows[kw]["Name"].ToString().Trim() + "-" + dtkey.Rows[kw]["kid"].ToString().Trim()] + "," + dtkey.Rows[kw]["KeyWord"].ToString().Trim(); } else { keywords.Add(dtkey.Rows[kw]["Name"].ToString().Trim() + "-" + dtkey.Rows[kw]["kid"].ToString().Trim(), "," + dtkey.Rows[kw]["KeyWord"].ToString().Trim()); } } //链接的正则 string aa = "https?://.[^\"]+"; string[] sDate; sb = new StringBuilder(); sb.Append(""); //TbReleaseInfo ri = new TbReleaseInfo(); //按关键字循环 foreach (KeyValuePair<string,string> kv in keywords) { string k = kv.Key; string v = kv.Value.Substring(1); this.BeginInvoke(new MethodInvoker(delegate() { lbAll.Text = "正在搜索事件为<" + k.Substring(0,k.IndexOf("-")) + ">的数据."; lbAll.ForeColor = Color.DarkBlue; })); //取得关键字 string keys = v; //组成查询字串 //string url = "http://www.baidu.com/s?wd=" + keys + "&rn=20"; string url = "http://news.baidu.com/ns?rn=20&word=" + keys; //得到结果放在数组内 List<string> lis = new List<string>(); lis = HtmlUtil.GetElementsByClassList(HtmlUtil.getHtml(url, "utf-8"), "result"); //如果没取到,就结束本次循环 if (lis == null) return; //webBrowser1.Navigate(url); if (lis.Count <= 0) { continue; } //循环时判断是否要验证 bool isThere = false; for (int i = 0; i < lis.Count; i++) { if (Program.ProClose == true) break; ModelReleaseInfo mri = new ModelReleaseInfo(); //发布日期的赋值 sDate = HtmlUtil.GetElementsByTagAndClass(lis[i], "span", "g"); if (sDate.Length <= 0) continue; mri.ReleaseDate = HtmlUtil.NoHTML(sDate[0]); mri.ReleaseDate = mri.ReleaseDate.Substring(mri.ReleaseDate.IndexOf('2'), 17); //判断日期 DateTime ddt; if (DateTime.TryParse(mri.ReleaseDate, out ddt)) { } else { //百度的快照日期有时会是9位或8位,如果是这种情况,那么按规则去掉 mri.ReleaseDate = mri.ReleaseDate.Substring(1, 9); if (DateTime.TryParse(mri.ReleaseDate, out ddt)) { } else { mri.ReleaseDate = mri.ReleaseDate.Substring(1, 8); } } //处理日期 try { mri.ReleaseDate = DateTime.Parse(mri.ReleaseDate).ToString("yyyy-MM-dd HH:mm:ss"); } catch (Exception ex) { StreamWriter sw = File.AppendText("log.txt"); sw.WriteLine(DateTime.Now.ToLongDateString()); sw.WriteLine("begin"); sw.WriteLine(ex.Message); sw.WriteLine(sb.ToString()); sw.WriteLine("end"); sw.WriteLine(""); sw.Close(); } //只拿取三天的内的数据 try { if (DateTime.Parse(mri.ReleaseDate) < DateTime.Now.AddDays(-30)) continue; } catch (Exception ex) { continue; } try { //得到标题 mri.Title = HtmlUtil.NoHTML(HtmlUtil.GetElementsByTagName(lis[i], "h3")[0]); string[] temp = HtmlUtil.GetElementsByClass(lis[i], "c-summary"); //如果未取到内容部分,就跳出 if (temp.Length == 0) continue; mri.Contexts = HtmlUtil.NoHTML(temp[0]); mri.InfoSource = HtmlUtil.GetListByHtml("",HtmlUtil.GetElementsByTagName(lis[i], "a")[0], aa)[0]; //去掉重复 if (isThere) { continue; } else { if (UrlThereare(mri.Title, this.dtqueryinfo, dtWebQueryInfo, false) != 0) { isThere = true; continue; } } mri.KeyWords = k; mri.CollectDate = string.Format(DateTime.Now.ToString("yyyy-MM-dd HH:mm:ss")); mri.Snapshot = ""; mri.ReleaseName = ""; mri.Sheng = ""; mri.Shi = ""; mri.Xian = ""; mri.WebName = "百度"; mri.Pid = 0; mri.Part = GetParts(mri.Contexts); mri.Comments = 0; mri.Reposts = 0; DataRow dr = dtqueryinfo.NewRow(); if (dvAll.RowCount == 0) { dr[0] = 1; } else { dr[0] = int.Parse(dvAll.Rows[dvAll.RowCount - 1].Cells[0].Value.ToString()) + 1; } dr[1] = mri.Title; dr[2] = mri.Contexts; dr[3] = mri.ReleaseDate; dr[4] = mri.InfoSource; dr[5] = mri.KeyWords.Substring(0, k.IndexOf("-")); dr[6] = mri.ReleaseName; dr[7] = mri.CollectDate; dr[8] = mri.Snapshot; dr[9] = mri.WebName; dr[10] = mri.Pid; dr[11] = mri.Part; dr[12] = mri.Reposts; dr[13] = mri.Comments; dtqueryinfo.Rows.InsertAt(dr, 0); if (dtqueryinfo.Rows.Count >= 500) { dtqueryinfo.Rows.RemoveAt(500); } this.BeginInvoke(new MethodInvoker(delegate() { dvAll.Refresh(); })); } catch (Exception ex) { StreamWriter sw = File.AppendText("log.txt"); sw.WriteLine(DateTime.Now.ToLongDateString()); sw.WriteLine("begin"); sw.WriteLine(ex.Message); sw.WriteLine(sb.ToString()); sw.WriteLine("end"); sw.WriteLine(""); sw.Close(); } try { //得到插入语句 if (isThere) { continue; } else { sb.Append(tri.GetInsString(mri) + ";"); } //每10次执行一次插入数据库 if (sb.ToString().Length != 0) { if (i % 10 == 0) { //执行插入 cmd.ExecuteNonQuery(sb.ToString()); //清除插入字段串 sb.Clear(); } } } catch (Exception ex) { StreamWriter sw = File.AppendText("log.txt"); sw.WriteLine(DateTime.Now.ToLongDateString()); sw.WriteLine("begin"); sw.WriteLine(ex.Message); sw.WriteLine(sb.ToString()); sw.WriteLine("end"); sw.WriteLine(""); sw.Close(); } } } try { if (sb.ToString().Length != 0) { //执行插入 cmd.ExecuteNonQuery(sb.ToString()); //清除插入字段串 sb.Clear(); } } catch (Exception ex) { StreamWriter sw = File.AppendText("log.txt"); sw.WriteLine(DateTime.Now.ToLongDateString()); sw.WriteLine("begin"); sw.WriteLine(ex.Message); sw.WriteLine(sb.ToString()); sw.WriteLine("end"); sw.WriteLine(""); sw.Close(); } //执行完毕后,重新获取一次数据库的数据 dtWebQueryInfo = tri.SelReleaseInfo(DateTime.Now.ToString("yyyy-MM-dd HH:mm:ss"), DateTime.Now.AddDays(-15).ToString("yyyy-MM-dd HH:mm:ss"), "0 AND webName='百度'"); this.BeginInvoke(new MethodInvoker(delegate() { lbAll.Text = "一轮搜索完毕!"; lbAll.ForeColor = Color.Red; })); }
/// <summary> /// 得到网站的贴吧类数据 /// </summary> private void GetWebTieBaInfo() { this.BeginInvoke(new MethodInvoker(delegate() { lbtieba.Text = ""; lbtieba.Visible = true; })); //相似链接 string Similar = ""; DataBaseServer.SQLitecommand cmd = new SQLitecommand(); //得到关键字列表 DataTable dtkey; DataTable dtParts; dtkey = cmd.GetTabel("select * from Keywords"); dtParts = cmd.GetTabel("SELECT * FROM partword"); //得到相似表 DataTable dtXs = new DataTable(); dtXs = cmd.GetTabel("Select * from WebAddress WHERE pid=5"); //相似表中的被抓取网址 string webInfo = ""; //要过滤链接中首页的正则 string strTopFormat = "https?://.+/"; List<string> strTop = new List<string>(); sb = new StringBuilder(); sb.Append(""); string filterStr = ""; #region 读取相似度表中的数据据,循环抓取 for (int xs = 0; xs < dtXs.Rows.Count; xs++) { this.BeginInvoke(new MethodInvoker(delegate() { lbtieba.ForeColor = Color.DarkBlue; lbtieba.Text = "正在搜索:" + dtXs.Rows[xs]["name"].ToString(); })); //读取相似表中要抓取的网址 webInfo = HtmlUtil.getHtml(dtXs.Rows[xs]["url"].ToString(), ""); //读取相似链接 Similar = dtXs.Rows[xs]["likeurl"].ToString(); //取出 //string[] strA = HtmlUtil.GetElementsByTagName(webInfo, "a"); List<string> strList = HtmlUtil.GetElementsByTagNameList(webInfo, "a"); string strURLformat = "https?://.[^\"]+"; TbReleaseInfo ri = new TbReleaseInfo(); string[] strA = GetLIstDate(strList.Distinct()); #region 逐个链接判断 //循环时判断是否要验证 bool isThere = false; for (int i = 0; i < strA.Length; i++) { if (Program.ProClose == true) break; Application.DoEvents(); Dictionary<string, int> events = new Dictionary<string, int>(); //创建数据对象 ModelReleaseInfo newsInfo = new ModelReleaseInfo(); try { //得到目标网址中的所有链接,如果未得到,那么就继续读取下一个 string tmp = strA[i]; newsInfo.Title = HtmlUtil.NoHTML(tmp); //得到目标网址中的所有链接,如果未得到,那么就继续读取下一个 for (int j = 0; j < dtkey.Rows.Count; j++) { //Application.DoEvents(); string[] keys = dtkey.Rows[j][4].ToString().Split(new char[] { ' ' }); if (!events.ContainsKey(dtkey.Rows[j][1].ToString() + "-" + dtkey.Rows[j][6].ToString())) { events.Add(dtkey.Rows[j][1].ToString() + "-" + dtkey.Rows[j][6].ToString(), 1); foreach (string k in keys) { if (!strA[i].ToLower().Contains(k.ToLower())) { events.Remove(dtkey.Rows[j][1].ToString() + "-" + dtkey.Rows[j][6].ToString()); break; } } } } foreach (KeyValuePair<string, int> ev in events) { if (ev.Value == 1) { newsInfo.KeyWords += "," + ev.Key.Split(new char[] { '-' })[0] + "-" + int.Parse(ev.Key.Split(new char[] { '-' })[1]); } } string[] _a = HtmlUtil.GetListByHtml(dtXs.Rows[xs]["url"].ToString(), strA[i], strURLformat); if (_a.Length > 0) { strA[i] = _a[0]; } else { strA[i] = ""; } //处理含有单引号的链接 strA[i] = HtmlUtil.UrlCl(strA[i]); //处理单引号的链接 if (strA[i].IndexOf("'") != -1) { strA[i] = HtmlUtil.GetstringByHtmlArray(strA[i], "https?://.[^\']+"); } } catch (Exception) { continue; } //得到相似值,大于0.70的认为相同,并开始抓取 if (HtmlUtil.getSimilarDegree(Similar, strA[i]) >= 0.70) { //判断这个链接是否已经在库中或者列表中,如果存在,此次就不再执行 strTop = HtmlUtil.GetListByHtmlArray(strA[i], strTopFormat); if (strTop.Count != 0) { continue;//同新闻,如果将首页去掉 } if (isThere) { continue; } else { if (UrlThereare(strA[i], this.dttiebainfo, dtTieBaInfo, true) != 0) { //isThere = true; continue; } } Thread.Sleep(2000); //得到此链接的源码 webInfo = HtmlUtil.getHtml(strA[i], ""); if (webInfo.Length == 0) { continue; } try { //流水+1 newsInfo.Uid = this.dvAll.Rows.Count + 1; //标题 //string[] strT = HtmlUtil.GetElementsByTagName(webInfo, "title"); //if (strT.Length == 0) //{ // continue; //} //else //{ // newsInfo.Title = HtmlUtil.NoHTML(HtmlUtil.GetElementsByTagName(webInfo, "title")[0]); //} //得到正文,以P标签来区分 string[] strContext = HtmlUtil.GetElementsByClass(webInfo, "tieba"); //string[] strContext = HtmlUtil.GetElementsByTagName(webInfo, "post_content"); newsInfo.Contexts = ""; for (int j = 0; j < strContext.Length; j++) { //循环累加正文信息 newsInfo.Contexts += HtmlUtil.NoHTML(strContext[j]); } //如果正文信息为空,那么将无法做关键字对照,此条数据舍弃 if (newsInfo.Contexts.Length == 0) { continue; } //网站链接 newsInfo.InfoSource = strA[i].Trim(); //关键字的设置 //newsInfo.KeyWords = ""; //Dictionary<string, int> events = new Dictionary<string, int>(); //for (int j = 0; j < dtkey.Rows.Count; j++) //{ // Application.DoEvents(); // if (!events.ContainsKey(dtkey.Rows[j][1].ToString() + "-" + dtkey.Rows[j][6].ToString())) // { // events.Add(dtkey.Rows[j][1].ToString() + "-" + dtkey.Rows[j][6].ToString(), 1); // } // if (!newsInfo.Contexts.Contains(dtkey.Rows[j][4].ToString())) // { // if (events.ContainsKey(dtkey.Rows[j][1].ToString() + "-" + dtkey.Rows[j][6].ToString()) && events[dtkey.Rows[j][1].ToString() + "-" + dtkey.Rows[j][6].ToString()] == 1) // { // events[dtkey.Rows[j][1].ToString() + "-" + dtkey.Rows[j][6].ToString()] = 0; // } // } //} if (newsInfo.KeyWords == null || newsInfo.KeyWords.Length == 0) { for (int j = 0; j < dtkey.Rows.Count; j++) { Application.DoEvents(); string[] keys = dtkey.Rows[j][4].ToString().Split(new char[] { ' ' }); if (!events.ContainsKey(dtkey.Rows[j][1].ToString() + "-" + dtkey.Rows[j][6].ToString())) { events.Add(dtkey.Rows[j][1].ToString() + "-" + dtkey.Rows[j][6].ToString(), 1); foreach (string k in keys) { if (!newsInfo.Contexts.ToLower().Contains(k.ToLower())) { events.Remove(dtkey.Rows[j][1].ToString() + "-" + dtkey.Rows[j][6].ToString()); break; } } } } foreach (KeyValuePair<string, int> ev in events) { if (ev.Value == 1) { newsInfo.KeyWords += "," + ev.Key.Split(new char[] { '-' })[0] + "-" + int.Parse(ev.Key.Split(new char[] { '-' })[1]); } } } if (newsInfo.KeyWords.Length == 0) { continue; } newsInfo.KeyWords = newsInfo.KeyWords.Substring(1); //收集日期 newsInfo.CollectDate = string.Format(DateTime.Now.ToString("yyyy-MM-dd HH:mm:ss")); //发布人和发布日期暂时无法取到,手工赋值为空 newsInfo.ReleaseDate = ""; newsInfo.ReleaseName = ""; //网页快照,这里为用户指定生成,如果未选择生成,那么为空 newsInfo.Snapshot = ""; newsInfo.Sheng = dtXs.Rows[xs]["sheng"] == null ? "" : dtXs.Rows[xs]["sheng"].ToString(); newsInfo.Shi = dtXs.Rows[xs]["shi"] == null ? "" : dtXs.Rows[xs]["shi"].ToString(); newsInfo.Xian = dtXs.Rows[xs]["xian"] == null ? "" : dtXs.Rows[xs]["xian"].ToString(); //网站名 newsInfo.WebName = dtXs.Rows[xs]["Name"].ToString(); //pid newsInfo.Pid = 5; //part正负判断 newsInfo.Part = GetParts(newsInfo.Contexts); //reposts newsInfo.Reposts = 0; //comments newsInfo.Comments = 0; //新建数据行 DataRow dr = dttiebainfo.NewRow(); if (dvtieba.RowCount == 0) { dr[0] = 1; } else { dr[0] = int.Parse(dvtieba.Rows[dvtieba.RowCount - 1].Cells[0].Value.ToString()) + 1; } //dr[0] = newsInfo.Uid; dr[1] = newsInfo.Title; dr[2] = newsInfo.Contexts; dr[3] = newsInfo.ReleaseDate; dr[4] = newsInfo.InfoSource; dr[5] = newsInfo.KeyWords.Substring(0, newsInfo.KeyWords.IndexOf("-")); dr[6] = newsInfo.ReleaseName; dr[7] = newsInfo.CollectDate; dr[8] = newsInfo.Snapshot; dr[9] = newsInfo.WebName; dr[10] = newsInfo.Pid; dr[11] = newsInfo.Part; dr[12] = newsInfo.Reposts; dr[13] = newsInfo.Comments; //把行加到DT中 dttiebainfo.Rows.InsertAt(dr, 0); //数据源刷新 if (dttiebainfo.Rows.Count >= 500) { dttiebainfo.Rows.RemoveAt(500); } this.BeginInvoke(new MethodInvoker(delegate() { dvtieba.Refresh(); })); } catch (Exception ex) { Comm.WriteErrorLog(ex.StackTrace); } ////总表刷新 //dt.Rows.Add(dr); //dvAll.Refresh(); //得到插入语句 try { if (isThere) { continue; } else { sb.Append(ri.GetInsString(newsInfo) + ";"); } //每10次执行一次插入数据库 if (sb.ToString().Length != 0) { if (i % 10 == 0) { filterStr = sb.ToString(); filterStr = filterStr.Replace("[ ", "["); filterStr = filterStr.Replace(" ]", "]"); //执行插入 cmd.ExecuteNonQuery(filterStr); //清除插入字段串 sb.Clear(); filterStr = ""; } } } catch (Exception ex) { Comm.WriteErrorLog(ex.StackTrace); } } } #endregion } #endregion try { if (sb.ToString().Length != 0) { filterStr = sb.ToString(); filterStr = filterStr.Replace("[ ", "["); filterStr = filterStr.Replace(" ]", "]"); //执行插入 cmd.ExecuteNonQuery(filterStr); //清除插入字段串 sb.Clear(); filterStr = ""; } } catch (Exception ex) { Comm.WriteErrorLog(ex.StackTrace); } //执行完毕后,重新获取一次数据库的数据 dtTieBaInfo = tri.SelReleaseInfo(DateTime.Now.ToString("yyyy-MM-dd HH:mm:ss"), DateTime.Now.AddDays(-15).ToString("yyyy-MM-dd HH:mm:ss"), "5"); this.BeginInvoke(new MethodInvoker(delegate() { lbtieba.Text = "一轮搜索完毕!"; lbtieba.ForeColor = Color.Red; })); }