/*-----------------------------------------------------------------------------------*/ //获取百度的当前页码 public int GetCurPage(Winista.Text.HtmlParser.Util.NodeList nodes) { nodes = nodes.ExtractAllNodesThatMatch(new HasAttributeFilter("id", "page"), true); NodeFilter filter = new AndFilter(new HasAttributeFilter("class", "pc"), new HasSiblingFilter(new HasAttributeFilter("class", "fk fk_cur"))); nodes = nodes.ExtractAllNodesThatMatch(filter, true); return(DataConverter.CLng(nodes.AsString())); }
public DataTable GetWXBySogou(string key, int count, DateTime time) { string baseurl = "http://weixin.sogou.com/weixin?type=2&query={0}&fr=sgsearch&ie=utf8&_ast=1433216256&_asf=null&w=01059900&cid=null&page={1}"; if (string.IsNullOrEmpty(key)) { return(null); } DataTable dt = GetStruct(key); DateTime cdate = DateTime.Now; for (int p = 0; p * 10 < count; p++) { string url = string.Format(baseurl, HttpUtility.UrlEncode(key), p + 1); string html = ieHelp.GetHtmlFromSite(url); HtmlPage page = htmlHelp.GetPage(html); //int cpage = GetCurPage(page.Body); if (cpage <= p) { break; } Winista.Text.HtmlParser.Util.NodeList nodes = page.Body.ExtractAllNodesThatMatch(new HasAttributeFilter("class", "wx-rb wx-rb3"), true); if (nodes.Count <= 0) { break; } //将其序列化为模型并存入相应类中 for (int i = 0; i < nodes.Count; i++) { Winista.Text.HtmlParser.Util.NodeList cnodes = nodes[i].Children; DataRow dr = dt.NewRow(); NodeFilter f_title = new AndFilter(new HasParentFilter(new TagNameFilter("h4")), new TagNameFilter("a")); ATag a = (ATag)cnodes.ExtractAllNodesThatMatch(f_title, true)[0]; dr["Title"] = a.StringText; dr["Link"] = a.Link; f_title = new AndFilter(new HasAttributeFilter("id", "weixin_account"), new TagNameFilter("a")); ATag author_a = (ATag)cnodes.ExtractAllNodesThatMatch(f_title, true)[0]; dr["Author"] = author_a.GetAttribute("title"); f_title = new HasAttributeFilter("class", "s-p"); Div div = (Div)cnodes.ExtractAllNodesThatMatch(f_title, true)[0]; string unixtime = div.GetAttribute("t"); dr["Cdate"] = GetDateTime(unixtime); dr["Day"] = GetDateTime(unixtime).Day; dr["Source"] = "微信"; dt.Rows.Add(dr); } if (cdate < time) { break; } } return(dt); }
/* * pn:页数,值为10的等数差列,是过滤掉前多少个贴子 * lm:指定时间内百度收录,值1为最近24小时,7为7天 * rn:搜索结果显示条数,取值范围10-100之间,缺少为10(无用) */ /// <summary> /// 抓取百度新闻关键词数据 /// </summary> /// <param name="key">需要抓取的关键词</param> /// <param name="count">最多抓取多少条数据</param> /// <param name="time">抓取该时间段之后的数据</param> /// <returns></returns> public DataTable GetBaiduNews(string key, int count, DateTime time) { if (string.IsNullOrEmpty(key)) { return(null); } int pageSize = 10; string baseurl = "http://news.baidu.com/ns?bs=%B7%F6%C0%CF%C8%CB&sr=0&cl=2&tn=news&ct=0&clk=sortbytime&rn=10&pn={0}&word={1}"; DataTable dt = GetStruct(key); DateTime cdate = DateTime.Now; //1,根据关键词从百度上获取相关信息 for (int p = 0; p * pageSize < count; p++) { //样本 //<div class="result" id="1"><h3 class="c-title"><a href="http://news.qudong.com/article/329733.shtml" target="_blank" data-click="{ 'f0':'77A717EA', 'f1':'9F63F1E4', 'f2':'4CA6DD6E', 'f3':'54E5343F', 't':'1462862118' }">1万亿美金的累计收入!继苹果之后<em>微软</em>也做到了</a></h3><div class="c-summary c-row c-gap-top-small"><div class="c-span6"><a class="c_photo" href="http://news.qudong.com/article/329733.shtml" target="_blank"><img class="c-img c-img6" alt="" src="http://t12.baidu.com/it/u=2227277846,1917450397&fm=82&s=B78AA7E278021ED6942CE89C0300509B&w=121&h=81&img.JPEG"></a></div><div class="c-span18 c-span-last"><p class="c-author">驱动中国 1小时前</p>1万亿美金的收入这在寻常企业眼中就是一个天文数字,然而对于科技大鳄苹果<em>微软</em>来说,已然成为可能,根据<em>微软</em>在华盛顿州的纳税记录可以发现,该公司的历史累计收入在上个... <span class="c-info"><a class="c-cache" href="http://cache.baidu.com/c?m=9d78d513d9d430d84f9e94697c1cc0116f4381132ba1d40209d6843898732f325321a3e52878564291d27d141cb2150bafb12172404067e1c694dd5dddccc375709574743647d71f45ce18afc04324c037902da8f55fb8e4&p=ce7ec816d9c111a05beb8f624c0d&newp=8749c54ad5c51bec17aac7710f5292695912c10e38dc8a563093&user=baidu&fm=sc&query=%CE%A2%C8%ED%D6%D0%B9%FA&qid=cdb5a37a00017e68&p1=1" target="_blank" data-click="{'fm':'sc'}">百度快照</a></span></div></div></div> string url = string.Format(baseurl, p * pageSize, HttpUtility.UrlEncode(key)); string html = htmlHelp.GetHtmlFromSite(url); HtmlPage page = htmlHelp.GetPage(html); int cpage = GetCurPage(page.Body); if (cpage <= p) { break; } Winista.Text.HtmlParser.Util.NodeList nodes = htmlHelp.GetTagList(html, "div");//以前版本为li nodes = nodes.ExtractAllNodesThatMatch(new HasAttributeFilter("class", "result")); for (int i = 0; i < nodes.Count; i++) { DataRow dr = dt.NewRow(); Div div = (Div)nodes[i].Children[1]; ATag a = (ATag)nodes[i].Children[0].Children.ExtractAllNodesThatMatch(new TagNameFilter("a"))[0]; dr["Title"] = a.StringText; dr["Link"] = a.Link; string author = ""; cdate = DateTime.Now; GetAuthorAndDate(ref author, ref cdate, div.Children.ExtractAllNodesThatMatch(new HasAttributeFilter("class", "c-author"), true).AsString()); dr["Author"] = author; dr["CDate"] = DataConverter.CDate(cdate); dr["Day"] = DataConverter.CDate(cdate).Day; dr["Source"] = "新闻"; //dr["Content"] = getacBll.GetArticleFromWeb(htmlHelp.GetHtmlFromSite(a.Link),a.Link); dt.Rows.Add(dr); } if (cdate < time) { break; } } return(dt); }
/// <summary> /// 利用百度搜索,抓取百度,新浪,网易的博客数据 /// </summary> public DataTable GetBlogByBaidu(string key, int count, DateTime time) { string baseurl = "http://www.baidu.com/s?rtt=2&tn=baiduwb&pn={0}&wd={1}"; if (string.IsNullOrEmpty(key)) { return(null); } int pageSize = 10; DataTable dt = GetStruct(key); DateTime cdate = DateTime.Now; for (int p = 0; p * pageSize < count; p++)//最多采100页即1000条数据 { string url = string.Format(baseurl, p * pageSize, HttpUtility.UrlEncode(key)); //string html = htmlHelp.GetHtmlFromSite(url); string html = htmlHelp.GetHtmlFromSite(url); HtmlPage page = htmlHelp.GetPage(html); //int cpage = GetCurPage(page.Body); if (cpage <= p) { break; } Winista.Text.HtmlParser.Util.NodeList nodes = htmlHelp.GetTagList(html, "li"); nodes = nodes.ExtractAllNodesThatMatch(new HasAttributeFilter("id")); for (int i = 0; i < nodes.Count; i++) { Winista.Text.HtmlParser.Util.NodeList cnodes = nodes[i].Children; DataRow dr = dt.NewRow(); NodeFilter linkFilter = new AndFilter(new TagNameFilter("a"), new HasAttributeFilter("class", "weibo_all")); ATag a = (ATag)cnodes.ExtractAllNodesThatMatch(linkFilter, true)[0]; dr["Title"] = "关键词:" + key; dr["Link"] = a.Link; dr["Author"] = cnodes.ExtractAllNodesThatMatch(new HasAttributeFilter("name", "weibo_rootnick"), true)[0].ToPlainTextString(); ATag datea = (ATag)cnodes.ExtractAllNodesThatMatch(new HasParentFilter(new HasAttributeFilter("class", "m")), true)[0]; cdate = ConverBDDate(datea.StringText); dr["CDate"] = cdate; dr["Day"] = cdate.Day; dr["Source"] = "微博"; //dr["Content"] = getacBll.GetArticleFromWeb(htmlHelp.GetHtmlFromSite(a.Link),a.Link); dt.Rows.Add(dr); } if (cdate < time) { break; } } return(dt); }
public List <string> GetUsersFromTb(Winista.Text.HtmlParser.Util.NodeList nlist) { List <string> us = new List <string>(); if (nlist == null || nlist.Count == 0) { return(us); } Winista.Text.HtmlParser.INode tablenode = null; for (int i = 0; i < nlist[0].Children.Count; i++) { var tg = nlist[0].Children[i] as Winista.Text.HtmlParser.ITag; if (tg == null || tg.TagName != "TABLE") { continue; } tablenode = nlist[0].Children[i]; break; } if (tablenode == null) { return(us); } Winista.Text.HtmlParser.Util.NodeList trs = null; for (int i = 0; i < tablenode.Children.Count; i++) { var tg = tablenode.Children[i] as Winista.Text.HtmlParser.ITag; if (tg == null) { continue; } if (tg.TagName == "TBODY") { trs = tg.Children; break; } if (tg.TagName == "TR") { trs = tablenode.Children; break; } } if (trs == null || trs.Count == 0) { return(us); } for (int i = 0; i < trs.Count; i++) { var tg = trs[i] as Winista.Text.HtmlParser.ITag; if (tg == null || tg.TagName != "TR") { continue; } //get first td Winista.Text.HtmlParser.ITag firsttd = null; for (int k = 0; k < trs[i].Children.Count; k++) { var td_t = trs[i].Children[k] as Winista.Text.HtmlParser.ITag; if (td_t != null && td_t.TagName == "TD") { firsttd = td_t; break; } } if (firsttd == null) { continue; } //get div Winista.Text.HtmlParser.ITag td_div = null; for (int k = 0; k < firsttd.Children.Count; k++) { var td_t_div = firsttd.Children[k] as Winista.Text.HtmlParser.ITag; if (td_t_div != null && td_t_div.TagName == "DIV") { td_div = td_t_div; break; } } if (td_div == null) { continue; } //GET A Winista.Text.HtmlParser.ITag td_div_a = null; for (int k = 0; k < td_div.Children.Count; k++) { var td_t_div = td_div.Children[k] as Winista.Text.HtmlParser.ITag; if (td_t_div != null && td_t_div.TagName == "A") { td_div_a = td_t_div; break; } } if (td_div_a == null) { continue; } us.Add(td_div_a.ToPlainTextString().Trim()); } return(us); }