예제 #1
0
        /*-----------------------------------------------------------------------------------*/

        //获取百度的当前页码
        public int GetCurPage(Winista.Text.HtmlParser.Util.NodeList nodes)
        {
            nodes = nodes.ExtractAllNodesThatMatch(new HasAttributeFilter("id", "page"), true);
            NodeFilter filter = new AndFilter(new HasAttributeFilter("class", "pc"), new HasSiblingFilter(new HasAttributeFilter("class", "fk fk_cur")));

            nodes = nodes.ExtractAllNodesThatMatch(filter, true);
            return(DataConverter.CLng(nodes.AsString()));
        }
예제 #2
0
        public DataTable GetWXBySogou(string key, int count, DateTime time)
        {
            string baseurl = "http://weixin.sogou.com/weixin?type=2&query={0}&fr=sgsearch&ie=utf8&_ast=1433216256&_asf=null&w=01059900&cid=null&page={1}";

            if (string.IsNullOrEmpty(key))
            {
                return(null);
            }
            DataTable dt = GetStruct(key); DateTime cdate = DateTime.Now;

            for (int p = 0; p * 10 < count; p++)
            {
                string   url  = string.Format(baseurl, HttpUtility.UrlEncode(key), p + 1);
                string   html = ieHelp.GetHtmlFromSite(url);
                HtmlPage page = htmlHelp.GetPage(html);
                //int cpage = GetCurPage(page.Body); if (cpage <= p) { break; }
                Winista.Text.HtmlParser.Util.NodeList nodes = page.Body.ExtractAllNodesThatMatch(new HasAttributeFilter("class", "wx-rb wx-rb3"), true);
                if (nodes.Count <= 0)
                {
                    break;
                }
                //将其序列化为模型并存入相应类中
                for (int i = 0; i < nodes.Count; i++)
                {
                    Winista.Text.HtmlParser.Util.NodeList cnodes = nodes[i].Children;
                    DataRow    dr      = dt.NewRow();
                    NodeFilter f_title = new AndFilter(new HasParentFilter(new TagNameFilter("h4")), new TagNameFilter("a"));
                    ATag       a       = (ATag)cnodes.ExtractAllNodesThatMatch(f_title, true)[0];
                    dr["Title"] = a.StringText;
                    dr["Link"]  = a.Link;
                    f_title     = new AndFilter(new HasAttributeFilter("id", "weixin_account"), new TagNameFilter("a"));
                    ATag author_a = (ATag)cnodes.ExtractAllNodesThatMatch(f_title, true)[0];
                    dr["Author"] = author_a.GetAttribute("title");
                    f_title      = new HasAttributeFilter("class", "s-p");
                    Div    div      = (Div)cnodes.ExtractAllNodesThatMatch(f_title, true)[0];
                    string unixtime = div.GetAttribute("t");
                    dr["Cdate"]  = GetDateTime(unixtime);
                    dr["Day"]    = GetDateTime(unixtime).Day;
                    dr["Source"] = "微信";
                    dt.Rows.Add(dr);
                }
                if (cdate < time)
                {
                    break;
                }
            }
            return(dt);
        }
예제 #3
0
        /*
         * pn:页数,值为10的等数差列,是过滤掉前多少个贴子
         * lm:指定时间内百度收录,值1为最近24小时,7为7天
         * rn:搜索结果显示条数,取值范围10-100之间,缺少为10(无用)
         */
        /// <summary>
        /// 抓取百度新闻关键词数据
        /// </summary>
        /// <param name="key">需要抓取的关键词</param>
        /// <param name="count">最多抓取多少条数据</param>
        /// <param name="time">抓取该时间段之后的数据</param>
        /// <returns></returns>
        public DataTable GetBaiduNews(string key, int count, DateTime time)
        {
            if (string.IsNullOrEmpty(key))
            {
                return(null);
            }
            int       pageSize = 10;
            string    baseurl = "http://news.baidu.com/ns?bs=%B7%F6%C0%CF%C8%CB&sr=0&cl=2&tn=news&ct=0&clk=sortbytime&rn=10&pn={0}&word={1}";
            DataTable dt = GetStruct(key); DateTime cdate = DateTime.Now;

            //1,根据关键词从百度上获取相关信息
            for (int p = 0; p * pageSize < count; p++)
            {
                //样本
                //<div class="result" id="1"><h3 class="c-title"><a href="http://news.qudong.com/article/329733.shtml" target="_blank" data-click="{&#10;      'f0':'77A717EA',&#10;      'f1':'9F63F1E4',&#10;      'f2':'4CA6DD6E',&#10;      'f3':'54E5343F',&#10;      't':'1462862118'&#10;      }">1万亿美金的累计收入!继苹果之后<em>微软</em>也做到了</a></h3><div class="c-summary c-row c-gap-top-small"><div class="c-span6"><a class="c_photo" href="http://news.qudong.com/article/329733.shtml" target="_blank"><img class="c-img c-img6" alt="" src="http://t12.baidu.com/it/u=2227277846,1917450397&amp;fm=82&amp;s=B78AA7E278021ED6942CE89C0300509B&amp;w=121&amp;h=81&amp;img.JPEG"></a></div><div class="c-span18 c-span-last"><p class="c-author">驱动中国&nbsp;&nbsp;1小时前</p>1万亿美金的收入这在寻常企业眼中就是一个天文数字,然而对于科技大鳄苹果<em>微软</em>来说,已然成为可能,根据<em>微软</em>在华盛顿州的纳税记录可以发现,该公司的历史累计收入在上个...  <span class="c-info"><a class="c-cache" href="http://cache.baidu.com/c?m=9d78d513d9d430d84f9e94697c1cc0116f4381132ba1d40209d6843898732f325321a3e52878564291d27d141cb2150bafb12172404067e1c694dd5dddccc375709574743647d71f45ce18afc04324c037902da8f55fb8e4&amp;p=ce7ec816d9c111a05beb8f624c0d&amp;newp=8749c54ad5c51bec17aac7710f5292695912c10e38dc8a563093&amp;user=baidu&amp;fm=sc&amp;query=%CE%A2%C8%ED%D6%D0%B9%FA&amp;qid=cdb5a37a00017e68&amp;p1=1" target="_blank" data-click="{'fm':'sc'}">百度快照</a></span></div></div></div>
                string   url   = string.Format(baseurl, p * pageSize, HttpUtility.UrlEncode(key));
                string   html  = htmlHelp.GetHtmlFromSite(url);
                HtmlPage page  = htmlHelp.GetPage(html);
                int      cpage = GetCurPage(page.Body); if (cpage <= p)
                {
                    break;
                }
                Winista.Text.HtmlParser.Util.NodeList nodes = htmlHelp.GetTagList(html, "div");//以前版本为li
                nodes = nodes.ExtractAllNodesThatMatch(new HasAttributeFilter("class", "result"));
                for (int i = 0; i < nodes.Count; i++)
                {
                    DataRow dr  = dt.NewRow();
                    Div     div = (Div)nodes[i].Children[1];
                    ATag    a   = (ATag)nodes[i].Children[0].Children.ExtractAllNodesThatMatch(new TagNameFilter("a"))[0];
                    dr["Title"] = a.StringText;
                    dr["Link"]  = a.Link;
                    string author = ""; cdate = DateTime.Now;
                    GetAuthorAndDate(ref author, ref cdate, div.Children.ExtractAllNodesThatMatch(new HasAttributeFilter("class", "c-author"), true).AsString());
                    dr["Author"] = author;
                    dr["CDate"]  = DataConverter.CDate(cdate);
                    dr["Day"]    = DataConverter.CDate(cdate).Day;
                    dr["Source"] = "新闻";
                    //dr["Content"] = getacBll.GetArticleFromWeb(htmlHelp.GetHtmlFromSite(a.Link),a.Link);
                    dt.Rows.Add(dr);
                }
                if (cdate < time)
                {
                    break;
                }
            }
            return(dt);
        }
예제 #4
0
        /// <summary>
        /// 利用百度搜索,抓取百度,新浪,网易的博客数据
        /// </summary>
        public DataTable GetBlogByBaidu(string key, int count, DateTime time)
        {
            string baseurl = "http://www.baidu.com/s?rtt=2&tn=baiduwb&pn={0}&wd={1}";

            if (string.IsNullOrEmpty(key))
            {
                return(null);
            }
            int       pageSize = 10;
            DataTable dt       = GetStruct(key);
            DateTime  cdate    = DateTime.Now;

            for (int p = 0; p * pageSize < count; p++)//最多采100页即1000条数据
            {
                string url = string.Format(baseurl, p * pageSize, HttpUtility.UrlEncode(key));
                //string html = htmlHelp.GetHtmlFromSite(url);
                string   html = htmlHelp.GetHtmlFromSite(url);
                HtmlPage page = htmlHelp.GetPage(html);
                //int cpage = GetCurPage(page.Body); if (cpage <= p) { break; }
                Winista.Text.HtmlParser.Util.NodeList nodes = htmlHelp.GetTagList(html, "li");
                nodes = nodes.ExtractAllNodesThatMatch(new HasAttributeFilter("id"));
                for (int i = 0; i < nodes.Count; i++)
                {
                    Winista.Text.HtmlParser.Util.NodeList cnodes = nodes[i].Children;
                    DataRow    dr         = dt.NewRow();
                    NodeFilter linkFilter = new AndFilter(new TagNameFilter("a"), new HasAttributeFilter("class", "weibo_all"));
                    ATag       a          = (ATag)cnodes.ExtractAllNodesThatMatch(linkFilter, true)[0];
                    dr["Title"]  = "关键词:" + key;
                    dr["Link"]   = a.Link;
                    dr["Author"] = cnodes.ExtractAllNodesThatMatch(new HasAttributeFilter("name", "weibo_rootnick"), true)[0].ToPlainTextString();
                    ATag datea = (ATag)cnodes.ExtractAllNodesThatMatch(new HasParentFilter(new HasAttributeFilter("class", "m")), true)[0];
                    cdate        = ConverBDDate(datea.StringText);
                    dr["CDate"]  = cdate;
                    dr["Day"]    = cdate.Day;
                    dr["Source"] = "微博";
                    //dr["Content"] = getacBll.GetArticleFromWeb(htmlHelp.GetHtmlFromSite(a.Link),a.Link);
                    dt.Rows.Add(dr);
                }
                if (cdate < time)
                {
                    break;
                }
            }
            return(dt);
        }
예제 #5
0
        public List <string> GetUsersFromTb(Winista.Text.HtmlParser.Util.NodeList nlist)
        {
            List <string> us = new List <string>();

            if (nlist == null || nlist.Count == 0)
            {
                return(us);
            }
            Winista.Text.HtmlParser.INode tablenode = null;
            for (int i = 0; i < nlist[0].Children.Count; i++)
            {
                var tg = nlist[0].Children[i] as Winista.Text.HtmlParser.ITag;
                if (tg == null || tg.TagName != "TABLE")
                {
                    continue;
                }
                tablenode = nlist[0].Children[i];
                break;
            }
            if (tablenode == null)
            {
                return(us);
            }
            Winista.Text.HtmlParser.Util.NodeList trs = null;
            for (int i = 0; i < tablenode.Children.Count; i++)
            {
                var tg = tablenode.Children[i] as Winista.Text.HtmlParser.ITag;
                if (tg == null)
                {
                    continue;
                }
                if (tg.TagName == "TBODY")
                {
                    trs = tg.Children;
                    break;
                }
                if (tg.TagName == "TR")
                {
                    trs = tablenode.Children;
                    break;
                }
            }

            if (trs == null || trs.Count == 0)
            {
                return(us);
            }
            for (int i = 0; i < trs.Count; i++)
            {
                var tg = trs[i] as Winista.Text.HtmlParser.ITag;
                if (tg == null || tg.TagName != "TR")
                {
                    continue;
                }
                //get first td
                Winista.Text.HtmlParser.ITag firsttd = null;
                for (int k = 0; k < trs[i].Children.Count; k++)
                {
                    var td_t = trs[i].Children[k] as Winista.Text.HtmlParser.ITag;
                    if (td_t != null && td_t.TagName == "TD")
                    {
                        firsttd = td_t;
                        break;
                    }
                }
                if (firsttd == null)
                {
                    continue;
                }
                //get div
                Winista.Text.HtmlParser.ITag td_div = null;
                for (int k = 0; k < firsttd.Children.Count; k++)
                {
                    var td_t_div = firsttd.Children[k] as Winista.Text.HtmlParser.ITag;
                    if (td_t_div != null && td_t_div.TagName == "DIV")
                    {
                        td_div = td_t_div;
                        break;
                    }
                }
                if (td_div == null)
                {
                    continue;
                }
                //GET A
                Winista.Text.HtmlParser.ITag td_div_a = null;
                for (int k = 0; k < td_div.Children.Count; k++)
                {
                    var td_t_div = td_div.Children[k] as Winista.Text.HtmlParser.ITag;
                    if (td_t_div != null && td_t_div.TagName == "A")
                    {
                        td_div_a = td_t_div;
                        break;
                    }
                }
                if (td_div_a == null)
                {
                    continue;
                }
                us.Add(td_div_a.ToPlainTextString().Trim());
            }
            return(us);
        }