Example #1
0
        private void BasicWebSpider(string motherUrl, string similar, DataTable keys, string sheng, string shi, string xian, string name, bool catchSub)
        {
            //相似表中的被抓取网址
            string webInfo = "";

            HtmlParse.Parse parse = new HtmlParse.Parse();

            //读取相似表中要抓取的网址
            webInfo = HtmlUtil.getHtml(motherUrl, "");

            //取出所有的超链
            string[] strA = HtmlUtil.GetElementsByTagName(webInfo, "a");

            //TbReleaseInfo ri = new TbReleaseInfo();
            List<ModelReleaseInfo> mris = new List<ModelReleaseInfo>();
            #region 逐个链接判断
            for (int i = 0; i < strA.Length; i++)
            {
                if (Program.ProClose == true) break;
                Application.DoEvents();

                string url = CrawlHtml.processUrl(motherUrl, strA[i]);
                if (string.IsNullOrEmpty(similar) || (!string.IsNullOrEmpty(similar) && HtmlUtil.getSimilarDegree(similar, url) >= 0.70))
                {
                    //因为有的网站会出现访问过快的话,会屏蔽访问者,所以在此让线程停止2秒钟。这样的话,会出现总体访问时间过长的问题
                    Thread.Sleep(2000);
                    //得到此链接的源码
                    webInfo = HtmlUtil.getHtml(url, "");
                    if (string.IsNullOrEmpty(webInfo)) { continue; }

                    //处理下级页面的超链
                    if (catchSub)
                    {
                        BasicWebSpider(url, similar, keys, sheng, shi, xian, name, false);
                    }

                    //判断该链接是否包含关键词
                    List<ModelReleaseInfo> newsInfos = parse.ParseGeneralWeb(strA[i], url, keys, sheng, shi, xian, Name, webInfo, 7);
                    if (newsInfos != null && newsInfos.Count > 0)
                    {
                        //写入数据库
                        DataPersistenceControl.GetInstance().Add(newsInfos);

                        //刷新界面
                        //RefreshDataGridView(7);
                    }

                }
            }
            #endregion
        }
Example #2
0
        private void BlogWebSpider()
        {
            #region 处理关键字
            MySqlCmd cmd = new MySqlCmd();
            //得到关键字列表
            DataTable dtkey;
            DataTable dtParts;
            if (selectKID == -1)
            {
                //全部
                dtkey = cmd.GetTabel("select * from Keywords");
            }
            else
            {
                dtkey = cmd.GetTabel("select * from Keywords where kid=" + selectKID);
            }
            dtParts = cmd.GetTabel("SELECT * FROM partword");
            #endregion

            HtmlParse.Parse parse = new HtmlParse.Parse();
            //parse.ReportCatchProcess += new HtmlParse.ReportCatchProcessEventHandler(Blog_ReportCatchProcess);
            List<ModelReleaseInfo> webDatas = new List<ModelReleaseInfo>();
            //按关键字循环
            for (int kw = 0; kw < dtkey.Rows.Count; kw++)
            {
                //处理关键字
                if (selectKID != -1 && selectKwName != "全部")
                {
                    if (dtkey.Rows[kw]["name"].ToString().Trim() != selectKwName) continue;
                }

                string keyword = dtkey.Rows[kw]["KeyWord"].ToString().Trim();
                string keyTitle = dtkey.Rows[kw]["Name"].ToString().Trim();
                int kid = 0;
                int.TryParse(dtkey.Rows[kw]["kid"].ToString().Trim(), out kid);
                #region 按关键字检索
                //组成查询字串
                string url = "http://www.sogou.com/web?interation=196647&query=" + keyword + "&ie=utf8";

                string html = HtmlUtil.HttpGet(url, Encoding.UTF8);
                #endregion
                List<ModelReleaseInfo> mris = parse.ParseSogouBlog(html, keyword, kid);
                if (mris != null && mris.Count() > 0)
                {
                    DataPersistenceControl.GetInstance().Add(mris);
                }
                //防止拉黑
                Thread.Sleep(Interval2m);
            }
        }
Example #3
0
        private void BaiduWebWebSpider()
        {
            #region 处理关键字
            MySqlCmd cmd = new MySqlCmd();
            //得到关键字列表
            DataTable dtkey;
            DataTable dtParts;
            if (selectKID == -1)
            {
                //全部
                dtkey = cmd.GetTabel("select * from Keywords");
            }
            else
            {
                dtkey = cmd.GetTabel("select * from Keywords where kid=" + selectKID);
            }
            dtParts = cmd.GetTabel("SELECT * FROM partword");
            #endregion

            HtmlParse.Parse parse = new HtmlParse.Parse();
            //parse.ReportCatchProcess += new HtmlParse.ReportCatchProcessEventHandler(BaiduWeb_ReportCatchProcess);
            List<ModelReleaseInfo> webDatas = new List<ModelReleaseInfo>();
            //按关键字循环
            for (int kw = 0; kw < dtkey.Rows.Count; kw++)
            {
                //处理关键字
                if (selectKID != -1 && selectKwName != "全部")
                {
                    if (dtkey.Rows[kw]["name"].ToString().Trim() != selectKwName) continue;
                }

                string keyword = dtkey.Rows[kw]["KeyWord"].ToString().Trim();
                string keyTitle = dtkey.Rows[kw]["Name"].ToString().Trim();
                int kid = 0;
                int.TryParse(dtkey.Rows[kw]["kid"].ToString().Trim(), out kid);

                #region 百度检索
                for (int i = 0; i < 5; i++)
                {
                    String encodeKey = CrawlHtml.UrlEncode(keyword);
                    string url = string.Format(@"http://www.baidu.com/s?wd={0}&pn={1}&ie=utf-8", encodeKey, i * 10);
                    string html = HtmlUtil.HttpGet(url, Encoding.UTF8);
                    List<ModelReleaseInfo> mris = parse.ParseBaiduWeb(html, keyword, kid);
                    if (mris != null && mris.Count() > 0)
                    {
                        DataPersistenceControl.GetInstance().Add(mris);
                    }
                    else
                    {
                        break;
                    }
                    //防止拉黑
                    Thread.Sleep(Interval50s);
                }
                #endregion

                #region bing检索
                for (int i = 0; i < 10; i++)
                {
                    //组成查询字串
                    string url = string.Format("http://cn.bing.com/search?q={0}&first={1}&FORM=PERE", keyword, i * 10 + 1);
                    string html = HtmlUtil.HttpGet(url, Encoding.UTF8);
                    List<ModelReleaseInfo> mris = parse.ParseBingWeb(html, keyword, kid);
                    if (mris != null && mris.Count() > 0)
                    {
                        //写入数据库
                        DataPersistenceControl.GetInstance().Add(mris);
                    }
                    else
                    {
                        break;
                    }
                    //防止拉黑
                    Thread.Sleep(Interval30s);
                }
                #endregion

                #region 搜狗检索
                for (int i = 0; i < 10; i++)
                {
                    //组成查询字串
                    string url = string.Format("http://www.sogou.com/web?query={0}&page={1}&ie=utf8", keyword, i + 1);
                    string html = HtmlUtil.HttpGet(url, Encoding.UTF8);
                    List<ModelReleaseInfo> mris = parse.ParseSogouWeb(html, keyword, kid);
                    if (mris != null && mris.Count() > 0)
                    {
                        //写入数据库
                        DataPersistenceControl.GetInstance().Add(mris);
                    }
                    else
                    {
                        break;
                    }
                    //防止拉黑
                    Thread.Sleep(Interval2m);
                }
                #endregion

                #region 中搜检索
                for (int i = 0; i < 10; i++)
                {
                    //组成查询字串
                    string url = string.Format("http://www.zhongsou.com/third?w={0}&b={1}", keyword, i + 1);
                    string html = HtmlUtil.HttpGet(url, Encoding.Default);
                    List<ModelReleaseInfo> mris = parse.ParseZhongsouWeb(html, keyword, kid);
                    if (mris != null && mris.Count() > 0)
                    {
                        //写入数据库
                        DataPersistenceControl.GetInstance().Add(mris);
                    }
                    else
                    {
                        break;
                    }
                    //防止拉黑
                    Thread.Sleep(Interval2m);
                }
                #endregion

                #region 好搜检索
                for (int i = 0; i < 10; i++)
                {
                    //组成查询字串
                    string url = string.Format("http://www.haosou.com/s?q={0}&pn={1}", keyword, i + 1);
                    string html = HtmlUtil.HttpGet(url, Encoding.UTF8);
                    List<ModelReleaseInfo> mris = parse.ParseHaosouWeb(html, keyword, kid);
                    if (mris != null && mris.Count() > 0)
                    {
                        //写入数据库
                        DataPersistenceControl.GetInstance().Add(mris);
                    }
                    else
                    {
                        break;
                    }
                    //防止拉黑
                    Thread.Sleep(Interval50s);
                }
                #endregion

            }
        }
Example #4
0
        private void WeixinWebSpider()
        {
            #region 处理关键字
            MySqlCmd cmd = new MySqlCmd();
            //得到关键字列表
            DataTable dtkey;
            //DataTable dtParts;
            //dtkey = cmd.GetTabel("select * from Keywords");
            //dtParts = cmd.GetTabel("SELECT * FROM partword");
            if (selectKID == -1)
            {
                //全部
                dtkey = cmd.GetTabel("select * from Keywords");
            }
            else
            {
                dtkey = cmd.GetTabel("select * from Keywords where kid=" + selectKID);
            }
            #endregion

            HtmlParse.Parse parse = new HtmlParse.Parse();
            //parse.ReportCatchProcess += new HtmlParse.Parse.ReportCatchProcessEventHandler(Weixin_ReportCatchProcess);
            List<ModelReleaseInfo> webDatas = new List<ModelReleaseInfo>();
            //按关键字循环
            for (int kw = 0; kw < dtkey.Rows.Count; kw++)
            {
                //处理关键字
                if (selectKID != -1 && selectKwName != "全部")
                {
                    if (dtkey.Rows[kw]["name"].ToString().Trim() != selectKwName) continue;
                }

                string keyword = dtkey.Rows[kw]["KeyWord"].ToString().Trim();
                //string keyTitle = dtkey.Rows[kw]["Name"].ToString().Trim();
                int kid = 0;
                int.TryParse(dtkey.Rows[kw]["kid"].ToString().Trim(), out kid);
                #region 按关键字检索
                //组成查询字串
                string url = "http://weixin.sogou.com/weixin?type=2&query=" + keyword + "&ie=utf8";

                //string html = HtmlUtil.HttpGet(url, Encoding.UTF8);
                CookieContainer cookies = new CookieContainer();
                string strCookie = "";
                string html = HtmlUtil.HttpGet(url, Encoding.UTF8, cookies, "weixin.sogou.com", ref strCookie);
                #endregion
                List<ModelReleaseInfo> mris = parse.ParseSogouWeixin(html, keyword, kid, cookies, strCookie);
                if (mris != null && mris.Count() > 0)
                {
                    DataPersistenceControl.GetInstance().Add(mris);
                    //webDatas.AddRange(mris);
                    //刷新界面
                    //RefreshDataGridView(6);
                }
                //防止拉黑(2分钟)
                Thread.Sleep(Interval2m);
            }
        }
Example #5
0
        private void WeiboWebSpider()
        {
            #region 处理关键字
            MySqlCmd cmd = new MySqlCmd();
            //得到关键字列表
            DataTable dtkey;
            DataTable dtParts;
            if (selectKID == -1)
            {
                //全部
                dtkey = cmd.GetTabel("select * from Keywords");
            }
            else
            {
                dtkey = cmd.GetTabel("select * from Keywords where kid=" + selectKID);
            }
            dtParts = cmd.GetTabel("SELECT * FROM partword");
            #endregion

            HtmlParse.Parse parse = new HtmlParse.Parse();
            //parse.ReportCatchProcess += new HtmlParse.ReportCatchProcessEventHandler(Weibo_ReportCatchProcess);
            List<ModelReleaseInfo> webDatas = new List<ModelReleaseInfo>();
            //按关键字循环
            for (int kw = 0; kw < dtkey.Rows.Count; kw++)
            {
                //处理关键字
                if (selectKID != -1 && selectKwName != "全部")
                {
                    if (dtkey.Rows[kw]["name"].ToString().Trim() != selectKwName) continue;
                }

                string keyword = dtkey.Rows[kw]["KeyWord"].ToString().Trim();
                string keyTitle = dtkey.Rows[kw]["Name"].ToString().Trim();
                int kid = 0;
                int.TryParse(dtkey.Rows[kw]["kid"].ToString().Trim(), out kid);

                #region 新浪微博检索
                String encodeKey = CrawlHtml.UrlEncode(keyword);
                string url = "http://s.weibo.com/weibo/" + encodeKey + "?topnav=1&wvr=6&b=1&page=1";
                string html = HtmlUtil.HttpGet(url, Encoding.UTF8);
                List<ModelReleaseInfo> mris = parse.ParseSinaWeibo(html, keyword, kid);
                if (mris != null && mris.Count() > 0)
                {
                    DataPersistenceControl.GetInstance().Add(mris);
                }
                //防止微博拉黑
                Thread.Sleep(Interval30s);
                #endregion

                #region 中搜检索
                for (int i = 0; i < 10; i++)
                {
                    //组成查询字串
                    url = "";
                    html = "";
                    mris = null;
                    url = string.Format("http://t.zhongsou.com/wb?w={0}&b={1}", keyword, i + 1);
                    html = HtmlUtil.HttpGet(url, Encoding.Default);
                    mris = parse.ParseZhongsouWeibo(html, keyword, kid);
                    if (mris != null && mris.Count() > 0)
                    {
                        //写入数据库
                        DataPersistenceControl.GetInstance().Add(mris);
                    }
                    else
                    {
                        break;
                    }
                    //防止拉黑
                    Thread.Sleep(Interval2m);
                }
                #endregion
            }
        }
Example #6
0
        private void TiebaWebSpider()
        {
            #region 处理关键字
            MySqlCmd cmd = new MySqlCmd();
            //得到关键字列表
            DataTable dtkey;
            DataTable dtParts;
            if (selectKID == -1)
            {
                //全部
                dtkey = cmd.GetTabel("select * from Keywords");
            }
            else
            {
                dtkey = cmd.GetTabel("select * from Keywords where kid=" + selectKID);
            }
            dtParts = cmd.GetTabel("SELECT * FROM partword");
            #endregion

            HtmlParse.Parse parse = new HtmlParse.Parse();
            //parse.ReportCatchProcess += new HtmlParse.ReportCatchProcessEventHandler(Tieba_ReportCatchProcess);

            //按关键字循环
            for (int kw = 0; kw < dtkey.Rows.Count; kw++)
            {
                //处理关键字
                if (selectKID != -1 && selectKwName != "全部")
                {
                    if (dtkey.Rows[kw]["name"].ToString().Trim() != selectKwName) continue;
                }

                string keyword = dtkey.Rows[kw]["KeyWord"].ToString().Trim();
                string keyTitle = dtkey.Rows[kw]["Name"].ToString().Trim();
                int kid = 0;
                int.TryParse(dtkey.Rows[kw]["kid"].ToString().Trim(), out kid);

                #region 按关键字检索
                //组成查询字串
                //返回前60条数据,暂且不处理翻页
                string url = "http://tieba.baidu.com/f/search/res?ie=utf-8&rn=60&qw=" + keyword;

                string html = HtmlUtil.HttpGet(url, Encoding.Default);
                #endregion
                List<ModelReleaseInfo> mris = parse.ParseBaiduTieba(html, keyword, kid);
                if (mris != null && mris.Count() > 0)
                {
                    DataPersistenceControl.GetInstance().Add(mris);
                }
                //防止拉黑
                Thread.Sleep(Interval50s);
            }
        }
Example #7
0
        private void MediaWebSpider()
        {
            #region 处理关键字
            MySqlCmd cmd = new MySqlCmd();
            //得到关键字列表
            DataTable dtkey;
            DataTable dtParts;
            if (selectKID == -1)
            {
                //全部
                dtkey = cmd.GetTabel("select * from Keywords");
            }
            else
            {
                dtkey = cmd.GetTabel("select * from Keywords where kid=" + selectKID);
            }
            dtParts = cmd.GetTabel("SELECT * FROM partword");
            #endregion

            HtmlParse.Parse parse = new HtmlParse.Parse();
            //parse.ReportCatchProcess += new HtmlParse.ReportCatchProcessEventHandler(parse_ReportCatchProcess);
            List<ModelReleaseInfo> webDatas = new List<ModelReleaseInfo>();
            //按关键字循环
            for (int kw = 0; kw < dtkey.Rows.Count; kw++)
            {
                //处理关键字
                if (selectKID != -1 && selectKwName != "全部")
                {
                    if (dtkey.Rows[kw]["name"].ToString().Trim() != selectKwName) continue;
                }
                string keyword = dtkey.Rows[kw]["KeyWord"].ToString().Trim();
                //string keyTitle = dtkey.Rows[kw]["Name"].ToString().Trim();
                int kid = 0;
                int.TryParse(dtkey.Rows[kw]["kid"].ToString().Trim(), out kid);

                #region 百度检索
                //组成查询字串
                string url = "http://news.baidu.com/ns?rn=100&word=" + keyword;
                string html = HtmlUtil.HttpGet(url, Encoding.UTF8);
                List<ModelReleaseInfo> mris = parse.ParseBaiduNews(html, keyword, kid);
                if (mris != null && mris.Count() > 0)
                {
                    //写入数据库
                    DataPersistenceControl.GetInstance().Add(mris);
                }
                #endregion

                #region bing检索
                for (int i = 0; i < 10; i++)
                {
                    //组成查询字串
                    url = "";
                    html = "";
                    mris = null;
                    url = string.Format("http://cn.bing.com/news/search?q={0}&first={1}&FORM=PENR", keyword, i * 10 + 1);
                    html = HtmlUtil.HttpGet(url, Encoding.UTF8);
                    mris = parse.ParseBingNews(html, keyword, kid);
                    if (mris != null && mris.Count() > 0)
                    {
                        //写入数据库
                        DataPersistenceControl.GetInstance().Add(mris);
                    }
                    else
                    {
                        break;
                    }
                    //防止拉黑
                    Thread.Sleep(Interval50s);
                }
                #endregion

                #region 搜狗新闻
                for (int i = 0; i < 10; i++)
                {
                    //组成查询字串
                    url = "";
                    html = "";
                    mris = null;
                    url = string.Format("http://news.sogou.com/news?query={0}&page={1}", keyword, i + 1);
                    html = HtmlUtil.HttpGet(url, Encoding.Default);
                    mris = parse.ParseSogouNews(html, keyword, kid);
                    if (mris != null && mris.Count() > 0)
                    {
                        //写入数据库
                        DataPersistenceControl.GetInstance().Add(mris);
                    }
                    else
                    {
                        break;
                    }
                    //防止拉黑(2分钟)
                    Thread.Sleep(Interval2m);
                }
                #endregion

                #region 中搜新闻
                for (int i = 0; i < 10; i++)
                {
                    //组成查询字串
                    url = "";
                    html = "";
                    mris = null;
                    url = string.Format("http://zixun.zhongsou.com/n?w={0}&b={1}", keyword, i + 1);
                    html = HtmlUtil.HttpGet(url, Encoding.Default);
                    mris = parse.ParseZhongsouNews(html, keyword, kid);
                    if (mris != null && mris.Count() > 0)
                    {
                        //写入数据库
                        DataPersistenceControl.GetInstance().Add(mris);
                    }
                    else
                    {
                        break;
                    }
                    //防止拉黑(2分钟)
                    Thread.Sleep(Interval2m);
                }
                #endregion

                #region 好搜新闻
                for (int i = 0; i < 10; i++)
                {
                    //组成查询字串
                    url = "";
                    html = "";
                    mris = null;
                    url = string.Format("http://news.haosou.com/ns?q={0}&pn={1}&tn=news&rank=rank&j=0", keyword, i + 1);
                    html = HtmlUtil.HttpGet(url, Encoding.UTF8);
                    mris = parse.ParseHaosouNews(html, keyword, kid);
                    if (mris != null && mris.Count() > 0)
                    {
                        //写入数据库
                        DataPersistenceControl.GetInstance().Add(mris);
                    }
                    else
                    {
                        break;
                    }
                    //防止拉黑
                    Thread.Sleep(Interval50s);
                }
                #endregion
            }
        }