Esempio n. 1
0
        private void BasicWebSpider(string motherUrl, string similar, DataTable keys, string sheng, string shi, string xian, string name, bool catchSub)
        {
            //相似表中的被抓取网址
            string webInfo = "";

            HtmlParse.Parse parse = new HtmlParse.Parse();

            //读取相似表中要抓取的网址
            webInfo = HtmlUtil.getHtml(motherUrl, "");

            //取出所有的超链
            string[] strA = HtmlUtil.GetElementsByTagName(webInfo, "a");

            //TbReleaseInfo ri = new TbReleaseInfo();
            List<ModelReleaseInfo> mris = new List<ModelReleaseInfo>();
            #region 逐个链接判断
            for (int i = 0; i < strA.Length; i++)
            {
                if (Program.ProClose == true) break;
                Application.DoEvents();

                string url = CrawlHtml.processUrl(motherUrl, strA[i]);
                if (string.IsNullOrEmpty(similar) || (!string.IsNullOrEmpty(similar) && HtmlUtil.getSimilarDegree(similar, url) >= 0.70))
                {
                    //因为有的网站会出现访问过快的话,会屏蔽访问者,所以在此让线程停止2秒钟。这样的话,会出现总体访问时间过长的问题
                    Thread.Sleep(2000);
                    //得到此链接的源码
                    webInfo = HtmlUtil.getHtml(url, "");
                    if (string.IsNullOrEmpty(webInfo)) { continue; }

                    //处理下级页面的超链
                    if (catchSub)
                    {
                        BasicWebSpider(url, similar, keys, sheng, shi, xian, name, false);
                    }

                    //判断该链接是否包含关键词
                    List<ModelReleaseInfo> newsInfos = parse.ParseGeneralWeb(strA[i], url, keys, sheng, shi, xian, Name, webInfo, 7);
                    if (newsInfos != null && newsInfos.Count > 0)
                    {
                        //写入数据库
                        DataPersistenceControl.GetInstance().Add(newsInfos);

                        //刷新界面
                        //RefreshDataGridView(7);
                    }

                }
            }
            #endregion
        }