예제 #1
0
		//根据大类资源集合网址来提取资源页面集合
		public static void StartCollectResouceList()
		{
			string firClassName = "图书" ;
			ResouceType resType = ResouceType.EBook ;
			Hashtable ht = new Hashtable () ;
			ht.Add ("http://www.verycd.com/archives/book/novels/","小说");
			ht.Add ("http://www.verycd.com/archives/book/literature/","文学");
			ht.Add ("http://www.verycd.com/archives/book/social/","人文社科");
			ht.Add ("http://www.verycd.com/archives/book/eco/","经济管理");
			ht.Add ("http://www.verycd.com/archives/book/computer/","计算机与网络");
			ht.Add ("http://www.verycd.com/archives/book/life/","生活");
			ht.Add ("http://www.verycd.com/archives/book/edu/","教育科技");
			ht.Add ("http://www.verycd.com/archives/book/children/","少儿");
			ht.Add ("http://www.verycd.com/archives/book/others/","其它图书");
			foreach (DictionaryEntry element in ht) {
//				GetTypePageList( element.Key.ToString().Trim (),firClassName,element.Value.ToString (),resType) ;
                Console.WriteLine("完成:" + element.Value.ToString());
				tb_typelist model = new tb_typelist () ;
				model.URL = element.Key.ToString ().Trim () ;                
				model.Remark = string.Empty ;
				model.SubClassName = element.Value.ToString().Trim () ;
				model.TypeName = firClassName ;
				model.UpdateTime = DateTime.Now ;
				model.ResType = resType.ToString () ;
				model.Save () ;
			}
		}
예제 #2
0
 //获取当前页面的链接,返回成功率
 private static double GetPageContentHerf(tb_typelist item, string curUrl)
 {
     HtmlDocument doc = CaptureWebSite.GetHtmlDocument(curUrl, VerycdEncoding);
     HtmlNodeCollection hc = doc.DocumentNode.SelectNodes("//@href");
     if (hc == null || hc.Count == 0) return 0;
     int count = 0;
     try
     {
         foreach (var s in hc)
         {
             string urls = s.Attributes["href"].Value.ToString();
             string url = verycdWebSite + urls;
             if (Regex.IsMatch(urls, pagePatten))
             {
                 #region 数据库操作
                 //不包括“全文”字样
                 string name = s.InnerText.Replace("\r\n", "").Trim();
                 if (name != "" && !name.Contains("全文"))
                 {
                     //写入加入到页面数据库,如果页面已经存在,则检查更新时间,如更新时间>10天,则更新状态                            
                     if (tb_resoucepageslist.FindCount(tb_resoucepageslist._.PageURL, url) < 1)
                     {
                         //直接插入
                         tb_resoucepageslist model = new tb_resoucepageslist();
                         model.PageURL = url;
                         model.ClassName = item.TypeName;
                         model.CollectionMark = 0;
                         model.InfoOrigin = "VeryCd";
                         model.PageTitle = name;
                         model.ResouceType = item.ResType;
                         model.SubClassName = item.SubClassName;
                         model.UpdateTime = DateTime.Now;
                         model.Insert();
                         count++;
                     }
                     else
                     {
                         //更新状态
                         tb_resoucepageslist model = tb_resoucepageslist.FindByPageURL(url);
                         if ((DateTime.Now - model.UpdateTime).TotalDays > 5)
                         {
                             model.ClassName = item.TypeName;
                             model.CollectionMark = 0;
                             model.InfoOrigin = "VeryCd";
                             model.PageTitle = name;
                             model.ResouceType = item.ResType;
                             model.SubClassName = item.SubClassName;
                             model.UpdateTime = DateTime.Now;
                             model.Update();
                             count++;
                         }
                     }
                 }
                 #endregion
             }
         }
         return ((double)count) / ((double)hc.Count);
     }
     catch (Exception err) { XTrace.WriteException(err); return ((double)count) / ((double)hc.Count); }
     finally { XTrace.WriteLine("通过网页:{0},获取到更新记录页面{1}条", curUrl, count); }
 }
예제 #3
0
 //根据大类资源集合网址来提取资源页面集合
 protected static void StartCollectResouceList()
 {
     string firClassName = "图书";
     ResouceType resType = ResouceType.EBook;
     Hashtable ht = new Hashtable();
     ht.Add("http://www.verycd.com/sto/book/novels/", "小说");
     ht.Add("http://www.verycd.com/sto/book/literature/", "文学");
     ht.Add("http://www.verycd.com/sto/book/social/", "人文社科");
     ht.Add("http://www.verycd.com/sto/book/eco/", "经济管理");
     ht.Add("http://www.verycd.com/sto/book/computer/", "计算机与网络");
     ht.Add("http://www.verycd.com/sto/book/life/", "生活");
     ht.Add("http://www.verycd.com/sto/book/edu/", "教育科技");
     ht.Add("http://www.verycd.com/sto/book/children/", "少儿");
     ht.Add("http://www.verycd.com/sto/book/others/", "其它图书");
     //ht.Add("http://www.verycd.com/archives/book/novels/", "小说");
     //ht.Add("http://www.verycd.com/archives/book/literature/", "文学");
     //ht.Add("http://www.verycd.com/archives/book/social/", "人文社科");
     //ht.Add("http://www.verycd.com/archives/book/eco/", "经济管理");
     //ht.Add("http://www.verycd.com/archives/book/computer/", "计算机与网络");
     //ht.Add("http://www.verycd.com/archives/book/life/", "生活");
     //ht.Add("http://www.verycd.com/archives/book/edu/", "教育科技");
     //ht.Add("http://www.verycd.com/archives/book/children/", "少儿");
     //ht.Add("http://www.verycd.com/archives/book/others/", "其它图书");
     foreach (DictionaryEntry element in ht)
     {
         if (tb_typelist.FindCount(tb_typelist._.URL, element.Key.ToString()) < 1)
         {
             tb_typelist model = new tb_typelist();
             model.URL = element.Key.ToString().Trim();
             model.Remark = string.Empty;
             model.SubClassName = element.Value.ToString().Trim();
             model.TypeName = firClassName;
             model.UpdateTime = DateTime.Now;
             model.ResType = resType.ToString();
             model.Insert();
         }
     }
 }
예제 #4
0
 //分析常规页面
 private static void AnalyNormalPage(tb_typelist item)
 {
     for (int i = 1; i <= 10; i++)
     {
         if (GetPageContentHerf(item, item.URL + "page" + i.ToString()) > 0.85)
         {
             return;
         }
     }
 }
예제 #5
0
        //以下为基本采集功能方法

        #region 根据大类资源网址获取资源集合列表网址
        //根据大类资源网址获取资源集合列表网址
        public static void GetTypePageList(tb_typelist typelist)
        {
            //string URL, string FirName, string SubClassName, ResouceType resType
            HtmlDocument doc = CaptureWebSite.GetHtmlDocument(typelist.URL, VerycdEncoding);
            HtmlNodeCollection hc = doc.DocumentNode.SelectNodes(xPath_TypePageList);
            int count = 0;//计数器
            for (int i = 0; i < hc.Count; i++)
            {
                try
                {
                    string url = (verycdWebSite + hc[i].SelectSingleNode(@"a[1]").Attributes["href"].Value.Trim());
                    if (tb_fistclasslist.FindCount(tb_fistclasslist._.WebURL, url) < 1)
                    {
                        tb_fistclasslist model = new tb_fistclasslist();
                        model.WebURL = url;
                        model.ClassName = typelist.TypeName ;
                        model.SubClassName = typelist.SubClassName ;
                        model.CollectionMark = 0;
                        model.InfoOrigin = "VeryCd";
                        model.Remark = string.Empty;
                        model.ResouceType = typelist.ResType.ToString ();
                        model.UpdateTime = DateTime.Now;
                        model.Insert();
                        count++;
                    }
                }
                catch (Exception err)
                {
                    XTrace.WriteException(err);
                    continue;
                }
                finally
                {
                    XTrace.WriteLine("通过大类资源列表{0},获取到更新记录{1}条", typelist.URL , count);
                }
            }
        }