//根据大类资源集合网址来提取资源页面集合 public static void StartCollectResouceList() { string firClassName = "图书" ; ResouceType resType = ResouceType.EBook ; Hashtable ht = new Hashtable () ; ht.Add ("http://www.verycd.com/archives/book/novels/","小说"); ht.Add ("http://www.verycd.com/archives/book/literature/","文学"); ht.Add ("http://www.verycd.com/archives/book/social/","人文社科"); ht.Add ("http://www.verycd.com/archives/book/eco/","经济管理"); ht.Add ("http://www.verycd.com/archives/book/computer/","计算机与网络"); ht.Add ("http://www.verycd.com/archives/book/life/","生活"); ht.Add ("http://www.verycd.com/archives/book/edu/","教育科技"); ht.Add ("http://www.verycd.com/archives/book/children/","少儿"); ht.Add ("http://www.verycd.com/archives/book/others/","其它图书"); foreach (DictionaryEntry element in ht) { // GetTypePageList( element.Key.ToString().Trim (),firClassName,element.Value.ToString (),resType) ; Console.WriteLine("完成:" + element.Value.ToString()); tb_typelist model = new tb_typelist () ; model.URL = element.Key.ToString ().Trim () ; model.Remark = string.Empty ; model.SubClassName = element.Value.ToString().Trim () ; model.TypeName = firClassName ; model.UpdateTime = DateTime.Now ; model.ResType = resType.ToString () ; model.Save () ; } }
//获取当前页面的链接,返回成功率 private static double GetPageContentHerf(tb_typelist item, string curUrl) { HtmlDocument doc = CaptureWebSite.GetHtmlDocument(curUrl, VerycdEncoding); HtmlNodeCollection hc = doc.DocumentNode.SelectNodes("//@href"); if (hc == null || hc.Count == 0) return 0; int count = 0; try { foreach (var s in hc) { string urls = s.Attributes["href"].Value.ToString(); string url = verycdWebSite + urls; if (Regex.IsMatch(urls, pagePatten)) { #region 数据库操作 //不包括“全文”字样 string name = s.InnerText.Replace("\r\n", "").Trim(); if (name != "" && !name.Contains("全文")) { //写入加入到页面数据库,如果页面已经存在,则检查更新时间,如更新时间>10天,则更新状态 if (tb_resoucepageslist.FindCount(tb_resoucepageslist._.PageURL, url) < 1) { //直接插入 tb_resoucepageslist model = new tb_resoucepageslist(); model.PageURL = url; model.ClassName = item.TypeName; model.CollectionMark = 0; model.InfoOrigin = "VeryCd"; model.PageTitle = name; model.ResouceType = item.ResType; model.SubClassName = item.SubClassName; model.UpdateTime = DateTime.Now; model.Insert(); count++; } else { //更新状态 tb_resoucepageslist model = tb_resoucepageslist.FindByPageURL(url); if ((DateTime.Now - model.UpdateTime).TotalDays > 5) { model.ClassName = item.TypeName; model.CollectionMark = 0; model.InfoOrigin = "VeryCd"; model.PageTitle = name; model.ResouceType = item.ResType; model.SubClassName = item.SubClassName; model.UpdateTime = DateTime.Now; model.Update(); count++; } } } #endregion } } return ((double)count) / ((double)hc.Count); } catch (Exception err) { XTrace.WriteException(err); return ((double)count) / ((double)hc.Count); } finally { XTrace.WriteLine("通过网页:{0},获取到更新记录页面{1}条", curUrl, count); } }
//根据大类资源集合网址来提取资源页面集合 protected static void StartCollectResouceList() { string firClassName = "图书"; ResouceType resType = ResouceType.EBook; Hashtable ht = new Hashtable(); ht.Add("http://www.verycd.com/sto/book/novels/", "小说"); ht.Add("http://www.verycd.com/sto/book/literature/", "文学"); ht.Add("http://www.verycd.com/sto/book/social/", "人文社科"); ht.Add("http://www.verycd.com/sto/book/eco/", "经济管理"); ht.Add("http://www.verycd.com/sto/book/computer/", "计算机与网络"); ht.Add("http://www.verycd.com/sto/book/life/", "生活"); ht.Add("http://www.verycd.com/sto/book/edu/", "教育科技"); ht.Add("http://www.verycd.com/sto/book/children/", "少儿"); ht.Add("http://www.verycd.com/sto/book/others/", "其它图书"); //ht.Add("http://www.verycd.com/archives/book/novels/", "小说"); //ht.Add("http://www.verycd.com/archives/book/literature/", "文学"); //ht.Add("http://www.verycd.com/archives/book/social/", "人文社科"); //ht.Add("http://www.verycd.com/archives/book/eco/", "经济管理"); //ht.Add("http://www.verycd.com/archives/book/computer/", "计算机与网络"); //ht.Add("http://www.verycd.com/archives/book/life/", "生活"); //ht.Add("http://www.verycd.com/archives/book/edu/", "教育科技"); //ht.Add("http://www.verycd.com/archives/book/children/", "少儿"); //ht.Add("http://www.verycd.com/archives/book/others/", "其它图书"); foreach (DictionaryEntry element in ht) { if (tb_typelist.FindCount(tb_typelist._.URL, element.Key.ToString()) < 1) { tb_typelist model = new tb_typelist(); model.URL = element.Key.ToString().Trim(); model.Remark = string.Empty; model.SubClassName = element.Value.ToString().Trim(); model.TypeName = firClassName; model.UpdateTime = DateTime.Now; model.ResType = resType.ToString(); model.Insert(); } } }
//分析常规页面 private static void AnalyNormalPage(tb_typelist item) { for (int i = 1; i <= 10; i++) { if (GetPageContentHerf(item, item.URL + "page" + i.ToString()) > 0.85) { return; } } }
//以下为基本采集功能方法 #region 根据大类资源网址获取资源集合列表网址 //根据大类资源网址获取资源集合列表网址 public static void GetTypePageList(tb_typelist typelist) { //string URL, string FirName, string SubClassName, ResouceType resType HtmlDocument doc = CaptureWebSite.GetHtmlDocument(typelist.URL, VerycdEncoding); HtmlNodeCollection hc = doc.DocumentNode.SelectNodes(xPath_TypePageList); int count = 0;//计数器 for (int i = 0; i < hc.Count; i++) { try { string url = (verycdWebSite + hc[i].SelectSingleNode(@"a[1]").Attributes["href"].Value.Trim()); if (tb_fistclasslist.FindCount(tb_fistclasslist._.WebURL, url) < 1) { tb_fistclasslist model = new tb_fistclasslist(); model.WebURL = url; model.ClassName = typelist.TypeName ; model.SubClassName = typelist.SubClassName ; model.CollectionMark = 0; model.InfoOrigin = "VeryCd"; model.Remark = string.Empty; model.ResouceType = typelist.ResType.ToString (); model.UpdateTime = DateTime.Now; model.Insert(); count++; } } catch (Exception err) { XTrace.WriteException(err); continue; } finally { XTrace.WriteLine("通过大类资源列表{0},获取到更新记录{1}条", typelist.URL , count); } } }