//获取当前页面的链接,返回成功率 private static double GetPageContentHerf(tb_typelist item, string curUrl) { HtmlDocument doc = CaptureWebSite.GetHtmlDocument(curUrl, VerycdEncoding); HtmlNodeCollection hc = doc.DocumentNode.SelectNodes("//@href"); if (hc == null || hc.Count == 0) return 0; int count = 0; try { foreach (var s in hc) { string urls = s.Attributes["href"].Value.ToString(); string url = verycdWebSite + urls; if (Regex.IsMatch(urls, pagePatten)) { #region 数据库操作 //不包括“全文”字样 string name = s.InnerText.Replace("\r\n", "").Trim(); if (name != "" && !name.Contains("全文")) { //写入加入到页面数据库,如果页面已经存在,则检查更新时间,如更新时间>10天,则更新状态 if (tb_resoucepageslist.FindCount(tb_resoucepageslist._.PageURL, url) < 1) { //直接插入 tb_resoucepageslist model = new tb_resoucepageslist(); model.PageURL = url; model.ClassName = item.TypeName; model.CollectionMark = 0; model.InfoOrigin = "VeryCd"; model.PageTitle = name; model.ResouceType = item.ResType; model.SubClassName = item.SubClassName; model.UpdateTime = DateTime.Now; model.Insert(); count++; } else { //更新状态 tb_resoucepageslist model = tb_resoucepageslist.FindByPageURL(url); if ((DateTime.Now - model.UpdateTime).TotalDays > 5) { model.ClassName = item.TypeName; model.CollectionMark = 0; model.InfoOrigin = "VeryCd"; model.PageTitle = name; model.ResouceType = item.ResType; model.SubClassName = item.SubClassName; model.UpdateTime = DateTime.Now; model.Update(); count++; } } } #endregion } } return ((double)count) / ((double)hc.Count); } catch (Exception err) { XTrace.WriteException(err); return ((double)count) / ((double)hc.Count); } finally { XTrace.WriteLine("通过网页:{0},获取到更新记录页面{1}条", curUrl, count); } }
//根据资源集合列表网址获取单独资源的列表 public static void GetPageResouceList(tb_fistclasslist firClassListModel) { //传入进来的都是可以操作的 if (firClassListModel.CollectionMark == 2) return;//采集过的不再重复进行 HtmlDocument doc = CaptureWebSite.GetHtmlDocument(firClassListModel.WebURL, VerycdEncoding); HtmlNodeCollection hc = doc.DocumentNode.SelectNodes(xPath_ResouceList); firClassListModel.CollectionMark = 1; firClassListModel.Update(); int count = 0; try { for (int i = 0; i < hc.Count; i++) { string url = (verycdWebSite + hc[i].SelectSingleNode(@"a[1]").Attributes["href"].Value.Trim()); if (tb_resoucepageslist.FindCount(tb_resoucepageslist._.PageURL, url) < 1) { tb_resoucepageslist model = new tb_resoucepageslist(); model.PageURL = url; model.PageTitle = hc[i].InnerText.Trim(); model.ClassName = firClassListModel.ClassName; model.SubClassName = firClassListModel.SubClassName; model.CollectionMark = 0; model.InfoOrigin = "VeryCd"; model.Remark = string.Empty; model.ResouceType = firClassListModel.ResouceType; model.UpdateTime = DateTime.Now; model.Insert(); count++; } } } catch (Exception err) { XTrace.WriteException(err); } finally { firClassListModel.CollectionMark = 2; firClassListModel.Update(); XTrace.WriteLine("通过大类资源列表页面:{0},获取到更新记录{1}条", firClassListModel.WebURL, count); } }