//根据资源集合列表网址获取单独资源的列表 public static void GetPageResouceList(tb_fistclasslist firClassListModel) { HtmlDocument doc = CaptureWebSite.GetHtmlDocument (firClassListModel.WebURL ,VerycdEncoding ) ; HtmlNodeCollection hc = doc.DocumentNode.SelectNodes (xPath_ResouceList ) ; firClassListModel.CollectionMark = 1 ; firClassListModel.Update () ; for (int i = 0; i < hc.Count ; i++) { try { tb_resoucepageslist model = new tb_resoucepageslist () ; model.PageURL =(verycdWebSite + hc[i].SelectSingleNode (@"a[1]").Attributes["href"].Value.Trim ()) ; model.PageTitle = hc[i].InnerText.Trim () ; model.ClassName = firClassListModel.ClassName ; model.SubClassName = firClassListModel.SubClassName ; model.CollectionMark = 0 ; model.InfoOrigin = "VeryCd" ; model.Remark = string.Empty ; model.ResouceType = firClassListModel.ResouceType ; model.UpdateTime = DateTime.Now ; model.Save () ; } catch (Exception err) { continue ; } finally { firClassListModel.CollectionMark = 2 ; firClassListModel.Update () ; } } }
//根据页面链接获取页面的资源下的信息 详细信息,即名称和下载链接,介绍等 public static void GetResoucePageInfo(tb_resoucepageslist respageListModel) { //string xpath =@"td[1]/a[1] "; //属性 ed2k HtmlDocument doc = CaptureWebSite.GetHtmlDocument (respageListModel.PageURL ,VerycdEncoding ) ; HtmlNodeCollection hc = doc.DocumentNode.SelectNodes ("//@ed2k") ; respageListModel.CollectionMark = 1 ; respageListModel.Update () ; for (int i = 0; i <hc.Count ; i++) { try { tb_resoucelink model = new tb_resoucelink () ; string name = hc[i ].Attributes["ed2k"].Value.Trim (); string urlLink = HttpUtility.UrlDecode (name,Encoding.UTF8 ) ; model.ResouceMD5 = urlLink.Split('|')[4] ; model.ResouceName = urlLink.Split('|')[2] ; model.ResouceLink = urlLink ; model.FromURL = respageListModel.PageURL ; model.ClassName = respageListModel .ClassName ; model.SubClassName = respageListModel .SubClassName ; model.InfoOrigin = "VeryCd" ; model.Remark = string.Empty ; model.ResouceType = respageListModel.ResouceType ; model.UpdateTime = DateTime.Now ; model.Save () ; } catch (Exception err) { continue ; } } respageListModel.CollectionMark = 2 ; respageListModel.Update () ; Console.WriteLine (respageListModel.PageURL ) ; }
//根据页面链接获取页面的资源下的信息 详细信息,即名称和下载链接,介绍等 public static void GetResoucePageInfo(tb_resoucepageslist respageListModel) { if (respageListModel.CollectionMark == 2) return; HtmlDocument doc = CaptureWebSite.GetHtmlDocument(respageListModel.PageURL, VerycdEncoding); HtmlNodeCollection hc = doc.DocumentNode.SelectNodes("//@ed2k"); respageListModel.CollectionMark = 1; respageListModel.Update(); int count = 0; try { for (int i = 0; i < hc.Count; i++) { string name = hc[i].Attributes["ed2k"].Value.Trim(); string urlLink = HttpUtility.UrlDecode(name, Encoding.UTF8); string[] linkFields = urlLink.Split('|'); if (tb_resoucelink.FindCount(tb_resoucelink._.ResouceMD5, linkFields[4]) < 1) { tb_resoucelink model = new tb_resoucelink(); model.ResouceMD5 = linkFields[4]; model.ResouceName = linkFields[2]; long size; if (long.TryParse(linkFields[3], out size)) model.Size = (ulong)size; else model.Size = 0; model.ResouceLink = urlLink; model.FromURL = respageListModel.PageURL; model.ClassName = respageListModel.ClassName; model.SubClassName = respageListModel.SubClassName; model.InfoOrigin = "VeryCd"; model.Remark = string.Empty; model.ResouceType = respageListModel.ResouceType; model.UpdateTime = DateTime.Now; model.IsDownload = 0; model.Insert(); count++; } } } catch (Exception err) { XTrace.WriteException(err); } finally { respageListModel.CollectionMark = 2; respageListModel.Update(); XTrace.WriteLine("从资源页面{0}获取到{1}条新的资源链接", respageListModel.PageURL, count); } }
//获取当前页面的链接,返回成功率 private static double GetPageContentHerf(tb_typelist item, string curUrl) { HtmlDocument doc = CaptureWebSite.GetHtmlDocument(curUrl, VerycdEncoding); HtmlNodeCollection hc = doc.DocumentNode.SelectNodes("//@href"); if (hc == null || hc.Count == 0) return 0; int count = 0; try { foreach (var s in hc) { string urls = s.Attributes["href"].Value.ToString(); string url = verycdWebSite + urls; if (Regex.IsMatch(urls, pagePatten)) { #region 数据库操作 //不包括“全文”字样 string name = s.InnerText.Replace("\r\n", "").Trim(); if (name != "" && !name.Contains("全文")) { //写入加入到页面数据库,如果页面已经存在,则检查更新时间,如更新时间>10天,则更新状态 if (tb_resoucepageslist.FindCount(tb_resoucepageslist._.PageURL, url) < 1) { //直接插入 tb_resoucepageslist model = new tb_resoucepageslist(); model.PageURL = url; model.ClassName = item.TypeName; model.CollectionMark = 0; model.InfoOrigin = "VeryCd"; model.PageTitle = name; model.ResouceType = item.ResType; model.SubClassName = item.SubClassName; model.UpdateTime = DateTime.Now; model.Insert(); count++; } else { //更新状态 tb_resoucepageslist model = tb_resoucepageslist.FindByPageURL(url); if ((DateTime.Now - model.UpdateTime).TotalDays > 5) { model.ClassName = item.TypeName; model.CollectionMark = 0; model.InfoOrigin = "VeryCd"; model.PageTitle = name; model.ResouceType = item.ResType; model.SubClassName = item.SubClassName; model.UpdateTime = DateTime.Now; model.Update(); count++; } } } #endregion } } return ((double)count) / ((double)hc.Count); } catch (Exception err) { XTrace.WriteException(err); return ((double)count) / ((double)hc.Count); } finally { XTrace.WriteLine("通过网页:{0},获取到更新记录页面{1}条", curUrl, count); } }
//根据资源集合列表网址获取单独资源的列表 public static void GetPageResouceList(tb_fistclasslist firClassListModel) { //传入进来的都是可以操作的 if (firClassListModel.CollectionMark == 2) return;//采集过的不再重复进行 HtmlDocument doc = CaptureWebSite.GetHtmlDocument(firClassListModel.WebURL, VerycdEncoding); HtmlNodeCollection hc = doc.DocumentNode.SelectNodes(xPath_ResouceList); firClassListModel.CollectionMark = 1; firClassListModel.Update(); int count = 0; try { for (int i = 0; i < hc.Count; i++) { string url = (verycdWebSite + hc[i].SelectSingleNode(@"a[1]").Attributes["href"].Value.Trim()); if (tb_resoucepageslist.FindCount(tb_resoucepageslist._.PageURL, url) < 1) { tb_resoucepageslist model = new tb_resoucepageslist(); model.PageURL = url; model.PageTitle = hc[i].InnerText.Trim(); model.ClassName = firClassListModel.ClassName; model.SubClassName = firClassListModel.SubClassName; model.CollectionMark = 0; model.InfoOrigin = "VeryCd"; model.Remark = string.Empty; model.ResouceType = firClassListModel.ResouceType; model.UpdateTime = DateTime.Now; model.Insert(); count++; } } } catch (Exception err) { XTrace.WriteException(err); } finally { firClassListModel.CollectionMark = 2; firClassListModel.Update(); XTrace.WriteLine("通过大类资源列表页面:{0},获取到更新记录{1}条", firClassListModel.WebURL, count); } }