예제 #1
0
 //获取当前页面的链接,返回成功率
 private static double GetPageContentHerf(tb_typelist item, string curUrl)
 {
     HtmlDocument doc = CaptureWebSite.GetHtmlDocument(curUrl, VerycdEncoding);
     HtmlNodeCollection hc = doc.DocumentNode.SelectNodes("//@href");
     if (hc == null || hc.Count == 0) return 0;
     int count = 0;
     try
     {
         foreach (var s in hc)
         {
             string urls = s.Attributes["href"].Value.ToString();
             string url = verycdWebSite + urls;
             if (Regex.IsMatch(urls, pagePatten))
             {
                 #region 数据库操作
                 //不包括“全文”字样
                 string name = s.InnerText.Replace("\r\n", "").Trim();
                 if (name != "" && !name.Contains("全文"))
                 {
                     //写入加入到页面数据库,如果页面已经存在,则检查更新时间,如更新时间>10天,则更新状态                            
                     if (tb_resoucepageslist.FindCount(tb_resoucepageslist._.PageURL, url) < 1)
                     {
                         //直接插入
                         tb_resoucepageslist model = new tb_resoucepageslist();
                         model.PageURL = url;
                         model.ClassName = item.TypeName;
                         model.CollectionMark = 0;
                         model.InfoOrigin = "VeryCd";
                         model.PageTitle = name;
                         model.ResouceType = item.ResType;
                         model.SubClassName = item.SubClassName;
                         model.UpdateTime = DateTime.Now;
                         model.Insert();
                         count++;
                     }
                     else
                     {
                         //更新状态
                         tb_resoucepageslist model = tb_resoucepageslist.FindByPageURL(url);
                         if ((DateTime.Now - model.UpdateTime).TotalDays > 5)
                         {
                             model.ClassName = item.TypeName;
                             model.CollectionMark = 0;
                             model.InfoOrigin = "VeryCd";
                             model.PageTitle = name;
                             model.ResouceType = item.ResType;
                             model.SubClassName = item.SubClassName;
                             model.UpdateTime = DateTime.Now;
                             model.Update();
                             count++;
                         }
                     }
                 }
                 #endregion
             }
         }
         return ((double)count) / ((double)hc.Count);
     }
     catch (Exception err) { XTrace.WriteException(err); return ((double)count) / ((double)hc.Count); }
     finally { XTrace.WriteLine("通过网页:{0},获取到更新记录页面{1}条", curUrl, count); }
 }
예제 #2
0
 //根据资源集合列表网址获取单独资源的列表
 public static void GetPageResouceList(tb_fistclasslist firClassListModel)
 {
     //传入进来的都是可以操作的
     if (firClassListModel.CollectionMark == 2) return;//采集过的不再重复进行
     HtmlDocument doc = CaptureWebSite.GetHtmlDocument(firClassListModel.WebURL, VerycdEncoding);
     HtmlNodeCollection hc = doc.DocumentNode.SelectNodes(xPath_ResouceList);
     firClassListModel.CollectionMark = 1;
     firClassListModel.Update();
     int count = 0;
     try
     {
         for (int i = 0; i < hc.Count; i++)
         {
             string url = (verycdWebSite + hc[i].SelectSingleNode(@"a[1]").Attributes["href"].Value.Trim());
             if (tb_resoucepageslist.FindCount(tb_resoucepageslist._.PageURL, url) < 1)
             {
                 tb_resoucepageslist model = new tb_resoucepageslist();
                 model.PageURL = url;
                 model.PageTitle = hc[i].InnerText.Trim();
                 model.ClassName = firClassListModel.ClassName;
                 model.SubClassName = firClassListModel.SubClassName;
                 model.CollectionMark = 0;
                 model.InfoOrigin = "VeryCd";
                 model.Remark = string.Empty;
                 model.ResouceType = firClassListModel.ResouceType;
                 model.UpdateTime = DateTime.Now;
                 model.Insert();
                 count++;
             }
         }
     }
     catch (Exception err) { XTrace.WriteException(err); }
     finally
     {
         firClassListModel.CollectionMark = 2; firClassListModel.Update();
         XTrace.WriteLine("通过大类资源列表页面:{0},获取到更新记录{1}条", firClassListModel.WebURL, count);
     }
 }