private string FilterList(string html, M_Collect_ListFilter model)
        {
            StringBuilder   sb         = new StringBuilder();
            string          listFilter = model.ListStart + "[\\s\\S]*?" + model.ListEnd;
            MatchCollection matches    = new Regex(listFilter).Matches(html);
            Match           TitleMatch = Regex.Match(html, listFilter, RegexOptions.IgnoreCase | RegexOptions.Multiline);

            foreach (Match m in matches)
            {
                string url = (m.Value.Replace(model.ListStart, "").Replace(model.ListEnd, ""));
                if (url.ToLower().Contains("http") || url.ToLower().Contains("https"))
                {
                }
                else
                {
                    url = model.FillStart + url + model.FillEnd;
                }                                                    //自动用根路径,或用户预设好的值填充前面
                sb.AppendLine(url);
            }
            if (string.IsNullOrEmpty(sb.ToString()))
            {
                return("");
            }
            return(htmlHelper.GetAllLink("<html><head></head><body>" + sb.ToString() + "</body></html>", model));
        }
 protected void Page_Load(object sender, EventArgs e)
 {
     if (!IsPostBack)
     {
         M_CollectionItem mc = bc.GetSelect(ItemID);
         Url                  = mc.CollUrl;
         lblLink.Text         = "<a href='" + mc.CollUrl + "'  target=\"_blank\" class='btn btn-info' style='color:white;'>查看原页面</a>";
         SourceHtml_Hid.Value = htmlHelper.GetHtmlFromSite(Url);
         if (!string.IsNullOrEmpty(mc.ListSettings))
         {
             lfMod              = JsonConvert.DeserializeObject <M_Collect_ListFilter>(mc.ListSettings);
             ListStart_T.Text   = lfMod.ListStart;
             ListEnd_T.Text     = lfMod.ListEnd;
             CharContain_T.Text = lfMod.CharContain;
             CharRegex_T.Text   = lfMod.CharRegex;
             FillStart_T.Text   = lfMod.FillStart;
             FillEnd_T.Text     = lfMod.FillEnd;
         }
         if (!string.IsNullOrEmpty(mc.LinkList))
         {
             txtHtml.Text = mc.LinkList;
         }
         else
         {
             txtHtml.Text = SourceHtml_Hid.Value;
         }
         Call.SetBreadCrumb(Master, "<li><a href='ContentManage.aspx'>内容管理</a></li><li><a href='CollectionManage.aspx'>信息采集</a></li><li class='active'>当前:" + mc.ItemName + "</li>");
     }
 }
Ejemplo n.º 3
0
        /// <summary>
        /// 从Html中获取所有超链接,必须以<html>包裹,自动过滤javascript:;等无效Url
        /// </summary>
        /// <param name="html">需要筛选的Html代码</param>
        /// <param name="pre">链接前加</param>
        /// <param name="end">链接后加</param>
        /// <param name="charcontain">必须包含指定字符</param>
        /// <returns></returns>
        public string GetAllLink(string html, M_Collect_ListFilter model)
        {
            string   list     = "";
            NodeList nodeList = GetTagList(html, "A");

            for (int i = 0; i < nodeList.Size(); i++)
            {
                ATag   link = (ATag)nodeList.ElementAt(i);
                string href = link.GetAttribute("href");
                if (string.IsNullOrEmpty(href) || href.ToLower().IndexOf("javascript") > -1)
                {
                    continue;
                }
                //必须包含有指定字符串
                if (!string.IsNullOrEmpty(model.CharContain) && !href.Contains(model.CharContain))
                {
                    bool flag = false;
                    foreach (string start in model.CharContain.Split("|".ToCharArray(), StringSplitOptions.RemoveEmptyEntries))
                    {
                        if (href.Contains(start))
                        {
                            flag = true; break;
                        }
                    }
                    if (flag == false)
                    {
                        continue;
                    }
                }
                if (!string.IsNullOrEmpty(model.CharRegex) && !Regex.IsMatch(href, model.CharRegex))
                {
                    continue;
                }
                list += (model.FillStart + link.GetAttribute("href") + model.FillEnd) + "\n";
            }
            return(list);
        }
 //筛选列表
 protected void FilterA_Btn_Click(object sender, EventArgs e)
 {
     lfMod        = FillListFilter();
     txtHtml.Text = FilterList(SourceHtml_Hid.Value, lfMod);
 }
Ejemplo n.º 5
0
 /// <summary>
 /// 从Html中获取所有超链接,必须以<html>包裹,自动过滤javascript:;等无效Url
 /// </summary>
 /// <param name="html">需要筛选的Html代码</param>
 /// <param name="pre">链接前加</param>
 /// <param name="end">链接后加</param>
 /// <param name="charcontain">必须包含指定字符</param>
 /// <returns></returns>
 public string GetAllLink(string html, M_Collect_ListFilter model)
 {
     return("");
 }