private string FilterList(string html, M_Collect_ListFilter model) { StringBuilder sb = new StringBuilder(); string listFilter = model.ListStart + "[\\s\\S]*?" + model.ListEnd; MatchCollection matches = new Regex(listFilter).Matches(html); Match TitleMatch = Regex.Match(html, listFilter, RegexOptions.IgnoreCase | RegexOptions.Multiline); foreach (Match m in matches) { string url = (m.Value.Replace(model.ListStart, "").Replace(model.ListEnd, "")); if (url.ToLower().Contains("http") || url.ToLower().Contains("https")) { } else { url = model.FillStart + url + model.FillEnd; } //自动用根路径,或用户预设好的值填充前面 sb.AppendLine(url); } if (string.IsNullOrEmpty(sb.ToString())) { return(""); } return(htmlHelper.GetAllLink("<html><head></head><body>" + sb.ToString() + "</body></html>", model)); }
protected void Page_Load(object sender, EventArgs e) { if (!IsPostBack) { M_CollectionItem mc = bc.GetSelect(ItemID); Url = mc.CollUrl; lblLink.Text = "<a href='" + mc.CollUrl + "' target=\"_blank\" class='btn btn-info' style='color:white;'>查看原页面</a>"; SourceHtml_Hid.Value = htmlHelper.GetHtmlFromSite(Url); if (!string.IsNullOrEmpty(mc.ListSettings)) { lfMod = JsonConvert.DeserializeObject <M_Collect_ListFilter>(mc.ListSettings); ListStart_T.Text = lfMod.ListStart; ListEnd_T.Text = lfMod.ListEnd; CharContain_T.Text = lfMod.CharContain; CharRegex_T.Text = lfMod.CharRegex; FillStart_T.Text = lfMod.FillStart; FillEnd_T.Text = lfMod.FillEnd; } if (!string.IsNullOrEmpty(mc.LinkList)) { txtHtml.Text = mc.LinkList; } else { txtHtml.Text = SourceHtml_Hid.Value; } Call.SetBreadCrumb(Master, "<li><a href='ContentManage.aspx'>内容管理</a></li><li><a href='CollectionManage.aspx'>信息采集</a></li><li class='active'>当前:" + mc.ItemName + "</li>"); } }
/// <summary> /// 从Html中获取所有超链接,必须以<html>包裹,自动过滤javascript:;等无效Url /// </summary> /// <param name="html">需要筛选的Html代码</param> /// <param name="pre">链接前加</param> /// <param name="end">链接后加</param> /// <param name="charcontain">必须包含指定字符</param> /// <returns></returns> public string GetAllLink(string html, M_Collect_ListFilter model) { string list = ""; NodeList nodeList = GetTagList(html, "A"); for (int i = 0; i < nodeList.Size(); i++) { ATag link = (ATag)nodeList.ElementAt(i); string href = link.GetAttribute("href"); if (string.IsNullOrEmpty(href) || href.ToLower().IndexOf("javascript") > -1) { continue; } //必须包含有指定字符串 if (!string.IsNullOrEmpty(model.CharContain) && !href.Contains(model.CharContain)) { bool flag = false; foreach (string start in model.CharContain.Split("|".ToCharArray(), StringSplitOptions.RemoveEmptyEntries)) { if (href.Contains(start)) { flag = true; break; } } if (flag == false) { continue; } } if (!string.IsNullOrEmpty(model.CharRegex) && !Regex.IsMatch(href, model.CharRegex)) { continue; } list += (model.FillStart + link.GetAttribute("href") + model.FillEnd) + "\n"; } return(list); }
//筛选列表 protected void FilterA_Btn_Click(object sender, EventArgs e) { lfMod = FillListFilter(); txtHtml.Text = FilterList(SourceHtml_Hid.Value, lfMod); }
/// <summary> /// 从Html中获取所有超链接,必须以<html>包裹,自动过滤javascript:;等无效Url /// </summary> /// <param name="html">需要筛选的Html代码</param> /// <param name="pre">链接前加</param> /// <param name="end">链接后加</param> /// <param name="charcontain">必须包含指定字符</param> /// <returns></returns> public string GetAllLink(string html, M_Collect_ListFilter model) { return(""); }