/// <summary> /// 作者名 /// </summary> /// <param name="sInput">输入内容</param> /// <param name="sRegex">表达式字符串</param> public static string GetAuthor(string sInput, string sRegex) { string sAuthor = GetText(sInput, sRegex, "Author"); sAuthor = CString.ClearTag(sAuthor); if (sAuthor.Length > 99) { sAuthor = sAuthor.Substring(0, 99); } return(sAuthor); }
/// <summary> /// 文章来源 /// </summary> /// <param name="sInput">输入内容</param> /// <param name="sRegex">表达式字符串</param> public static string GetSource(string sInput, string sRegex) { string sSource = GetText(sInput, sRegex, "Source"); sSource = CString.ClearTag(sSource); if (sSource.Length > 99) { sSource = sSource.Substring(0, 99); } return(sSource); }
/// <summary> /// 文章标题 /// </summary> /// <param name="sInput">输入内容</param> /// <param name="sRegex">表达式字符串</param> public static string GetTitle(string sInput, string sRegex) { string sTitle = GetText(sInput, sRegex, "Title"); sTitle = CString.ClearTag(sTitle); if (sTitle.Length > 99) { sTitle = sTitle.Substring(0, 99); } return(sTitle); }
public static string GetTextByReg(string sContent, string sRegex) { Regex re = new Regex(sRegex, RegexOptions.IgnoreCase | RegexOptions.IgnorePatternWhitespace | RegexOptions.Multiline); Match mc = re.Match(sContent); string str = ""; if (mc.Success) { str = mc.Groups[0].Value; } while (str.EndsWith("_")) { str = CString.RemoveEndWith(str, "_"); } return(str); }
/// <summary> /// 根据相对路径得到绝对路径 /// </summary> /// <param name="sUrl">输入内容</param> /// <param name="sInput">原始网站地址</param> /// <param name="sRelativeUrl">相对链接地址</param> public static string GetUrl(string sInput, string sRelativeUrl) { string sReturnUrl = ""; string sUrl = _GetStandardUrlDepth(sInput);//返回了http://www.163.com/news/这种形式 if (sRelativeUrl.ToLower().StartsWith("http") || sRelativeUrl.ToLower().StartsWith("https")) { sReturnUrl = sRelativeUrl.Trim(); } else if (sRelativeUrl.StartsWith("/")) { sReturnUrl = GetDomain(sInput) + sRelativeUrl; } else if (sRelativeUrl.StartsWith("../")) { sUrl = sUrl.Substring(0, sUrl.Length - 1); while (sRelativeUrl.IndexOf("../") >= 0) { string temp = CString.GetPreStrByLast(sUrl, "/"); if (temp.Length > 6) {//temp != "http:/",否则的话,说明已经回溯到尽头了,"../"与网址的层次对应不上。存在这种情况,网页上面的链接是错误的,但浏览器还能正常显示 sUrl = temp; } sRelativeUrl = sRelativeUrl.Substring(3); } sReturnUrl = sUrl + "/" + sRelativeUrl.Trim(); } else if (sRelativeUrl.StartsWith("./")) { sReturnUrl = sUrl + sRelativeUrl.Trim().Substring(2); } else if (sRelativeUrl.Trim() != "") {//2007images/modecss.css sReturnUrl = sUrl + sRelativeUrl.Trim(); } else { sRelativeUrl = sUrl; } return(sReturnUrl); }
public static List <string> GetLinksByKey(string sContent, /*string sUrl,*/ List <string> listKey) { List <string> listResult = new List <string>(); List <string> list = new List <string>(); string sKey = ""; string strKey; //提取链接 Regex re = new Regex(@"<a\s+[^>]*href\s*=\s*[^>]+>[\s\S]*?</a>", RegexOptions.IgnoreCase | RegexOptions.IgnorePatternWhitespace | RegexOptions.Multiline); MatchCollection mcs = re.Matches(sContent); foreach (Match mc in mcs) { strKey = RemoveByReg(GetLink(mc.Value), @"^http.*/$");//屏蔽以“http”开头“/”结尾的链接地址 if (strKey.Length > 0) { list.Add(mc.Value); } } //准备好关键字 foreach (string s in listKey) { sKey += "([\\s\\S]*" + s + "[\\s\\S]*)|"; } if (sKey != "") { sKey = sKey.Substring(0, sKey.Length - 1); } if (sKey == "") { sKey = "[\\s\\S]+"; } Regex reKey = new Regex(sKey, RegexOptions.IgnoreCase | RegexOptions.IgnorePatternWhitespace | RegexOptions.Multiline); Match tmpmc; //链接的文字一定要5个字以上才算有效? re = new Regex(@"<a\s+[^>]+>([\s\S]{5,})?</a>", RegexOptions.IgnoreCase | RegexOptions.IgnorePatternWhitespace | RegexOptions.Multiline); foreach (string s in list) { tmpmc = re.Match(s); if (tmpmc.Success) { strKey = CString.ClearTag(tmpmc.Groups[1].Value.Trim()); strKey = RemoveByReg(strKey, @"更多|登录|添加|推荐|收藏夹|加盟|关于|订阅|阅读器|我的|有限|免费|公司|more|RSS|about|\."); if (CString.GetLength(strKey) > 8)//最起码是5个是为了屏蔽垃圾信息。 { if (reKey.Match(strKey).Success) { listResult.Add(s); } } } } #region 对RSS的支持 if (listResult.Count == 0) { return(GetLinksByKeyFromRss(sContent, listKey)); } #endregion return(listResult); }
private static void _GetLinks(string sContent, string sUrl, ref Dictionary <string, string> lisA) { const string sFilter = @"首页|下载|中文|English|反馈|讨论区|投诉|建议|联系|关于|about|诚邀|工作|简介|新闻|掠影|风采 |登录|注销|注册|使用|体验|立即|收藏夹|收藏|添加|加入 |更多|more|专题|精选|热卖|热销|推荐|精彩 |加盟|联盟|友情|链接|相关 |订阅|阅读器|RSS |免责|条款|声明|我的|我们|组织|概况|有限|免费|公司|法律|导航|广告|地图|隐私 |〖|〗|【|】|(|)|[|]|『|』|\."; Regex re = new Regex(@"<a\s+[^>]*href\s*=\s*[^>]+>[\s\S]*?</a>", RegexOptions.IgnoreCase | RegexOptions.IgnorePatternWhitespace | RegexOptions.Multiline); Regex re2 = new Regex(@"""|'", RegexOptions.IgnoreCase | RegexOptions.IgnorePatternWhitespace | RegexOptions.Multiline); MatchCollection mcs = re.Matches(sContent); //foreach (Match mc in mcs) for (int i = mcs.Count - 1; i >= 0; i--) { Match mc = mcs[i]; string strHref = GetLink(mc.Value).Trim(); strHref = strHref.Replace("\\\"", "");//针对JS输出链接 strHref = strHref.Replace("\\\'", ""); string strTemp = RemoveByReg(strHref, @"^http.*/$");//屏蔽以“http”开头“/”结尾的链接地址 if (strTemp.Length < 2) { continue; } //过滤广告或无意义的链接 string strText = CString.ClearTag(GetTextByLink(mc.Value)).Trim(); strTemp = RemoveByReg(strText, sFilter); if (CString.GetLength(strTemp) < 9) { continue; } if (re2.IsMatch(strText)) { continue; } //换上绝对地址 strHref = CText.GetUrlByRelative(sUrl, strHref); if (strHref.Length <= 18)//例如,http://www.163.com = 18 { continue; } //计算#字符出现的位置,移除它后面的内容 //如果是域名地址,就跳过 int charIndex = strHref.IndexOf('#'); if (charIndex > -1) { strHref = strHref.Substring(0, charIndex); } strHref = strHref.Trim(new char[] { '/', '\\' }); string tmpDomainURL = CRegex.GetDomain(strHref); if (strHref.Equals(tmpDomainURL, StringComparison.OrdinalIgnoreCase)) { continue; } if (!lisA.ContainsKey(strHref) && !lisA.ContainsValue(strText)) { lisA.Add(strHref, strText); } } }