public static PageType GetPageType(string sUrl, ref string sHtml) { PageType pt = PageType.HTML; //看有没有RSS FEED string regRss = @"<link\s+[^>]*((type=""application/rss\+xml"")|(type=application/rss\+xml))[^>]*>"; Regex r = new Regex(regRss, RegexOptions.IgnoreCase); Match m = r.Match(sHtml); if (m.Captures.Count != 0) {//有,则转向从RSS FEED中抓取 string regHref = @"href=\s*(?:'(?<href>[^']+)'|""(?<href>[^""]+)""|(?<href>[^>\s]+))"; r = new Regex(regHref, RegexOptions.IgnoreCase); m = r.Match(m.Captures[0].Value); if (m.Captures.Count > 0) { //有可能是相对路径,加上绝对路径 string rssFile = CRegex.GetUrl(sUrl, m.Groups["href"].Value); sHtml = GetHtmlByUrl(rssFile); pt = PageType.RSS; } } else {//看这个地址本身是不是一个Rss feed r = new Regex(@"<rss\s+[^>]*>", RegexOptions.IgnoreCase); m = r.Match(sHtml); if (m.Captures.Count > 0) { pt = PageType.RSS; } } return(pt); }
/// <summary> /// 将Unicode字串\u.\u.格式字串转换为原始字符串 /// </summary> /// <param name="str"></param> /// <returns></returns> public static string UnicodeToString(string str) { string outStr = ""; str = CRegex.Replace(str, "[\r\n]", "", 0); if (!string.IsNullOrEmpty(str)) { string[] strlist = str.Replace("\\u", "㊣").Split('㊣'); try { outStr += strlist[0]; for (int i = 1; i < strlist.Length; i++) { string strTemp = strlist[i]; if (!string.IsNullOrEmpty(strTemp) && strTemp.Length >= 4) { strTemp = strlist[i].Substring(0, 4); //将unicode字符转为10进制整数,然后转为char中文字符 outStr += (char)int.Parse(strTemp, System.Globalization.NumberStyles.HexNumber); outStr += strlist[i].Substring(4); } } } catch (FormatException ex) { outStr += "Erorr";//ex.Message; } } return(outStr); }
private static string[] DealWithFrame(string strReg, string url, string content) { ArrayList alFrame = new ArrayList(); Regex r = new Regex(strReg, RegexOptions.IgnoreCase); Match m = r.Match(content); while (m.Success) { alFrame.Add(CRegex.GetUrl(url, m.Groups["src"].Value)); m = m.NextMatch(); } return((string[])alFrame.ToArray(System.Type.GetType("System.String"))); }
/// <summary> /// 置换连接 /// </summary> private static string _ReplaceUrl(string strRe, string subMatch, string sFormartted, string sPageUrl) { Regex re = new Regex(strRe, RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace | RegexOptions.IgnoreCase); MatchCollection mcs = re.Matches(sFormartted); string sOriStr = ""; string sSubMatch = ""; string sReplaceStr = ""; foreach (Match mc in mcs) { sOriStr = mc.Value; sSubMatch = mc.Groups[subMatch].Value; sReplaceStr = sOriStr.Replace(sSubMatch, CRegex.GetUrl(sPageUrl, sSubMatch)); sFormartted = sFormartted.Replace(sOriStr, sReplaceStr); } return(sFormartted); }
public static Dictionary <string, string> GetLinks(string sContent, string sUrl, ref Dictionary <string, string> lisDes) { Dictionary <string, string> lisA = new Dictionary <string, string>(); _GetLinks(sContent, sUrl, ref lisA); string domain = CRegex.GetDomain(sUrl).ToLower(); //抓取脚本输出的链接 Regex re = new Regex(@"<script[^>]+src\s*=\s*(?:'(?<src>[^']+)'|""(?<src>[^""]+)""|(?<src>[^>\s]+))\s*[^>]*>", RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace | RegexOptions.IgnoreCase); MatchCollection mcs = re.Matches(sContent); //foreach (Match mc in mcs) for (int i = mcs.Count - 1; i >= 0; i--) { Match mc = mcs[i]; string subUrl = CRegex.GetUrl(sUrl, mc.Groups["src"].Value); if (domain.CompareTo(CRegex.GetDomain(subUrl).ToLower()) != 0) { //同一域的才提炼 continue; } string subContent = CSocket.GetHtmlByUrl(subUrl); if (subContent.Length == 0) { continue; } _GetLinks(subContent, subUrl, ref lisA); } if (lisA.Count == 0) { return(GetLinksFromRss(sContent, sUrl, ref lisDes)); } return(lisA); }
private static void _GetLinks(string sContent, string sUrl, ref Dictionary <string, string> lisA) { const string sFilter = @"首页|下载|中文|English|反馈|讨论区|投诉|建议|联系|关于|about|诚邀|工作|简介|新闻|掠影|风采 |登录|注销|注册|使用|体验|立即|收藏夹|收藏|添加|加入 |更多|more|专题|精选|热卖|热销|推荐|精彩 |加盟|联盟|友情|链接|相关 |订阅|阅读器|RSS |免责|条款|声明|我的|我们|组织|概况|有限|免费|公司|法律|导航|广告|地图|隐私 |〖|〗|【|】|(|)|[|]|『|』|\."; Regex re = new Regex(@"<a\s+[^>]*href\s*=\s*[^>]+>[\s\S]*?</a>", RegexOptions.IgnoreCase | RegexOptions.IgnorePatternWhitespace | RegexOptions.Multiline); Regex re2 = new Regex(@"""|'", RegexOptions.IgnoreCase | RegexOptions.IgnorePatternWhitespace | RegexOptions.Multiline); MatchCollection mcs = re.Matches(sContent); //foreach (Match mc in mcs) for (int i = mcs.Count - 1; i >= 0; i--) { Match mc = mcs[i]; string strHref = GetLink(mc.Value).Trim(); strHref = strHref.Replace("\\\"", "");//针对JS输出链接 strHref = strHref.Replace("\\\'", ""); string strTemp = RemoveByReg(strHref, @"^http.*/$");//屏蔽以“http”开头“/”结尾的链接地址 if (strTemp.Length < 2) { continue; } //过滤广告或无意义的链接 string strText = CString.ClearTag(GetTextByLink(mc.Value)).Trim(); strTemp = RemoveByReg(strText, sFilter); if (CString.GetLength(strTemp) < 9) { continue; } if (re2.IsMatch(strText)) { continue; } //换上绝对地址 strHref = CText.GetUrlByRelative(sUrl, strHref); if (strHref.Length <= 18)//例如,http://www.163.com = 18 { continue; } //计算#字符出现的位置,移除它后面的内容 //如果是域名地址,就跳过 int charIndex = strHref.IndexOf('#'); if (charIndex > -1) { strHref = strHref.Substring(0, charIndex); } strHref = strHref.Trim(new char[] { '/', '\\' }); string tmpDomainURL = CRegex.GetDomain(strHref); if (strHref.Equals(tmpDomainURL, StringComparison.OrdinalIgnoreCase)) { continue; } if (!lisA.ContainsKey(strHref) && !lisA.ContainsValue(strText)) { lisA.Add(strHref, strText); } } }
public static string GetContent(string sOriContent, string sOtherRemoveReg, string sPageUrl, DataTable dtAntiLink) { string sFormartted = sOriContent; //去掉有危险的标记 sFormartted = Regex.Replace(sFormartted, @"<script[\s\S]*?</script>", "", RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace | RegexOptions.IgnoreCase); sFormartted = Regex.Replace(sFormartted, @"<iframe[^>]*>[\s\S]*?</iframe>", "", RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace | RegexOptions.IgnoreCase); Regex r = new Regex(@"<input[\s\S]+?>|<form[\s\S]+?>|</form[\s\S]*?>|<select[\s\S]+?>?</select>|<textarea[\s\S]*?>?</textarea>|<file[\s\S]*?>|<noscript>|</noscript>", RegexOptions.IgnoreCase); sFormartted = r.Replace(sFormartted, ""); string[] sOtherReg = sOtherRemoveReg.Split(new string[] { "\r\n" }, StringSplitOptions.RemoveEmptyEntries); foreach (string sRemoveReg in sOtherReg) { sFormartted = CRegex.Replace(sFormartted, sRemoveReg, "", 0); } //图片路径 //sFormartted = _ReplaceUrl("<img[^>]+src\\s*=\\s*(?:'(?<src>[^']+)'|\"(?<src>[^\"]+)\"|(?<src>[^>\\s]+))\\s*[^>]*>", "src", sFormartted,sPageUrl); sFormartted = _ReplaceUrl("<img[\\s\\S]+?src\\s*=\\s*(?:'(?<src>[^']+)'|\"(?<src>[^\"]+)\"|(?<src>[^>\\s]+))\\s*[^>]*>", "src", sFormartted, sPageUrl); //反防盗链 string domain = GetDomain(sPageUrl); DataRow[] drs = dtAntiLink.Select("Domain='" + domain + "'"); if (drs.Length > 0) { foreach (DataRow dr in drs) { switch (Convert.ToInt32(dr["Type"])) { case 1: //置换 sFormartted = sFormartted.Replace(dr["imgUrl"].ToString(), "http://stat.580k.com/t.asp?url="); break; default: //附加 sFormartted = sFormartted.Replace(dr["imgUrl"].ToString(), "http://stat.580k.com/t.asp?url=" + dr["imgUrl"].ToString()); break; } } } //A链接 sFormartted = _ReplaceUrl(@"<a[^>]+href\s*=\s*(?:'(?<href>[^']+)'|""(?<href>[^""]+)""|(?<href>[^>\s]+))\s*[^>]*>", "href", sFormartted, sPageUrl); //CSS sFormartted = _ReplaceUrl(@"<link[^>]+href\s*=\s*(?:'(?<href>[^']+)'|""(?<href>[^""]+)""|(?<href>[^>\s]+))\s*[^>]*>", "href", sFormartted, sPageUrl); //BACKGROUND sFormartted = _ReplaceUrl(@"background\s*=\s*(?:'(?<img>[^']+)'|""(?<img>[^""]+)""|(?<img>[^>\s]+))", "img", sFormartted, sPageUrl); //style方式的背景:background-image:url(...) sFormartted = _ReplaceUrl(@"background-image\s*:\s*url\s*\x28(?<img>[^\x29]+)\x29", "img", sFormartted, sPageUrl); //FLASH sFormartted = _ReplaceUrl(@"<param\s[^>]+""movie""[^>]+value\s*=\s*""(?<flash>[^"">]+\x2eswf)""[^>]*>", "flash", sFormartted, sPageUrl); //XSL if (IsXml(sFormartted)) { sFormartted = _ReplaceUrl(@"<\x3fxml-stylesheet\s+[^\x3f>]+href=\s*(?:'(?<href>[^']+)'|""(?<href>[^""]+)"")\s*[^\x3f>]*\x3f>", "href", sFormartted, sPageUrl); } //script //sFormartted = _ReplaceUrl(@"<script[^>]+src\s*=\s*(?:'(?<src>[^']+)'|""(?<src>[^""]+)""|(?<src>[^>\s]+))\s*[^>]*>", "src", sFormartted,sPageUrl); return(sFormartted); }
/// <summary> /// 网页内容 /// </summary> /// <param name="sInput">输入内容</param> public static string GetHtml(string sInput) { return(CRegex.Replace(sInput, @"(?<Head>[^<]+)<", "", "Head")); }