private static void smethod_0(string string_0, string string_1, ref Dictionary <string, string> dictionary_0) { Regex regex = new Regex(@"<a\s+[^>]*href\s*=\s*[^>]+>[\s\S]*?</a>", RegexOptions.IgnorePatternWhitespace | RegexOptions.Multiline | RegexOptions.IgnoreCase); Regex regex2 = new Regex("\"|'", RegexOptions.IgnorePatternWhitespace | RegexOptions.Multiline | RegexOptions.IgnoreCase); MatchCollection matchs = regex.Matches(string_0); for (int i = matchs.Count - 1; i >= 0; i--) { Match match = matchs[i]; string sContent = GetLink(match.Value).Trim().Replace("\\\"", "").Replace(@"\'", ""); if (RemoveByReg(sContent, "^http.*/$").Length >= 2) { string str = CString.ClearTag(GetTextByLink(match.Value)).Trim(); if ((CString.GetLength(RemoveByReg(str, "首页|下载|中文|English|反馈|讨论区|投诉|建议|联系|关于|about|诚邀|工作|简介|新闻|掠影|风采\r\n|登录|注销|注册|使用|体验|立即|收藏夹|收藏|添加|加入\r\n|更多|more|专题|精选|热卖|热销|推荐|精彩\r\n|加盟|联盟|友情|链接|相关\r\n|订阅|阅读器|RSS\r\n|免责|条款|声明|我的|我们|组织|概况|有限|免费|公司|法律|导航|广告|地图|隐私\r\n|〖|〗|【|】|(|)|[|]|『|』|\\.")) >= 9) && !regex2.IsMatch(str)) { sContent = GetUrlByRelative(string_1, sContent); if (sContent.Length > 0x12) { int index = sContent.IndexOf('#'); if (index > -1) { sContent = sContent.Substring(0, index); } sContent = sContent.Trim(new char[] { '/', '\\' }); string domain = CRegex.GetDomain(sContent); if (!sContent.Equals(domain, StringComparison.OrdinalIgnoreCase) && !(dictionary_0.ContainsKey(sContent) || dictionary_0.ContainsValue(str))) { dictionary_0.Add(sContent, str); } } } } } }
public static Dictionary <string, string> GetLinks(string sContent, string sUrl, ref Dictionary <string, string> lisDes) { Dictionary <string, string> dictionary = new Dictionary <string, string>(); smethod_0(sContent, sUrl, ref dictionary); string str = CRegex.GetDomain(sUrl).ToLower(); MatchCollection matchs = new Regex("<script[^>]+src\\s*=\\s*(?:'(?<src>[^']+)'|\"(?<src>[^\"]+)\"|(?<src>[^>\\s]+))\\s*[^>]*>", RegexOptions.IgnorePatternWhitespace | RegexOptions.Multiline | RegexOptions.IgnoreCase).Matches(sContent); for (int i = matchs.Count - 1; i >= 0; i--) { Match match = matchs[i]; string url = CRegex.GetUrl(sUrl, match.Groups["src"].Value); if (str.CompareTo(CRegex.GetDomain(url).ToLower()) == 0) { string htmlByUrl = CSocket.GetHtmlByUrl(url); if (htmlByUrl.Length != 0) { smethod_0(htmlByUrl, url, ref dictionary); } } } if (dictionary.Count == 0) { return(GetLinksFromRss(sContent, sUrl, ref lisDes)); } return(dictionary); }
public static Dictionary <string, string> GetLinks(string sContent, string sUrl, ref Dictionary <string, string> lisDes) { Dictionary <string, string> lisA = new Dictionary <string, string>(); _GetLinks(sContent, sUrl, ref lisA); string domain = CRegex.GetDomain(sUrl).ToLower(); //抓取脚本输出的链接 Regex re = new Regex(@"<script[^>]+src\s*=\s*(?:'(?<src>[^']+)'|""(?<src>[^""]+)""|(?<src>[^>\s]+))\s*[^>]*>", RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace | RegexOptions.IgnoreCase); MatchCollection mcs = re.Matches(sContent); //foreach (Match mc in mcs) for (int i = mcs.Count - 1; i >= 0; i--) { Match mc = mcs[i]; string subUrl = CRegex.GetUrl(sUrl, mc.Groups["src"].Value); if (domain.CompareTo(CRegex.GetDomain(subUrl).ToLower()) != 0) { //同一域的才提炼 continue; } string subContent = CSocket.GetHtmlByUrl(subUrl); if (subContent.Length == 0) { continue; } _GetLinks(subContent, subUrl, ref lisA); } if (lisA.Count == 0) { return(GetLinksFromRss(sContent, sUrl, ref lisDes)); } return(lisA); }
private static void _GetLinks(string sContent, string sUrl, ref Dictionary <string, string> lisA) { const string sFilter = @"首页|下载|中文|English|反馈|讨论区|投诉|建议|联系|关于|about|诚邀|工作|简介|新闻|掠影|风采 |登录|注销|注册|使用|体验|立即|收藏夹|收藏|添加|加入 |更多|more|专题|精选|热卖|热销|推荐|精彩 |加盟|联盟|友情|链接|相关 |订阅|阅读器|RSS |免责|条款|声明|我的|我们|组织|概况|有限|免费|公司|法律|导航|广告|地图|隐私 |〖|〗|【|】|(|)|[|]|『|』|\."; Regex re = new Regex(@"<a\s+[^>]*href\s*=\s*[^>]+>[\s\S]*?</a>", RegexOptions.IgnoreCase | RegexOptions.IgnorePatternWhitespace | RegexOptions.Multiline); Regex re2 = new Regex(@"""|'", RegexOptions.IgnoreCase | RegexOptions.IgnorePatternWhitespace | RegexOptions.Multiline); MatchCollection mcs = re.Matches(sContent); //foreach (Match mc in mcs) for (int i = mcs.Count - 1; i >= 0; i--) { Match mc = mcs[i]; string strHref = GetLink(mc.Value).Trim(); strHref = strHref.Replace("\\\"", "");//针对JS输出链接 strHref = strHref.Replace("\\\'", ""); string strTemp = RemoveByReg(strHref, @"^http.*/$");//屏蔽以“http”开头“/”结尾的链接地址 if (strTemp.Length < 2) { continue; } //过滤广告或无意义的链接 string strText = CString.ClearTag(GetTextByLink(mc.Value)).Trim(); strTemp = RemoveByReg(strText, sFilter); if (CString.GetLength(strTemp) < 9) { continue; } if (re2.IsMatch(strText)) { continue; } //换上绝对地址 strHref = CText.GetUrlByRelative(sUrl, strHref); if (strHref.Length <= 18)//例如,http://www.163.com = 18 { continue; } //计算#字符出现的位置,移除它后面的内容 //如果是域名地址,就跳过 int charIndex = strHref.IndexOf('#'); if (charIndex > -1) { strHref = strHref.Substring(0, charIndex); } strHref = strHref.Trim(new char[] { '/', '\\' }); string tmpDomainURL = CRegex.GetDomain(strHref); if (strHref.Equals(tmpDomainURL, StringComparison.OrdinalIgnoreCase)) { continue; } if (!lisA.ContainsKey(strHref) && !lisA.ContainsValue(strText)) { lisA.Add(strHref, strText); } } }