public static Dictionary <string, string> GetLinks(string sContent, string sUrl, ref Dictionary <string, string> lisDes) { Dictionary <string, string> dictionary = new Dictionary <string, string>(); smethod_0(sContent, sUrl, ref dictionary); string str = CRegex.GetDomain(sUrl).ToLower(); MatchCollection matchs = new Regex("<script[^>]+src\\s*=\\s*(?:'(?<src>[^']+)'|\"(?<src>[^\"]+)\"|(?<src>[^>\\s]+))\\s*[^>]*>", RegexOptions.IgnorePatternWhitespace | RegexOptions.Multiline | RegexOptions.IgnoreCase).Matches(sContent); for (int i = matchs.Count - 1; i >= 0; i--) { Match match = matchs[i]; string url = CRegex.GetUrl(sUrl, match.Groups["src"].Value); if (str.CompareTo(CRegex.GetDomain(url).ToLower()) == 0) { string htmlByUrl = CSocket.GetHtmlByUrl(url); if (htmlByUrl.Length != 0) { smethod_0(htmlByUrl, url, ref dictionary); } } } if (dictionary.Count == 0) { return(GetLinksFromRss(sContent, sUrl, ref lisDes)); } return(dictionary); }
public static PageType GetPageType(string sUrl, ref string sHtml) { PageType pt = PageType.HTML; //看有没有RSS FEED string regRss = @"<link\s+[^>]*((type=""application/rss\+xml"")|(type=application/rss\+xml))[^>]*>"; Regex r = new Regex(regRss, RegexOptions.IgnoreCase); Match m = r.Match(sHtml); if (m.Captures.Count != 0) {//有,则转向从RSS FEED中抓取 string regHref = @"href=\s*(?:'(?<href>[^']+)'|""(?<href>[^""]+)""|(?<href>[^>\s]+))"; r = new Regex(regHref, RegexOptions.IgnoreCase); m = r.Match(m.Captures[0].Value); if (m.Captures.Count > 0) { //有可能是相对路径,加上绝对路径 string rssFile = CRegex.GetUrl(sUrl, m.Groups["href"].Value); sHtml = GetHtmlByUrl(rssFile); pt = PageType.RSS; } } else {//看这个地址本身是不是一个Rss feed r = new Regex(@"<rss\s+[^>]*>", RegexOptions.IgnoreCase); m = r.Match(sHtml); if (m.Captures.Count > 0) { pt = PageType.RSS; } } return(pt); }
public static PageType GetPageType(string sUrl, ref string sHtml) { PageType hTML = PageType.HTML; string pattern = "<link\\s+[^>]*((type=\"application/rss\\+xml\")|(type=application/rss\\+xml))[^>]*>"; Regex regex = new Regex(pattern, RegexOptions.IgnoreCase); Match match = regex.Match(sHtml); if (match.Captures.Count != 0) { string str2 = "href=\\s*(?:'(?<href>[^']+)'|\"(?<href>[^\"]+)\"|(?<href>[^>\\s]+))"; match = new Regex(str2, RegexOptions.IgnoreCase).Match(match.Captures[0].Value); if (match.Captures.Count > 0) { string url = CRegex.GetUrl(sUrl, match.Groups["href"].Value); sHtml = GetHtmlByUrl(url); hTML = PageType.RSS; } return(hTML); } regex = new Regex(@"<rss\s+[^>]*>", RegexOptions.IgnoreCase); if (regex.Match(sHtml).Captures.Count > 0) { hTML = PageType.RSS; } return(hTML); }
private static string[] smethod_2(string string_0, string string_1, string string_2) { ArrayList list = new ArrayList(); Regex regex = new Regex(string_0, RegexOptions.IgnoreCase); for (Match match = regex.Match(string_2); match.Success; match = match.NextMatch()) { list.Add(CRegex.GetUrl(string_1, match.Groups["src"].Value)); } return((string[])list.ToArray(Type.GetType("System.String"))); }
private static string[] DealWithFrame(string strReg, string url, string content) { ArrayList alFrame = new ArrayList(); Regex r = new Regex(strReg, RegexOptions.IgnoreCase); Match m = r.Match(content); while (m.Success) { alFrame.Add(CRegex.GetUrl(url, m.Groups["src"].Value)); m = m.NextMatch(); } return((string[])alFrame.ToArray(System.Type.GetType("System.String"))); }
/// <summary> /// 置换连接 /// </summary> private static string _ReplaceUrl(string strRe, string subMatch, string sFormartted, string sPageUrl) { Regex re = new Regex(strRe, RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace | RegexOptions.IgnoreCase); MatchCollection mcs = re.Matches(sFormartted); string sOriStr = ""; string sSubMatch = ""; string sReplaceStr = ""; foreach (Match mc in mcs) { sOriStr = mc.Value; sSubMatch = mc.Groups[subMatch].Value; sReplaceStr = sOriStr.Replace(sSubMatch, CRegex.GetUrl(sPageUrl, sSubMatch)); sFormartted = sFormartted.Replace(sOriStr, sReplaceStr); } return(sFormartted); }
public static Dictionary <string, string> GetLinks(string sContent, string sUrl, ref Dictionary <string, string> lisDes) { Dictionary <string, string> lisA = new Dictionary <string, string>(); _GetLinks(sContent, sUrl, ref lisA); string domain = CRegex.GetDomain(sUrl).ToLower(); //抓取脚本输出的链接 Regex re = new Regex(@"<script[^>]+src\s*=\s*(?:'(?<src>[^']+)'|""(?<src>[^""]+)""|(?<src>[^>\s]+))\s*[^>]*>", RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace | RegexOptions.IgnoreCase); MatchCollection mcs = re.Matches(sContent); //foreach (Match mc in mcs) for (int i = mcs.Count - 1; i >= 0; i--) { Match mc = mcs[i]; string subUrl = CRegex.GetUrl(sUrl, mc.Groups["src"].Value); if (domain.CompareTo(CRegex.GetDomain(subUrl).ToLower()) != 0) { //同一域的才提炼 continue; } string subContent = CSocket.GetHtmlByUrl(subUrl); if (subContent.Length == 0) { continue; } _GetLinks(subContent, subUrl, ref lisA); } if (lisA.Count == 0) { return(GetLinksFromRss(sContent, sUrl, ref lisDes)); } return(lisA); }