public static PageType GetPageType(string sUrl, ref string sHtml) { PageType pt = PageType.HTML; //看有没有RSS FEED string regRss = @"<link\s+[^>]*((type=""application/rss\+xml"")|(type=application/rss\+xml))[^>]*>"; Regex r = new Regex(regRss, RegexOptions.IgnoreCase); Match m = r.Match(sHtml); if (m.Captures.Count != 0) {//有,则转向从RSS FEED中抓取 string regHref = @"href=\s*(?:'(?<href>[^']+)'|""(?<href>[^""]+)""|(?<href>[^>\s]+))"; r = new Regex(regHref, RegexOptions.IgnoreCase); m = r.Match(m.Captures[0].Value); if (m.Captures.Count > 0) { //有可能是相对路径,加上绝对路径 string rssFile = CRegex.GetUrl(sUrl, m.Groups["href"].Value); sHtml = GetHtmlByUrl(rssFile); pt = PageType.RSS; } } else {//看这个地址本身是不是一个Rss feed r = new Regex(@"<rss\s+[^>]*>", RegexOptions.IgnoreCase); m = r.Match(sHtml); if (m.Captures.Count > 0) { pt = PageType.RSS; } } return(pt); }
private static string[] DealWithFrame(string strReg, string url, string content) { ArrayList alFrame = new ArrayList(); Regex r = new Regex(strReg, RegexOptions.IgnoreCase); Match m = r.Match(content); while (m.Success) { alFrame.Add(CRegex.GetUrl(url, m.Groups["src"].Value)); m = m.NextMatch(); } return((string[])alFrame.ToArray(System.Type.GetType("System.String"))); }