Esempio n. 1
0
        public static PageType GetPageType(string sUrl, ref string sHtml)
        {
            PageType pt = PageType.HTML;

            //看有没有RSS FEED
            string regRss = @"<link\s+[^>]*((type=""application/rss\+xml"")|(type=application/rss\+xml))[^>]*>";
            Regex  r      = new Regex(regRss, RegexOptions.IgnoreCase);
            Match  m      = r.Match(sHtml);

            if (m.Captures.Count != 0)
            {//有,则转向从RSS FEED中抓取
                string regHref = @"href=\s*(?:'(?<href>[^']+)'|""(?<href>[^""]+)""|(?<href>[^>\s]+))";
                r = new Regex(regHref, RegexOptions.IgnoreCase);
                m = r.Match(m.Captures[0].Value);
                if (m.Captures.Count > 0)
                {
                    //有可能是相对路径,加上绝对路径
                    string rssFile = CRegex.GetUrl(sUrl, m.Groups["href"].Value);
                    sHtml = GetHtmlByUrl(rssFile);
                    pt    = PageType.RSS;
                }
            }
            else
            {//看这个地址本身是不是一个Rss feed
                r = new Regex(@"<rss\s+[^>]*>", RegexOptions.IgnoreCase);
                m = r.Match(sHtml);
                if (m.Captures.Count > 0)
                {
                    pt = PageType.RSS;
                }
            }

            return(pt);
        }
Esempio n. 2
0
        private static string[] DealWithFrame(string strReg, string url, string content)
        {
            ArrayList alFrame = new ArrayList();
            Regex     r       = new Regex(strReg, RegexOptions.IgnoreCase);
            Match     m       = r.Match(content);

            while (m.Success)
            {
                alFrame.Add(CRegex.GetUrl(url, m.Groups["src"].Value));
                m = m.NextMatch();
            }

            return((string[])alFrame.ToArray(System.Type.GetType("System.String")));
        }