Ejemplo n.º 1
0
        /// <summary>
        /// 获取页面的类型
        /// </summary>
        /// <param name="sUrl">URL地址</param>
        /// <param name="sHtml">页面内容</param>
        /// <returns></returns>
        public static PageType GetPageType(string sUrl, ref string sHtml)
        {
            PageType pt = PageType.HTML;

            //看有没有RSS FEED
            string regRss = @"<link\s+[^>]*((type=""application/rss\+xml"")|(type=application/rss\+xml))[^>]*>";
            Regex  r      = new Regex(regRss, RegexOptions.IgnoreCase);
            Match  m      = r.Match(sHtml);

            if (m.Captures.Count != 0)
            {//有,则转向从RSS FEED中抓取
                string regHref = @"href=\s*(?:'(?<href>[^']+)'|""(?<href>[^""]+)""|(?<href>[^>\s]+))";
                r = new Regex(regHref, RegexOptions.IgnoreCase);
                m = r.Match(m.Captures[0].Value);
                if (m.Captures.Count > 0)
                {
                    //有可能是相对路径,加上绝对路径
                    string rssFile = CRegex.GetUrl(sUrl, m.Groups["href"].Value);
                    sHtml = GetHtmlByUrl(rssFile);
                    pt    = PageType.RSS;
                }
            }
            else
            {//看这个地址本身是不是一个Rss feed
                r = new Regex(@"<rss\s+[^>]*>", RegexOptions.IgnoreCase);
                m = r.Match(sHtml);
                if (m.Captures.Count > 0)
                {
                    pt = PageType.RSS;
                }
            }

            return(pt);
        }
Ejemplo n.º 2
0
        private static string[] DealWithFrame(string strReg, string url, string content)
        {
            ArrayList alFrame = new ArrayList();
            Regex     r       = new Regex(strReg, RegexOptions.IgnoreCase);
            Match     m       = r.Match(content);

            while (m.Success)
            {
                alFrame.Add(CRegex.GetUrl(url, m.Groups["src"].Value));
                m = m.NextMatch();
            }

            return((string[])alFrame.ToArray(System.Type.GetType("System.String")));
        }
Ejemplo n.º 3
0
        /// <summary>
        /// 置换连接
        /// </summary>
        private static string _ReplaceUrl(string strRe, string subMatch, string sFormartted, string sPageUrl)
        {
            Regex           re          = new Regex(strRe, RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace | RegexOptions.IgnoreCase);
            MatchCollection mcs         = re.Matches(sFormartted);
            string          sOriStr     = "";
            string          sSubMatch   = "";
            string          sReplaceStr = "";

            foreach (Match mc in mcs)
            {
                sOriStr     = mc.Value;
                sSubMatch   = mc.Groups[subMatch].Value;
                sReplaceStr = sOriStr.Replace(sSubMatch, CRegex.GetUrl(sPageUrl, sSubMatch));
                sFormartted = sFormartted.Replace(sOriStr, sReplaceStr);
            }

            return(sFormartted);
        }
Ejemplo n.º 4
0
        /// <summary>
        /// 获取连接信息
        /// </summary>
        /// <param name="sContent">网页内容</param>
        /// <param name="sUrl">网页url</param>
        /// <param name="lisDes">连接列表</param>
        /// <returns></returns>
        public static Dictionary <string, string> GetLinks(string sContent, string sUrl, ref Dictionary <string, string> lisDes)
        {
            Dictionary <string, string> lisA = new Dictionary <string, string>();

            _GetLinks(sContent, sUrl, ref lisA);

            string domain = CRegex.GetDomain(sUrl).ToLower();

            //抓取脚本输出的链接
            Regex           re  = new Regex(@"<script[^>]+src\s*=\s*(?:'(?<src>[^']+)'|""(?<src>[^""]+)""|(?<src>[^>\s]+))\s*[^>]*>", RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace | RegexOptions.IgnoreCase);
            MatchCollection mcs = re.Matches(sContent);

            //foreach (Match mc in mcs)
            for (int i = mcs.Count - 1; i >= 0; i--)
            {
                Match  mc     = mcs[i];
                string subUrl = CRegex.GetUrl(sUrl, mc.Groups["src"].Value);
                if (domain.CompareTo(CRegex.GetDomain(subUrl).ToLower()) != 0)
                {
                    //同一域的才提炼
                    continue;
                }
                string subContent = CSocket.GetHtmlByUrl(subUrl);
                if (subContent.Length == 0)
                {
                    continue;
                }
                _GetLinks(subContent, subUrl, ref lisA);
            }

            if (lisA.Count == 0)
            {
                return(GetLinksFromRss(sContent, sUrl, ref lisDes));
            }

            return(lisA);
        }