Пример #1
0
        public static Dictionary <string, string> GetLinks(string sContent, string sUrl, ref Dictionary <string, string> lisDes)
        {
            Dictionary <string, string> dictionary = new Dictionary <string, string>();

            smethod_0(sContent, sUrl, ref dictionary);
            string          str    = CRegex.GetDomain(sUrl).ToLower();
            MatchCollection matchs = new Regex("<script[^>]+src\\s*=\\s*(?:'(?<src>[^']+)'|\"(?<src>[^\"]+)\"|(?<src>[^>\\s]+))\\s*[^>]*>", RegexOptions.IgnorePatternWhitespace | RegexOptions.Multiline | RegexOptions.IgnoreCase).Matches(sContent);

            for (int i = matchs.Count - 1; i >= 0; i--)
            {
                Match  match = matchs[i];
                string url   = CRegex.GetUrl(sUrl, match.Groups["src"].Value);
                if (str.CompareTo(CRegex.GetDomain(url).ToLower()) == 0)
                {
                    string htmlByUrl = CSocket.GetHtmlByUrl(url);
                    if (htmlByUrl.Length != 0)
                    {
                        smethod_0(htmlByUrl, url, ref dictionary);
                    }
                }
            }
            if (dictionary.Count == 0)
            {
                return(GetLinksFromRss(sContent, sUrl, ref lisDes));
            }
            return(dictionary);
        }
Пример #2
0
        public static PageType GetPageType(string sUrl, ref string sHtml)
        {
            PageType pt = PageType.HTML;

            //看有没有RSS FEED
            string regRss = @"<link\s+[^>]*((type=""application/rss\+xml"")|(type=application/rss\+xml))[^>]*>";
            Regex  r      = new Regex(regRss, RegexOptions.IgnoreCase);
            Match  m      = r.Match(sHtml);

            if (m.Captures.Count != 0)
            {//有,则转向从RSS FEED中抓取
                string regHref = @"href=\s*(?:'(?<href>[^']+)'|""(?<href>[^""]+)""|(?<href>[^>\s]+))";
                r = new Regex(regHref, RegexOptions.IgnoreCase);
                m = r.Match(m.Captures[0].Value);
                if (m.Captures.Count > 0)
                {
                    //有可能是相对路径,加上绝对路径
                    string rssFile = CRegex.GetUrl(sUrl, m.Groups["href"].Value);
                    sHtml = GetHtmlByUrl(rssFile);
                    pt    = PageType.RSS;
                }
            }
            else
            {//看这个地址本身是不是一个Rss feed
                r = new Regex(@"<rss\s+[^>]*>", RegexOptions.IgnoreCase);
                m = r.Match(sHtml);
                if (m.Captures.Count > 0)
                {
                    pt = PageType.RSS;
                }
            }

            return(pt);
        }
Пример #3
0
        public static PageType GetPageType(string sUrl, ref string sHtml)
        {
            PageType hTML    = PageType.HTML;
            string   pattern = "<link\\s+[^>]*((type=\"application/rss\\+xml\")|(type=application/rss\\+xml))[^>]*>";
            Regex    regex   = new Regex(pattern, RegexOptions.IgnoreCase);
            Match    match   = regex.Match(sHtml);

            if (match.Captures.Count != 0)
            {
                string str2 = "href=\\s*(?:'(?<href>[^']+)'|\"(?<href>[^\"]+)\"|(?<href>[^>\\s]+))";
                match = new Regex(str2, RegexOptions.IgnoreCase).Match(match.Captures[0].Value);
                if (match.Captures.Count > 0)
                {
                    string url = CRegex.GetUrl(sUrl, match.Groups["href"].Value);
                    sHtml = GetHtmlByUrl(url);
                    hTML  = PageType.RSS;
                }
                return(hTML);
            }
            regex = new Regex(@"<rss\s+[^>]*>", RegexOptions.IgnoreCase);
            if (regex.Match(sHtml).Captures.Count > 0)
            {
                hTML = PageType.RSS;
            }
            return(hTML);
        }
Пример #4
0
        private static string[] smethod_2(string string_0, string string_1, string string_2)
        {
            ArrayList list  = new ArrayList();
            Regex     regex = new Regex(string_0, RegexOptions.IgnoreCase);

            for (Match match = regex.Match(string_2); match.Success; match = match.NextMatch())
            {
                list.Add(CRegex.GetUrl(string_1, match.Groups["src"].Value));
            }
            return((string[])list.ToArray(Type.GetType("System.String")));
        }
Пример #5
0
        private static string[] DealWithFrame(string strReg, string url, string content)
        {
            ArrayList alFrame = new ArrayList();
            Regex     r       = new Regex(strReg, RegexOptions.IgnoreCase);
            Match     m       = r.Match(content);

            while (m.Success)
            {
                alFrame.Add(CRegex.GetUrl(url, m.Groups["src"].Value));
                m = m.NextMatch();
            }

            return((string[])alFrame.ToArray(System.Type.GetType("System.String")));
        }
Пример #6
0
        /// <summary>
        /// 置换连接
        /// </summary>
        private static string _ReplaceUrl(string strRe, string subMatch, string sFormartted, string sPageUrl)
        {
            Regex           re          = new Regex(strRe, RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace | RegexOptions.IgnoreCase);
            MatchCollection mcs         = re.Matches(sFormartted);
            string          sOriStr     = "";
            string          sSubMatch   = "";
            string          sReplaceStr = "";

            foreach (Match mc in mcs)
            {
                sOriStr     = mc.Value;
                sSubMatch   = mc.Groups[subMatch].Value;
                sReplaceStr = sOriStr.Replace(sSubMatch, CRegex.GetUrl(sPageUrl, sSubMatch));
                sFormartted = sFormartted.Replace(sOriStr, sReplaceStr);
            }

            return(sFormartted);
        }
Пример #7
0
        public static Dictionary <string, string> GetLinks(string sContent, string sUrl, ref Dictionary <string, string> lisDes)
        {
            Dictionary <string, string> lisA = new Dictionary <string, string>();

            _GetLinks(sContent, sUrl, ref lisA);

            string domain = CRegex.GetDomain(sUrl).ToLower();

            //抓取脚本输出的链接
            Regex           re  = new Regex(@"<script[^>]+src\s*=\s*(?:'(?<src>[^']+)'|""(?<src>[^""]+)""|(?<src>[^>\s]+))\s*[^>]*>", RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace | RegexOptions.IgnoreCase);
            MatchCollection mcs = re.Matches(sContent);

            //foreach (Match mc in mcs)
            for (int i = mcs.Count - 1; i >= 0; i--)
            {
                Match  mc     = mcs[i];
                string subUrl = CRegex.GetUrl(sUrl, mc.Groups["src"].Value);
                if (domain.CompareTo(CRegex.GetDomain(subUrl).ToLower()) != 0)
                {
                    //同一域的才提炼
                    continue;
                }
                string subContent = CSocket.GetHtmlByUrl(subUrl);
                if (subContent.Length == 0)
                {
                    continue;
                }
                _GetLinks(subContent, subUrl, ref lisA);
            }

            if (lisA.Count == 0)
            {
                return(GetLinksFromRss(sContent, sUrl, ref lisDes));
            }

            return(lisA);
        }