Beispiel #1
0
        /// <summary>
        /// 获取页面的类型
        /// </summary>
        /// <param name="sUrl">URL地址</param>
        /// <param name="sHtml">页面内容</param>
        /// <returns></returns>
        public static PageType GetPageType(string sUrl, ref string sHtml)
        {
            PageType pt = PageType.HTML;

            //看有没有RSS FEED
            string regRss = @"<link\s+[^>]*((type=""application/rss\+xml"")|(type=application/rss\+xml))[^>]*>";
            Regex  r      = new Regex(regRss, RegexOptions.IgnoreCase);
            Match  m      = r.Match(sHtml);

            if (m.Captures.Count != 0)
            {//有,则转向从RSS FEED中抓取
                string regHref = @"href=\s*(?:'(?<href>[^']+)'|""(?<href>[^""]+)""|(?<href>[^>\s]+))";
                r = new Regex(regHref, RegexOptions.IgnoreCase);
                m = r.Match(m.Captures[0].Value);
                if (m.Captures.Count > 0)
                {
                    //有可能是相对路径,加上绝对路径
                    string rssFile = CRegex.GetUrl(sUrl, m.Groups["href"].Value);
                    sHtml = GetHtmlByUrl(rssFile);
                    pt    = PageType.RSS;
                }
            }
            else
            {//看这个地址本身是不是一个Rss feed
                r = new Regex(@"<rss\s+[^>]*>", RegexOptions.IgnoreCase);
                m = r.Match(sHtml);
                if (m.Captures.Count > 0)
                {
                    pt = PageType.RSS;
                }
            }

            return(pt);
        }
Beispiel #2
0
        /// <summary>
        /// 将Unicode字串\u.\u.格式字串转换为原始字符串
        /// </summary>
        /// <param name="str">待转换字符串</param>
        /// <returns></returns>
        public static string UnicodeToString(string str)
        {
            string outStr = "";

            str = CRegex.Replace(str, "[\r\n]", "", 0);

            if (!string.IsNullOrEmpty(str))
            {
                string[] strlist = str.Replace("\\u", "㊣").Split('㊣');
                try
                {
                    outStr += strlist[0];
                    for (int i = 1; i < strlist.Length; i++)
                    {
                        string strTemp = strlist[i];
                        if (!string.IsNullOrEmpty(strTemp) && strTemp.Length >= 4)
                        {
                            strTemp = strlist[i].Substring(0, 4);
                            //将unicode字符转为10进制整数,然后转为char中文字符
                            outStr += (char)int.Parse(strTemp, System.Globalization.NumberStyles.HexNumber);
                            outStr += strlist[i].Substring(4);
                        }
                    }
                }
                catch (FormatException ex)
                {
                    outStr += ex.Message;
                    LogHelper.WriteLog(LogLevel.LOG_LEVEL_CRIT, ex, typeof(UnicodeHelper));
                }
            }
            return(outStr);
        }
Beispiel #3
0
        /// <summary>
        /// 获取连接信息
        /// </summary>
        /// <param name="sContent"></param>
        /// <param name="sUrl"></param>
        /// <param name="lisA"></param>
        private static void _GetLinks(string sContent, string sUrl, ref Dictionary <string, string> lisA)
        {
            Regex           re  = new Regex(@"<a\s+[^>]*href\s*=\s*[^>]+>[\s\S]*?</a>", RegexOptions.IgnoreCase | RegexOptions.IgnorePatternWhitespace | RegexOptions.Multiline);
            Regex           re2 = new Regex(@"""|'", RegexOptions.IgnoreCase | RegexOptions.IgnorePatternWhitespace | RegexOptions.Multiline);
            MatchCollection mcs = re.Matches(sContent);

            //foreach (Match mc in mcs)
            for (int i = mcs.Count - 1; i >= 0; i--)
            {
                Match  mc      = mcs[i];
                string strHref = GetLink(mc.Value).Trim();

                strHref = strHref.Replace("\\\"", "");//针对JS输出链接
                strHref = strHref.Replace("\\\'", "");

                string strTemp = RemoveByReg(strHref, @"^http.*/$");//屏蔽以“http”开头“/”结尾的链接地址
                if (strTemp.Length < 2)
                {
                    continue;
                }

                //过滤广告或无意义的链接
                string strText = CString.ClearTag(GetTextByLink(mc.Value)).Trim();
                strTemp = RemoveByReg(strText, Const.sFilter);
                if (CString.GetLength(strTemp) < 9)
                {
                    continue;
                }
                if (re2.IsMatch(strText))
                {
                    continue;
                }

                //换上绝对地址
                strHref = CText.GetUrlByRelative(sUrl, strHref);
                if (strHref.Length <= 18)//例如,http://www.163.com = 18
                {
                    continue;
                }

                //计算#字符出现的位置,移除它后面的内容
                //如果是域名地址,就跳过
                int charIndex = strHref.IndexOf('#');
                if (charIndex > -1)
                {
                    strHref = strHref.Substring(0, charIndex);
                }
                strHref = strHref.Trim(new char[] { '/', '\\' });
                string tmpDomainURL = CRegex.GetDomain(strHref);
                if (strHref.Equals(tmpDomainURL, StringComparison.OrdinalIgnoreCase))
                {
                    continue;
                }

                if (!lisA.ContainsKey(strHref) && !lisA.ContainsValue(strText))
                {
                    lisA.Add(strHref, strText);
                }
            }
        }
Beispiel #4
0
        private static string[] DealWithFrame(string strReg, string url, string content)
        {
            ArrayList alFrame = new ArrayList();
            Regex     r       = new Regex(strReg, RegexOptions.IgnoreCase);
            Match     m       = r.Match(content);

            while (m.Success)
            {
                alFrame.Add(CRegex.GetUrl(url, m.Groups["src"].Value));
                m = m.NextMatch();
            }

            return((string[])alFrame.ToArray(System.Type.GetType("System.String")));
        }
Beispiel #5
0
        /// <summary>
        /// 置换连接
        /// </summary>
        private static string _ReplaceUrl(string strRe, string subMatch, string sFormartted, string sPageUrl)
        {
            Regex           re          = new Regex(strRe, RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace | RegexOptions.IgnoreCase);
            MatchCollection mcs         = re.Matches(sFormartted);
            string          sOriStr     = "";
            string          sSubMatch   = "";
            string          sReplaceStr = "";

            foreach (Match mc in mcs)
            {
                sOriStr     = mc.Value;
                sSubMatch   = mc.Groups[subMatch].Value;
                sReplaceStr = sOriStr.Replace(sSubMatch, CRegex.GetUrl(sPageUrl, sSubMatch));
                sFormartted = sFormartted.Replace(sOriStr, sReplaceStr);
            }

            return(sFormartted);
        }
Beispiel #6
0
        /// <summary>
        /// 获取连接信息
        /// </summary>
        /// <param name="sContent">网页内容</param>
        /// <param name="sUrl">网页url</param>
        /// <param name="lisDes">连接列表</param>
        /// <returns></returns>
        public static Dictionary <string, string> GetLinks(string sContent, string sUrl, ref Dictionary <string, string> lisDes)
        {
            Dictionary <string, string> lisA = new Dictionary <string, string>();

            _GetLinks(sContent, sUrl, ref lisA);

            string domain = CRegex.GetDomain(sUrl).ToLower();

            //抓取脚本输出的链接
            Regex           re  = new Regex(@"<script[^>]+src\s*=\s*(?:'(?<src>[^']+)'|""(?<src>[^""]+)""|(?<src>[^>\s]+))\s*[^>]*>", RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace | RegexOptions.IgnoreCase);
            MatchCollection mcs = re.Matches(sContent);

            //foreach (Match mc in mcs)
            for (int i = mcs.Count - 1; i >= 0; i--)
            {
                Match  mc     = mcs[i];
                string subUrl = CRegex.GetUrl(sUrl, mc.Groups["src"].Value);
                if (domain.CompareTo(CRegex.GetDomain(subUrl).ToLower()) != 0)
                {
                    //同一域的才提炼
                    continue;
                }
                string subContent = CSocket.GetHtmlByUrl(subUrl);
                if (subContent.Length == 0)
                {
                    continue;
                }
                _GetLinks(subContent, subUrl, ref lisA);
            }

            if (lisA.Count == 0)
            {
                return(GetLinksFromRss(sContent, sUrl, ref lisDes));
            }

            return(lisA);
        }
Beispiel #7
0
        public static string GetContent(string sOriContent, string sOtherRemoveReg, string sPageUrl, DataTable dtAntiLink)
        {
            string sFormartted = sOriContent;

            //去掉有危险的标记
            sFormartted = Regex.Replace(sFormartted, @"<script[\s\S]*?</script>", "", RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace | RegexOptions.IgnoreCase);
            sFormartted = Regex.Replace(sFormartted, @"<iframe[^>]*>[\s\S]*?</iframe>", "", RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace | RegexOptions.IgnoreCase);
            Regex r = new Regex(@"<input[\s\S]+?>|<form[\s\S]+?>|</form[\s\S]*?>|<select[\s\S]+?>?</select>|<textarea[\s\S]*?>?</textarea>|<file[\s\S]*?>|<noscript>|</noscript>", RegexOptions.IgnoreCase);

            sFormartted = r.Replace(sFormartted, "");
            string[] sOtherReg = sOtherRemoveReg.Split(new string[] { "\r\n" }, StringSplitOptions.RemoveEmptyEntries);
            foreach (string sRemoveReg in sOtherReg)
            {
                sFormartted = CRegex.Replace(sFormartted, sRemoveReg, "", 0);
            }

            //图片路径
            //sFormartted = _ReplaceUrl("<img[^>]+src\\s*=\\s*(?:'(?<src>[^']+)'|\"(?<src>[^\"]+)\"|(?<src>[^>\\s]+))\\s*[^>]*>", "src", sFormartted,sPageUrl);
            sFormartted = _ReplaceUrl("<img[\\s\\S]+?src\\s*=\\s*(?:'(?<src>[^']+)'|\"(?<src>[^\"]+)\"|(?<src>[^>\\s]+))\\s*[^>]*>", "src", sFormartted, sPageUrl);
            //反防盗链
            string domain = GetDomain(sPageUrl);

            DataRow[] drs = dtAntiLink.Select("Domain='" + domain + "'");
            if (drs.Length > 0)
            {
                foreach (DataRow dr in drs)
                {
                    switch (Convert.ToInt32(dr["Type"]))
                    {
                    case 1:    //置换
                        sFormartted = sFormartted.Replace(dr["imgUrl"].ToString(), "http://stat.580k.com/t.asp?url=");
                        break;

                    default:    //附加
                        sFormartted = sFormartted.Replace(dr["imgUrl"].ToString(), "http://stat.580k.com/t.asp?url=" + dr["imgUrl"].ToString());
                        break;
                    }
                }
            }

            //A链接
            sFormartted = _ReplaceUrl(@"<a[^>]+href\s*=\s*(?:'(?<href>[^']+)'|""(?<href>[^""]+)""|(?<href>[^>\s]+))\s*[^>]*>", "href", sFormartted, sPageUrl);

            //CSS
            sFormartted = _ReplaceUrl(@"<link[^>]+href\s*=\s*(?:'(?<href>[^']+)'|""(?<href>[^""]+)""|(?<href>[^>\s]+))\s*[^>]*>", "href", sFormartted, sPageUrl);

            //BACKGROUND
            sFormartted = _ReplaceUrl(@"background\s*=\s*(?:'(?<img>[^']+)'|""(?<img>[^""]+)""|(?<img>[^>\s]+))", "img", sFormartted, sPageUrl);
            //style方式的背景:background-image:url(...)
            sFormartted = _ReplaceUrl(@"background-image\s*:\s*url\s*\x28(?<img>[^\x29]+)\x29", "img", sFormartted, sPageUrl);

            //FLASH
            sFormartted = _ReplaceUrl(@"<param\s[^>]+""movie""[^>]+value\s*=\s*""(?<flash>[^"">]+\x2eswf)""[^>]*>", "flash", sFormartted, sPageUrl);

            //XSL
            if (IsXml(sFormartted))
            {
                sFormartted = _ReplaceUrl(@"<\x3fxml-stylesheet\s+[^\x3f>]+href=\s*(?:'(?<href>[^']+)'|""(?<href>[^""]+)"")\s*[^\x3f>]*\x3f>", "href", sFormartted, sPageUrl);
            }

            //script
            //sFormartted = _ReplaceUrl(@"<script[^>]+src\s*=\s*(?:'(?<src>[^']+)'|""(?<src>[^""]+)""|(?<src>[^>\s]+))\s*[^>]*>", "src", sFormartted,sPageUrl);

            return(sFormartted);
        }
Beispiel #8
0
 /// <summary>
 /// 网页内容
 /// </summary>
 /// <param name="sInput">输入内容</param>
 public static string GetHtml(string sInput)
 {
     return(CRegex.Replace(sInput, @"(?<Head>[^<]+)<", "", "Head"));
 }