Example #1
0
        /// <summary>
        /// 获取连接信息
        /// </summary>
        /// <param name="sContent"></param>
        /// <param name="sUrl"></param>
        /// <param name="lisA"></param>
        private static void _GetLinks(string sContent, string sUrl, ref Dictionary <string, string> lisA)
        {
            Regex           re  = new Regex(@"<a\s+[^>]*href\s*=\s*[^>]+>[\s\S]*?</a>", RegexOptions.IgnoreCase | RegexOptions.IgnorePatternWhitespace | RegexOptions.Multiline);
            Regex           re2 = new Regex(@"""|'", RegexOptions.IgnoreCase | RegexOptions.IgnorePatternWhitespace | RegexOptions.Multiline);
            MatchCollection mcs = re.Matches(sContent);

            //foreach (Match mc in mcs)
            for (int i = mcs.Count - 1; i >= 0; i--)
            {
                Match  mc      = mcs[i];
                string strHref = GetLink(mc.Value).Trim();

                strHref = strHref.Replace("\\\"", "");//针对JS输出链接
                strHref = strHref.Replace("\\\'", "");

                string strTemp = RemoveByReg(strHref, @"^http.*/$");//屏蔽以“http”开头“/”结尾的链接地址
                if (strTemp.Length < 2)
                {
                    continue;
                }

                //过滤广告或无意义的链接
                string strText = CString.ClearTag(GetTextByLink(mc.Value)).Trim();
                strTemp = RemoveByReg(strText, Const.sFilter);
                if (CString.GetLength(strTemp) < 9)
                {
                    continue;
                }
                if (re2.IsMatch(strText))
                {
                    continue;
                }

                //换上绝对地址
                strHref = CText.GetUrlByRelative(sUrl, strHref);
                if (strHref.Length <= 18)//例如,http://www.163.com = 18
                {
                    continue;
                }

                //计算#字符出现的位置,移除它后面的内容
                //如果是域名地址,就跳过
                int charIndex = strHref.IndexOf('#');
                if (charIndex > -1)
                {
                    strHref = strHref.Substring(0, charIndex);
                }
                strHref = strHref.Trim(new char[] { '/', '\\' });
                string tmpDomainURL = CRegex.GetDomain(strHref);
                if (strHref.Equals(tmpDomainURL, StringComparison.OrdinalIgnoreCase))
                {
                    continue;
                }

                if (!lisA.ContainsKey(strHref) && !lisA.ContainsValue(strText))
                {
                    lisA.Add(strHref, strText);
                }
            }
        }
Example #2
0
        /// <summary>
        /// 作者名
        /// </summary>
        /// <param name="sInput">输入内容</param>
        /// <param name="sRegex">表达式字符串</param>
        public static string GetAuthor(string sInput, string sRegex)
        {
            string sAuthor = GetText(sInput, sRegex, "Author");

            sAuthor = CString.ClearTag(sAuthor);
            if (sAuthor.Length > 99)
            {
                sAuthor = sAuthor.Substring(0, 99);
            }
            return(sAuthor);
        }
Example #3
0
        /// <summary>
        /// 文章来源
        /// </summary>
        /// <param name="sInput">输入内容</param>
        /// <param name="sRegex">表达式字符串</param>
        public static string GetSource(string sInput, string sRegex)
        {
            string sSource = GetText(sInput, sRegex, "Source");

            sSource = CString.ClearTag(sSource);
            if (sSource.Length > 99)
            {
                sSource = sSource.Substring(0, 99);
            }
            return(sSource);
        }
Example #4
0
        /// <summary>
        /// 文章标题
        /// </summary>
        /// <param name="sInput">输入内容</param>
        /// <param name="sRegex">表达式字符串</param>
        public static string GetTitle(string sInput, string sRegex)
        {
            string sTitle = GetText(sInput, sRegex, "Title");

            sTitle = CString.ClearTag(sTitle);
            if (sTitle.Length > 99)
            {
                sTitle = sTitle.Substring(0, 99);
            }
            return(sTitle);
        }
Example #5
0
        /// <summary>
        /// 通过正则表达式获取内容
        /// </summary>
        /// <param name="sContent">待查找的内容</param>
        /// <param name="sRegex">正则表达式</param>
        /// <returns></returns>
        public static string GetTextByReg(string sContent, string sRegex)
        {
            Regex  re  = new Regex(sRegex, RegexOptions.IgnoreCase | RegexOptions.IgnorePatternWhitespace | RegexOptions.Multiline);
            Match  mc  = re.Match(sContent);
            string str = "";

            if (mc.Success)
            {
                str = mc.Groups[0].Value;
            }
            while (str.EndsWith("_"))
            {
                str = CString.RemoveEndWith(str, "_");
            }
            return(str);
        }
Example #6
0
        /// <summary>
        /// 根据相对路径得到绝对路径
        /// </summary>
        /// <param name="sInput">输入内容</param>
        /// <param name="sRelativeUrl">相对链接地址</param>
        public static string GetUrl(string sInput, string sRelativeUrl)
        {
            string sReturnUrl = "";
            string sUrl       = _GetStandardUrlDepth(sInput);//返回了http://www.163.com/news/这种形式

            if (sRelativeUrl.ToLower().StartsWith("http") || sRelativeUrl.ToLower().StartsWith("https"))
            {
                sReturnUrl = sRelativeUrl.Trim();
            }
            else if (sRelativeUrl.StartsWith("/"))
            {
                sReturnUrl = GetDomain(sInput) + sRelativeUrl;
            }
            else if (sRelativeUrl.StartsWith("../"))
            {
                sUrl = sUrl.Substring(0, sUrl.Length - 1);
                while (sRelativeUrl.IndexOf("../") >= 0)
                {
                    string temp = CString.GetPreStrByLast(sUrl, "/");
                    if (temp.Length > 6)
                    {//temp != "http:/",否则的话,说明已经回溯到尽头了,"../"与网址的层次对应不上。存在这种情况,网页上面的链接是错误的,但浏览器还能正常显示
                        sUrl = temp;
                    }
                    sRelativeUrl = sRelativeUrl.Substring(3);
                }
                sReturnUrl = sUrl + "/" + sRelativeUrl.Trim();
            }
            else if (sRelativeUrl.StartsWith("./"))
            {
                sReturnUrl = sUrl + sRelativeUrl.Trim().Substring(2);
            }
            else if (sRelativeUrl.Trim() != "")
            {//2007images/modecss.css
                sReturnUrl = sUrl + sRelativeUrl.Trim();
            }
            else
            {
                sRelativeUrl = sUrl;
            }
            return(sReturnUrl);
        }
Example #7
0
        public static List <string> GetLinksByKey(string sContent, /*string sUrl,*/ List <string> listKey)
        {
            List <string> listResult = new List <string>();
            List <string> list       = new List <string>();
            string        sKey       = "";
            string        strKey;

            //提取链接
            Regex           re  = new Regex(@"<a\s+[^>]*href\s*=\s*[^>]+>[\s\S]*?</a>", RegexOptions.IgnoreCase | RegexOptions.IgnorePatternWhitespace | RegexOptions.Multiline);
            MatchCollection mcs = re.Matches(sContent);

            foreach (Match mc in mcs)
            {
                strKey = RemoveByReg(GetLink(mc.Value), @"^http.*/$");//屏蔽以“http”开头“/”结尾的链接地址
                if (strKey.Length > 0)
                {
                    list.Add(mc.Value);
                }
            }

            //准备好关键字
            foreach (string s in listKey)
            {
                sKey += "([\\s\\S]*" + s + "[\\s\\S]*)|";
            }
            if (sKey != "")
            {
                sKey = sKey.Substring(0, sKey.Length - 1);
            }
            if (sKey == "")
            {
                sKey = "[\\s\\S]+";
            }
            Regex reKey = new Regex(sKey, RegexOptions.IgnoreCase | RegexOptions.IgnorePatternWhitespace | RegexOptions.Multiline);

            Match tmpmc;

            //链接的文字一定要5个字以上才算有效?
            re = new Regex(@"<a\s+[^>]+>([\s\S]{5,})?</a>", RegexOptions.IgnoreCase | RegexOptions.IgnorePatternWhitespace | RegexOptions.Multiline);
            foreach (string s in list)
            {
                tmpmc = re.Match(s);
                if (tmpmc.Success)
                {
                    strKey = CString.ClearTag(tmpmc.Groups[1].Value.Trim());
                    strKey = RemoveByReg(strKey, @"更多|登录|添加|推荐|收藏夹|加盟|关于|订阅|阅读器|我的|有限|免费|公司|more|RSS|about|\.");
                    if (CString.GetLength(strKey) > 8)//最起码是5个是为了屏蔽垃圾信息。
                    {
                        if (reKey.Match(strKey).Success)
                        {
                            listResult.Add(s);
                        }
                    }
                }
            }

            #region 对RSS的支持
            if (listResult.Count == 0)
            {
                return(GetLinksByKeyFromRss(sContent, listKey));
            }
            #endregion

            return(listResult);
        }