Exemple #1
0
        private static void smethod_0(string string_0, string string_1, ref Dictionary <string, string> dictionary_0)
        {
            Regex           regex  = new Regex(@"<a\s+[^>]*href\s*=\s*[^>]+>[\s\S]*?</a>", RegexOptions.IgnorePatternWhitespace | RegexOptions.Multiline | RegexOptions.IgnoreCase);
            Regex           regex2 = new Regex("\"|'", RegexOptions.IgnorePatternWhitespace | RegexOptions.Multiline | RegexOptions.IgnoreCase);
            MatchCollection matchs = regex.Matches(string_0);

            for (int i = matchs.Count - 1; i >= 0; i--)
            {
                Match  match    = matchs[i];
                string sContent = GetLink(match.Value).Trim().Replace("\\\"", "").Replace(@"\'", "");
                if (RemoveByReg(sContent, "^http.*/$").Length >= 2)
                {
                    string str = CString.ClearTag(GetTextByLink(match.Value)).Trim();
                    if ((CString.GetLength(RemoveByReg(str, "首页|下载|中文|English|反馈|讨论区|投诉|建议|联系|关于|about|诚邀|工作|简介|新闻|掠影|风采\r\n|登录|注销|注册|使用|体验|立即|收藏夹|收藏|添加|加入\r\n|更多|more|专题|精选|热卖|热销|推荐|精彩\r\n|加盟|联盟|友情|链接|相关\r\n|订阅|阅读器|RSS\r\n|免责|条款|声明|我的|我们|组织|概况|有限|免费|公司|法律|导航|广告|地图|隐私\r\n|〖|〗|【|】|(|)|[|]|『|』|\\.")) >= 9) && !regex2.IsMatch(str))
                    {
                        sContent = GetUrlByRelative(string_1, sContent);
                        if (sContent.Length > 0x12)
                        {
                            int index = sContent.IndexOf('#');
                            if (index > -1)
                            {
                                sContent = sContent.Substring(0, index);
                            }
                            sContent = sContent.Trim(new char[] { '/', '\\' });
                            string domain = CRegex.GetDomain(sContent);
                            if (!sContent.Equals(domain, StringComparison.OrdinalIgnoreCase) && !(dictionary_0.ContainsKey(sContent) || dictionary_0.ContainsValue(str)))
                            {
                                dictionary_0.Add(sContent, str);
                            }
                        }
                    }
                }
            }
        }
Exemple #2
0
        public static List <string> GetLinksByKey(string sContent, List <string> listKey)
        {
            List <string> list    = new List <string>();
            List <string> list2   = new List <string>();
            string        pattern = "";
            Regex         regex   = new Regex(@"<a\s+[^>]*href\s*=\s*[^>]+>[\s\S]*?</a>", RegexOptions.IgnorePatternWhitespace | RegexOptions.Multiline | RegexOptions.IgnoreCase);

            foreach (Match match in regex.Matches(sContent))
            {
                if (RemoveByReg(GetLink(match.Value), "^http.*/$").Length > 0)
                {
                    list2.Add(match.Value);
                }
            }
            foreach (string str3 in listKey)
            {
                pattern = pattern + @"([\s\S]*" + str3 + @"[\s\S]*)|";
            }
            if (pattern != "")
            {
                pattern = pattern.Substring(0, pattern.Length - 1);
            }
            if (pattern == "")
            {
                pattern = @"[\s\S]+";
            }
            Regex regex2 = new Regex(pattern, RegexOptions.IgnorePatternWhitespace | RegexOptions.Multiline | RegexOptions.IgnoreCase);

            regex = new Regex(@"<a\s+[^>]+>([\s\S]{5,})?</a>", RegexOptions.IgnorePatternWhitespace | RegexOptions.Multiline | RegexOptions.IgnoreCase);
            foreach (string str3 in list2)
            {
                Match match2 = regex.Match(str3);
                if (match2.Success)
                {
                    string str = RemoveByReg(CString.ClearTag(match2.Groups[1].Value.Trim()), @"更多|登录|添加|推荐|收藏夹|加盟|关于|订阅|阅读器|我的|有限|免费|公司|more|RSS|about|\.");
                    if ((CString.GetLength(str) > 8) && regex2.Match(str).Success)
                    {
                        list.Add(str3);
                    }
                }
            }
            if (list.Count == 0)
            {
                return(GetLinksByKeyFromRss(sContent, listKey));
            }
            return(list);
        }
        public static List <string> GetLinksByKey(string sContent, /*string sUrl,*/ List <string> listKey)
        {
            List <string> listResult = new List <string>();
            List <string> list       = new List <string>();
            string        sKey       = "";
            string        strKey;

            //提取链接
            Regex           re  = new Regex(@"<a\s+[^>]*href\s*=\s*[^>]+>[\s\S]*?</a>", RegexOptions.IgnoreCase | RegexOptions.IgnorePatternWhitespace | RegexOptions.Multiline);
            MatchCollection mcs = re.Matches(sContent);

            foreach (Match mc in mcs)
            {
                strKey = RemoveByReg(GetLink(mc.Value), @"^http.*/$");//屏蔽以“http”开头“/”结尾的链接地址
                if (strKey.Length > 0)
                {
                    list.Add(mc.Value);
                }
            }

            //准备好关键字
            foreach (string s in listKey)
            {
                sKey += "([\\s\\S]*" + s + "[\\s\\S]*)|";
            }
            if (sKey != "")
            {
                sKey = sKey.Substring(0, sKey.Length - 1);
            }
            if (sKey == "")
            {
                sKey = "[\\s\\S]+";
            }
            Regex reKey = new Regex(sKey, RegexOptions.IgnoreCase | RegexOptions.IgnorePatternWhitespace | RegexOptions.Multiline);

            Match tmpmc;

            //链接的文字一定要5个字以上才算有效?
            re = new Regex(@"<a\s+[^>]+>([\s\S]{5,})?</a>", RegexOptions.IgnoreCase | RegexOptions.IgnorePatternWhitespace | RegexOptions.Multiline);
            foreach (string s in list)
            {
                tmpmc = re.Match(s);
                if (tmpmc.Success)
                {
                    strKey = CString.ClearTag(tmpmc.Groups[1].Value.Trim());
                    strKey = RemoveByReg(strKey, @"更多|登录|添加|推荐|收藏夹|加盟|关于|订阅|阅读器|我的|有限|免费|公司|more|RSS|about|\.");
                    if (CString.GetLength(strKey) > 8)//最起码是5个是为了屏蔽垃圾信息。
                    {
                        if (reKey.Match(strKey).Success)
                        {
                            listResult.Add(s);
                        }
                    }
                }
            }

            #region 对RSS的支持
            if (listResult.Count == 0)
            {
                return(GetLinksByKeyFromRss(sContent, listKey));
            }
            #endregion

            return(listResult);
        }
        private static void _GetLinks(string sContent, string sUrl, ref Dictionary <string, string> lisA)
        {
            const string sFilter =
                @"首页|下载|中文|English|反馈|讨论区|投诉|建议|联系|关于|about|诚邀|工作|简介|新闻|掠影|风采
|登录|注销|注册|使用|体验|立即|收藏夹|收藏|添加|加入
|更多|more|专题|精选|热卖|热销|推荐|精彩
|加盟|联盟|友情|链接|相关
|订阅|阅读器|RSS
|免责|条款|声明|我的|我们|组织|概况|有限|免费|公司|法律|导航|广告|地图|隐私
|〖|〗|【|】|(|)|[|]|『|』|\.";

            Regex           re  = new Regex(@"<a\s+[^>]*href\s*=\s*[^>]+>[\s\S]*?</a>", RegexOptions.IgnoreCase | RegexOptions.IgnorePatternWhitespace | RegexOptions.Multiline);
            Regex           re2 = new Regex(@"""|'", RegexOptions.IgnoreCase | RegexOptions.IgnorePatternWhitespace | RegexOptions.Multiline);
            MatchCollection mcs = re.Matches(sContent);

            //foreach (Match mc in mcs)
            for (int i = mcs.Count - 1; i >= 0; i--)
            {
                Match  mc      = mcs[i];
                string strHref = GetLink(mc.Value).Trim();

                strHref = strHref.Replace("\\\"", "");//针对JS输出链接
                strHref = strHref.Replace("\\\'", "");

                string strTemp = RemoveByReg(strHref, @"^http.*/$");//屏蔽以“http”开头“/”结尾的链接地址
                if (strTemp.Length < 2)
                {
                    continue;
                }

                //过滤广告或无意义的链接
                string strText = CString.ClearTag(GetTextByLink(mc.Value)).Trim();
                strTemp = RemoveByReg(strText, sFilter);
                if (CString.GetLength(strTemp) < 9)
                {
                    continue;
                }
                if (re2.IsMatch(strText))
                {
                    continue;
                }

                //换上绝对地址
                strHref = CText.GetUrlByRelative(sUrl, strHref);
                if (strHref.Length <= 18)//例如,http://www.163.com = 18
                {
                    continue;
                }

                //计算#字符出现的位置,移除它后面的内容
                //如果是域名地址,就跳过
                int charIndex = strHref.IndexOf('#');
                if (charIndex > -1)
                {
                    strHref = strHref.Substring(0, charIndex);
                }
                strHref = strHref.Trim(new char[] { '/', '\\' });
                string tmpDomainURL = CRegex.GetDomain(strHref);
                if (strHref.Equals(tmpDomainURL, StringComparison.OrdinalIgnoreCase))
                {
                    continue;
                }

                if (!lisA.ContainsKey(strHref) && !lisA.ContainsValue(strText))
                {
                    lisA.Add(strHref, strText);
                }
            }
        }