C# (CSharp) LSLibrary CRegex.GetDomain Examples

Programming Language: C# (CSharp)

Namespace/Package Name: LSLibrary

Class/Type: CRegex

Method/Function: GetDomain

Examples at hotexamples.com: 2

C# (CSharp) LSLibrary CRegex.GetDomain - 2 examples found. These are the top rated real world C# (CSharp) examples of LSLibrary.CRegex.GetDomain extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

GetUrl(4)

Replace(3)

GetDomain(2)

Frequently Used Methods

GetUrl (4)

Replace (3)

GetDomain (2)

Example #1

Show file

        public static Dictionary <string, string> GetLinks(string sContent, string sUrl, ref Dictionary <string, string> lisDes)
        {
            Dictionary <string, string> lisA = new Dictionary <string, string>();

            _GetLinks(sContent, sUrl, ref lisA);

            string domain = CRegex.GetDomain(sUrl).ToLower();

            //抓取脚本输出的链接
            Regex           re  = new Regex(@"<script[^>]+src\s*=\s*(?:'(?<src>[^']+)'|""(?<src>[^""]+)""|(?<src>[^>\s]+))\s*[^>]*>", RegexOptions.Multiline | RegexOptions.IgnorePatternWhitespace | RegexOptions.IgnoreCase);
            MatchCollection mcs = re.Matches(sContent);

            //foreach (Match mc in mcs)
            for (int i = mcs.Count - 1; i >= 0; i--)
            {
                Match  mc     = mcs[i];
                string subUrl = CRegex.GetUrl(sUrl, mc.Groups["src"].Value);
                if (domain.CompareTo(CRegex.GetDomain(subUrl).ToLower()) != 0)
                {
                    //同一域的才提炼
                    continue;
                }
                string subContent = CSocket.GetHtmlByUrl(subUrl);
                if (subContent.Length == 0)
                {
                    continue;
                }
                _GetLinks(subContent, subUrl, ref lisA);
            }

            if (lisA.Count == 0)
            {
                return(GetLinksFromRss(sContent, sUrl, ref lisDes));
            }

            return(lisA);
        }

Example #2

Show file

        private static void _GetLinks(string sContent, string sUrl, ref Dictionary <string, string> lisA)
        {
            const string sFilter =
                @"首页|下载|中文|English|反馈|讨论区|投诉|建议|联系|关于|about|诚邀|工作|简介|新闻|掠影|风采
|登录|注销|注册|使用|体验|立即|收藏夹|收藏|添加|加入
|更多|more|专题|精选|热卖|热销|推荐|精彩
|加盟|联盟|友情|链接|相关
|订阅|阅读器|RSS
|免责|条款|声明|我的|我们|组织|概况|有限|免费|公司|法律|导航|广告|地图|隐私
|〖|〗|【|】|（|）|［|］|『|』|\.";

            Regex           re  = new Regex(@"<a\s+[^>]*href\s*=\s*[^>]+>[\s\S]*?</a>", RegexOptions.IgnoreCase | RegexOptions.IgnorePatternWhitespace | RegexOptions.Multiline);
            Regex           re2 = new Regex(@"""|'", RegexOptions.IgnoreCase | RegexOptions.IgnorePatternWhitespace | RegexOptions.Multiline);
            MatchCollection mcs = re.Matches(sContent);

            //foreach (Match mc in mcs)
            for (int i = mcs.Count - 1; i >= 0; i--)
            {
                Match  mc      = mcs[i];
                string strHref = GetLink(mc.Value).Trim();

                strHref = strHref.Replace("\\\"", "");//针对JS输出链接
                strHref = strHref.Replace("\\\'", "");

                string strTemp = RemoveByReg(strHref, @"^http.*/$");//屏蔽以“http”开头“/”结尾的链接地址
                if (strTemp.Length < 2)
                {
                    continue;
                }

                //过滤广告或无意义的链接
                string strText = CString.ClearTag(GetTextByLink(mc.Value)).Trim();
                strTemp = RemoveByReg(strText, sFilter);
                if (CString.GetLength(strTemp) < 9)
                {
                    continue;
                }
                if (re2.IsMatch(strText))
                {
                    continue;
                }

                //换上绝对地址
                strHref = CText.GetUrlByRelative(sUrl, strHref);
                if (strHref.Length <= 18)//例如，http://www.163.com = 18
                {
                    continue;
                }

                //计算#字符出现的位置，移除它后面的内容
                //如果是域名地址，就跳过
                int charIndex = strHref.IndexOf('#');
                if (charIndex > -1)
                {
                    strHref = strHref.Substring(0, charIndex);
                }
                strHref = strHref.Trim(new char[] { '/', '\\' });
                string tmpDomainURL = CRegex.GetDomain(strHref);
                if (strHref.Equals(tmpDomainURL, StringComparison.OrdinalIgnoreCase))
                {
                    continue;
                }

                if (!lisA.ContainsKey(strHref) && !lisA.ContainsValue(strText))
                {
                    lisA.Add(strHref, strText);
                }
            }
        }