コード例 #1
0
        public List <string> GetNextLevelUrl(string parentUrl, string html, string urlRule)
        {
            ///category-<Regex:\S+>/goods-<Regex:\d+>.html
            List <string> resultUrls  = new List <string>();
            string        urlTempRule = "";

            if (urlRule.StartsWith("<Regex:"))
            {
                urlTempRule = @"(?<=[href=|src=|open(][\W])";
                //处理前缀
                string strPre = urlRule.Substring(urlRule.IndexOf("<Regex:") + 7, urlRule.IndexOf(">") - 7);
                urlTempRule += strPre;
                //处理中间内容
                string cma = @"(?<=<Common:)\S+?(?=>)";

                Regex           cmas = new Regex(cma, RegexOptions.IgnoreCase | RegexOptions.Multiline);
                MatchCollection cs   = cmas.Matches(urlRule);
                foreach (Match ma in cs)
                {
                    urlTempRule += @"(\S*)" + ma.Value.ToString();
                }

                //处理后缀
                if (Regex.IsMatch(urlRule, "<End:"))
                {
                    string s = urlRule.Substring(urlRule.IndexOf("<End:") + 5, urlRule.Length - urlRule.IndexOf("<End:") - 6);
                    urlTempRule += @"(\S*)" + s;
                }
                else
                {
                    urlTempRule += @"(\S[^'"">]*)(?=[\s'""])";
                }
            }
            else
            {
                urlTempRule = @"(?<=[href=|src=|open(][\W])" + RegexString.RegexReplaceTrans(urlRule) + @"(\S[^'"">]*)(?=[\s'""])";
            }
            MatchCollection matchs = Regex.Matches(html, urlTempRule, RegexOptions.IgnoreCase | RegexOptions.Multiline);

            foreach (Match item in matchs)
            {
                string url = GetNextUrl(item.Value, parentUrl);
                if (!resultUrls.Contains(url))
                {
                    resultUrls.Add(url);
                }
            }
            return(resultUrls);
        }
コード例 #2
0
        private string GetRegString(TaskColumnItem temp)
        {
            string strCut = string.Empty;

            strCut += "(?<" + temp.DataTextType.Value + ">" + RegexString.RegexReplaceTrans(temp.StartPos.Value) + ")";
            switch (temp.LimitSign)
            {
            case EnumGloabParas.EnumLimitSign.LimitSign1:
                strCut += ".*?";
                break;

            case EnumGloabParas.EnumLimitSign.LimitSign2:
                strCut += "[^<>]*?";
                break;

            case EnumGloabParas.EnumLimitSign.LimitSign3:
                strCut += "[\\u4e00-\\u9fa5]*?";
                break;

            case EnumGloabParas.EnumLimitSign.LimitSign4:
                strCut += "[^\\x00-\\xff]*?";
                break;

            case EnumGloabParas.EnumLimitSign.LimitSign5:
                strCut += "[\\d]*?";
                break;

            case EnumGloabParas.EnumLimitSign.LimitSign6:
                strCut += "[\\x00-\\xff]*?";
                break;

            case EnumGloabParas.EnumLimitSign.LimitSign7:
                strCut += temp.LimitSignText.ToString();
                break;

            default:
                strCut += "[\\S\\s]*?";
                break;
            }
            strCut += "(?=" + RegexString.RegexReplaceTrans(temp.EndPos.Value) + ")";
            return(strCut);
        }