Beispiel #1
0
        public static List <string> GetPageDownUrlAuto(string baseUrl)
        {
            //自动获取翻页

            //1、直接在url后面加 _page
            //2、将url最后一段转换成数字加1

            List <string> list      = new List <string>();
            var           urlSuffix = baseUrl.Substring(baseUrl.LastIndexOf("."));
            var           subUrl    = baseUrl.Substring(baseUrl.LastIndexOf("/") + 1).Replace(urlSuffix, "");
            var           resultUrl = "";
            var           pageStr   = "";
            var           page      = 0;

            //获取最后一段数字区域
            pageStr = RegexUtil.ExtractDigit(subUrl);

            if (int.TryParse(pageStr, out page) == true)
            {
                resultUrl = subUrl.Replace(pageStr, (++page).ToString());
            }

            resultUrl = baseUrl.Replace(subUrl, resultUrl);
            //添加到url列表中
            list.Add(resultUrl);

            resultUrl = subUrl + "_2";
            resultUrl = baseUrl.Replace(subUrl, resultUrl);
            //添加到url列表中
            list.Add(resultUrl);
            return(list);
        }
Beispiel #2
0
        public static string GetPageDownUrlManual(string baseUrl, string pageDownUrl)
        {
            //目前只考虑简单的翻页,太复杂的翻页还是具体情况具体分析
            var url = "";
            int i;

            for (i = 0; i < baseUrl.Length; i++)
            {
                if (baseUrl[i] == pageDownUrl[i])
                {
                    continue;
                }

                break;
            }

            url = pageDownUrl.Substring(i);

            //补齐
            baseUrl = baseUrl.PadLeft(pageDownUrl.Length, ' ');

            for (i = pageDownUrl.Length - 1; i >= 0; i--)
            {
                if (pageDownUrl[i] == baseUrl[i])
                {
                    continue;
                }

                break;
            }

            url = pageDownUrl.Substring(i);

            var pageStr = RegexUtil.ExtractDigit(url);

            if (string.IsNullOrEmpty(pageStr))
            {
                return(pageDownUrl);
            }

            var page = 0;

            int.TryParse(pageStr, out page);

            url = url.Replace(pageStr, (++page).ToString());

            //防止前面有数字重合的,再获取最后一段url
            var subUrl = pageDownUrl.Substring(pageDownUrl.LastIndexOf("/") + 1);

            url = pageDownUrl.Replace(subUrl, subUrl.Replace(pageStr, url));
            return(url);
        }