예제 #1
0
        /// <summary>
        /// 构造方法
        /// </summary>
        /// <param name="paginationStr">分页信息片段: http://a.com?p=40 PaginationStr: p=40</param>
        /// <param name="termination">中止器</param>
        protected PaginationTargetUrlsExtractor(string paginationStr, ITargetUrlsExtractorTermination termination = null)
        {
            if (string.IsNullOrWhiteSpace(paginationStr))
            {
                throw new SpiderException("paginationStr should not be null or empty");
            }

            PaginationStr     = paginationStr;
            PaginationPattern = new Regex($"{RegexUtil.Number.Replace(PaginationStr, @"\d+")}");
            TargetUrlsExtractorTermination = termination;
        }
예제 #2
0
 /// <summary>
 /// 构造方法
 /// </summary>
 /// <param name="paginationStr">URL中分页的部分, 如: www.a.com/content_1.html, 则可以填此值为 content_1.html, tent_1.html等, 框架会把数据部分改成\d+用于正则匹配截取</param>
 /// <param name="interval">每次自增的间隔</param>
 /// <param name="termination">中止器, 用于判断是否已到最后一个需要采集的链接</param>
 public AutoIncrementTargetUrlsExtractor(string paginationStr, int interval = 1, ITargetUrlsExtractorTermination termination = null) : base(paginationStr, termination)
 {
     _interval = interval;
 }