/// <summary> /// 构造方法 /// </summary> /// <param name="paginationStr">分页信息片段: http://a.com?p=40 PaginationStr: p=40</param> /// <param name="termination">中止器</param> protected PaginationTargetUrlsExtractor(string paginationStr, ITargetUrlsExtractorTermination termination = null) { if (string.IsNullOrWhiteSpace(paginationStr)) { throw new SpiderException("paginationStr should not be null or empty"); } PaginationStr = paginationStr; PaginationPattern = new Regex($"{RegexUtil.Number.Replace(PaginationStr, @"\d+")}"); TargetUrlsExtractorTermination = termination; }
/// <summary> /// 构造方法 /// </summary> /// <param name="paginationStr">URL中分页的部分, 如: www.a.com/content_1.html, 则可以填此值为 content_1.html, tent_1.html等, 框架会把数据部分改成\d+用于正则匹配截取</param> /// <param name="interval">每次自增的间隔</param> /// <param name="termination">中止器, 用于判断是否已到最后一个需要采集的链接</param> public AutoIncrementTargetUrlsExtractor(string paginationStr, int interval = 1, ITargetUrlsExtractorTermination termination = null) : base(paginationStr, termination) { _interval = interval; }