public virtual bool ContainsTargetUrlRegion(string region) { ISelector selector = Selectors.Default(); if (!string.IsNullOrWhiteSpace(region)) { selector = Selectors.XPath(region); } return(_targetUrlExtractors.ContainsKey(selector)); }
/// <summary> /// Only used for test /// </summary> /// <param name="region"></param> /// <returns></returns> internal bool ContainsTargetUrlRegion(string region) { ISelector selector = Selectors.Default(); if (!string.IsNullOrWhiteSpace(region)) { selector = Selectors.XPath(region); } return(_regionSelectorMapPatterns.ContainsKey(selector)); }
/// <summary> /// Only used for test /// </summary> /// <param name="regionXpath"></param> /// <returns></returns> internal List <Regex> GetTargetUrlPatterns(string regionXpath) { ISelector selector = Selectors.Default(); if (!string.IsNullOrWhiteSpace(regionXpath)) { selector = Selectors.XPath(regionXpath); } return(_regionSelectorMapPatterns.ContainsKey(selector) ? _regionSelectorMapPatterns[selector] : null); }
/// <summary> /// Only used for test /// </summary> /// <param name="regionXpath"></param> /// <returns></returns> public virtual List <Regex> GetTargetUrlPatterns(string regionXpath) { ISelector selector = Selectors.Default(); if (!string.IsNullOrWhiteSpace(regionXpath)) { selector = Selectors.XPath(regionXpath); } return(_targetUrlExtractors.ContainsKey(selector) ? _targetUrlExtractors[selector] : null); }
/// <summary> /// 添加目标链接解析规则 /// </summary> /// <param name="regionXpath">目标链接所在区域</param> /// <param name="patterns">匹配目标链接的正则表达式</param> public void AddTargetUrlExtractor(string regionXpath, params string[] patterns) { if (patterns == null || patterns.Length == 0) { throw new ArgumentException("Patterns should not be null or empty."); } var validPatterns = patterns.Where(p => p != null && !string.IsNullOrWhiteSpace(p.Trim())).Select(p => p.Trim()) .ToList(); if (validPatterns.Count != patterns.Length) { throw new ArgumentException("Pattern value should not be null or empty."); } ISelector selector = Selectors.Default(); if (!string.IsNullOrWhiteSpace(regionXpath)) { string xpath = string.IsNullOrWhiteSpace(regionXpath.Trim()) ? "." : regionXpath.Trim(); selector = Selectors.XPath(xpath); } if (!_regionSelectorMapPatterns.ContainsKey(selector)) { _regionSelectorMapPatterns.Add(selector, new List <Regex>()); } var oldPatterns = _regionSelectorMapPatterns[selector]; // 如果已经有正则为空, 即表示当前区域内所有的URL都是目标链接, 则无需再校验其它正则了 if (oldPatterns.Contains(null)) { return; } // 如果不提供正则表达式, 表示当前区域内所有的URL都是目标链接 if (validPatterns.Count == 0) { oldPatterns.Add(null); } foreach (var pattern in validPatterns) { if (oldPatterns.All(p => p.ToString() != pattern)) { oldPatterns.Add(new Regex(pattern)); AddTargetUrlPatterns(pattern); } } }
protected virtual void AddTargetUrlExtractor(string regionXpath, params string[] patterns) { ISelector selector = Selectors.Default(); if (regionXpath != null) { string xpath = string.IsNullOrWhiteSpace(regionXpath.Trim()) ? "." : regionXpath.Trim(); selector = Selectors.XPath(xpath); } if (!_targetUrlExtractors.ContainsKey(selector)) { _targetUrlExtractors.Add(selector, new List <Regex>()); } var realPatterns = _targetUrlExtractors[selector]; // 如果已经有正则为空, 即表示当前区域内所有的URL都是目标链接, 则无需再校验其它正则了 if (realPatterns.Contains(null)) { return; } if (patterns == null || patterns.Length == 0) { if (!realPatterns.Contains(null)) { realPatterns.Add(null); } return; } foreach (var pattern in patterns) { if (!string.IsNullOrEmpty(pattern)) { var realPattern = pattern.Trim(); if (realPatterns.All(p => p.ToString() != realPattern)) { realPatterns.Add(new Regex(realPattern)); } } else { throw new ArgumentNullException("Pattern should not be null."); } } }
/// <summary> /// 解析出目标链接 /// </summary> /// <param name="page">页面数据</param> /// <param name="site">站点信息</param> /// <returns>目标链接</returns> protected override IEnumerable <Request> Extract(Page page, Site site) { if (_regionSelectorMapPatterns == null || _regionSelectorMapPatterns.Count == 0) { return(new Request[0]); } List <string> resultUrls = new List <string>(); foreach (var targetUrlExtractor in _regionSelectorMapPatterns) { if (Equals(targetUrlExtractor.Key, Selectors.Default())) { continue; } IEnumerable <string> links = null; if (page.ContentType == ContentType.Html) { links = page.Selectable.SelectList(targetUrlExtractor.Key).Links().GetValues(); } else if (page.ContentType == ContentType.Json) { links = page.Selectable.SelectList(Selectors.Regex(RegexUtil.Url)).Links().GetValues(); } else { } if (links == null) { continue; } // check: 仔细考虑是放在前面, 还是在后面做 formatter, 我倾向于在前面. 对targetUrl做formatter则表示Start Url也应该是要符合这个规则的。 List <string> tmp = new List <string>(); foreach (string link in links) { var newUrl = FormateUrl(link); #if !NETSTANDARD tmp.Add(System.Web.HttpUtility.HtmlDecode(System.Web.HttpUtility.UrlDecode(newUrl))); #else tmp.Add(WebUtility.HtmlDecode(WebUtility.UrlDecode(newUrl))); #endif } links = tmp; if (targetUrlExtractor.Value == null || targetUrlExtractor.Value.Count == 0) { resultUrls.AddRange(links); continue; } foreach (var regex in targetUrlExtractor.Value) { foreach (string link in links) { if (regex.IsMatch(link)) { bool isRequired = true; if (ExcludeTargetUrlPatterns != null) { foreach (var excludeRegex in ExcludeTargetUrlPatterns) { if (excludeRegex.IsMatch(link)) { isRequired = false; break; } } } if (isRequired) { resultUrls.Add(link); } } } } } if (site.DownloadFiles) { var links = (page.Selectable.SelectList(ImageSelector)).GetValues(); if (links != null && links.Count() > 0) { foreach (string link in links) { bool isRequired = true; if (ExcludeTargetUrlPatterns != null) { foreach (var excludeRegex in ExcludeTargetUrlPatterns) { if (excludeRegex.IsMatch(link)) { isRequired = false; break; } } } if (isRequired) { resultUrls.Add(link); } } } } return(resultUrls.Select(t => new Request(t, page.Request.Extras) { Site = site })); }
/// <summary> /// 解析出目标链接 /// </summary> /// <param name="response">链接请求结果</param> /// <returns>目标链接</returns> protected override IEnumerable <Request> Extract(Response response) { if (_regionSelectorMapPatterns == null || _regionSelectorMapPatterns.Count == 0) { return(new Request[0]); } var site = response.Request.Site; List <string> resultUrls = new List <string>(); foreach (var targetUrlExtractor in _regionSelectorMapPatterns) { if (Equals(targetUrlExtractor.Key, Selectors.Default())) { continue; } List <string> requests; if (response.ContentType == ContentType.Json) { requests = new List <string>(response.Selectable().SelectList(Selectors.Regex(RegexUtil.Url)).Links().GetValues()); } else { requests = new List <string>(response.Selectable().SelectList(targetUrlExtractor.Key).Links().GetValues()); } if (requests.Count == 0) { continue; } List <string> tmpRequests = new List <string>(); foreach (string request in requests) { #if !NETSTANDARD tmpRequests.Add(System.Web.HttpUtility.HtmlDecode(System.Web.HttpUtility.UrlDecode(request))); #else tmpRequests.Add(System.Net.WebUtility.HtmlDecode(System.Net.WebUtility.UrlDecode(request))); #endif } requests = tmpRequests; if (targetUrlExtractor.Value == null || targetUrlExtractor.Value.Count == 0) { resultUrls.AddRange(requests); continue; } foreach (var regex in targetUrlExtractor.Value) { foreach (string link in requests) { if (regex.IsMatch(link)) { bool isRequired = true; if (ExcludeTargetUrlPatterns != null) { foreach (var excludeRegex in ExcludeTargetUrlPatterns) { if (excludeRegex.IsMatch(link)) { isRequired = false; break; } } } if (isRequired) { resultUrls.Add(link); } } } } } var properties = new Dictionary <string, dynamic>(); foreach (var kv in response.Request.Properties) { if (kv.Key != Env.UrlPropertyKey && kv.Key != Env.TargetUrlPropertyKey) { properties.Add(kv.Key, kv.Value); } } return(resultUrls.Select(url => new Request(url, response.Request.Properties) { Site = site })); }
/// <summary> /// 如果找不到则不返回URL, 不然返回的URL太多 /// </summary> /// <param name="page"></param> protected virtual void ExtractUrls(Page page) { if (_targetUrlExtractors == null || _targetUrlExtractors.Count == 0) { return; } foreach (var targetUrlExtractor in _targetUrlExtractors) { if (Equals(targetUrlExtractor.Key, Selectors.Default())) { continue; } var links = page.Selectable.SelectList(targetUrlExtractor.Key).Links().GetValues(); if (links == null) { continue; } // check: 仔细考虑是放在前面, 还是在后面做 formatter, 我倾向于在前面. 对targetUrl做formatter则表示Start Url也应该是要符合这个规则的。 List <string> tmp = new List <string>(); foreach (string link in links) { var url = FormateUrl(link); #if !NET_CORE tmp.Add(HttpUtility.HtmlDecode(HttpUtility.UrlDecode(url))); #else tmp.Add(WebUtility.HtmlDecode(WebUtility.UrlDecode(url))); #endif } links = tmp; if (targetUrlExtractor.Value == null || targetUrlExtractor.Value.Count == 0) { page.AddTargetRequests(links); continue; } foreach (var regex in targetUrlExtractor.Value) { foreach (string link in links) { if (regex.IsMatch(link)) { bool isRequired = true; if (_excludeTargetUrlPatterns != null) { foreach (var excludeRegex in _excludeTargetUrlPatterns) { if (excludeRegex.IsMatch(link)) { isRequired = false; break; } } } if (isRequired) { page.AddTargetRequest(new Request(link, page.Request.Extras)); } } } } } if (Site.DownloadFiles) { var links = (page.Selectable.SelectList(_imageSelector)).GetValues(); if (links == null || links.Count == 0) { return; } foreach (string link in links) { bool isRequired = true; if (_excludeTargetUrlPatterns != null) { foreach (var excludeRegex in _excludeTargetUrlPatterns) { if (excludeRegex.IsMatch(link)) { isRequired = false; break; } } } if (isRequired) { page.AddTargetRequest(new Request(link, page.Request.Extras)); } } } }