Example #1
0
        public virtual bool ContainsTargetUrlRegion(string region)
        {
            ISelector selector = Selectors.Default();

            if (!string.IsNullOrWhiteSpace(region))
            {
                selector = Selectors.XPath(region);
            }
            return(_targetUrlExtractors.ContainsKey(selector));
        }
        /// <summary>
        /// Only used for test
        /// </summary>
        /// <param name="region"></param>
        /// <returns></returns>
        internal bool ContainsTargetUrlRegion(string region)
        {
            ISelector selector = Selectors.Default();

            if (!string.IsNullOrWhiteSpace(region))
            {
                selector = Selectors.XPath(region);
            }
            return(_regionSelectorMapPatterns.ContainsKey(selector));
        }
Example #3
0
        /// <summary>
        /// Only used for test
        /// </summary>
        /// <param name="regionXpath"></param>
        /// <returns></returns>
        internal List <Regex> GetTargetUrlPatterns(string regionXpath)
        {
            ISelector selector = Selectors.Default();

            if (!string.IsNullOrWhiteSpace(regionXpath))
            {
                selector = Selectors.XPath(regionXpath);
            }

            return(_regionSelectorMapPatterns.ContainsKey(selector) ? _regionSelectorMapPatterns[selector] : null);
        }
Example #4
0
        /// <summary>
        /// Only used for test
        /// </summary>
        /// <param name="regionXpath"></param>
        /// <returns></returns>
        public virtual List <Regex> GetTargetUrlPatterns(string regionXpath)
        {
            ISelector selector = Selectors.Default();

            if (!string.IsNullOrWhiteSpace(regionXpath))
            {
                selector = Selectors.XPath(regionXpath);
            }

            return(_targetUrlExtractors.ContainsKey(selector) ? _targetUrlExtractors[selector] : null);
        }
Example #5
0
        /// <summary>
        /// 添加目标链接解析规则
        /// </summary>
        /// <param name="regionXpath">目标链接所在区域</param>
        /// <param name="patterns">匹配目标链接的正则表达式</param>
        public void AddTargetUrlExtractor(string regionXpath, params string[] patterns)
        {
            if (patterns == null || patterns.Length == 0)
            {
                throw new ArgumentException("Patterns should not be null or empty.");
            }

            var validPatterns = patterns.Where(p => p != null && !string.IsNullOrWhiteSpace(p.Trim())).Select(p => p.Trim())
                                .ToList();

            if (validPatterns.Count != patterns.Length)
            {
                throw new ArgumentException("Pattern value should not be null or empty.");
            }

            ISelector selector = Selectors.Default();

            if (!string.IsNullOrWhiteSpace(regionXpath))
            {
                string xpath = string.IsNullOrWhiteSpace(regionXpath.Trim()) ? "." : regionXpath.Trim();
                selector = Selectors.XPath(xpath);
            }

            if (!_regionSelectorMapPatterns.ContainsKey(selector))
            {
                _regionSelectorMapPatterns.Add(selector, new List <Regex>());
            }

            var oldPatterns = _regionSelectorMapPatterns[selector];

            // 如果已经有正则为空, 即表示当前区域内所有的URL都是目标链接, 则无需再校验其它正则了
            if (oldPatterns.Contains(null))
            {
                return;
            }

            // 如果不提供正则表达式, 表示当前区域内所有的URL都是目标链接
            if (validPatterns.Count == 0)
            {
                oldPatterns.Add(null);
            }

            foreach (var pattern in validPatterns)
            {
                if (oldPatterns.All(p => p.ToString() != pattern))
                {
                    oldPatterns.Add(new Regex(pattern));
                    AddTargetUrlPatterns(pattern);
                }
            }
        }
Example #6
0
        protected virtual void AddTargetUrlExtractor(string regionXpath, params string[] patterns)
        {
            ISelector selector = Selectors.Default();

            if (regionXpath != null)
            {
                string xpath = string.IsNullOrWhiteSpace(regionXpath.Trim()) ? "." : regionXpath.Trim();
                selector = Selectors.XPath(xpath);
            }
            if (!_targetUrlExtractors.ContainsKey(selector))
            {
                _targetUrlExtractors.Add(selector, new List <Regex>());
            }
            var realPatterns = _targetUrlExtractors[selector];

            // 如果已经有正则为空, 即表示当前区域内所有的URL都是目标链接, 则无需再校验其它正则了
            if (realPatterns.Contains(null))
            {
                return;
            }

            if (patterns == null || patterns.Length == 0)
            {
                if (!realPatterns.Contains(null))
                {
                    realPatterns.Add(null);
                }
                return;
            }
            foreach (var pattern in patterns)
            {
                if (!string.IsNullOrEmpty(pattern))
                {
                    var realPattern = pattern.Trim();
                    if (realPatterns.All(p => p.ToString() != realPattern))
                    {
                        realPatterns.Add(new Regex(realPattern));
                    }
                }
                else
                {
                    throw new ArgumentNullException("Pattern should not be null.");
                }
            }
        }
Example #7
0
        /// <summary>
        /// 解析出目标链接
        /// </summary>
        /// <param name="page">页面数据</param>
        /// <param name="site">站点信息</param>
        /// <returns>目标链接</returns>
        protected override IEnumerable <Request> Extract(Page page, Site site)
        {
            if (_regionSelectorMapPatterns == null || _regionSelectorMapPatterns.Count == 0)
            {
                return(new Request[0]);
            }

            List <string> resultUrls = new List <string>();

            foreach (var targetUrlExtractor in _regionSelectorMapPatterns)
            {
                if (Equals(targetUrlExtractor.Key, Selectors.Default()))
                {
                    continue;
                }
                IEnumerable <string> links = null;
                if (page.ContentType == ContentType.Html)
                {
                    links = page.Selectable.SelectList(targetUrlExtractor.Key).Links().GetValues();
                }
                else if (page.ContentType == ContentType.Json)
                {
                    links = page.Selectable.SelectList(Selectors.Regex(RegexUtil.Url)).Links().GetValues();
                }
                else
                {
                }

                if (links == null)
                {
                    continue;
                }

                // check: 仔细考虑是放在前面, 还是在后面做 formatter, 我倾向于在前面. 对targetUrl做formatter则表示Start Url也应该是要符合这个规则的。
                List <string> tmp = new List <string>();
                foreach (string link in links)
                {
                    var newUrl = FormateUrl(link);
#if !NETSTANDARD
                    tmp.Add(System.Web.HttpUtility.HtmlDecode(System.Web.HttpUtility.UrlDecode(newUrl)));
#else
                    tmp.Add(WebUtility.HtmlDecode(WebUtility.UrlDecode(newUrl)));
#endif
                }
                links = tmp;

                if (targetUrlExtractor.Value == null || targetUrlExtractor.Value.Count == 0)
                {
                    resultUrls.AddRange(links);
                    continue;
                }

                foreach (var regex in targetUrlExtractor.Value)
                {
                    foreach (string link in links)
                    {
                        if (regex.IsMatch(link))
                        {
                            bool isRequired = true;
                            if (ExcludeTargetUrlPatterns != null)
                            {
                                foreach (var excludeRegex in ExcludeTargetUrlPatterns)
                                {
                                    if (excludeRegex.IsMatch(link))
                                    {
                                        isRequired = false;
                                        break;
                                    }
                                }
                            }
                            if (isRequired)
                            {
                                resultUrls.Add(link);
                            }
                        }
                    }
                }
            }

            if (site.DownloadFiles)
            {
                var links = (page.Selectable.SelectList(ImageSelector)).GetValues();

                if (links != null && links.Count() > 0)
                {
                    foreach (string link in links)
                    {
                        bool isRequired = true;
                        if (ExcludeTargetUrlPatterns != null)
                        {
                            foreach (var excludeRegex in ExcludeTargetUrlPatterns)
                            {
                                if (excludeRegex.IsMatch(link))
                                {
                                    isRequired = false;
                                    break;
                                }
                            }
                        }
                        if (isRequired)
                        {
                            resultUrls.Add(link);
                        }
                    }
                }
            }

            return(resultUrls.Select(t => new Request(t, page.Request.Extras)
            {
                Site = site
            }));
        }
Example #8
0
        /// <summary>
        /// 解析出目标链接
        /// </summary>
        /// <param name="response">链接请求结果</param>
        /// <returns>目标链接</returns>
        protected override IEnumerable <Request> Extract(Response response)
        {
            if (_regionSelectorMapPatterns == null || _regionSelectorMapPatterns.Count == 0)
            {
                return(new Request[0]);
            }
            var site = response.Request.Site;

            List <string> resultUrls = new List <string>();

            foreach (var targetUrlExtractor in _regionSelectorMapPatterns)
            {
                if (Equals(targetUrlExtractor.Key, Selectors.Default()))
                {
                    continue;
                }

                List <string> requests;

                if (response.ContentType == ContentType.Json)
                {
                    requests = new List <string>(response.Selectable().SelectList(Selectors.Regex(RegexUtil.Url)).Links().GetValues());
                }
                else
                {
                    requests = new List <string>(response.Selectable().SelectList(targetUrlExtractor.Key).Links().GetValues());
                }

                if (requests.Count == 0)
                {
                    continue;
                }

                List <string> tmpRequests = new List <string>();
                foreach (string request in requests)
                {
#if !NETSTANDARD
                    tmpRequests.Add(System.Web.HttpUtility.HtmlDecode(System.Web.HttpUtility.UrlDecode(request)));
#else
                    tmpRequests.Add(System.Net.WebUtility.HtmlDecode(System.Net.WebUtility.UrlDecode(request)));
#endif
                }

                requests = tmpRequests;

                if (targetUrlExtractor.Value == null || targetUrlExtractor.Value.Count == 0)
                {
                    resultUrls.AddRange(requests);
                    continue;
                }

                foreach (var regex in targetUrlExtractor.Value)
                {
                    foreach (string link in requests)
                    {
                        if (regex.IsMatch(link))
                        {
                            bool isRequired = true;
                            if (ExcludeTargetUrlPatterns != null)
                            {
                                foreach (var excludeRegex in ExcludeTargetUrlPatterns)
                                {
                                    if (excludeRegex.IsMatch(link))
                                    {
                                        isRequired = false;
                                        break;
                                    }
                                }
                            }

                            if (isRequired)
                            {
                                resultUrls.Add(link);
                            }
                        }
                    }
                }
            }
            var properties = new Dictionary <string, dynamic>();
            foreach (var kv in response.Request.Properties)
            {
                if (kv.Key != Env.UrlPropertyKey && kv.Key != Env.TargetUrlPropertyKey)
                {
                    properties.Add(kv.Key, kv.Value);
                }
            }

            return(resultUrls.Select(url => new Request(url, response.Request.Properties)
            {
                Site = site
            }));
        }
Example #9
0
        /// <summary>
        /// 如果找不到则不返回URL, 不然返回的URL太多
        /// </summary>
        /// <param name="page"></param>
        protected virtual void ExtractUrls(Page page)
        {
            if (_targetUrlExtractors == null || _targetUrlExtractors.Count == 0)
            {
                return;
            }

            foreach (var targetUrlExtractor in _targetUrlExtractors)
            {
                if (Equals(targetUrlExtractor.Key, Selectors.Default()))
                {
                    continue;
                }

                var links = page.Selectable.SelectList(targetUrlExtractor.Key).Links().GetValues();

                if (links == null)
                {
                    continue;
                }

                // check: 仔细考虑是放在前面, 还是在后面做 formatter, 我倾向于在前面. 对targetUrl做formatter则表示Start Url也应该是要符合这个规则的。
                List <string> tmp = new List <string>();
                foreach (string link in links)
                {
                    var url = FormateUrl(link);
#if !NET_CORE
                    tmp.Add(HttpUtility.HtmlDecode(HttpUtility.UrlDecode(url)));
#else
                    tmp.Add(WebUtility.HtmlDecode(WebUtility.UrlDecode(url)));
#endif
                }
                links = tmp;

                if (targetUrlExtractor.Value == null || targetUrlExtractor.Value.Count == 0)
                {
                    page.AddTargetRequests(links);
                    continue;
                }

                foreach (var regex in targetUrlExtractor.Value)
                {
                    foreach (string link in links)
                    {
                        if (regex.IsMatch(link))
                        {
                            bool isRequired = true;
                            if (_excludeTargetUrlPatterns != null)
                            {
                                foreach (var excludeRegex in _excludeTargetUrlPatterns)
                                {
                                    if (excludeRegex.IsMatch(link))
                                    {
                                        isRequired = false;
                                        break;
                                    }
                                }
                            }
                            if (isRequired)
                            {
                                page.AddTargetRequest(new Request(link, page.Request.Extras));
                            }
                        }
                    }
                }
            }

            if (Site.DownloadFiles)
            {
                var links = (page.Selectable.SelectList(_imageSelector)).GetValues();

                if (links == null || links.Count == 0)
                {
                    return;
                }
                foreach (string link in links)
                {
                    bool isRequired = true;
                    if (_excludeTargetUrlPatterns != null)
                    {
                        foreach (var excludeRegex in _excludeTargetUrlPatterns)
                        {
                            if (excludeRegex.IsMatch(link))
                            {
                                isRequired = false;
                                break;
                            }
                        }
                    }
                    if (isRequired)
                    {
                        page.AddTargetRequest(new Request(link, page.Request.Extras));
                    }
                }
            }
        }