Esempio n. 1
0
        private ISelector CompileSelector()
        {
            switch (expressionType)
            {
            case ExpressionType.Css:
                if (expressionParams.Length >= 1)
                {
                    return(Selectors.Css(expressionValue, expressionParams[0]));
                }
                else
                {
                    return(Selectors.Css(expressionValue));
                }

            case ExpressionType.XPath:
                return(Selectors.XPath(expressionValue));

            case ExpressionType.Regex:
                if (expressionParams.Length >= 1)
                {
                    return(Selectors.Regex(expressionValue,
                                           int.Parse(expressionParams[0])));
                }
                else
                {
                    return(Selectors.Regex(expressionValue));
                }

            case ExpressionType.JsonPath:
                return(new JsonPathSelector(expressionValue));

            default:
                return(Selectors.XPath(expressionValue));
            }
        }
        protected override void Handle(Page page)
        {
            // 利用 Selectable 查询并构造自己想要的数据对象
            var element = page.Selectable.SelectList(Selectors.Regex(@"defjson: \{.*?\]\}")).GetValues().FirstOrDefault();

            if (string.IsNullOrEmpty(element))
            {
                return;
            }
            if (element.IndexOf("[{") < 0)
            {
                return;
            }
            element = element.Substring(element.IndexOf("[{"), element.Length - element.IndexOf("[{") - 1);
            element = element.Replace("\"NOTICEDATE\":\"-\"", "\"NOTICEDATE\":\"\"");
            element = element.Replace("\"-\"", "\"0\"");

            var settings = new JsonSerializerSettings
            {
                NullValueHandling     = NullValueHandling.Ignore,
                MissingMemberHandling = MissingMemberHandling.Ignore,
                DateParseHandling     = DateParseHandling.None,
            };
            var results = JsonConvert.DeserializeObject <DataTable>(element, settings);


            //// Save data object by key. 以自定义KEY存入page对象中供Pipeline调用
            page.AddResultItem("Result", results);
        }
Esempio n. 3
0
        /// <summary>
        /// 把BaseSelector转换成真正的查询器
        /// </summary>
        /// <param name="selector">BaseSelector</param>
        /// <returns>查询器</returns>
        public static ISelector ToSelector(this Selector selector)
        {
            if (selector != null)
            {
                string expression = selector.Expression;

                switch (selector.Type)
                {
                case SelectorType.Css:
                {
                    NotNullExpression(selector);
                    return(Selectors.Css(expression));
                }

                case SelectorType.Enviroment:
                {
                    return(Selectors.Enviroment(expression));
                }

                case SelectorType.JsonPath:
                {
                    NotNullExpression(selector);
                    return(Selectors.JsonPath(expression));
                }

                case SelectorType.Regex:
                {
                    NotNullExpression(selector);
                    if (string.IsNullOrEmpty(selector.Arguments))
                    {
                        return(Selectors.Regex(expression));
                    }
                    else
                    {
                        int group;
                        if (int.TryParse(selector.Arguments, out group))
                        {
                            return(Selectors.Regex(expression, group));
                        }
                        throw new SpiderException("Regex argument should be a number set to group: " + selector);
                    }
                }

                case SelectorType.XPath:
                {
                    NotNullExpression(selector);
                    return(Selectors.XPath(expression));
                }

                default:
                {
                    throw new SpiderException($"Selector {selector} unsupoort");
                }
                }
            }
            else
            {
                return(null);
            }
        }
Esempio n. 4
0
 public void TestEach()
 {
     Assert.AreEqual(Selectors.Css("div h1 a").Select(_html).OuterHtml, "<a href=\"xxx\">aabbcc</a>");
     Assert.AreEqual(Selectors.Css("div h1 a", "href").Select(_html), "xxx");
     Assert.AreEqual(Selectors.Css("div h1 a").Select(_html).InnerHtml, "aabbcc");
     Assert.AreEqual(Selectors.XPath("//a/@href").Select(_html), "xxx");
     Assert.AreEqual(Selectors.Regex("a href=\"(.*)\"").Select(_html), "xxx");
     Assert.AreEqual(Selectors.Regex("(a href)=\"(.*)\"", 2).Select(_html), "xxx");
 }
Esempio n. 5
0
        public void TestCombo()
        {
            var value1 = Selectors.And(Selectors.Css("title"), Selectors.Regex("aa(bb)cc")).Select(_html2);

            Assert.AreEqual(value1, "bb");

            var or = Selectors.Or(Selectors.Css("div h1 a", "innerHtml"), Selectors.XPath("//title"));

            Assert.AreEqual(or.Select(_html), "aabbcc");
            Assert.AreEqual(or.Select(_html2), "aabbcc");
        }
        /// <summary>
        /// Only used for test
        /// </summary>
        /// <param name="regionXpath"></param>
        /// <returns></returns>
        internal List <Regex> GetTargetUrlPatterns(string regionXpath)
        {
            ISelector selector = Selectors.Regex(RegexUtil.Url);

            if (!string.IsNullOrWhiteSpace(regionXpath))
            {
                selector = Selectors.XPath(regionXpath);
            }

            return(_regionSelectorMapPatterns.ContainsKey(selector) ? _regionSelectorMapPatterns[selector] : null);
        }
        /// <summary>
        /// Only used for test
        /// </summary>
        /// <param name="regionXpath"></param>
        /// <returns></returns>
        internal virtual List <Regex> GetTargetUrlPatterns(string regionXpath)
        {
            //ISelector selector = Selectors.Default();
            ISelector selector = Selectors.Regex(RegexUtil.UrlRegex);

            if (!string.IsNullOrWhiteSpace(regionXpath))
            {
                selector = Selectors.XPath(regionXpath);
            }

            return(_targetUrlExtractors.ContainsKey(selector) ? _targetUrlExtractors[selector] : null);
        }
        public void AddTargetUrlExtractor(string regionXpath, params string[] patterns)
        {
            if (patterns == null || patterns.Length == 0)
            {
                throw new ArgumentNullException("Patterns should not be null or empty.");
            }

            var validPatterns = patterns.Where(p => p != null && !string.IsNullOrEmpty(p.Trim())).Select(p => p.Trim()).ToList();

            if (validPatterns.Count != patterns.Length)
            {
                throw new ArgumentNullException("Pattern value should not be null or empty.");
            }

            ISelector selector = Selectors.Regex(RegexUtil.UrlRegex);

            if (!string.IsNullOrEmpty(regionXpath))
            {
                string xpath = string.IsNullOrWhiteSpace(regionXpath.Trim()) ? "." : regionXpath.Trim();
                selector = Selectors.XPath(xpath);
            }

            if (!_targetUrlExtractors.ContainsKey(selector))
            {
                _targetUrlExtractors.Add(selector, new List <Regex>());
            }
            var realPatterns = _targetUrlExtractors[selector];

            // 如果已经有正则为空, 即表示当前区域内所有的URL都是目标链接, 则无需再校验其它正则了
            if (realPatterns.Contains(null))
            {
                return;
            }

            if (validPatterns.Count == 0)
            {
                if (!realPatterns.Contains(null))
                {
                    realPatterns.Add(null);
                }
                return;
            }
            foreach (var pattern in validPatterns)
            {
                if (realPatterns.All(p => p.ToString() != pattern))
                {
                    var regex = new Regex(pattern);
                    realPatterns.Add(regex);
                    _targetUrlPatterns.Add(regex);
                }
            }
        }
Esempio n. 9
0
        /// <summary>
        /// 把BaseSelector转换成真正的查询器
        /// </summary>
        /// <param name="selector">BaseSelector</param>
        /// <returns>查询器</returns>
        public static ISelector ToSelector(this Attribute.Selector selector)
        {
            if (selector != null)
            {
                string expression = selector.Expression;

                switch (selector.Type)
                {
                case SelectorType.Css:
                {
                    NotNullExpression(selector);
                    return(Selectors.Css(expression));
                }

                case SelectorType.JsonPath:
                {
                    NotNullExpression(selector);
                    return(Selectors.JsonPath(expression));
                }

                case SelectorType.Regex:
                {
                    NotNullExpression(selector);
                    if (string.IsNullOrEmpty(selector.Arguments))
                    {
                        return(Selectors.Regex(expression));
                    }

                    if (int.TryParse(selector.Arguments, out var group))
                    {
                        return(Selectors.Regex(expression, @group));
                    }
                    throw new ArgumentException($"Regex argument should be a number set to group: {selector}");
                }

                case SelectorType.XPath:
                {
                    NotNullExpression(selector);
                    return(Selectors.XPath(expression));
                }

                default:
                {
                    throw new NotSupportedException($"{selector} unsupported");
                }
                }
            }

            return(null);
        }
Esempio n. 10
0
        public static ISelector Parse(BaseSelector selector)
        {
            if (string.IsNullOrEmpty(selector?.Expression))
            {
                return(null);
            }

            string expression = selector.Expression;

            switch (selector.Type)
            {
            case SelectorType.Css:
            {
                return(Selectors.Css(expression));
            }

            case SelectorType.Enviroment:
            {
                return(Selectors.Enviroment(expression));
            }

            case SelectorType.JsonPath:
            {
                return(Selectors.JsonPath(expression));
            }

            case SelectorType.Regex:
            {
                if (string.IsNullOrEmpty(selector.Argument))
                {
                    return(Selectors.Regex(expression));
                }
                else
                {
                    int group;
                    if (int.TryParse(selector.Argument, out group))
                    {
                        return(Selectors.Regex(expression, group));
                    }
                    throw new SpiderException("Regex argument should be a number set to group: " + selector);
                }
            }

            case SelectorType.XPath:
            {
                return(Selectors.XPath(expression));
            }
            }
            throw new SpiderException("Not support selector: " + selector);
        }
Esempio n. 11
0
        /// <summary>
        /// 把 BaseSelector 转换成真正的查询器
        /// </summary>
        /// <param name="selector">BaseSelector</param>
        /// <returns>查询器</returns>
        public static ISelector ToSelector(this Selector selector)
        {
            if (selector != null)
            {
                var expression = selector.Expression;

                switch (selector.Type)
                {
                case SelectorType.Css:
                {
                    NotNullExpression(selector);
                    return(Selectors.Css(expression));
                }

                case SelectorType.JsonPath:
                {
                    NotNullExpression(selector);
                    return(Selectors.JsonPath(expression));
                }

                case SelectorType.Regex:
                {
                    NotNullExpression(selector);
                    if (string.IsNullOrEmpty(selector.Arguments))
                    {
                        return(Selectors.Regex(expression));
                    }

                    var arguments   = selector.Arguments.Split(new[] { ',' }, StringSplitOptions.RemoveEmptyEntries);
                    var options     = (RegexOptions)Enum.Parse(typeof(RegexOptions), arguments[0]);
                    var replacement = arguments[1];
                    return(Selectors.Regex(expression, options, replacement));
                }

                case SelectorType.XPath:
                {
                    NotNullExpression(selector);
                    return(Selectors.XPath(expression));
                }

                default:
                {
                    throw new NotSupportedException($"{selector} unsupported");
                }
                }
            }

            return(null);
        }
        protected override void Handle(Page page)
        {
            // 利用 Selectable 查询并构造自己想要的数据对象
            var totalElements = page.Selectable.SelectList(Selectors.Regex("<li><a target=\"_blank\" href=\"http://quote.eastmoney.com/.*?.html\">.*?</a></li>")).GetValues();

            List <string> results = new List <string>();

            foreach (var element in totalElements)
            {
                //string stockValue = Utility.GetTitleContent(element, "a");
                results.Add(element);
            }

            // Save data object by key. 以自定义KEY存入page对象中供Pipeline调用
            page.AddResultItem("Result", results);
        }
Esempio n. 13
0
        protected override void Handle(Page page)
        {
            var results = new List <TiebaUser>();

            if (page.Request.Url.Contains("https://tieba.baidu.com/home/main"))
            {
                if (page.Request.Properties.ContainsKey("页面类型"))
                {
                    string htmlType = page.Request.Properties["页面类型"];
                    if (htmlType.Equals("手机"))
                    {
                        TiebaUser  tiebaUser             = new TiebaUser();
                        var        totalUserInfoElements = page.Selectable().SelectList(Selectors.XPath(".div//tab tab_holo home_tab j_home_tab']")).Nodes();
                        List <int> info = totalUserInfoElements.Select(p => p.Select(Selectors.XPath(".//span[@class='home_tab_item_num']")).GetValue()).Select(p => Convert.ToInt32(p)).ToList();
                        tiebaUser.Post_Num    = info[0];
                        tiebaUser.PostBar_Num = info[1];
                        tiebaUser.Follow_Num  = info[2];
                        tiebaUser.Fans_Num    = info[3];
                        tiebaUser.Key         = page.Selectable().Select(Selectors.Regex(@"(?<=(/i/\?portrait=))[0-9a-zA-Z]+")).GetValue();
                        tiebaUser.U_Nick      = page.Selectable().Select(Selectors.XPath(".//a[class='home_card_uname_link']")).GetValue(ValueOption.InnerText);
                        results.Add(tiebaUser);
                        page.AddTargetRequest(new Request($"http://tieba.baidu.com/home/main/?un={tiebaUser.U_Nick}", new Dictionary <string, object>()
                        {
                            { "页面类型", "电脑" }
                        }));
                    }
                    else if (htmlType.Equals("电脑"))
                    {
                        TiebaUser tiebaUser = new TiebaUser();
                        tiebaUser.U_Nick = page.Selectable().Select(Selectors.XPath(".//span[class='userinfo_username']")).GetValue(ValueOption.InnerText);
                        string userTitle = page.Selectable().Select(Selectors.XPath(".//span[class='user_name']")).GetValue(ValueOption.InnerText);
                        tiebaUser.U_Name      = Regex.Match(userTitle, @"(?<=用户名:)[^"" <]+").Value;
                        tiebaUser.U_BaAge     = Regex.Match(userTitle, @"(?<=吧龄:)\d+\.\d+").Value;
                        tiebaUser.Posting_Num = Convert.ToInt32(Regex.Match(userTitle, @"(?<=发帖:)\d+").Value);
                        results.Add(tiebaUser);
                    }
                }
            }
            //将数据添加进去,使得数据存储类可以拿到数据
            if (results.Count > 0)
            {
                page.AddResultItem("TiebaUser", results);
            }
        }
Esempio n. 14
0
        public static ISelector Parse(Selector selector)
        {
            if (string.IsNullOrEmpty(selector?.Expression))
            {
                return(null);
            }

            string expression = selector.Expression;

            switch (selector.Type)
            {
            case SelectorType.Css:
            {
                return(Selectors.Css(expression));
            }

            case SelectorType.Enviroment:
            {
                return(Selectors.Enviroment(expression));
            }

            case SelectorType.JsonPath:
            {
                return(Selectors.JsonPath(expression));
            }

            case SelectorType.Regex:
            {
                return(Selectors.Regex(expression));
            }

            case SelectorType.XPath:
            {
                return(Selectors.XPath(expression));
            }
            }
            throw new SpiderException("Not support selector: " + selector);
        }
Esempio n. 15
0
        /// <summary>
        /// 构造方法
        /// </summary>
        public DataParser()
        {
            Model = new Model <T>();

            var patterns = new HashSet <string>();

            if (Model.FollowRequestSelectors != null)
            {
                foreach (var followSelector in Model.FollowRequestSelectors)
                {
                    switch (followSelector.SelectorType)
                    {
                    case SelectorType.Css:
                    {
                        foreach (var expression in followSelector.Expressions)
                        {
                            AddFollowRequestQuerier(Selectors.Css(expression));
                        }

                        break;
                    }

                    case SelectorType.Regex:
                    {
                        foreach (var expression in followSelector.Expressions)
                        {
                            AddFollowRequestQuerier(Selectors.Regex(expression));
                        }

                        break;
                    }

                    case SelectorType.XPath:
                    {
                        foreach (var expression in followSelector.Expressions)
                        {
                            AddFollowRequestQuerier(Selectors.XPath(expression));
                        }

                        break;
                    }

                    case SelectorType.Environment:
                    {
                        Logger.LogWarning("SelectorType of follow selector is not supported");
                        break;
                    }

                    case SelectorType.JsonPath:
                    {
                        foreach (var expression in followSelector.Expressions)
                        {
                            AddFollowRequestQuerier(Selectors.JsonPath(expression));
                        }

                        break;
                    }
                    }

                    foreach (var pattern in followSelector.Patterns)
                    {
                        patterns.Add(pattern);
                    }
                }
            }

            foreach (var pattern in patterns)
            {
                AddRequiredValidator(request => Regex.IsMatch(request.RequestUri.ToString(), pattern));
            }
        }
Esempio n. 16
0
 public void Regex()
 {
     Assert.Equal("a href=\"xxx\"", Selectors.Regex("a href=\"(.*)\"").Select(html).Value);
     Assert.Equal("xxx", Selectors.Regex("(a href)=\"(.*)\"", RegexOptions.None, "$2").Select(html).Value);
 }
Esempio n. 17
0
        /// <summary>
        /// 解析出目标链接
        /// </summary>
        /// <param name="response">链接请求结果</param>
        /// <returns>目标链接</returns>
        protected override IEnumerable <Request> Extract(Response response)
        {
            if (_regionSelectorMapPatterns == null || _regionSelectorMapPatterns.Count == 0)
            {
                return(new Request[0]);
            }
            var site = response.Request.Site;

            List <string> resultUrls = new List <string>();

            foreach (var targetUrlExtractor in _regionSelectorMapPatterns)
            {
                if (Equals(targetUrlExtractor.Key, Selectors.Default()))
                {
                    continue;
                }

                List <string> requests;

                if (response.ContentType == ContentType.Json)
                {
                    requests = new List <string>(response.Selectable().SelectList(Selectors.Regex(RegexUtil.Url)).Links().GetValues());
                }
                else
                {
                    requests = new List <string>(response.Selectable().SelectList(targetUrlExtractor.Key).Links().GetValues());
                }

                if (requests.Count == 0)
                {
                    continue;
                }

                List <string> tmpRequests = new List <string>();
                foreach (string request in requests)
                {
#if !NETSTANDARD
                    tmpRequests.Add(System.Web.HttpUtility.HtmlDecode(System.Web.HttpUtility.UrlDecode(request)));
#else
                    tmpRequests.Add(System.Net.WebUtility.HtmlDecode(System.Net.WebUtility.UrlDecode(request)));
#endif
                }

                requests = tmpRequests;

                if (targetUrlExtractor.Value == null || targetUrlExtractor.Value.Count == 0)
                {
                    resultUrls.AddRange(requests);
                    continue;
                }

                foreach (var regex in targetUrlExtractor.Value)
                {
                    foreach (string link in requests)
                    {
                        if (regex.IsMatch(link))
                        {
                            bool isRequired = true;
                            if (ExcludeTargetUrlPatterns != null)
                            {
                                foreach (var excludeRegex in ExcludeTargetUrlPatterns)
                                {
                                    if (excludeRegex.IsMatch(link))
                                    {
                                        isRequired = false;
                                        break;
                                    }
                                }
                            }

                            if (isRequired)
                            {
                                resultUrls.Add(link);
                            }
                        }
                    }
                }
            }
            var properties = new Dictionary <string, dynamic>();
            foreach (var kv in response.Request.Properties)
            {
                if (kv.Key != Env.UrlPropertyKey && kv.Key != Env.TargetUrlPropertyKey)
                {
                    properties.Add(kv.Key, kv.Value);
                }
            }

            return(resultUrls.Select(url => new Request(url, response.Request.Properties)
            {
                Site = site
            }));
        }
        /// <summary>
        /// 如果找不到则不返回URL, 不然返回的URL太多
        /// </summary>
        /// <param name="page"></param>
        protected virtual void ExtractUrls(Page page)
        {
            if (_targetUrlExtractors == null || _targetUrlExtractors.Count == 0)
            {
                return;
            }

            foreach (var targetUrlExtractor in _targetUrlExtractors)
            {
                if (Equals(targetUrlExtractor.Key, Selectors.Default()))
                {
                    continue;
                }


                List <string> links = null;
                if (page.ContentType == ContentType.Html)
                {
                    links = page.Selectable.SelectList(targetUrlExtractor.Key).Links().GetValues();
                }
                else if (page.ContentType == ContentType.Json)
                {
                    links = page.Selectable.SelectList(Selectors.Regex(RegexUtil.UrlRegex)).Links().GetValues();
                }
                else
                {
                    throw new Exception("page.ContentType is not match!");
                }


                if (links == null)
                {
                    continue;
                }

                // check: 仔细考虑是放在前面, 还是在后面做 formatter, 我倾向于在前面. 对targetUrl做formatter则表示Start Url也应该是要符合这个规则的。
                List <string> tmp = new List <string>();
                foreach (string link in links)
                {
                    var url = FormateUrl(link);
#if !NET_CORE
                    tmp.Add(HttpUtility.HtmlDecode(HttpUtility.UrlDecode(url)));
#else
                    tmp.Add(WebUtility.HtmlDecode(WebUtility.UrlDecode(url)));
#endif
                }
                links = tmp;

                if (targetUrlExtractor.Value == null || targetUrlExtractor.Value.Count == 0)
                {
                    page.AddTargetRequests(links);
                    continue;
                }

                foreach (var regex in targetUrlExtractor.Value)
                {
                    foreach (string link in links)
                    {
                        if (regex.IsMatch(link))
                        {
                            bool isRequired = true;
                            if (_excludeTargetUrlPatterns != null)
                            {
                                foreach (var excludeRegex in _excludeTargetUrlPatterns)
                                {
                                    if (excludeRegex.IsMatch(link))
                                    {
                                        isRequired = false;
                                        break;
                                    }
                                }
                            }
                            if (isRequired)
                            {
                                page.AddTargetRequest(new Request(link, page.Request.Extras));
                            }
                        }
                    }
                }
            }

            if (Site.DownloadFiles)
            {
                var links = (page.Selectable.SelectList(ImageSelector)).GetValues();

                if (links == null || links.Count == 0)
                {
                    return;
                }
                foreach (string link in links)
                {
                    bool isRequired = true;
                    if (_excludeTargetUrlPatterns != null)
                    {
                        foreach (var excludeRegex in _excludeTargetUrlPatterns)
                        {
                            if (excludeRegex.IsMatch(link))
                            {
                                isRequired = false;
                                break;
                            }
                        }
                    }
                    if (isRequired)
                    {
                        page.AddTargetRequest(new Request(link, page.Request.Extras));
                    }
                }
            }
        }
Esempio n. 19
0
        /// <summary>
        /// 解析出目标链接
        /// </summary>
        /// <param name="page">页面数据</param>
        /// <param name="site">站点信息</param>
        /// <returns>目标链接</returns>
        protected override IEnumerable <Request> Extract(Page page, Site site)
        {
            if (_regionSelectorMapPatterns == null || _regionSelectorMapPatterns.Count == 0)
            {
                return(new Request[0]);
            }

            List <string> resultUrls = new List <string>();

            foreach (var targetUrlExtractor in _regionSelectorMapPatterns)
            {
                if (Equals(targetUrlExtractor.Key, Selectors.Default()))
                {
                    continue;
                }
                IEnumerable <string> links = null;
                if (page.ContentType == ContentType.Html)
                {
                    links = page.Selectable.SelectList(targetUrlExtractor.Key).Links().GetValues();
                }
                else if (page.ContentType == ContentType.Json)
                {
                    links = page.Selectable.SelectList(Selectors.Regex(RegexUtil.Url)).Links().GetValues();
                }
                else
                {
                }

                if (links == null)
                {
                    continue;
                }

                // check: 仔细考虑是放在前面, 还是在后面做 formatter, 我倾向于在前面. 对targetUrl做formatter则表示Start Url也应该是要符合这个规则的。
                List <string> tmp = new List <string>();
                foreach (string link in links)
                {
                    var newUrl = FormateUrl(link);
#if !NETSTANDARD
                    tmp.Add(System.Web.HttpUtility.HtmlDecode(System.Web.HttpUtility.UrlDecode(newUrl)));
#else
                    tmp.Add(WebUtility.HtmlDecode(WebUtility.UrlDecode(newUrl)));
#endif
                }
                links = tmp;

                if (targetUrlExtractor.Value == null || targetUrlExtractor.Value.Count == 0)
                {
                    resultUrls.AddRange(links);
                    continue;
                }

                foreach (var regex in targetUrlExtractor.Value)
                {
                    foreach (string link in links)
                    {
                        if (regex.IsMatch(link))
                        {
                            bool isRequired = true;
                            if (ExcludeTargetUrlPatterns != null)
                            {
                                foreach (var excludeRegex in ExcludeTargetUrlPatterns)
                                {
                                    if (excludeRegex.IsMatch(link))
                                    {
                                        isRequired = false;
                                        break;
                                    }
                                }
                            }
                            if (isRequired)
                            {
                                resultUrls.Add(link);
                            }
                        }
                    }
                }
            }

            if (site.DownloadFiles)
            {
                var links = (page.Selectable.SelectList(ImageSelector)).GetValues();

                if (links != null && links.Count() > 0)
                {
                    foreach (string link in links)
                    {
                        bool isRequired = true;
                        if (ExcludeTargetUrlPatterns != null)
                        {
                            foreach (var excludeRegex in ExcludeTargetUrlPatterns)
                            {
                                if (excludeRegex.IsMatch(link))
                                {
                                    isRequired = false;
                                    break;
                                }
                            }
                        }
                        if (isRequired)
                        {
                            resultUrls.Add(link);
                        }
                    }
                }
            }

            return(resultUrls.Select(t => new Request(t, page.Request.Extras)
            {
                Site = site
            }));
        }
Esempio n. 20
0
 public void Regex()
 {
     Assert.Equal(Selectors.Regex("a href=\"(.*)\"").Select(_html), "a href=\"xxx\"");
     Assert.Equal(Selectors.Regex("(a href)=\"(.*)\"", 2).Select(_html), "xxx");
 }
Esempio n. 21
0
        protected override void Handle(Page page)
        {
            //如果是分类页面,就获取所有分类
            if (page.Request.RequestUri.AbsoluteUri.Contains("tieba.baidu.com/f/index/forumclass"))
            {
                var listRequest        = new List <Request>();
                var totalClassElements = page.Selectable().SelectList(Selectors.XPath(".//ul[@class='item-list-ul clearfix']")).Nodes();
                foreach (var classElement in totalClassElements)
                {
                    IEnumerable <string> hrefs = classElement.SelectList(Selectors.XPath(".//li")).Nodes().Select(p => p.XPath(".//a/@href").GetValue());
                    foreach (var href in hrefs)
                    {
                        listRequest.Add(new Request(href, new Dictionary <string, dynamic>()
                        {
                            { "pageSize", 30 }
                        })
                        {
                            UserAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36"
                        });
                    }
                }
                //把获取到的所有分类链接加入到队列中
                page.AddTargetRequests(listRequest);
            }
            List <TiebaGroup> results = new List <TiebaGroup>();

            //分类页面获取贴吧信息
            if (page.Request.RequestUri.AbsoluteUri.Contains("tieba.baidu.com/f/index/forumpark"))
            {
                var listRequest        = new List <Request>();
                var totalTiebaElements = page.Selectable().SelectList(Selectors.XPath(".//div[@id='ba_list']/div")).Nodes();
                foreach (var tiebaElement in totalTiebaElements)
                {
                    //var tieba = new TiebaGroup();
                    //tieba.Key = tiebaElement.Select(Selectors.XPath(".//div[@class='ba_like ']//@data-fid")).GetValue();
                    //tieba.Ba_Name = tiebaElement.Select(Selectors.XPath(".//p[@class='ba_name']")).GetValue();
                    //tieba.Ba_Desc = tiebaElement.Select(Selectors.XPath(".//p[@class='ba_desc']")).GetValue();
                    //tieba.Ba_M_Num = Convert.ToInt32(tiebaElement.Select(Selectors.XPath(".//span[@class='ba_m_num']")).GetValue());
                    //tieba.Ba_P_Num = Convert.ToInt32(tiebaElement.Select(Selectors.XPath(".//span[@class='ba_p_num']")).GetValue());
                    //tieba.Ba_Pic = tiebaElement.Select(Selectors.XPath(".//img[@class='ba_pic']/@src")).GetValue();
                    //tieba.FirstClassIfication = page.Request.RequestUri.GetParameter("pcn");
                    //tieba.TwoClassIfication = page.Request.RequestUri.GetParameter("cn");
                    //tieba.UpdateTime = DateTime.Now;
                    //results.Add(tieba);
                    string baName = tiebaElement.Select(Selectors.XPath(".//p[@class='ba_name']")).GetValue();
                    listRequest.Add(new Request($"http://tieba.baidu.com/f?kw={baName.Substring(0, baName.Length - 1)}"));
                }
                page.AddTargetRequests(listRequest);
                //如果是分类页面则需要分页
                int pageIndex = Convert.ToInt32(page.Request.RequestUri.GetParameter("pn") ?? "1");
                if (page.Request.Properties.ContainsKey("pageSize"))
                {
                    int pageSize = page.Request.Properties["pageSize"];
                    for (int i = pageIndex + 1; i <= pageSize; i++)
                    {
                        //往后翻页
                        page.AddTargetRequest(new Request(page.Request.RequestUri.SetParameter("pn", i.ToString()).AbsoluteUri));
                    }
                }
            }
            //贴吧主页获取贴吧信息
            else if (page.Request.RequestUri.AbsoluteUri.Contains("tieba.baidu.com/f?") && page.Request.RequestUri.IsGetParameter("kw"))
            {
                var select = page.Selectable();
                var tieba  = new TiebaGroup();
                tieba.Key = Regex.Match(select.GetValue(), @"(?<=(PageData.forum = {\s*'id': ))\d+").Value;
                string title = select.Select(Selectors.XPath("//title")).GetValue();
                tieba.Ba_Name  = Regex.Match(title, "[^>]*(?=(-百度贴吧))").Value;
                tieba.Ba_Desc  = Regex.Match(title, "(?<=(-百度贴吧--))[^<]*").Value;
                tieba.Ba_M_Num = Convert.ToInt32(select.Select(Selectors.Regex("(?<=card_menNum\">)[\\d,]+")).GetValue().Replace(",", ""));
                tieba.Ba_P_Num = Convert.ToInt32(select.Select(Selectors.Regex("(?<=card_infoNum\">)[\\d,]+")).GetValue().Replace(",", ""));
                tieba.Ba_Pic   = HttpUtility.UrlDecode(select.Select(Selectors.Regex("(?<=(wh_rate=null&amp;src=))[^\"]*")).GetValue());
                Uri uri = new Uri(HttpUtility.UrlDecode("http://tieba.baidu.com" + select.Select(Selectors.Regex(@"(?<=(<span>目录:</span>\s*<a rel=""noreferrer""\s*href=""))[^""]*")).GetValue()));
                tieba.FirstClassIfication = uri.GetParameter("fd");
                tieba.TwoClassIfication   = uri.GetParameter("sd");
                tieba.UpdateTime          = DateTime.Now;
                results.Add(tieba);
            }
            // Save data object by key. 以自定义KEY存入page对象中供Pipeline调用
            if (results.Count > 0)
            {
                page.AddResultItem("TiebaGroup", results);
            }
        }