private ISelector CompileSelector() { switch (expressionType) { case ExpressionType.Css: if (expressionParams.Length >= 1) { return(Selectors.Css(expressionValue, expressionParams[0])); } else { return(Selectors.Css(expressionValue)); } case ExpressionType.XPath: return(Selectors.XPath(expressionValue)); case ExpressionType.Regex: if (expressionParams.Length >= 1) { return(Selectors.Regex(expressionValue, int.Parse(expressionParams[0]))); } else { return(Selectors.Regex(expressionValue)); } case ExpressionType.JsonPath: return(new JsonPathSelector(expressionValue)); default: return(Selectors.XPath(expressionValue)); } }
protected override void Handle(Page page) { // 利用 Selectable 查询并构造自己想要的数据对象 var element = page.Selectable.SelectList(Selectors.Regex(@"defjson: \{.*?\]\}")).GetValues().FirstOrDefault(); if (string.IsNullOrEmpty(element)) { return; } if (element.IndexOf("[{") < 0) { return; } element = element.Substring(element.IndexOf("[{"), element.Length - element.IndexOf("[{") - 1); element = element.Replace("\"NOTICEDATE\":\"-\"", "\"NOTICEDATE\":\"\""); element = element.Replace("\"-\"", "\"0\""); var settings = new JsonSerializerSettings { NullValueHandling = NullValueHandling.Ignore, MissingMemberHandling = MissingMemberHandling.Ignore, DateParseHandling = DateParseHandling.None, }; var results = JsonConvert.DeserializeObject <DataTable>(element, settings); //// Save data object by key. 以自定义KEY存入page对象中供Pipeline调用 page.AddResultItem("Result", results); }
/// <summary> /// 把BaseSelector转换成真正的查询器 /// </summary> /// <param name="selector">BaseSelector</param> /// <returns>查询器</returns> public static ISelector ToSelector(this Selector selector) { if (selector != null) { string expression = selector.Expression; switch (selector.Type) { case SelectorType.Css: { NotNullExpression(selector); return(Selectors.Css(expression)); } case SelectorType.Enviroment: { return(Selectors.Enviroment(expression)); } case SelectorType.JsonPath: { NotNullExpression(selector); return(Selectors.JsonPath(expression)); } case SelectorType.Regex: { NotNullExpression(selector); if (string.IsNullOrEmpty(selector.Arguments)) { return(Selectors.Regex(expression)); } else { int group; if (int.TryParse(selector.Arguments, out group)) { return(Selectors.Regex(expression, group)); } throw new SpiderException("Regex argument should be a number set to group: " + selector); } } case SelectorType.XPath: { NotNullExpression(selector); return(Selectors.XPath(expression)); } default: { throw new SpiderException($"Selector {selector} unsupoort"); } } } else { return(null); } }
public void TestEach() { Assert.AreEqual(Selectors.Css("div h1 a").Select(_html).OuterHtml, "<a href=\"xxx\">aabbcc</a>"); Assert.AreEqual(Selectors.Css("div h1 a", "href").Select(_html), "xxx"); Assert.AreEqual(Selectors.Css("div h1 a").Select(_html).InnerHtml, "aabbcc"); Assert.AreEqual(Selectors.XPath("//a/@href").Select(_html), "xxx"); Assert.AreEqual(Selectors.Regex("a href=\"(.*)\"").Select(_html), "xxx"); Assert.AreEqual(Selectors.Regex("(a href)=\"(.*)\"", 2).Select(_html), "xxx"); }
public void TestCombo() { var value1 = Selectors.And(Selectors.Css("title"), Selectors.Regex("aa(bb)cc")).Select(_html2); Assert.AreEqual(value1, "bb"); var or = Selectors.Or(Selectors.Css("div h1 a", "innerHtml"), Selectors.XPath("//title")); Assert.AreEqual(or.Select(_html), "aabbcc"); Assert.AreEqual(or.Select(_html2), "aabbcc"); }
/// <summary> /// Only used for test /// </summary> /// <param name="regionXpath"></param> /// <returns></returns> internal List <Regex> GetTargetUrlPatterns(string regionXpath) { ISelector selector = Selectors.Regex(RegexUtil.Url); if (!string.IsNullOrWhiteSpace(regionXpath)) { selector = Selectors.XPath(regionXpath); } return(_regionSelectorMapPatterns.ContainsKey(selector) ? _regionSelectorMapPatterns[selector] : null); }
/// <summary> /// Only used for test /// </summary> /// <param name="regionXpath"></param> /// <returns></returns> internal virtual List <Regex> GetTargetUrlPatterns(string regionXpath) { //ISelector selector = Selectors.Default(); ISelector selector = Selectors.Regex(RegexUtil.UrlRegex); if (!string.IsNullOrWhiteSpace(regionXpath)) { selector = Selectors.XPath(regionXpath); } return(_targetUrlExtractors.ContainsKey(selector) ? _targetUrlExtractors[selector] : null); }
public void AddTargetUrlExtractor(string regionXpath, params string[] patterns) { if (patterns == null || patterns.Length == 0) { throw new ArgumentNullException("Patterns should not be null or empty."); } var validPatterns = patterns.Where(p => p != null && !string.IsNullOrEmpty(p.Trim())).Select(p => p.Trim()).ToList(); if (validPatterns.Count != patterns.Length) { throw new ArgumentNullException("Pattern value should not be null or empty."); } ISelector selector = Selectors.Regex(RegexUtil.UrlRegex); if (!string.IsNullOrEmpty(regionXpath)) { string xpath = string.IsNullOrWhiteSpace(regionXpath.Trim()) ? "." : regionXpath.Trim(); selector = Selectors.XPath(xpath); } if (!_targetUrlExtractors.ContainsKey(selector)) { _targetUrlExtractors.Add(selector, new List <Regex>()); } var realPatterns = _targetUrlExtractors[selector]; // 如果已经有正则为空, 即表示当前区域内所有的URL都是目标链接, 则无需再校验其它正则了 if (realPatterns.Contains(null)) { return; } if (validPatterns.Count == 0) { if (!realPatterns.Contains(null)) { realPatterns.Add(null); } return; } foreach (var pattern in validPatterns) { if (realPatterns.All(p => p.ToString() != pattern)) { var regex = new Regex(pattern); realPatterns.Add(regex); _targetUrlPatterns.Add(regex); } } }
/// <summary> /// 把BaseSelector转换成真正的查询器 /// </summary> /// <param name="selector">BaseSelector</param> /// <returns>查询器</returns> public static ISelector ToSelector(this Attribute.Selector selector) { if (selector != null) { string expression = selector.Expression; switch (selector.Type) { case SelectorType.Css: { NotNullExpression(selector); return(Selectors.Css(expression)); } case SelectorType.JsonPath: { NotNullExpression(selector); return(Selectors.JsonPath(expression)); } case SelectorType.Regex: { NotNullExpression(selector); if (string.IsNullOrEmpty(selector.Arguments)) { return(Selectors.Regex(expression)); } if (int.TryParse(selector.Arguments, out var group)) { return(Selectors.Regex(expression, @group)); } throw new ArgumentException($"Regex argument should be a number set to group: {selector}"); } case SelectorType.XPath: { NotNullExpression(selector); return(Selectors.XPath(expression)); } default: { throw new NotSupportedException($"{selector} unsupported"); } } } return(null); }
public static ISelector Parse(BaseSelector selector) { if (string.IsNullOrEmpty(selector?.Expression)) { return(null); } string expression = selector.Expression; switch (selector.Type) { case SelectorType.Css: { return(Selectors.Css(expression)); } case SelectorType.Enviroment: { return(Selectors.Enviroment(expression)); } case SelectorType.JsonPath: { return(Selectors.JsonPath(expression)); } case SelectorType.Regex: { if (string.IsNullOrEmpty(selector.Argument)) { return(Selectors.Regex(expression)); } else { int group; if (int.TryParse(selector.Argument, out group)) { return(Selectors.Regex(expression, group)); } throw new SpiderException("Regex argument should be a number set to group: " + selector); } } case SelectorType.XPath: { return(Selectors.XPath(expression)); } } throw new SpiderException("Not support selector: " + selector); }
/// <summary> /// 把 BaseSelector 转换成真正的查询器 /// </summary> /// <param name="selector">BaseSelector</param> /// <returns>查询器</returns> public static ISelector ToSelector(this Selector selector) { if (selector != null) { var expression = selector.Expression; switch (selector.Type) { case SelectorType.Css: { NotNullExpression(selector); return(Selectors.Css(expression)); } case SelectorType.JsonPath: { NotNullExpression(selector); return(Selectors.JsonPath(expression)); } case SelectorType.Regex: { NotNullExpression(selector); if (string.IsNullOrEmpty(selector.Arguments)) { return(Selectors.Regex(expression)); } var arguments = selector.Arguments.Split(new[] { ',' }, StringSplitOptions.RemoveEmptyEntries); var options = (RegexOptions)Enum.Parse(typeof(RegexOptions), arguments[0]); var replacement = arguments[1]; return(Selectors.Regex(expression, options, replacement)); } case SelectorType.XPath: { NotNullExpression(selector); return(Selectors.XPath(expression)); } default: { throw new NotSupportedException($"{selector} unsupported"); } } } return(null); }
protected override void Handle(Page page) { // 利用 Selectable 查询并构造自己想要的数据对象 var totalElements = page.Selectable.SelectList(Selectors.Regex("<li><a target=\"_blank\" href=\"http://quote.eastmoney.com/.*?.html\">.*?</a></li>")).GetValues(); List <string> results = new List <string>(); foreach (var element in totalElements) { //string stockValue = Utility.GetTitleContent(element, "a"); results.Add(element); } // Save data object by key. 以自定义KEY存入page对象中供Pipeline调用 page.AddResultItem("Result", results); }
protected override void Handle(Page page) { var results = new List <TiebaUser>(); if (page.Request.Url.Contains("https://tieba.baidu.com/home/main")) { if (page.Request.Properties.ContainsKey("页面类型")) { string htmlType = page.Request.Properties["页面类型"]; if (htmlType.Equals("手机")) { TiebaUser tiebaUser = new TiebaUser(); var totalUserInfoElements = page.Selectable().SelectList(Selectors.XPath(".div//tab tab_holo home_tab j_home_tab']")).Nodes(); List <int> info = totalUserInfoElements.Select(p => p.Select(Selectors.XPath(".//span[@class='home_tab_item_num']")).GetValue()).Select(p => Convert.ToInt32(p)).ToList(); tiebaUser.Post_Num = info[0]; tiebaUser.PostBar_Num = info[1]; tiebaUser.Follow_Num = info[2]; tiebaUser.Fans_Num = info[3]; tiebaUser.Key = page.Selectable().Select(Selectors.Regex(@"(?<=(/i/\?portrait=))[0-9a-zA-Z]+")).GetValue(); tiebaUser.U_Nick = page.Selectable().Select(Selectors.XPath(".//a[class='home_card_uname_link']")).GetValue(ValueOption.InnerText); results.Add(tiebaUser); page.AddTargetRequest(new Request($"http://tieba.baidu.com/home/main/?un={tiebaUser.U_Nick}", new Dictionary <string, object>() { { "页面类型", "电脑" } })); } else if (htmlType.Equals("电脑")) { TiebaUser tiebaUser = new TiebaUser(); tiebaUser.U_Nick = page.Selectable().Select(Selectors.XPath(".//span[class='userinfo_username']")).GetValue(ValueOption.InnerText); string userTitle = page.Selectable().Select(Selectors.XPath(".//span[class='user_name']")).GetValue(ValueOption.InnerText); tiebaUser.U_Name = Regex.Match(userTitle, @"(?<=用户名:)[^"" <]+").Value; tiebaUser.U_BaAge = Regex.Match(userTitle, @"(?<=吧龄:)\d+\.\d+").Value; tiebaUser.Posting_Num = Convert.ToInt32(Regex.Match(userTitle, @"(?<=发帖:)\d+").Value); results.Add(tiebaUser); } } } //将数据添加进去,使得数据存储类可以拿到数据 if (results.Count > 0) { page.AddResultItem("TiebaUser", results); } }
public static ISelector Parse(Selector selector) { if (string.IsNullOrEmpty(selector?.Expression)) { return(null); } string expression = selector.Expression; switch (selector.Type) { case SelectorType.Css: { return(Selectors.Css(expression)); } case SelectorType.Enviroment: { return(Selectors.Enviroment(expression)); } case SelectorType.JsonPath: { return(Selectors.JsonPath(expression)); } case SelectorType.Regex: { return(Selectors.Regex(expression)); } case SelectorType.XPath: { return(Selectors.XPath(expression)); } } throw new SpiderException("Not support selector: " + selector); }
/// <summary> /// 构造方法 /// </summary> public DataParser() { Model = new Model <T>(); var patterns = new HashSet <string>(); if (Model.FollowRequestSelectors != null) { foreach (var followSelector in Model.FollowRequestSelectors) { switch (followSelector.SelectorType) { case SelectorType.Css: { foreach (var expression in followSelector.Expressions) { AddFollowRequestQuerier(Selectors.Css(expression)); } break; } case SelectorType.Regex: { foreach (var expression in followSelector.Expressions) { AddFollowRequestQuerier(Selectors.Regex(expression)); } break; } case SelectorType.XPath: { foreach (var expression in followSelector.Expressions) { AddFollowRequestQuerier(Selectors.XPath(expression)); } break; } case SelectorType.Environment: { Logger.LogWarning("SelectorType of follow selector is not supported"); break; } case SelectorType.JsonPath: { foreach (var expression in followSelector.Expressions) { AddFollowRequestQuerier(Selectors.JsonPath(expression)); } break; } } foreach (var pattern in followSelector.Patterns) { patterns.Add(pattern); } } } foreach (var pattern in patterns) { AddRequiredValidator(request => Regex.IsMatch(request.RequestUri.ToString(), pattern)); } }
public void Regex() { Assert.Equal("a href=\"xxx\"", Selectors.Regex("a href=\"(.*)\"").Select(html).Value); Assert.Equal("xxx", Selectors.Regex("(a href)=\"(.*)\"", RegexOptions.None, "$2").Select(html).Value); }
/// <summary> /// 解析出目标链接 /// </summary> /// <param name="response">链接请求结果</param> /// <returns>目标链接</returns> protected override IEnumerable <Request> Extract(Response response) { if (_regionSelectorMapPatterns == null || _regionSelectorMapPatterns.Count == 0) { return(new Request[0]); } var site = response.Request.Site; List <string> resultUrls = new List <string>(); foreach (var targetUrlExtractor in _regionSelectorMapPatterns) { if (Equals(targetUrlExtractor.Key, Selectors.Default())) { continue; } List <string> requests; if (response.ContentType == ContentType.Json) { requests = new List <string>(response.Selectable().SelectList(Selectors.Regex(RegexUtil.Url)).Links().GetValues()); } else { requests = new List <string>(response.Selectable().SelectList(targetUrlExtractor.Key).Links().GetValues()); } if (requests.Count == 0) { continue; } List <string> tmpRequests = new List <string>(); foreach (string request in requests) { #if !NETSTANDARD tmpRequests.Add(System.Web.HttpUtility.HtmlDecode(System.Web.HttpUtility.UrlDecode(request))); #else tmpRequests.Add(System.Net.WebUtility.HtmlDecode(System.Net.WebUtility.UrlDecode(request))); #endif } requests = tmpRequests; if (targetUrlExtractor.Value == null || targetUrlExtractor.Value.Count == 0) { resultUrls.AddRange(requests); continue; } foreach (var regex in targetUrlExtractor.Value) { foreach (string link in requests) { if (regex.IsMatch(link)) { bool isRequired = true; if (ExcludeTargetUrlPatterns != null) { foreach (var excludeRegex in ExcludeTargetUrlPatterns) { if (excludeRegex.IsMatch(link)) { isRequired = false; break; } } } if (isRequired) { resultUrls.Add(link); } } } } } var properties = new Dictionary <string, dynamic>(); foreach (var kv in response.Request.Properties) { if (kv.Key != Env.UrlPropertyKey && kv.Key != Env.TargetUrlPropertyKey) { properties.Add(kv.Key, kv.Value); } } return(resultUrls.Select(url => new Request(url, response.Request.Properties) { Site = site })); }
/// <summary> /// 如果找不到则不返回URL, 不然返回的URL太多 /// </summary> /// <param name="page"></param> protected virtual void ExtractUrls(Page page) { if (_targetUrlExtractors == null || _targetUrlExtractors.Count == 0) { return; } foreach (var targetUrlExtractor in _targetUrlExtractors) { if (Equals(targetUrlExtractor.Key, Selectors.Default())) { continue; } List <string> links = null; if (page.ContentType == ContentType.Html) { links = page.Selectable.SelectList(targetUrlExtractor.Key).Links().GetValues(); } else if (page.ContentType == ContentType.Json) { links = page.Selectable.SelectList(Selectors.Regex(RegexUtil.UrlRegex)).Links().GetValues(); } else { throw new Exception("page.ContentType is not match!"); } if (links == null) { continue; } // check: 仔细考虑是放在前面, 还是在后面做 formatter, 我倾向于在前面. 对targetUrl做formatter则表示Start Url也应该是要符合这个规则的。 List <string> tmp = new List <string>(); foreach (string link in links) { var url = FormateUrl(link); #if !NET_CORE tmp.Add(HttpUtility.HtmlDecode(HttpUtility.UrlDecode(url))); #else tmp.Add(WebUtility.HtmlDecode(WebUtility.UrlDecode(url))); #endif } links = tmp; if (targetUrlExtractor.Value == null || targetUrlExtractor.Value.Count == 0) { page.AddTargetRequests(links); continue; } foreach (var regex in targetUrlExtractor.Value) { foreach (string link in links) { if (regex.IsMatch(link)) { bool isRequired = true; if (_excludeTargetUrlPatterns != null) { foreach (var excludeRegex in _excludeTargetUrlPatterns) { if (excludeRegex.IsMatch(link)) { isRequired = false; break; } } } if (isRequired) { page.AddTargetRequest(new Request(link, page.Request.Extras)); } } } } } if (Site.DownloadFiles) { var links = (page.Selectable.SelectList(ImageSelector)).GetValues(); if (links == null || links.Count == 0) { return; } foreach (string link in links) { bool isRequired = true; if (_excludeTargetUrlPatterns != null) { foreach (var excludeRegex in _excludeTargetUrlPatterns) { if (excludeRegex.IsMatch(link)) { isRequired = false; break; } } } if (isRequired) { page.AddTargetRequest(new Request(link, page.Request.Extras)); } } } }
/// <summary> /// 解析出目标链接 /// </summary> /// <param name="page">页面数据</param> /// <param name="site">站点信息</param> /// <returns>目标链接</returns> protected override IEnumerable <Request> Extract(Page page, Site site) { if (_regionSelectorMapPatterns == null || _regionSelectorMapPatterns.Count == 0) { return(new Request[0]); } List <string> resultUrls = new List <string>(); foreach (var targetUrlExtractor in _regionSelectorMapPatterns) { if (Equals(targetUrlExtractor.Key, Selectors.Default())) { continue; } IEnumerable <string> links = null; if (page.ContentType == ContentType.Html) { links = page.Selectable.SelectList(targetUrlExtractor.Key).Links().GetValues(); } else if (page.ContentType == ContentType.Json) { links = page.Selectable.SelectList(Selectors.Regex(RegexUtil.Url)).Links().GetValues(); } else { } if (links == null) { continue; } // check: 仔细考虑是放在前面, 还是在后面做 formatter, 我倾向于在前面. 对targetUrl做formatter则表示Start Url也应该是要符合这个规则的。 List <string> tmp = new List <string>(); foreach (string link in links) { var newUrl = FormateUrl(link); #if !NETSTANDARD tmp.Add(System.Web.HttpUtility.HtmlDecode(System.Web.HttpUtility.UrlDecode(newUrl))); #else tmp.Add(WebUtility.HtmlDecode(WebUtility.UrlDecode(newUrl))); #endif } links = tmp; if (targetUrlExtractor.Value == null || targetUrlExtractor.Value.Count == 0) { resultUrls.AddRange(links); continue; } foreach (var regex in targetUrlExtractor.Value) { foreach (string link in links) { if (regex.IsMatch(link)) { bool isRequired = true; if (ExcludeTargetUrlPatterns != null) { foreach (var excludeRegex in ExcludeTargetUrlPatterns) { if (excludeRegex.IsMatch(link)) { isRequired = false; break; } } } if (isRequired) { resultUrls.Add(link); } } } } } if (site.DownloadFiles) { var links = (page.Selectable.SelectList(ImageSelector)).GetValues(); if (links != null && links.Count() > 0) { foreach (string link in links) { bool isRequired = true; if (ExcludeTargetUrlPatterns != null) { foreach (var excludeRegex in ExcludeTargetUrlPatterns) { if (excludeRegex.IsMatch(link)) { isRequired = false; break; } } } if (isRequired) { resultUrls.Add(link); } } } } return(resultUrls.Select(t => new Request(t, page.Request.Extras) { Site = site })); }
public void Regex() { Assert.Equal(Selectors.Regex("a href=\"(.*)\"").Select(_html), "a href=\"xxx\""); Assert.Equal(Selectors.Regex("(a href)=\"(.*)\"", 2).Select(_html), "xxx"); }
protected override void Handle(Page page) { //如果是分类页面,就获取所有分类 if (page.Request.RequestUri.AbsoluteUri.Contains("tieba.baidu.com/f/index/forumclass")) { var listRequest = new List <Request>(); var totalClassElements = page.Selectable().SelectList(Selectors.XPath(".//ul[@class='item-list-ul clearfix']")).Nodes(); foreach (var classElement in totalClassElements) { IEnumerable <string> hrefs = classElement.SelectList(Selectors.XPath(".//li")).Nodes().Select(p => p.XPath(".//a/@href").GetValue()); foreach (var href in hrefs) { listRequest.Add(new Request(href, new Dictionary <string, dynamic>() { { "pageSize", 30 } }) { UserAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36" }); } } //把获取到的所有分类链接加入到队列中 page.AddTargetRequests(listRequest); } List <TiebaGroup> results = new List <TiebaGroup>(); //分类页面获取贴吧信息 if (page.Request.RequestUri.AbsoluteUri.Contains("tieba.baidu.com/f/index/forumpark")) { var listRequest = new List <Request>(); var totalTiebaElements = page.Selectable().SelectList(Selectors.XPath(".//div[@id='ba_list']/div")).Nodes(); foreach (var tiebaElement in totalTiebaElements) { //var tieba = new TiebaGroup(); //tieba.Key = tiebaElement.Select(Selectors.XPath(".//div[@class='ba_like ']//@data-fid")).GetValue(); //tieba.Ba_Name = tiebaElement.Select(Selectors.XPath(".//p[@class='ba_name']")).GetValue(); //tieba.Ba_Desc = tiebaElement.Select(Selectors.XPath(".//p[@class='ba_desc']")).GetValue(); //tieba.Ba_M_Num = Convert.ToInt32(tiebaElement.Select(Selectors.XPath(".//span[@class='ba_m_num']")).GetValue()); //tieba.Ba_P_Num = Convert.ToInt32(tiebaElement.Select(Selectors.XPath(".//span[@class='ba_p_num']")).GetValue()); //tieba.Ba_Pic = tiebaElement.Select(Selectors.XPath(".//img[@class='ba_pic']/@src")).GetValue(); //tieba.FirstClassIfication = page.Request.RequestUri.GetParameter("pcn"); //tieba.TwoClassIfication = page.Request.RequestUri.GetParameter("cn"); //tieba.UpdateTime = DateTime.Now; //results.Add(tieba); string baName = tiebaElement.Select(Selectors.XPath(".//p[@class='ba_name']")).GetValue(); listRequest.Add(new Request($"http://tieba.baidu.com/f?kw={baName.Substring(0, baName.Length - 1)}")); } page.AddTargetRequests(listRequest); //如果是分类页面则需要分页 int pageIndex = Convert.ToInt32(page.Request.RequestUri.GetParameter("pn") ?? "1"); if (page.Request.Properties.ContainsKey("pageSize")) { int pageSize = page.Request.Properties["pageSize"]; for (int i = pageIndex + 1; i <= pageSize; i++) { //往后翻页 page.AddTargetRequest(new Request(page.Request.RequestUri.SetParameter("pn", i.ToString()).AbsoluteUri)); } } } //贴吧主页获取贴吧信息 else if (page.Request.RequestUri.AbsoluteUri.Contains("tieba.baidu.com/f?") && page.Request.RequestUri.IsGetParameter("kw")) { var select = page.Selectable(); var tieba = new TiebaGroup(); tieba.Key = Regex.Match(select.GetValue(), @"(?<=(PageData.forum = {\s*'id': ))\d+").Value; string title = select.Select(Selectors.XPath("//title")).GetValue(); tieba.Ba_Name = Regex.Match(title, "[^>]*(?=(-百度贴吧))").Value; tieba.Ba_Desc = Regex.Match(title, "(?<=(-百度贴吧--))[^<]*").Value; tieba.Ba_M_Num = Convert.ToInt32(select.Select(Selectors.Regex("(?<=card_menNum\">)[\\d,]+")).GetValue().Replace(",", "")); tieba.Ba_P_Num = Convert.ToInt32(select.Select(Selectors.Regex("(?<=card_infoNum\">)[\\d,]+")).GetValue().Replace(",", "")); tieba.Ba_Pic = HttpUtility.UrlDecode(select.Select(Selectors.Regex("(?<=(wh_rate=null&src=))[^\"]*")).GetValue()); Uri uri = new Uri(HttpUtility.UrlDecode("http://tieba.baidu.com" + select.Select(Selectors.Regex(@"(?<=(<span>目录:</span>\s*<a rel=""noreferrer""\s*href=""))[^""]*")).GetValue())); tieba.FirstClassIfication = uri.GetParameter("fd"); tieba.TwoClassIfication = uri.GetParameter("sd"); tieba.UpdateTime = DateTime.Now; results.Add(tieba); } // Save data object by key. 以自定义KEY存入page对象中供Pipeline调用 if (results.Count > 0) { page.AddResultItem("TiebaGroup", results); } }