public void Process(Page page) { IList<String> links = page.GetHtml().Links().Regex("http://my\\.oschina\\.net/flashsword/blog/\\d+").GetAll(); page.AddTargetRequests(links); page.PutField("title", page.GetHtml().XPath("//div[@class='BlogEntity']/div[@class='BlogTitle']/h1/text()").ToString()); page.PutField("content", page.GetHtml().XPath("//div[@class='BlogContent']/tidyText()").ToString()); page.PutField("tags", page.GetHtml().XPath("//div[@class='BlogTags']/a/text()").GetAll()); page.PutField("artical", page.GetHtml().XPath("//*[@Class='Blog']/div[1]/div/h2/a").ToString()); }
/// <summary> /// 如果找不到则不返回URL, 不然返回的URL太多 /// </summary> /// <param name="page"></param> /// <param name="targetUrlExtractInfos"></param> private void ExtractLinks(Page page, List<Model.TargetUrlExtractor> targetUrlExtractInfos) { if (targetUrlExtractInfos == null) { return; } foreach (var targetUrlExtractInfo in targetUrlExtractInfos) { var urlRegionSelector = targetUrlExtractInfo.Region; var formatters = targetUrlExtractInfo.Formatters; var urlPatterns = targetUrlExtractInfo.Patterns; var links = urlRegionSelector == null ? page.Selectable.Links().GetValues() : (page.Selectable.SelectList(urlRegionSelector)).Links().GetValues(); if (links == null) { return; } // check: 仔细考虑是放在前面, 还是在后面做 formatter, 我倾向于在前面. 对targetUrl做formatter则表示Start Url也应该是要符合这个规则的。 if (formatters != null && formatters.Count > 0) { List<string> tmp = new List<string>(); foreach (string link in links) { var url = new String(link.ToCharArray()); foreach (Formatter f in formatters) { url = f.Formate(url); } tmp.Add(url); } links = tmp; } List<string> tmpLinks = new List<string>(); foreach (var link in links) { #if !NET_CORE tmpLinks.Add(HttpUtility.HtmlDecode(HttpUtility.UrlDecode(link))); #else tmpLinks.Add(WebUtility.HtmlDecode(WebUtility.UrlDecode(link))); #endif } links = tmpLinks; if (urlPatterns == null || urlPatterns.Count == 0) { page.AddTargetRequests(links); return; } foreach (Regex targetUrlPattern in urlPatterns) { foreach (string link in links) { if (targetUrlPattern.IsMatch(link)) { page.AddTargetRequest(new Request(link, page.Request.NextDepth, page.Request.Extras) { }); } } } } }
public void Process(Page page) { foreach (IEntityExtractor pageModelExtractor in EntityExtractorList) { dynamic process = pageModelExtractor.Process(page); if (process == null || (process is IEnumerable && !((IEnumerable)process).GetEnumerator().MoveNext())) { continue; } page.AddResultItem(pageModelExtractor.EntityName, process); } if (!page.MissTargetUrls) { if (GetCustomizeTargetUrls == null) { ExtractLinks(page, TargetUrlExtractInfos); } else { page.AddTargetRequests(GetCustomizeTargetUrls(page)); } } if (page.ResultItems.Results.Count == 0) { page.ResultItems.IsSkip = true; } }
/// <summary> /// ����Ҳ�������URL, ��Ȼ���ص�URL̫�� /// </summary> /// <param name="page"></param> /// <param name="urlRegionSelector"></param> /// <param name="urlPatterns"></param> private void ExtractLinks(Page page, ISelector urlRegionSelector, IList<Regex> urlPatterns) { var links = urlRegionSelector == null ? new List<string>() : page.GetHtml().SelectList(urlRegionSelector).Links().GetAll(); if (urlPatterns == null || urlPatterns.Count == 0) { page.AddTargetRequests(links); return; } foreach (Regex targetUrlPattern in urlPatterns) { foreach (string link in links) { if (targetUrlPattern.IsMatch(link)) { page.AddTargetRequest(new Request(link, page.GetRequest().Extras)); } } } }