public void Process(Page page) { IList<String> links = page.GetHtml().Links().Regex("http://my\\.oschina\\.net/flashsword/blog/\\d+").GetAll(); page.AddTargetRequests(links); page.PutField("title", page.GetHtml().XPath("//div[@class='BlogEntity']/div[@class='BlogTitle']/h1/text()").ToString()); page.PutField("content", page.GetHtml().XPath("//div[@class='BlogContent']/tidyText()").ToString()); page.PutField("tags", page.GetHtml().XPath("//div[@class='BlogTags']/a/text()").GetAll()); page.PutField("artical", page.GetHtml().XPath("//*[@Class='Blog']/div[1]/div/h2/a").ToString()); }
public void Process(Page page) { foreach (PageModelExtractor pageModelExtractor in _pageModelExtractorList) { ExtractLinks(page, pageModelExtractor.GetHelpUrlRegionSelector(), pageModelExtractor.GetHelpUrlPatterns()); ExtractLinks(page, pageModelExtractor.GetTargetUrlRegionSelector(), pageModelExtractor.GetTargetUrlPatterns()); object process = pageModelExtractor.Process(page); if (process == null || (process is IList && ((IList)process).Count == 0)) { continue; } PostProcessPageModel(process); page.PutField(pageModelExtractor.GetModelType().FullName, process); } if (page.GetResultItems().GetAll().Count == 0) { page.GetResultItems().IsSkip = true; } }