public void Process(Page page)
 {
     foreach (ExtractRule extractRule in _extractRules)
     {
         if (extractRule.IsMulti)
         {
             IList<string> results = page.GetHtml().SelectDocumentForList(extractRule.Selector);
             if (extractRule.IsNotNull && results.Count == 0)
             {
                 page.SetSkip(true);
             }
             else
             {
                 page.GetResultItems().Put(extractRule.FieldName, results);
             }
         }
         else
         {
             string result = page.GetHtml().SelectDocument(extractRule.Selector);
             if (extractRule.IsNotNull && result == null)
             {
                 page.SetSkip(true);
             }
             else
             {
                 page.GetResultItems().Put(extractRule.FieldName, result);
             }
         }
     }
 }
		public void Process(Page page)
		{
			IList<String> links = page.GetHtml().Links().Regex("http://my\\.oschina\\.net/flashsword/blog/\\d+").GetAll();
			page.AddTargetRequests(links);
			page.PutField("title", page.GetHtml().XPath("//div[@class='BlogEntity']/div[@class='BlogTitle']/h1/text()").ToString());
			page.PutField("content", page.GetHtml().XPath("//div[@class='BlogContent']/tidyText()").ToString());
			page.PutField("tags", page.GetHtml().XPath("//div[@class='BlogTags']/a/text()").GetAll());
			page.PutField("artical", page.GetHtml().XPath("//*[@Class='Blog']/div[1]/div/h2/a").ToString());
		}
Esempio n. 3
0
        /// <summary>
        /// ����Ҳ����򲻷���URL, ��Ȼ���ص�URL̫��
        /// </summary>
        /// <param name="page"></param>
        /// <param name="urlRegionSelector"></param>
        /// <param name="urlPatterns"></param>
        private void ExtractLinks(Page page, ISelector urlRegionSelector, IList<Regex> urlPatterns)
        {
            var links = urlRegionSelector == null ? new List<string>() : page.GetHtml().SelectList(urlRegionSelector).Links().GetAll();

            if (urlPatterns == null || urlPatterns.Count == 0)
            {
                page.AddTargetRequests(links);
                return;
            }

            foreach (Regex targetUrlPattern in urlPatterns)
            {
                foreach (string link in links)
                {
                    if (targetUrlPattern.IsMatch(link))
                    {
                        page.AddTargetRequest(new Request(link, page.GetRequest().Extras));
                    }
                }
            }
        }