public void Process(Page page)
		{
			IList<String> links = page.GetHtml().Links().Regex("http://my\\.oschina\\.net/flashsword/blog/\\d+").GetAll();
			page.AddTargetRequests(links);
			page.PutField("title", page.GetHtml().XPath("//div[@class='BlogEntity']/div[@class='BlogTitle']/h1/text()").ToString());
			page.PutField("content", page.GetHtml().XPath("//div[@class='BlogContent']/tidyText()").ToString());
			page.PutField("tags", page.GetHtml().XPath("//div[@class='BlogTags']/a/text()").GetAll());
			page.PutField("artical", page.GetHtml().XPath("//*[@Class='Blog']/div[1]/div/h2/a").ToString());
		}
Пример #2
0
        /// <summary>
        /// 如果找不到则不返回URL, 不然返回的URL太多
        /// </summary>
        /// <param name="page"></param>
        /// <param name="targetUrlExtractInfos"></param>
        private void ExtractLinks(Page page, List<Model.TargetUrlExtractor> targetUrlExtractInfos)
        {
            if (targetUrlExtractInfos == null)
            {
                return;
            }

            foreach (var targetUrlExtractInfo in targetUrlExtractInfos)
            {
                var urlRegionSelector = targetUrlExtractInfo.Region;
                var formatters = targetUrlExtractInfo.Formatters;
                var urlPatterns = targetUrlExtractInfo.Patterns;

                var links = urlRegionSelector == null ? page.Selectable.Links().GetValues() : (page.Selectable.SelectList(urlRegionSelector)).Links().GetValues();
                if (links == null)
                {
                    return;
                }

                // check: 仔细考虑是放在前面, 还是在后面做 formatter, 我倾向于在前面. 对targetUrl做formatter则表示Start Url也应该是要符合这个规则的。
                if (formatters != null && formatters.Count > 0)
                {
                    List<string> tmp = new List<string>();
                    foreach (string link in links)
                    {
                        var url = new String(link.ToCharArray());
                        foreach (Formatter f in formatters)
                        {
                            url = f.Formate(url);
                        }
                        tmp.Add(url);
                    }
                    links = tmp;
                }

                List<string> tmpLinks = new List<string>();
                foreach (var link in links)
                {
            #if !NET_CORE
                    tmpLinks.Add(HttpUtility.HtmlDecode(HttpUtility.UrlDecode(link)));
            #else
                    tmpLinks.Add(WebUtility.HtmlDecode(WebUtility.UrlDecode(link)));
            #endif
                }
                links = tmpLinks;

                if (urlPatterns == null || urlPatterns.Count == 0)
                {
                    page.AddTargetRequests(links);
                    return;
                }

                foreach (Regex targetUrlPattern in urlPatterns)
                {
                    foreach (string link in links)
                    {
                        if (targetUrlPattern.IsMatch(link))
                        {
                            page.AddTargetRequest(new Request(link, page.Request.NextDepth, page.Request.Extras)
                            {
                            });
                        }
                    }
                }
            }
        }
Пример #3
0
        public void Process(Page page)
        {
            foreach (IEntityExtractor pageModelExtractor in EntityExtractorList)
            {
                dynamic process = pageModelExtractor.Process(page);

                if (process == null || (process is IEnumerable && !((IEnumerable)process).GetEnumerator().MoveNext()))
                {
                    continue;
                }

                page.AddResultItem(pageModelExtractor.EntityName, process);
            }

            if (!page.MissTargetUrls)
            {
                if (GetCustomizeTargetUrls == null)
                {
                    ExtractLinks(page, TargetUrlExtractInfos);
                }
                else
                {
                    page.AddTargetRequests(GetCustomizeTargetUrls(page));
                }
            }

            if (page.ResultItems.Results.Count == 0)
            {
                page.ResultItems.IsSkip = true;
            }
        }
Пример #4
0
        /// <summary>
        /// ����Ҳ����򲻷���URL, ��Ȼ���ص�URL̫��
        /// </summary>
        /// <param name="page"></param>
        /// <param name="urlRegionSelector"></param>
        /// <param name="urlPatterns"></param>
        private void ExtractLinks(Page page, ISelector urlRegionSelector, IList<Regex> urlPatterns)
        {
            var links = urlRegionSelector == null ? new List<string>() : page.GetHtml().SelectList(urlRegionSelector).Links().GetAll();

            if (urlPatterns == null || urlPatterns.Count == 0)
            {
                page.AddTargetRequests(links);
                return;
            }

            foreach (Regex targetUrlPattern in urlPatterns)
            {
                foreach (string link in links)
                {
                    if (targetUrlPattern.IsMatch(link))
                    {
                        page.AddTargetRequest(new Request(link, page.GetRequest().Extras));
                    }
                }
            }
        }