public void RegionAndPatternTargetUrlsExtractor()
        {
            HttpClient client = new HttpClient();
            var        html   = client.GetStringAsync("http://www.cnblogs.com").Result;

            var extracotr = new XPathRequestExtractor(".//div[@class='pager']");
            //, "/sitehome/p/\\d+", "^http://www\\.cnblogs\\.com/$"
            var page = new Page(new Request("http://cnblogs.com"));

            page.Content     = html;
            page.ContentType = ContentType.Html;
            var requets = Enumerable.ToList(extracotr.Extract(page));

            Assert.Equal(12, requets.Count);
            Assert.Contains(requets, r => r.Url == "http://cnblogs.com/sitehome/p/2");
        }
 public BlogSumaryProcessor()
 {
     RequestExtractor = new XPathRequestExtractor(".");
     Filter           = new PatternFilter(new[] { "^http://www\\.cnblogs\\.com/$", "http://www\\.cnblogs\\.com/sitehome/p/\\d+" });
 }
예제 #3
0
        /// <summary>
        /// 构造方法
        /// </summary>
        /// <param name="model">数据模型</param>
        /// <param name="extractor">模型解析器</param>
        /// <param name="targetRequestExtractor">目标链接的解析器</param>
        /// <param name="dataHandlers">数据处理器</param>
        public ModelProcessor(IModel model, IModelExtractor extractor = null, params IDataHandler[] dataHandlers)
        {
            Model = model ?? throw new ArgumentNullException(nameof(model));

            Extractor = extractor ?? new ModelExtractor();

            var patterns = new HashSet <string>();

            foreach (var ps in model.Targets.Select(t => t.Patterns))
            {
                if (ps != null)
                {
                    foreach (var p in ps)
                    {
                        patterns.Add(p);
                    }
                }
            }
            var excludePatterns = new HashSet <string>();

            foreach (var ps in model.Targets.Select(t => t.ExcludePatterns))
            {
                if (ps != null)
                {
                    foreach (var p in ps)
                    {
                        excludePatterns.Add(p);
                    }
                }
            }
            Filter = new PatternFilter(patterns, excludePatterns);

            var xpaths = new HashSet <string>();

            foreach (var xs in model.Targets.Select(t => t.XPaths))
            {
                if (xs != null)
                {
                    foreach (var x in xs)
                    {
                        xpaths.Add(x);
                    }
                }
            }
            if (xpaths.Any(x => x == null || x == "."))
            {
                RequestExtractor = new XPathRequestExtractor(".");
            }
            else
            {
                foreach (var xpath in xpaths)
                {
                    RequestExtractor = new XPathRequestExtractor(xpaths);
                }
            }

            if (dataHandlers != null)
            {
                foreach (var datahandler in dataHandlers)
                {
                    if (datahandler != null)
                    {
                        _dataHandlers.Add(datahandler);
                    }
                }
            }
        }