public void RegionAndPatternTargetUrlsExtractor() { HttpClient client = new HttpClient(); var html = client.GetStringAsync("http://www.cnblogs.com").Result; var extracotr = new XPathRequestExtractor(".//div[@class='pager']"); //, "/sitehome/p/\\d+", "^http://www\\.cnblogs\\.com/$" var page = new Page(new Request("http://cnblogs.com")); page.Content = html; page.ContentType = ContentType.Html; var requets = Enumerable.ToList(extracotr.Extract(page)); Assert.Equal(12, requets.Count); Assert.Contains(requets, r => r.Url == "http://cnblogs.com/sitehome/p/2"); }
public BlogSumaryProcessor() { RequestExtractor = new XPathRequestExtractor("."); Filter = new PatternFilter(new[] { "^http://www\\.cnblogs\\.com/$", "http://www\\.cnblogs\\.com/sitehome/p/\\d+" }); }
/// <summary> /// 构造方法 /// </summary> /// <param name="model">数据模型</param> /// <param name="extractor">模型解析器</param> /// <param name="targetRequestExtractor">目标链接的解析器</param> /// <param name="dataHandlers">数据处理器</param> public ModelProcessor(IModel model, IModelExtractor extractor = null, params IDataHandler[] dataHandlers) { Model = model ?? throw new ArgumentNullException(nameof(model)); Extractor = extractor ?? new ModelExtractor(); var patterns = new HashSet <string>(); foreach (var ps in model.Targets.Select(t => t.Patterns)) { if (ps != null) { foreach (var p in ps) { patterns.Add(p); } } } var excludePatterns = new HashSet <string>(); foreach (var ps in model.Targets.Select(t => t.ExcludePatterns)) { if (ps != null) { foreach (var p in ps) { excludePatterns.Add(p); } } } Filter = new PatternFilter(patterns, excludePatterns); var xpaths = new HashSet <string>(); foreach (var xs in model.Targets.Select(t => t.XPaths)) { if (xs != null) { foreach (var x in xs) { xpaths.Add(x); } } } if (xpaths.Any(x => x == null || x == ".")) { RequestExtractor = new XPathRequestExtractor("."); } else { foreach (var xpath in xpaths) { RequestExtractor = new XPathRequestExtractor(xpaths); } } if (dataHandlers != null) { foreach (var datahandler in dataHandlers) { if (datahandler != null) { _dataHandlers.Add(datahandler); } } } }