/// <summary> /// 构造方法 /// </summary> /// <param name="extractor">爬虫实体的解析器</param> /// <param name="targetUrlsExtractor">目标链接的解析、筛选器</param> /// <param name="dataHandler">对解析的结果进一步加工操作</param> /// <param name="tableName">实体在数据库中的表名, 此优先级高于EntitySelector中的定义</param> public EntityProcessor(IEntityExtractor <T> extractor, ITargetUrlsExtractor targetUrlsExtractor, IDataHandler <T> dataHandler, string tableName) { if (extractor == null) { Extractor = new EntityExtractor <T>(dataHandler, tableName); } else { Extractor = extractor; } if (targetUrlsExtractor != null) { TargetUrlsExtractor = targetUrlsExtractor; } RegionAndPatternTargetUrlsExtractor regionAndPatternTargetUrlsExtractor; if (TargetUrlsExtractor == null) { regionAndPatternTargetUrlsExtractor = new RegionAndPatternTargetUrlsExtractor(); TargetUrlsExtractor = regionAndPatternTargetUrlsExtractor; } else { regionAndPatternTargetUrlsExtractor = TargetUrlsExtractor as RegionAndPatternTargetUrlsExtractor; } if (regionAndPatternTargetUrlsExtractor == null) { return; } if (Extractor.EntityDefine.TargetUrlsSelectors != null && Extractor.EntityDefine.TargetUrlsSelectors.Count > 0) { foreach (var targetUrlsSelector in Extractor.EntityDefine.TargetUrlsSelectors) { var patterns = targetUrlsSelector.Patterns?.Select(x => x?.Trim()).Distinct().ToArray(); var xpaths = targetUrlsSelector.XPaths?.Select(x => x?.Trim()).Distinct().ToList(); if (xpaths == null && patterns == null) { throw new SpiderException("Region xpath and patterns should not be null both"); } if (xpaths != null && xpaths.Count > 0) { foreach (var xpath in xpaths) { regionAndPatternTargetUrlsExtractor.AddTargetUrlExtractor(xpath, patterns); } } else { if (patterns != null && patterns.Length > 0) { regionAndPatternTargetUrlsExtractor.AddTargetUrlExtractor(null, patterns); } } } } }
/// <summary> /// 构造方法 /// </summary> /// <param name="partterns">匹配目标链接的正则表达式</param> /// <param name="excludeParterns">排除目标链接的正则表达式</param> public DefaultPageProcessor(string[] partterns = null, string[] excludeParterns = null) { var targetUrlsExtractor = new RegionAndPatternTargetUrlsExtractor(); if (partterns != null && partterns.Length > 0) { targetUrlsExtractor.AddTargetUrlExtractor(".", partterns); } if (excludeParterns != null && excludeParterns.Length > 0) { targetUrlsExtractor.AddExcludeTargetUrlPatterns(excludeParterns); } TargetUrlsExtractor = targetUrlsExtractor; }
public void RegionAndPatternTargetUrlsExtractor() { HttpClient client = new HttpClient(); var html = client.GetStringAsync("http://www.cnblogs.com").Result; var extracotr = new RegionAndPatternTargetUrlsExtractor(".//div[@class='pager']", "/sitehome/p/\\d+", "^http://www\\.cnblogs\\.com/$"); var site = new Site(); var page = new Page(new Request("http://cnblogs.com") { Site = site }); page.Content = html; var requets = extracotr.ExtractRequests(page, site).ToList(); Assert.Equal(12, requets.Count); Assert.Contains(requets, r => r.Url == "http://cnblogs.com/sitehome/p/2"); }
public CnblogsProcessor3() { TargetUrlsExtractor = new RegionAndPatternTargetUrlsExtractor(".", "/sitehome/p/\\d+"); }
public CnblogsProcessor2() { TargetUrlsExtractor = new RegionAndPatternTargetUrlsExtractor(".//div[@class='pager']", "/sitehome/p/\\d+", "^http://www\\.cnblogs\\.com/$"); }
public BlogSumaryProcessor() { // 定义目标页的筛选 TargetUrlsExtractor = new RegionAndPatternTargetUrlsExtractor(".", "^http://www\\.cnblogs\\.com/$", "http://www\\.cnblogs\\.com/sitehome/p/\\d+"); }
public NewsProcessor() { TargetUrlsExtractor = new RegionAndPatternTargetUrlsExtractor(null, "^http://www\\.cnblogs\\.com/$", "^http://www\\.cnblogs\\.com/news/$", "www\\.cnblogs\\.com/news/\\d+"); }
/// <summary> /// 构造方法 /// </summary> /// <param name="extractor">爬虫实体的解析器</param> /// <param name="targetUrlsExtractor">目标链接的解析、筛选器</param> /// <param name="dataHandler">对解析的结果进一步加工操作</param> /// <param name="tableName">实体在数据库中的表名, 此优先级高于EntitySelector中的定义</param> public ModelProcessor(IModel model, IModelExtractor extractor = null, ITargetUrlsExtractor targetUrlsExtractor = null, params IDataHandler[] dataHandlers) { Model = model ?? throw new ArgumentNullException($"{nameof(model)} should not be null."); Extractor = extractor ?? new ModelExtractor(); if (targetUrlsExtractor != null) { TargetUrlsExtractor = targetUrlsExtractor; } RegionAndPatternTargetUrlsExtractor regionAndPatternTargetUrlsExtractor; if (TargetUrlsExtractor == null) { regionAndPatternTargetUrlsExtractor = new RegionAndPatternTargetUrlsExtractor(); TargetUrlsExtractor = regionAndPatternTargetUrlsExtractor; } else { regionAndPatternTargetUrlsExtractor = TargetUrlsExtractor as RegionAndPatternTargetUrlsExtractor; } if (regionAndPatternTargetUrlsExtractor == null) { return; } if (Model.TargetUrlsSelectors != null && Model.TargetUrlsSelectors.Count() > 0) { foreach (var targetUrlsSelector in Model.TargetUrlsSelectors) { var patterns = targetUrlsSelector.Patterns?.Select(x => x?.Trim()).Distinct().ToArray(); var xpaths = targetUrlsSelector.XPaths?.Select(x => x?.Trim()).Distinct().ToList(); if (xpaths == null && patterns == null) { throw new SpiderException("Region xpath and patterns should not be null both"); } if (xpaths != null && xpaths.Count > 0) { foreach (var xpath in xpaths) { regionAndPatternTargetUrlsExtractor.AddTargetUrlExtractor(xpath, patterns); } } else { if (patterns != null && patterns.Length > 0) { regionAndPatternTargetUrlsExtractor.AddTargetUrlExtractor(null, patterns); } } } } if (dataHandlers != null) { foreach (var datahandler in dataHandlers) { if (datahandler != null) { _dataHandlers.Add(datahandler); } } } }
public GetExtInfoProcessor() { TargetUrlsExtractor = new RegionAndPatternTargetUrlsExtractor(".", @"^https://mall\.autohome\.com\.cn\/http\/data\.html\?data\[_host\]=//car\.api\.autohome\.com\.cn/v2/carprice/Config_GetListBySpecId\.ashx*"); }
public GetBasicInfoProcessor() { TargetUrlsExtractor = new RegionAndPatternTargetUrlsExtractor(".", @"^https://mall\.autohome\.com\.cn/http/data\.html\?data\[_host\]=//car\.api\.autohome\.com\.cn/v1/carprice/spec_paramsinglebyspecid\.ashx*"); }
public GetSkuProcessor() { TargetUrlsExtractor = new RegionAndPatternTargetUrlsExtractor(".", @"^https://mall\.autohome\.com\.cn/detail/*"); }
public GetSkuProcessor() { TargetUrlsExtractor = new RegionAndPatternTargetUrlsExtractor(".", @"^https:\/\/mall.autohome.com.cn\/detail\/*[0-9]+-[0-9]+-[0-9]+\.html$"); }