/// <summary> /// 构造方法 /// </summary> /// <param name="extractor">爬虫实体的解析器</param> /// <param name="targetRequestExtractor">目标链接的解析、筛选器</param> /// <param name="dataHandlers">对解析的结果进一步加工操作</param> public EntityProcessor(IModelExtractor extractor = null, ITargetRequestExtractor targetRequestExtractor = null, params IDataHandler[] dataHandlers) : base(new ModelDefinition <T>(), extractor ?? new ModelExtractor <T>(), targetRequestExtractor, dataHandlers) { }
/// <summary> /// 构造方法 /// </summary> /// <param name="model">数据模型</param> /// <param name="extractor">模型解析器</param> /// <param name="dataHandlers">数据处理器</param> public ModelProcessor(IModel model, IModelExtractor extractor = null, params IDataHandler[] dataHandlers) { Model = model ?? throw new ArgumentNullException(nameof(model)); Extractor = extractor ?? new ModelExtractor(); var patterns = new HashSet <string>(); foreach (var ps in model.Targets.Select(t => t.Patterns)) { if (ps == null) { continue; } foreach (var p in ps) { patterns.Add(p); } } var excludePatterns = new HashSet <string>(); foreach (var ps in model.Targets.Select(t => t.ExcludePatterns)) { if (ps == null) { continue; } foreach (var p in ps) { excludePatterns.Add(p); } } Filter = new PatternFilter(patterns, excludePatterns); var xPaths = new HashSet <string>(); foreach (var xs in model.Targets.Select(t => t.XPaths)) { if (xs == null) { continue; } foreach (var x in xs) { xPaths.Add(x); } } RequestExtractor = xPaths.Any(x => x == null || x == ".") ? new XPathRequestExtractor(".") : (xPaths.Count == 0 ? null : new XPathRequestExtractor(xPaths)); if (dataHandlers == null) { return; } foreach (var dataHandler in dataHandlers) { if (dataHandler != null) { _dataHandlers.Add(dataHandler); } } }
/// <summary> /// 构造方法 /// </summary> /// <param name="model">数据模型</param> /// <param name="extractor">模型解析器</param> /// <param name="targetRequestExtractor">目标链接的解析器</param> /// <param name="dataHandlers">数据处理器</param> public ModelProcessor(IModel model, IModelExtractor extractor = null, ITargetRequestExtractor targetRequestExtractor = null, params IDataHandler[] dataHandlers) { Model = model ?? throw new ArgumentNullException(nameof(model)); Extractor = extractor ?? new ModelExtractor(); if (targetRequestExtractor != null) { TargetUrlsExtractor = targetRequestExtractor; } RegionAndPatternTargetRequestExtractor regionAndPatternTargetUrlsExtractor; if (TargetUrlsExtractor == null) { regionAndPatternTargetUrlsExtractor = new RegionAndPatternTargetRequestExtractor(); TargetUrlsExtractor = regionAndPatternTargetUrlsExtractor; } else { regionAndPatternTargetUrlsExtractor = TargetUrlsExtractor as RegionAndPatternTargetRequestExtractor; } if (regionAndPatternTargetUrlsExtractor == null) { return; } if (Model.TargetRequestSelectors != null && Model.TargetRequestSelectors.Any()) { foreach (var targetUrlsSelector in Model.TargetRequestSelectors) { var patterns = targetUrlsSelector.Patterns?.Select(x => x?.Trim()).Distinct().ToArray(); var xpaths = targetUrlsSelector.XPaths?.Select(x => x?.Trim()).Distinct().ToList(); if (xpaths == null && patterns == null) { throw new SpiderException("Region xpath and patterns should not be null both"); } if (xpaths != null && xpaths.Count > 0) { foreach (var xpath in xpaths) { regionAndPatternTargetUrlsExtractor.AddTargetUrlExtractor(xpath, patterns); } } else { if (patterns != null && patterns.Length > 0) { regionAndPatternTargetUrlsExtractor.AddTargetUrlExtractor(null, patterns); } } if (targetUrlsSelector.ExcludePatterns != null) { foreach (var p in targetUrlsSelector.ExcludePatterns) { regionAndPatternTargetUrlsExtractor.ExcludeTargetUrlPatterns.Add(new Regex(p)); } } } } if (dataHandlers != null) { foreach (var datahandler in dataHandlers) { if (datahandler != null) { _dataHandlers.Add(datahandler); } } } }
/// <summary> /// 构造方法 /// </summary> /// <param name="extractor">爬虫实体的解析器</param> /// <param name="targetUrlsExtractor">目标链接的解析、筛选器</param> /// <param name="dataHandler">对解析的结果进一步加工操作</param> /// <param name="tableName">实体在数据库中的表名, 此优先级高于EntitySelector中的定义</param> public EntityProcessor(IModelExtractor extractor = null, ITargetUrlsExtractor targetUrlsExtractor = null, params IDataHandler[] dataHandlers) : base(new ModelDefine <T>(), extractor == null ? new ModelExtractor <T>() : extractor, targetUrlsExtractor, dataHandlers) { }