Beispiel #1
0
        /// <summary>
        /// 添加爬虫实体类
        /// </summary>
        /// <typeparam name="T">爬虫实体类的类型, 必须继承自 ISpiderEntity</typeparam>
        /// <param name="targetUrlsExtractor">目标链接的解析、筛选器</param>
        /// <param name="dataHandler">对解析的结果进一步加工操作</param>
        public void AddEntityType <T>(ITargetRequestExtractor targetUrlsExtractor, IDataHandler dataHandler) where T : new()
        {
            CheckIfRunning();

            var processor = new EntityProcessor <T>(UseEntityModelExtractor ? null : new ModelExtractor(), targetUrlsExtractor, dataHandler);

            AddPageProcessors(processor);
        }
Beispiel #2
0
 /// <summary>
 /// 构造方法
 /// </summary>
 /// <param name="extractor">爬虫实体的解析器</param>
 /// <param name="targetRequestExtractor">目标链接的解析、筛选器</param>
 /// <param name="dataHandlers">对解析的结果进一步加工操作</param>
 public EntityProcessor(IModelExtractor extractor = null, ITargetRequestExtractor targetRequestExtractor = null, params IDataHandler[] dataHandlers)
     : base(new ModelDefinition <T>(), extractor ?? new ModelExtractor <T>(), targetRequestExtractor, dataHandlers)
 {
 }
Beispiel #3
0
        /// <summary>
        /// 构造方法
        /// </summary>
        /// <param name="model">数据模型</param>
        /// <param name="extractor">模型解析器</param>
        /// <param name="targetRequestExtractor">目标链接的解析器</param>
        /// <param name="dataHandlers">数据处理器</param>
        public ModelProcessor(IModel model, IModelExtractor extractor = null, ITargetRequestExtractor targetRequestExtractor = null,
                              params IDataHandler[] dataHandlers)
        {
            Model = model ?? throw new ArgumentNullException(nameof(model));

            Extractor = extractor ?? new ModelExtractor();

            if (targetRequestExtractor != null)
            {
                TargetUrlsExtractor = targetRequestExtractor;
            }

            RegionAndPatternTargetRequestExtractor regionAndPatternTargetUrlsExtractor;

            if (TargetUrlsExtractor == null)
            {
                regionAndPatternTargetUrlsExtractor = new RegionAndPatternTargetRequestExtractor();
                TargetUrlsExtractor = regionAndPatternTargetUrlsExtractor;
            }
            else
            {
                regionAndPatternTargetUrlsExtractor = TargetUrlsExtractor as RegionAndPatternTargetRequestExtractor;
            }

            if (regionAndPatternTargetUrlsExtractor == null)
            {
                return;
            }

            if (Model.TargetRequestSelectors != null && Model.TargetRequestSelectors.Any())
            {
                foreach (var targetUrlsSelector in Model.TargetRequestSelectors)
                {
                    var patterns = targetUrlsSelector.Patterns?.Select(x => x?.Trim()).Distinct().ToArray();
                    var xpaths   = targetUrlsSelector.XPaths?.Select(x => x?.Trim()).Distinct().ToList();
                    if (xpaths == null && patterns == null)
                    {
                        throw new SpiderException("Region xpath and patterns should not be null both");
                    }

                    if (xpaths != null && xpaths.Count > 0)
                    {
                        foreach (var xpath in xpaths)
                        {
                            regionAndPatternTargetUrlsExtractor.AddTargetUrlExtractor(xpath, patterns);
                        }
                    }
                    else
                    {
                        if (patterns != null && patterns.Length > 0)
                        {
                            regionAndPatternTargetUrlsExtractor.AddTargetUrlExtractor(null, patterns);
                        }
                    }

                    if (targetUrlsSelector.ExcludePatterns != null)
                    {
                        foreach (var p in targetUrlsSelector.ExcludePatterns)
                        {
                            regionAndPatternTargetUrlsExtractor.ExcludeTargetUrlPatterns.Add(new Regex(p));
                        }
                    }
                }
            }

            if (dataHandlers != null)
            {
                foreach (var datahandler in dataHandlers)
                {
                    if (datahandler != null)
                    {
                        _dataHandlers.Add(datahandler);
                    }
                }
            }
        }
 /// <summary>
 /// Construct a <see cref="TargetRequestHandler"/> instance.
 /// </summary>
 /// <summary xml:lang="zh-CN">
 /// 构造方法
 /// </summary>
 /// <param name="targetRequestExtractor">目标链接解析器 <see cref="ITargetRequestExtractor"/></param>
 /// <param name="extractByProcessor">Processor是否还需要执行目标链接解析工作(Should <see cref="IPageProcessor"/> continue to execute <see cref="ITargetRequestExtractor"/>)</param>
 public TargetRequestHandler(ITargetRequestExtractor targetRequestExtractor, bool extractByProcessor = false)
 {
     _targetUrlsExtractor = targetRequestExtractor ?? throw new ArgumentNullException(nameof(targetRequestExtractor));
     _extractByProcessor  = extractByProcessor;
 }
Beispiel #5
0
 /// <summary>
 /// 添加爬虫实体类
 /// </summary>
 /// <typeparam name="T">爬虫实体类的类型, 必须继承自 ISpiderEntity</typeparam>
 /// <param name="targetUrlsExtractor">目标链接的解析、筛选器</param>
 public void AddEntityType <T>(ITargetRequestExtractor targetUrlsExtractor) where T : new()
 {
     AddEntityType <T>(targetUrlsExtractor, null);
 }