/// <summary>
 /// 构造方法
 /// </summary>
 /// <param name="extractor">爬虫实体的解析器</param>
 /// <param name="targetRequestExtractor">目标链接的解析、筛选器</param>
 /// <param name="dataHandlers">对解析的结果进一步加工操作</param>
 public EntityProcessor(IModelExtractor extractor = null, ITargetRequestExtractor targetRequestExtractor = null, params IDataHandler[] dataHandlers)
     : base(new ModelDefinition <T>(), extractor ?? new ModelExtractor <T>(), targetRequestExtractor, dataHandlers)
 {
 }
Exemple #2
0
        /// <summary>
        /// 构造方法
        /// </summary>
        /// <param name="model">数据模型</param>
        /// <param name="extractor">模型解析器</param>
        /// <param name="dataHandlers">数据处理器</param>
        public ModelProcessor(IModel model, IModelExtractor extractor = null, params IDataHandler[] dataHandlers)
        {
            Model = model ?? throw new ArgumentNullException(nameof(model));

            Extractor = extractor ?? new ModelExtractor();

            var patterns = new HashSet <string>();

            foreach (var ps in model.Targets.Select(t => t.Patterns))
            {
                if (ps == null)
                {
                    continue;
                }
                foreach (var p in ps)
                {
                    patterns.Add(p);
                }
            }

            var excludePatterns = new HashSet <string>();

            foreach (var ps in model.Targets.Select(t => t.ExcludePatterns))
            {
                if (ps == null)
                {
                    continue;
                }
                foreach (var p in ps)
                {
                    excludePatterns.Add(p);
                }
            }

            Filter = new PatternFilter(patterns, excludePatterns);

            var xPaths = new HashSet <string>();

            foreach (var xs in model.Targets.Select(t => t.XPaths))
            {
                if (xs == null)
                {
                    continue;
                }
                foreach (var x in xs)
                {
                    xPaths.Add(x);
                }
            }

            RequestExtractor = xPaths.Any(x => x == null || x == ".")
                                ? new XPathRequestExtractor(".")
                                : (xPaths.Count == 0 ? null : new XPathRequestExtractor(xPaths));

            if (dataHandlers == null)
            {
                return;
            }
            foreach (var dataHandler in dataHandlers)
            {
                if (dataHandler != null)
                {
                    _dataHandlers.Add(dataHandler);
                }
            }
        }
Exemple #3
0
        /// <summary>
        /// 构造方法
        /// </summary>
        /// <param name="model">数据模型</param>
        /// <param name="extractor">模型解析器</param>
        /// <param name="targetRequestExtractor">目标链接的解析器</param>
        /// <param name="dataHandlers">数据处理器</param>
        public ModelProcessor(IModel model, IModelExtractor extractor = null, ITargetRequestExtractor targetRequestExtractor = null,
                              params IDataHandler[] dataHandlers)
        {
            Model = model ?? throw new ArgumentNullException(nameof(model));

            Extractor = extractor ?? new ModelExtractor();

            if (targetRequestExtractor != null)
            {
                TargetUrlsExtractor = targetRequestExtractor;
            }

            RegionAndPatternTargetRequestExtractor regionAndPatternTargetUrlsExtractor;

            if (TargetUrlsExtractor == null)
            {
                regionAndPatternTargetUrlsExtractor = new RegionAndPatternTargetRequestExtractor();
                TargetUrlsExtractor = regionAndPatternTargetUrlsExtractor;
            }
            else
            {
                regionAndPatternTargetUrlsExtractor = TargetUrlsExtractor as RegionAndPatternTargetRequestExtractor;
            }

            if (regionAndPatternTargetUrlsExtractor == null)
            {
                return;
            }

            if (Model.TargetRequestSelectors != null && Model.TargetRequestSelectors.Any())
            {
                foreach (var targetUrlsSelector in Model.TargetRequestSelectors)
                {
                    var patterns = targetUrlsSelector.Patterns?.Select(x => x?.Trim()).Distinct().ToArray();
                    var xpaths   = targetUrlsSelector.XPaths?.Select(x => x?.Trim()).Distinct().ToList();
                    if (xpaths == null && patterns == null)
                    {
                        throw new SpiderException("Region xpath and patterns should not be null both");
                    }

                    if (xpaths != null && xpaths.Count > 0)
                    {
                        foreach (var xpath in xpaths)
                        {
                            regionAndPatternTargetUrlsExtractor.AddTargetUrlExtractor(xpath, patterns);
                        }
                    }
                    else
                    {
                        if (patterns != null && patterns.Length > 0)
                        {
                            regionAndPatternTargetUrlsExtractor.AddTargetUrlExtractor(null, patterns);
                        }
                    }

                    if (targetUrlsSelector.ExcludePatterns != null)
                    {
                        foreach (var p in targetUrlsSelector.ExcludePatterns)
                        {
                            regionAndPatternTargetUrlsExtractor.ExcludeTargetUrlPatterns.Add(new Regex(p));
                        }
                    }
                }
            }

            if (dataHandlers != null)
            {
                foreach (var datahandler in dataHandlers)
                {
                    if (datahandler != null)
                    {
                        _dataHandlers.Add(datahandler);
                    }
                }
            }
        }
Exemple #4
0
 /// <summary>
 /// 构造方法
 /// </summary>
 /// <param name="extractor">爬虫实体的解析器</param>
 /// <param name="targetUrlsExtractor">目标链接的解析、筛选器</param>
 /// <param name="dataHandler">对解析的结果进一步加工操作</param>
 /// <param name="tableName">实体在数据库中的表名, 此优先级高于EntitySelector中的定义</param>
 public EntityProcessor(IModelExtractor extractor = null, ITargetUrlsExtractor targetUrlsExtractor = null, params IDataHandler[] dataHandlers)
     : base(new ModelDefine <T>(), extractor == null ? new ModelExtractor <T>() : extractor, targetUrlsExtractor, dataHandlers)
 {
 }