Exemple #1
0
        public void AddEntityType <T>(ITargetUrlsExtractor targetUrlsExtractor, IDataHandler <T> dataHandler, string tableName) where T : ISpiderEntity
        {
            CheckIfRunning();

            EntityProcessor <T> processor = new EntityProcessor <T>(targetUrlsExtractor, dataHandler, tableName);

            AddPageProcessor(processor);
        }
Exemple #2
0
        /// <summary>
        /// 添加爬虫实体类
        /// </summary>
        /// <typeparam name="T">爬虫实体类的类型, 必须继承自 ISpiderEntity</typeparam>
        /// <param name="targetUrlsExtractor">目标链接的解析、筛选器</param>
        /// <param name="dataHandler">对解析的结果进一步加工操作</param>
        public void AddEntityType <T>(ITargetUrlsExtractor targetUrlsExtractor, IDataHandler dataHandler) where T : new()
        {
            CheckIfRunning();

            var processor = new EntityProcessor <T>(UseEntityModelExtrator ? null : new ModelExtractor(), targetUrlsExtractor, dataHandler);

            AddPageProcessors(processor);
        }
        /// <summary>
        /// 构造方法
        /// </summary>
        /// <param name="extractor">爬虫实体的解析器</param>
        /// <param name="targetUrlsExtractor">目标链接的解析、筛选器</param>
        /// <param name="dataHandler">对解析的结果进一步加工操作</param>
        /// <param name="tableName">实体在数据库中的表名, 此优先级高于EntitySelector中的定义</param>
        public EntityProcessor(IEntityExtractor <T> extractor, ITargetUrlsExtractor targetUrlsExtractor, IDataHandler <T> dataHandler, string tableName)
        {
            if (extractor == null)
            {
                Extractor = new EntityExtractor <T>(dataHandler, tableName);
            }
            else
            {
                Extractor = extractor;
            }

            if (targetUrlsExtractor != null)
            {
                TargetUrlsExtractor = targetUrlsExtractor;
            }

            RegionAndPatternTargetUrlsExtractor regionAndPatternTargetUrlsExtractor;

            if (TargetUrlsExtractor == null)
            {
                regionAndPatternTargetUrlsExtractor = new RegionAndPatternTargetUrlsExtractor();
                TargetUrlsExtractor = regionAndPatternTargetUrlsExtractor;
            }
            else
            {
                regionAndPatternTargetUrlsExtractor = TargetUrlsExtractor as RegionAndPatternTargetUrlsExtractor;
            }
            if (regionAndPatternTargetUrlsExtractor == null)
            {
                return;
            }
            if (Extractor.EntityDefine.TargetUrlsSelectors != null && Extractor.EntityDefine.TargetUrlsSelectors.Count > 0)
            {
                foreach (var targetUrlsSelector in Extractor.EntityDefine.TargetUrlsSelectors)
                {
                    var patterns = targetUrlsSelector.Patterns?.Select(x => x?.Trim()).Distinct().ToArray();
                    var xpaths   = targetUrlsSelector.XPaths?.Select(x => x?.Trim()).Distinct().ToList();
                    if (xpaths == null && patterns == null)
                    {
                        throw new SpiderException("Region xpath and patterns should not be null both");
                    }
                    if (xpaths != null && xpaths.Count > 0)
                    {
                        foreach (var xpath in xpaths)
                        {
                            regionAndPatternTargetUrlsExtractor.AddTargetUrlExtractor(xpath, patterns);
                        }
                    }
                    else
                    {
                        if (patterns != null && patterns.Length > 0)
                        {
                            regionAndPatternTargetUrlsExtractor.AddTargetUrlExtractor(null, patterns);
                        }
                    }
                }
            }
        }
Exemple #4
0
 /// <summary>
 /// Construct a <see cref="TargetUrlsHandler"/> instance.
 /// </summary>
 /// <summary xml:lang="zh-CN">
 /// 构造方法
 /// </summary>
 /// <param name="targetUrlsExtractor">目标链接解析器 <see cref="ITargetUrlsExtractor"/></param>
 /// <param name="extractByProcessor">Processor是否还需要执行目标链接解析工作(Should <see cref="IPageProcessor"/> continue to execute <see cref="ITargetUrlsExtractor"/>)</param>
 public TargetUrlsHandler(ITargetUrlsExtractor targetUrlsExtractor, bool extractByProcessor = false)
 {
     _targetUrlsExtractor = targetUrlsExtractor;
     if (targetUrlsExtractor == null)
     {
         throw new ArgumentNullException(nameof(targetUrlsExtractor));
     }
     _extractByProcessor = extractByProcessor;
 }
Exemple #5
0
        public bool IsTermination(Page page, ITargetUrlsExtractor creator)
        {
            if (TotalPageSelector == null || CurrenctPageSelector == null)
            {
                throw new SpiderException("Total page selector or current page selector should not be null.");
            }
            if (string.IsNullOrEmpty(page?.Content))
            {
                return(false);
            }
            var totalStr   = GetSelectorValue(page, TotalPageSelector);
            var currentStr = GetSelectorValue(page, CurrenctPageSelector);

            return(currentStr == totalStr);
        }
Exemple #6
0
 public void AddEntityType <T>(ITargetUrlsExtractor targetUrlsExtractor, IDataHandler <T> dataHandler) where T : ISpiderEntity
 {
     AddEntityType(targetUrlsExtractor, dataHandler, null);
 }
Exemple #7
0
 public void AddEntityType <T>(ITargetUrlsExtractor targetUrlsExtractor) where T : ISpiderEntity
 {
     AddEntityType <T>(targetUrlsExtractor, null, null);
 }
Exemple #8
0
 /// <summary>
 /// 添加爬虫实体类
 /// </summary>
 /// <typeparam name="T">爬虫实体类的类型, 必须继承自 ISpiderEntity</typeparam>
 /// <param name="targetUrlsExtractor">目标链接的解析、筛选器</param>
 public void AddEntityType <T>(ITargetUrlsExtractor targetUrlsExtractor) where T : new()
 {
     AddEntityType <T>(targetUrlsExtractor, null);
 }
Exemple #9
0
 /// <summary>
 /// 构造方法
 /// </summary>
 /// <param name="extractor">爬虫实体的解析器</param>
 /// <param name="targetUrlsExtractor">目标链接的解析、筛选器</param>
 /// <param name="dataHandlers">对解析的结果进一步加工操作</param>
 public EntityProcessor(IModelExtractor extractor = null, ITargetUrlsExtractor targetUrlsExtractor = null, params IDataHandler[] dataHandlers)
     : base(new ModelDefine <T>(), extractor ?? new ModelExtractor <T>(), targetUrlsExtractor, dataHandlers)
 {
 }
 /// <summary>
 /// 构造方法
 /// </summary>
 /// <param name="targetUrlsExtractor">目标链接的解析、筛选器</param>
 /// <param name="dataHandler">对解析的结果进一步加工操作</param>
 /// <param name="tableName">实体在数据库中的表名, 此优先级高于EntitySelector中的定义</param>
 public EntityProcessor(ITargetUrlsExtractor targetUrlsExtractor, IDataHandler <T> dataHandler, string tableName) : this(null, targetUrlsExtractor, dataHandler, tableName)
 {
 }
 /// <summary>
 /// 构造方法
 /// </summary>
 /// <param name="targetUrlsExtractor">目标链接的解析、筛选器</param>
 public EntityProcessor(ITargetUrlsExtractor targetUrlsExtractor) : this(null, targetUrlsExtractor, null, null)
 {
 }
        /// <summary>
        /// 构造方法
        /// </summary>
        /// <param name="extractor">爬虫实体的解析器</param>
        /// <param name="targetUrlsExtractor">目标链接的解析、筛选器</param>
        /// <param name="dataHandler">对解析的结果进一步加工操作</param>
        /// <param name="tableName">实体在数据库中的表名, 此优先级高于EntitySelector中的定义</param>
        public ModelProcessor(IModel model, IModelExtractor extractor = null, ITargetUrlsExtractor targetUrlsExtractor = null, params IDataHandler[] dataHandlers)
        {
            Model = model ?? throw new ArgumentNullException($"{nameof(model)} should not be null.");

            Extractor = extractor ?? new ModelExtractor();

            if (targetUrlsExtractor != null)
            {
                TargetUrlsExtractor = targetUrlsExtractor;
            }

            RegionAndPatternTargetUrlsExtractor regionAndPatternTargetUrlsExtractor;

            if (TargetUrlsExtractor == null)
            {
                regionAndPatternTargetUrlsExtractor = new RegionAndPatternTargetUrlsExtractor();
                TargetUrlsExtractor = regionAndPatternTargetUrlsExtractor;
            }
            else
            {
                regionAndPatternTargetUrlsExtractor = TargetUrlsExtractor as RegionAndPatternTargetUrlsExtractor;
            }
            if (regionAndPatternTargetUrlsExtractor == null)
            {
                return;
            }
            if (Model.TargetUrlsSelectors != null && Model.TargetUrlsSelectors.Count() > 0)
            {
                foreach (var targetUrlsSelector in Model.TargetUrlsSelectors)
                {
                    var patterns = targetUrlsSelector.Patterns?.Select(x => x?.Trim()).Distinct().ToArray();
                    var xpaths   = targetUrlsSelector.XPaths?.Select(x => x?.Trim()).Distinct().ToList();
                    if (xpaths == null && patterns == null)
                    {
                        throw new SpiderException("Region xpath and patterns should not be null both");
                    }
                    if (xpaths != null && xpaths.Count > 0)
                    {
                        foreach (var xpath in xpaths)
                        {
                            regionAndPatternTargetUrlsExtractor.AddTargetUrlExtractor(xpath, patterns);
                        }
                    }
                    else
                    {
                        if (patterns != null && patterns.Length > 0)
                        {
                            regionAndPatternTargetUrlsExtractor.AddTargetUrlExtractor(null, patterns);
                        }
                    }
                }
            }

            if (dataHandlers != null)
            {
                foreach (var datahandler in dataHandlers)
                {
                    if (datahandler != null)
                    {
                        _dataHandlers.Add(datahandler);
                    }
                }
            }
        }