/// <summary>
        /// 构造方法
        /// </summary>
        /// <param name="extractor">爬虫实体的解析器</param>
        /// <param name="targetUrlsExtractor">目标链接的解析、筛选器</param>
        /// <param name="dataHandler">对解析的结果进一步加工操作</param>
        /// <param name="tableName">实体在数据库中的表名, 此优先级高于EntitySelector中的定义</param>
        public EntityProcessor(IEntityExtractor <T> extractor, ITargetUrlsExtractor targetUrlsExtractor, IDataHandler <T> dataHandler, string tableName)
        {
            if (extractor == null)
            {
                Extractor = new EntityExtractor <T>(dataHandler, tableName);
            }
            else
            {
                Extractor = extractor;
            }

            if (targetUrlsExtractor != null)
            {
                TargetUrlsExtractor = targetUrlsExtractor;
            }

            RegionAndPatternTargetUrlsExtractor regionAndPatternTargetUrlsExtractor;

            if (TargetUrlsExtractor == null)
            {
                regionAndPatternTargetUrlsExtractor = new RegionAndPatternTargetUrlsExtractor();
                TargetUrlsExtractor = regionAndPatternTargetUrlsExtractor;
            }
            else
            {
                regionAndPatternTargetUrlsExtractor = TargetUrlsExtractor as RegionAndPatternTargetUrlsExtractor;
            }
            if (regionAndPatternTargetUrlsExtractor == null)
            {
                return;
            }
            if (Extractor.EntityDefine.TargetUrlsSelectors != null && Extractor.EntityDefine.TargetUrlsSelectors.Count > 0)
            {
                foreach (var targetUrlsSelector in Extractor.EntityDefine.TargetUrlsSelectors)
                {
                    var patterns = targetUrlsSelector.Patterns?.Select(x => x?.Trim()).Distinct().ToArray();
                    var xpaths   = targetUrlsSelector.XPaths?.Select(x => x?.Trim()).Distinct().ToList();
                    if (xpaths == null && patterns == null)
                    {
                        throw new SpiderException("Region xpath and patterns should not be null both");
                    }
                    if (xpaths != null && xpaths.Count > 0)
                    {
                        foreach (var xpath in xpaths)
                        {
                            regionAndPatternTargetUrlsExtractor.AddTargetUrlExtractor(xpath, patterns);
                        }
                    }
                    else
                    {
                        if (patterns != null && patterns.Length > 0)
                        {
                            regionAndPatternTargetUrlsExtractor.AddTargetUrlExtractor(null, patterns);
                        }
                    }
                }
            }
        }
        /// <summary>
        /// 构造方法
        /// </summary>
        /// <param name="partterns">匹配目标链接的正则表达式</param>
        /// <param name="excludeParterns">排除目标链接的正则表达式</param>
        public DefaultPageProcessor(string[] partterns = null, string[] excludeParterns = null)
        {
            var targetUrlsExtractor = new RegionAndPatternTargetUrlsExtractor();

            if (partterns != null && partterns.Length > 0)
            {
                targetUrlsExtractor.AddTargetUrlExtractor(".", partterns);
            }
            if (excludeParterns != null && excludeParterns.Length > 0)
            {
                targetUrlsExtractor.AddExcludeTargetUrlPatterns(excludeParterns);
            }
            TargetUrlsExtractor = targetUrlsExtractor;
        }
Beispiel #3
0
        public void RegionAndPatternTargetUrlsExtractor()
        {
            HttpClient client = new HttpClient();
            var        html   = client.GetStringAsync("http://www.cnblogs.com").Result;

            var extracotr = new RegionAndPatternTargetUrlsExtractor(".//div[@class='pager']", "/sitehome/p/\\d+", "^http://www\\.cnblogs\\.com/$");

            var site = new Site();
            var page = new Page(new Request("http://cnblogs.com")
            {
                Site = site
            });

            page.Content = html;
            var requets = extracotr.ExtractRequests(page, site).ToList();

            Assert.Equal(12, requets.Count);
            Assert.Contains(requets, r => r.Url == "http://cnblogs.com/sitehome/p/2");
        }
Beispiel #4
0
 public CnblogsProcessor3()
 {
     TargetUrlsExtractor = new RegionAndPatternTargetUrlsExtractor(".", "/sitehome/p/\\d+");
 }
Beispiel #5
0
 public CnblogsProcessor2()
 {
     TargetUrlsExtractor = new RegionAndPatternTargetUrlsExtractor(".//div[@class='pager']", "/sitehome/p/\\d+", "^http://www\\.cnblogs\\.com/$");
 }
Beispiel #6
0
 public BlogSumaryProcessor()
 {
     // 定义目标页的筛选
     TargetUrlsExtractor = new RegionAndPatternTargetUrlsExtractor(".", "^http://www\\.cnblogs\\.com/$", "http://www\\.cnblogs\\.com/sitehome/p/\\d+");
 }
Beispiel #7
0
 public NewsProcessor()
 {
     TargetUrlsExtractor = new RegionAndPatternTargetUrlsExtractor(null, "^http://www\\.cnblogs\\.com/$", "^http://www\\.cnblogs\\.com/news/$", "www\\.cnblogs\\.com/news/\\d+");
 }
        /// <summary>
        /// 构造方法
        /// </summary>
        /// <param name="extractor">爬虫实体的解析器</param>
        /// <param name="targetUrlsExtractor">目标链接的解析、筛选器</param>
        /// <param name="dataHandler">对解析的结果进一步加工操作</param>
        /// <param name="tableName">实体在数据库中的表名, 此优先级高于EntitySelector中的定义</param>
        public ModelProcessor(IModel model, IModelExtractor extractor = null, ITargetUrlsExtractor targetUrlsExtractor = null, params IDataHandler[] dataHandlers)
        {
            Model = model ?? throw new ArgumentNullException($"{nameof(model)} should not be null.");

            Extractor = extractor ?? new ModelExtractor();

            if (targetUrlsExtractor != null)
            {
                TargetUrlsExtractor = targetUrlsExtractor;
            }

            RegionAndPatternTargetUrlsExtractor regionAndPatternTargetUrlsExtractor;

            if (TargetUrlsExtractor == null)
            {
                regionAndPatternTargetUrlsExtractor = new RegionAndPatternTargetUrlsExtractor();
                TargetUrlsExtractor = regionAndPatternTargetUrlsExtractor;
            }
            else
            {
                regionAndPatternTargetUrlsExtractor = TargetUrlsExtractor as RegionAndPatternTargetUrlsExtractor;
            }
            if (regionAndPatternTargetUrlsExtractor == null)
            {
                return;
            }
            if (Model.TargetUrlsSelectors != null && Model.TargetUrlsSelectors.Count() > 0)
            {
                foreach (var targetUrlsSelector in Model.TargetUrlsSelectors)
                {
                    var patterns = targetUrlsSelector.Patterns?.Select(x => x?.Trim()).Distinct().ToArray();
                    var xpaths   = targetUrlsSelector.XPaths?.Select(x => x?.Trim()).Distinct().ToList();
                    if (xpaths == null && patterns == null)
                    {
                        throw new SpiderException("Region xpath and patterns should not be null both");
                    }
                    if (xpaths != null && xpaths.Count > 0)
                    {
                        foreach (var xpath in xpaths)
                        {
                            regionAndPatternTargetUrlsExtractor.AddTargetUrlExtractor(xpath, patterns);
                        }
                    }
                    else
                    {
                        if (patterns != null && patterns.Length > 0)
                        {
                            regionAndPatternTargetUrlsExtractor.AddTargetUrlExtractor(null, patterns);
                        }
                    }
                }
            }

            if (dataHandlers != null)
            {
                foreach (var datahandler in dataHandlers)
                {
                    if (datahandler != null)
                    {
                        _dataHandlers.Add(datahandler);
                    }
                }
            }
        }
Beispiel #9
0
 public GetExtInfoProcessor()
 {
     TargetUrlsExtractor = new RegionAndPatternTargetUrlsExtractor(".", @"^https://mall\.autohome\.com\.cn\/http\/data\.html\?data\[_host\]=//car\.api\.autohome\.com\.cn/v2/carprice/Config_GetListBySpecId\.ashx*");
 }
Beispiel #10
0
 public GetBasicInfoProcessor()
 {
     TargetUrlsExtractor = new RegionAndPatternTargetUrlsExtractor(".", @"^https://mall\.autohome\.com\.cn/http/data\.html\?data\[_host\]=//car\.api\.autohome\.com\.cn/v1/carprice/spec_paramsinglebyspecid\.ashx*");
 }
Beispiel #11
0
 public GetSkuProcessor()
 {
     TargetUrlsExtractor = new RegionAndPatternTargetUrlsExtractor(".", @"^https://mall\.autohome\.com\.cn/detail/*");
 }
Beispiel #12
0
 public GetSkuProcessor()
 {
     TargetUrlsExtractor = new RegionAndPatternTargetUrlsExtractor(".", @"^https:\/\/mall.autohome.com.cn\/detail\/*[0-9]+-[0-9]+-[0-9]+\.html$");
 }