public EntityProcessor(Site site, DataHandler <T> dataHandler = null, string tableName = null) { Site = site; Extractor = new EntityExtractor <T>(dataHandler, tableName); if (Extractor.EntityDefine.TargetUrlsSelectors != null && Extractor.EntityDefine.TargetUrlsSelectors.Count > 0) { foreach (var targetUrlsSelector in Extractor.EntityDefine.TargetUrlsSelectors) { var patterns = targetUrlsSelector.Patterns?.Select(x => x?.Trim()).Distinct().ToArray(); var xpaths = targetUrlsSelector.XPaths?.Select(x => x?.Trim()).Distinct().ToList(); if (xpaths == null && patterns == null) { throw new SpiderException("Region xpath and patterns should not be null both."); } if (xpaths != null && xpaths.Count > 0) { foreach (var xpath in xpaths) { AddTargetUrlExtractor(xpath, patterns); } } else { if (patterns != null && patterns.Length > 0) { AddTargetUrlExtractor(null, patterns); } } } } }
public IEntityLoader <MSSQLTable <TTableContent, TColumnContent> > GetEntityLoader( IEntityExtractor <IBaseObject <TTableContent> > entityExtractor = null, IEntityPropertyExtractor <ITable <TTableContent, TColumnContent>, TTableContent, IColumn <TColumnContent>, TColumnContent> propertyExtractor = null, IEntityRelationshipExtractor <ITable <TTableContent, TColumnContent>, TTableContent, IColumn <TColumnContent>, TColumnContent> relationshipExtractor = null, IEntityHandler <TTableContent, EntityInfo>[] tableContentHandlers = null, IEntityHandler <TColumnContent, SimplePropertyEntityInfo>[] propertyContentHandlers = null) { var tableHandlers = new List <IEntityHandler <MSSQLTable <TTableContent, TColumnContent>, EntityInfo> >(); var columnHandlers = new List <IEntityHandler <MSSQLColumn <TTableContent, TColumnContent>, SimplePropertyEntityInfo> > { new MSSQLColumnHandler <TTableContent, TColumnContent>(), new MSSQLColumnDefaultValuetHandler <TTableContent, TColumnContent>() }; if (tableContentHandlers != null && tableContentHandlers.Length != 0) { tableHandlers.Add(new ContentEntityHandler <MSSQLTable <TTableContent, TColumnContent>, TTableContent, EntityInfo>(tableContentHandlers)); } if (propertyContentHandlers != null && propertyContentHandlers.Length != 0) { columnHandlers.Add(new ContentEntityHandler <MSSQLColumn <TTableContent, TColumnContent>, TColumnContent, SimplePropertyEntityInfo>(propertyContentHandlers)); } var loader = new EntityLoader <MSSQLTable <TTableContent, TColumnContent>, TTableContent, MSSQLColumn <TTableContent, TColumnContent>, TColumnContent> (entityExtractor, propertyExtractor, relationshipExtractor, tableHandlers, columnHandlers); return(loader); }
public EntityProcessor(Site site, EntityMetadata entity) { Site = site; _entity = entity; _extractor = new EntityExtractor(entity.Entity.Name, entity.SharedValues, entity); if (entity.TargetUrlsSelectors != null && entity.TargetUrlsSelectors.Count > 0) { var pairs = new List <string>(); foreach (var targetUrlsSelector in entity.TargetUrlsSelectors) { if (targetUrlsSelector.XPaths == null && targetUrlsSelector.Patterns == null) { throw new SpiderException("Region xpath and patterns should not be null both."); } if (targetUrlsSelector.XPaths == null) { targetUrlsSelector.XPaths = new string[] { }; } foreach (var xpath in targetUrlsSelector.XPaths?.Select(x => x?.Trim()).Distinct()) { AddTargetUrlExtractor(xpath, targetUrlsSelector.Patterns); } } } }
/// <summary> /// 构造方法 /// </summary> /// <param name="extractor">爬虫实体的解析器</param> /// <param name="targetUrlsExtractor">目标链接的解析、筛选器</param> /// <param name="dataHandler">对解析的结果进一步加工操作</param> /// <param name="tableName">实体在数据库中的表名, 此优先级高于EntitySelector中的定义</param> public EntityProcessor(IEntityExtractor <T> extractor, ITargetUrlsExtractor targetUrlsExtractor, IDataHandler <T> dataHandler, string tableName) { if (extractor == null) { Extractor = new EntityExtractor <T>(dataHandler, tableName); } else { Extractor = extractor; } if (targetUrlsExtractor != null) { TargetUrlsExtractor = targetUrlsExtractor; } RegionAndPatternTargetUrlsExtractor regionAndPatternTargetUrlsExtractor; if (TargetUrlsExtractor == null) { regionAndPatternTargetUrlsExtractor = new RegionAndPatternTargetUrlsExtractor(); TargetUrlsExtractor = regionAndPatternTargetUrlsExtractor; } else { regionAndPatternTargetUrlsExtractor = TargetUrlsExtractor as RegionAndPatternTargetUrlsExtractor; } if (regionAndPatternTargetUrlsExtractor == null) { return; } if (Extractor.EntityDefine.TargetUrlsSelectors != null && Extractor.EntityDefine.TargetUrlsSelectors.Count > 0) { foreach (var targetUrlsSelector in Extractor.EntityDefine.TargetUrlsSelectors) { var patterns = targetUrlsSelector.Patterns?.Select(x => x?.Trim()).Distinct().ToArray(); var xpaths = targetUrlsSelector.XPaths?.Select(x => x?.Trim()).Distinct().ToList(); if (xpaths == null && patterns == null) { throw new SpiderException("Region xpath and patterns should not be null both"); } if (xpaths != null && xpaths.Count > 0) { foreach (var xpath in xpaths) { regionAndPatternTargetUrlsExtractor.AddTargetUrlExtractor(xpath, patterns); } } else { if (patterns != null && patterns.Length > 0) { regionAndPatternTargetUrlsExtractor.AddTargetUrlExtractor(null, patterns); } } } } }
/// <summary> /// Initializes a new instance of the <see cref="T:System.Object" /> class. /// </summary> public EntityLoader(IEntityExtractor <TTable> entityExtractor = null, IEntityPropertyExtractor <TTable, TTableContent, TColumn, TColumnContent> propertyExtractor = null, IEntityRelationshipExtractor <TTable, TTableContent, TColumn, TColumnContent> relationshipExtractor = null, IReadOnlyCollection <IEntityHandler <TTable, EntityInfo> > tableHandlers = null, IReadOnlyCollection <IEntityHandler <TColumn, SimplePropertyEntityInfo> > simplePropertiesHandlers = null) { _entityExtractor = entityExtractor ?? new EntityExtractor <TTable, TTableContent>(); _propertyExtractor = propertyExtractor ?? new EntityPropertyExtractor <ITable <TTableContent, TColumnContent>, TTableContent, IColumn <TColumnContent>, TColumnContent>(); _relationshipExtractor = relationshipExtractor; _tableHandlers = tableHandlers ?? new IEntityHandler <TTable, EntityInfo> [0]; _simplePropertiesHandlers = simplePropertiesHandlers ?? new IEntityHandler <TColumn, SimplePropertyEntityInfo> [0]; }
public EntityProcessor(Site site, EntityMetadata entity) { Site = site; _entity = entity; _extractor = new EntityExtractor(entity.Entity.Name, entity.SharedValues, entity); if (entity.TargetUrlExtractor != null) { if (entity.TargetUrlExtractor.Patterns != null && entity.TargetUrlExtractor.Patterns.Length > 0) { TargetUrlPatterns = new HashSet <Regex>(entity.TargetUrlExtractor.Patterns.Select(p => new Regex(p))); } if (entity.TargetUrlExtractor.XPaths != null && entity.TargetUrlExtractor.XPaths.Length > 0) { TargetUrlRegions = new HashSet <ISelector>(entity.TargetUrlExtractor.XPaths.Select(x => Selectors.XPath(x))); } } }
public EntityProcessor(Site site, DataHandler <T> dataHandler = null, string tableName = null) { Site = site; Extractor = new EntityExtractor <T>(dataHandler); if (Extractor.EntityDefine.TargetUrlsSelectors != null && Extractor.EntityDefine.TargetUrlsSelectors.Count > 0) { foreach (var targetUrlsSelector in Extractor.EntityDefine.TargetUrlsSelectors) { if (targetUrlsSelector.XPaths == null && targetUrlsSelector.Patterns == null) { throw new SpiderException("Region xpath and patterns should not be null both."); } if (targetUrlsSelector.XPaths == null) { targetUrlsSelector.XPaths = new string[] { }; } foreach (var xpath in targetUrlsSelector.XPaths?.Select(x => x?.Trim()).Distinct()) { AddTargetUrlExtractor(xpath, targetUrlsSelector.Patterns); } } } }
/// <summary> /// 构造方法 /// </summary> /// <param name="extractor">爬虫实体的解析器</param> public EntityProcessor(IEntityExtractor <T> extractor) : this(extractor, null, null, null) { }
public EntityExtractor(IEntityExtractor <WeatherInfo> weatherEntityExtractor, ILog logger) { _infoExtractor = weatherEntityExtractor ?? throw new ArgumentNullException(nameof(weatherEntityExtractor)); _logger = logger ?? throw new ArgumentNullException(nameof(logger)); }