Beispiel #1
0
        public EntityProcessor(Site site, DataHandler <T> dataHandler = null, string tableName = null)
        {
            Site      = site;
            Extractor = new EntityExtractor <T>(dataHandler, tableName);

            if (Extractor.EntityDefine.TargetUrlsSelectors != null && Extractor.EntityDefine.TargetUrlsSelectors.Count > 0)
            {
                foreach (var targetUrlsSelector in Extractor.EntityDefine.TargetUrlsSelectors)
                {
                    var patterns = targetUrlsSelector.Patterns?.Select(x => x?.Trim()).Distinct().ToArray();
                    var xpaths   = targetUrlsSelector.XPaths?.Select(x => x?.Trim()).Distinct().ToList();
                    if (xpaths == null && patterns == null)
                    {
                        throw new SpiderException("Region xpath and patterns should not be null both.");
                    }
                    if (xpaths != null && xpaths.Count > 0)
                    {
                        foreach (var xpath in xpaths)
                        {
                            AddTargetUrlExtractor(xpath, patterns);
                        }
                    }
                    else
                    {
                        if (patterns != null && patterns.Length > 0)
                        {
                            AddTargetUrlExtractor(null, patterns);
                        }
                    }
                }
            }
        }
Beispiel #2
0
        public IEntityLoader <MSSQLTable <TTableContent, TColumnContent> > GetEntityLoader(
            IEntityExtractor <IBaseObject <TTableContent> > entityExtractor = null,
            IEntityPropertyExtractor <ITable <TTableContent, TColumnContent>, TTableContent, IColumn <TColumnContent>, TColumnContent> propertyExtractor         = null,
            IEntityRelationshipExtractor <ITable <TTableContent, TColumnContent>, TTableContent, IColumn <TColumnContent>, TColumnContent> relationshipExtractor = null,
            IEntityHandler <TTableContent, EntityInfo>[] tableContentHandlers = null,
            IEntityHandler <TColumnContent, SimplePropertyEntityInfo>[] propertyContentHandlers = null)
        {
            var tableHandlers  = new List <IEntityHandler <MSSQLTable <TTableContent, TColumnContent>, EntityInfo> >();
            var columnHandlers = new List <IEntityHandler <MSSQLColumn <TTableContent, TColumnContent>, SimplePropertyEntityInfo> >
            {
                new MSSQLColumnHandler <TTableContent, TColumnContent>(),
                new MSSQLColumnDefaultValuetHandler <TTableContent, TColumnContent>()
            };

            if (tableContentHandlers != null && tableContentHandlers.Length != 0)
            {
                tableHandlers.Add(new ContentEntityHandler <MSSQLTable <TTableContent, TColumnContent>, TTableContent, EntityInfo>(tableContentHandlers));
            }

            if (propertyContentHandlers != null && propertyContentHandlers.Length != 0)
            {
                columnHandlers.Add(new ContentEntityHandler <MSSQLColumn <TTableContent, TColumnContent>, TColumnContent, SimplePropertyEntityInfo>(propertyContentHandlers));
            }

            var loader = new EntityLoader <MSSQLTable <TTableContent, TColumnContent>, TTableContent, MSSQLColumn <TTableContent, TColumnContent>, TColumnContent>
                             (entityExtractor, propertyExtractor, relationshipExtractor, tableHandlers, columnHandlers);

            return(loader);
        }
 public EntityProcessor(Site site, EntityMetadata entity)
 {
     Site       = site;
     _entity    = entity;
     _extractor = new EntityExtractor(entity.Entity.Name, entity.SharedValues, entity);
     if (entity.TargetUrlsSelectors != null && entity.TargetUrlsSelectors.Count > 0)
     {
         var pairs = new List <string>();
         foreach (var targetUrlsSelector in entity.TargetUrlsSelectors)
         {
             if (targetUrlsSelector.XPaths == null && targetUrlsSelector.Patterns == null)
             {
                 throw new SpiderException("Region xpath and patterns should not be null both.");
             }
             if (targetUrlsSelector.XPaths == null)
             {
                 targetUrlsSelector.XPaths = new string[] { };
             }
             foreach (var xpath in targetUrlsSelector.XPaths?.Select(x => x?.Trim()).Distinct())
             {
                 AddTargetUrlExtractor(xpath, targetUrlsSelector.Patterns);
             }
         }
     }
 }
        /// <summary>
        /// 构造方法
        /// </summary>
        /// <param name="extractor">爬虫实体的解析器</param>
        /// <param name="targetUrlsExtractor">目标链接的解析、筛选器</param>
        /// <param name="dataHandler">对解析的结果进一步加工操作</param>
        /// <param name="tableName">实体在数据库中的表名, 此优先级高于EntitySelector中的定义</param>
        public EntityProcessor(IEntityExtractor <T> extractor, ITargetUrlsExtractor targetUrlsExtractor, IDataHandler <T> dataHandler, string tableName)
        {
            if (extractor == null)
            {
                Extractor = new EntityExtractor <T>(dataHandler, tableName);
            }
            else
            {
                Extractor = extractor;
            }

            if (targetUrlsExtractor != null)
            {
                TargetUrlsExtractor = targetUrlsExtractor;
            }

            RegionAndPatternTargetUrlsExtractor regionAndPatternTargetUrlsExtractor;

            if (TargetUrlsExtractor == null)
            {
                regionAndPatternTargetUrlsExtractor = new RegionAndPatternTargetUrlsExtractor();
                TargetUrlsExtractor = regionAndPatternTargetUrlsExtractor;
            }
            else
            {
                regionAndPatternTargetUrlsExtractor = TargetUrlsExtractor as RegionAndPatternTargetUrlsExtractor;
            }
            if (regionAndPatternTargetUrlsExtractor == null)
            {
                return;
            }
            if (Extractor.EntityDefine.TargetUrlsSelectors != null && Extractor.EntityDefine.TargetUrlsSelectors.Count > 0)
            {
                foreach (var targetUrlsSelector in Extractor.EntityDefine.TargetUrlsSelectors)
                {
                    var patterns = targetUrlsSelector.Patterns?.Select(x => x?.Trim()).Distinct().ToArray();
                    var xpaths   = targetUrlsSelector.XPaths?.Select(x => x?.Trim()).Distinct().ToList();
                    if (xpaths == null && patterns == null)
                    {
                        throw new SpiderException("Region xpath and patterns should not be null both");
                    }
                    if (xpaths != null && xpaths.Count > 0)
                    {
                        foreach (var xpath in xpaths)
                        {
                            regionAndPatternTargetUrlsExtractor.AddTargetUrlExtractor(xpath, patterns);
                        }
                    }
                    else
                    {
                        if (patterns != null && patterns.Length > 0)
                        {
                            regionAndPatternTargetUrlsExtractor.AddTargetUrlExtractor(null, patterns);
                        }
                    }
                }
            }
        }
Beispiel #5
0
        /// <summary>
        ///     Initializes a new instance of the <see cref="T:System.Object" /> class.
        /// </summary>
        public EntityLoader(IEntityExtractor <TTable> entityExtractor = null,
                            IEntityPropertyExtractor <TTable, TTableContent, TColumn, TColumnContent> propertyExtractor         = null,
                            IEntityRelationshipExtractor <TTable, TTableContent, TColumn, TColumnContent> relationshipExtractor = null,
                            IReadOnlyCollection <IEntityHandler <TTable, EntityInfo> > tableHandlers = null,
                            IReadOnlyCollection <IEntityHandler <TColumn, SimplePropertyEntityInfo> > simplePropertiesHandlers = null)
        {
            _entityExtractor       = entityExtractor ?? new EntityExtractor <TTable, TTableContent>();
            _propertyExtractor     = propertyExtractor ?? new EntityPropertyExtractor <ITable <TTableContent, TColumnContent>, TTableContent, IColumn <TColumnContent>, TColumnContent>();
            _relationshipExtractor = relationshipExtractor;

            _tableHandlers            = tableHandlers ?? new IEntityHandler <TTable, EntityInfo> [0];
            _simplePropertiesHandlers = simplePropertiesHandlers ?? new IEntityHandler <TColumn, SimplePropertyEntityInfo> [0];
        }
Beispiel #6
0
 public EntityProcessor(Site site, EntityMetadata entity)
 {
     Site       = site;
     _entity    = entity;
     _extractor = new EntityExtractor(entity.Entity.Name, entity.SharedValues, entity);
     if (entity.TargetUrlExtractor != null)
     {
         if (entity.TargetUrlExtractor.Patterns != null && entity.TargetUrlExtractor.Patterns.Length > 0)
         {
             TargetUrlPatterns = new HashSet <Regex>(entity.TargetUrlExtractor.Patterns.Select(p => new Regex(p)));
         }
         if (entity.TargetUrlExtractor.XPaths != null && entity.TargetUrlExtractor.XPaths.Length > 0)
         {
             TargetUrlRegions = new HashSet <ISelector>(entity.TargetUrlExtractor.XPaths.Select(x => Selectors.XPath(x)));
         }
     }
 }
Beispiel #7
0
        public EntityProcessor(Site site, DataHandler <T> dataHandler = null, string tableName = null)
        {
            Site      = site;
            Extractor = new EntityExtractor <T>(dataHandler);

            if (Extractor.EntityDefine.TargetUrlsSelectors != null && Extractor.EntityDefine.TargetUrlsSelectors.Count > 0)
            {
                foreach (var targetUrlsSelector in Extractor.EntityDefine.TargetUrlsSelectors)
                {
                    if (targetUrlsSelector.XPaths == null && targetUrlsSelector.Patterns == null)
                    {
                        throw new SpiderException("Region xpath and patterns should not be null both.");
                    }
                    if (targetUrlsSelector.XPaths == null)
                    {
                        targetUrlsSelector.XPaths = new string[] { };
                    }
                    foreach (var xpath in targetUrlsSelector.XPaths?.Select(x => x?.Trim()).Distinct())
                    {
                        AddTargetUrlExtractor(xpath, targetUrlsSelector.Patterns);
                    }
                }
            }
        }
 /// <summary>
 /// 构造方法
 /// </summary>
 /// <param name="extractor">爬虫实体的解析器</param>
 public EntityProcessor(IEntityExtractor <T> extractor) : this(extractor, null, null, null)
 {
 }
Beispiel #9
0
 public EntityExtractor(IEntityExtractor <WeatherInfo> weatherEntityExtractor, ILog logger)
 {
     _infoExtractor = weatherEntityExtractor ?? throw new ArgumentNullException(nameof(weatherEntityExtractor));
     _logger        = logger ?? throw new ArgumentNullException(nameof(logger));
 }