示例#1
0
        public void SatrtSpider()
        {
            List <Request> resList = new List <Request>();
            var            site    = new DotnetSpider.Core.Site()
            {
                EncodingName = "UTF-8"
            };

            site.CycleRetryTimes = 1;
            //循环获取29页句子迷原创句子
            for (int k = 1; k < 50; k++)
            {
                for (int i = 0; i < 100; i++)
                {
                    var url = $"http://www.liwushuo.com/api/channels/{k}/items?limit=50&offset={i * 50}";
                    site.AddStartUrl(url);
                }
            }

            var spider = Spider.Create(site, new GiftDetailsPageProcessor())
                         .AddStartRequests(resList.ToArray())
                         .AddPipeline(new GiftDetailsPipeliner());

            spider.ThreadNum = 1;
            spider.Run();
        }
示例#2
0
        public EntityProcessor(Site site, DataHandler <T> dataHandler = null, string tableName = null)
        {
            Site      = site;
            Extractor = new EntityExtractor <T>(dataHandler, tableName);

            if (Extractor.EntityDefine.TargetUrlsSelectors != null && Extractor.EntityDefine.TargetUrlsSelectors.Count > 0)
            {
                foreach (var targetUrlsSelector in Extractor.EntityDefine.TargetUrlsSelectors)
                {
                    var patterns = targetUrlsSelector.Patterns?.Select(x => x?.Trim()).Distinct().ToArray();
                    var xpaths   = targetUrlsSelector.XPaths?.Select(x => x?.Trim()).Distinct().ToList();
                    if (xpaths == null && patterns == null)
                    {
                        throw new SpiderException("Region xpath and patterns should not be null both.");
                    }
                    if (xpaths != null && xpaths.Count > 0)
                    {
                        foreach (var xpath in xpaths)
                        {
                            AddTargetUrlExtractor(xpath, patterns);
                        }
                    }
                    else
                    {
                        if (patterns != null && patterns.Length > 0)
                        {
                            AddTargetUrlExtractor(null, patterns);
                        }
                    }
                }
            }
        }
示例#3
0
        public static void CustmizeProcessorAndPipeline()
        {
            // Config encoding, header, cookie, proxy etc... 定义采集的 Site 对象, 设置 Header、Cookie、代理等
            var site = new DotnetSpider.Core.Site {
                EncodingName = "UTF-8", RemoveOutboundLinks = true
            };

            for (int i = 1; i < 5; ++i)
            {
                // Add start/feed urls. 添加初始采集链接
                site.AddStartUrl($"http://list.youku.com/category/show/c_96_s_1_d_1_p_{i}.html");
            }
            Spider spider = Spider.Create(site,
                            // use memoery queue scheduler. 使用内存调度
                                          new QueueDuplicateRemovedScheduler(),
                            // use custmize processor for youku 为优酷自定义的 Processor
                                          new YoukuPageProcessor())
                            // use custmize pipeline for youku 为优酷自定义的 Pipeline
                            .AddPipeline(new YoukuPipeline());

            spider.Downloader     = new HttpClientDownloader();
            spider.ThreadNum      = 1;
            spider.EmptySleepTime = 3000;

            // Start crawler 启动爬虫
            spider.Run();
        }
 public EntityProcessor(Site site, EntityMetadata entity)
 {
     Site       = site;
     _entity    = entity;
     _extractor = new EntityExtractor(entity.Entity.Name, entity.SharedValues, entity);
     if (entity.TargetUrlsSelectors != null && entity.TargetUrlsSelectors.Count > 0)
     {
         var pairs = new List <string>();
         foreach (var targetUrlsSelector in entity.TargetUrlsSelectors)
         {
             if (targetUrlsSelector.XPaths == null && targetUrlsSelector.Patterns == null)
             {
                 throw new SpiderException("Region xpath and patterns should not be null both.");
             }
             if (targetUrlsSelector.XPaths == null)
             {
                 targetUrlsSelector.XPaths = new string[] { };
             }
             foreach (var xpath in targetUrlsSelector.XPaths?.Select(x => x?.Trim()).Distinct())
             {
                 AddTargetUrlExtractor(xpath, targetUrlsSelector.Patterns);
             }
         }
     }
 }
示例#5
0
        public void SatrtSpider()
        {
            List <Request> resList = new List <Request>();
            var            site    = new DotnetSpider.Core.Site()
            {
                EncodingName = "UTF-8"
            };

            site.CycleRetryTimes = 1;
            //循环获取29页句子迷原创句子
            var db   = GiftDB.GetInstance();
            var list = db.Query <ArticleDetail>("SELECT * FROM t_gift_ArticleDetails");

            foreach (var item in list)
            {
                //var url = $"http://www.liwushuo.com/posts/{item.PID}/content";
                var url = $"http://www.liwushuo.com/posts/{item.PID}";

                site.AddStartUrl(url);
            }
            var spider = Spider.Create(site, new GiftPageProcessor())
                         .AddStartRequests(resList.ToArray())
                         .AddPipeline(new GiftPipeliner());

            spider.ThreadNum = 1;
            spider.Run();
        }
示例#6
0
 public EntityProcessor(Site site, EntityMetadata entity)
 {
     Site       = site;
     _entity    = entity;
     _extractor = new EntityExtractor(entity.Entity.Name, entity.SharedValues, entity);
     if (entity.TargetUrlExtractor != null)
     {
         if (entity.TargetUrlExtractor.Patterns != null && entity.TargetUrlExtractor.Patterns.Length > 0)
         {
             TargetUrlPatterns = new HashSet <Regex>(entity.TargetUrlExtractor.Patterns.Select(p => new Regex(p)));
         }
         if (entity.TargetUrlExtractor.XPaths != null && entity.TargetUrlExtractor.XPaths.Length > 0)
         {
             TargetUrlRegions = new HashSet <ISelector>(entity.TargetUrlExtractor.XPaths.Select(x => Selectors.XPath(x)));
         }
     }
 }
示例#7
0
        public static void CrawlerPagesWithoutTraverse()
        {
            var site = new DotnetSpider.Core.Site {
                EncodingName = "UTF-8", RemoveOutboundLinks = true
            };

            site.AddStartUrl("https://movie.douban.com/top250");
            Spider spider = Spider.Create(site,
                                          "DOUBAN_" + DateTime.Now.ToString("yyyyMMddhhmmss"),
                                          new QueueDuplicateRemovedScheduler(),
                                          new DouBanPageProcessor())
                            .AddPipeline(new DouBanPipeline());

            spider.ThreadNum      = 2;
            spider.EmptySleepTime = 3000;

            // 启动爬虫
            spider.Run();
        }
示例#8
0
        public EntityProcessor(Site site, DataHandler <T> dataHandler = null, string tableName = null)
        {
            Site      = site;
            Extractor = new EntityExtractor <T>(dataHandler);

            if (Extractor.EntityDefine.TargetUrlsSelectors != null && Extractor.EntityDefine.TargetUrlsSelectors.Count > 0)
            {
                foreach (var targetUrlsSelector in Extractor.EntityDefine.TargetUrlsSelectors)
                {
                    if (targetUrlsSelector.XPaths == null && targetUrlsSelector.Patterns == null)
                    {
                        throw new SpiderException("Region xpath and patterns should not be null both.");
                    }
                    if (targetUrlsSelector.XPaths == null)
                    {
                        targetUrlsSelector.XPaths = new string[] { };
                    }
                    foreach (var xpath in targetUrlsSelector.XPaths?.Select(x => x?.Trim()).Distinct())
                    {
                        AddTargetUrlExtractor(xpath, targetUrlsSelector.Patterns);
                    }
                }
            }
        }