Exemplo n.º 1
0
        public void SatrtSpider()
        {
            List <Request> resList = new List <Request>();
            var            site    = new DotnetSpider.Core.Site()
            {
                EncodingName = "UTF-8"
            };

            site.CycleRetryTimes = 1;
            //循环获取29页句子迷原创句子
            for (int k = 1; k < 50; k++)
            {
                for (int i = 0; i < 100; i++)
                {
                    var url = $"http://www.liwushuo.com/api/channels/{k}/items?limit=50&offset={i * 50}";
                    site.AddStartUrl(url);
                }
            }

            var spider = Spider.Create(site, new GiftDetailsPageProcessor())
                         .AddStartRequests(resList.ToArray())
                         .AddPipeline(new GiftDetailsPipeliner());

            spider.ThreadNum = 1;
            spider.Run();
        }
Exemplo n.º 2
0
        public static void CustmizeProcessorAndPipeline()
        {
            // Config encoding, header, cookie, proxy etc... 定义采集的 Site 对象, 设置 Header、Cookie、代理等
            var site = new DotnetSpider.Core.Site {
                EncodingName = "UTF-8", RemoveOutboundLinks = true
            };

            for (int i = 1; i < 5; ++i)
            {
                // Add start/feed urls. 添加初始采集链接
                site.AddStartUrl($"http://list.youku.com/category/show/c_96_s_1_d_1_p_{i}.html");
            }
            Spider spider = Spider.Create(site,
                            // use memoery queue scheduler. 使用内存调度
                                          new QueueDuplicateRemovedScheduler(),
                            // use custmize processor for youku 为优酷自定义的 Processor
                                          new YoukuPageProcessor())
                            // use custmize pipeline for youku 为优酷自定义的 Pipeline
                            .AddPipeline(new YoukuPipeline());

            spider.Downloader     = new HttpClientDownloader();
            spider.ThreadNum      = 1;
            spider.EmptySleepTime = 3000;

            // Start crawler 启动爬虫
            spider.Run();
        }
Exemplo n.º 3
0
        public void SatrtSpider()
        {
            List <Request> resList = new List <Request>();
            var            site    = new DotnetSpider.Core.Site()
            {
                EncodingName = "UTF-8"
            };

            site.CycleRetryTimes = 1;
            //循环获取29页句子迷原创句子
            var db   = GiftDB.GetInstance();
            var list = db.Query <ArticleDetail>("SELECT * FROM t_gift_ArticleDetails");

            foreach (var item in list)
            {
                //var url = $"http://www.liwushuo.com/posts/{item.PID}/content";
                var url = $"http://www.liwushuo.com/posts/{item.PID}";

                site.AddStartUrl(url);
            }
            var spider = Spider.Create(site, new GiftPageProcessor())
                         .AddStartRequests(resList.ToArray())
                         .AddPipeline(new GiftPipeliner());

            spider.ThreadNum = 1;
            spider.Run();
        }
Exemplo n.º 4
0
        public static void CrawlerPagesWithoutTraverse()
        {
            var site = new DotnetSpider.Core.Site {
                EncodingName = "UTF-8", RemoveOutboundLinks = true
            };

            site.AddStartUrl("https://movie.douban.com/top250");
            Spider spider = Spider.Create(site,
                                          "DOUBAN_" + DateTime.Now.ToString("yyyyMMddhhmmss"),
                                          new QueueDuplicateRemovedScheduler(),
                                          new DouBanPageProcessor())
                            .AddPipeline(new DouBanPipeline());

            spider.ThreadNum      = 2;
            spider.EmptySleepTime = 3000;

            // 启动爬虫
            spider.Run();
        }