public void SatrtSpider() { List <Request> resList = new List <Request>(); var site = new DotnetSpider.Core.Site() { EncodingName = "UTF-8" }; site.CycleRetryTimes = 1; //循环获取29页句子迷原创句子 for (int k = 1; k < 50; k++) { for (int i = 0; i < 100; i++) { var url = $"http://www.liwushuo.com/api/channels/{k}/items?limit=50&offset={i * 50}"; site.AddStartUrl(url); } } var spider = Spider.Create(site, new GiftDetailsPageProcessor()) .AddStartRequests(resList.ToArray()) .AddPipeline(new GiftDetailsPipeliner()); spider.ThreadNum = 1; spider.Run(); }
public static void CustmizeProcessorAndPipeline() { // Config encoding, header, cookie, proxy etc... 定义采集的 Site 对象, 设置 Header、Cookie、代理等 var site = new DotnetSpider.Core.Site { EncodingName = "UTF-8", RemoveOutboundLinks = true }; for (int i = 1; i < 5; ++i) { // Add start/feed urls. 添加初始采集链接 site.AddStartUrl($"http://list.youku.com/category/show/c_96_s_1_d_1_p_{i}.html"); } Spider spider = Spider.Create(site, // use memoery queue scheduler. 使用内存调度 new QueueDuplicateRemovedScheduler(), // use custmize processor for youku 为优酷自定义的 Processor new YoukuPageProcessor()) // use custmize pipeline for youku 为优酷自定义的 Pipeline .AddPipeline(new YoukuPipeline()); spider.Downloader = new HttpClientDownloader(); spider.ThreadNum = 1; spider.EmptySleepTime = 3000; // Start crawler 启动爬虫 spider.Run(); }
public void SatrtSpider() { List <Request> resList = new List <Request>(); var site = new DotnetSpider.Core.Site() { EncodingName = "UTF-8" }; site.CycleRetryTimes = 1; //循环获取29页句子迷原创句子 var db = GiftDB.GetInstance(); var list = db.Query <ArticleDetail>("SELECT * FROM t_gift_ArticleDetails"); foreach (var item in list) { //var url = $"http://www.liwushuo.com/posts/{item.PID}/content"; var url = $"http://www.liwushuo.com/posts/{item.PID}"; site.AddStartUrl(url); } var spider = Spider.Create(site, new GiftPageProcessor()) .AddStartRequests(resList.ToArray()) .AddPipeline(new GiftPipeliner()); spider.ThreadNum = 1; spider.Run(); }
public static void CrawlerPagesWithoutTraverse() { var site = new DotnetSpider.Core.Site { EncodingName = "UTF-8", RemoveOutboundLinks = true }; site.AddStartUrl("https://movie.douban.com/top250"); Spider spider = Spider.Create(site, "DOUBAN_" + DateTime.Now.ToString("yyyyMMddhhmmss"), new QueueDuplicateRemovedScheduler(), new DouBanPageProcessor()) .AddPipeline(new DouBanPipeline()); spider.ThreadNum = 2; spider.EmptySleepTime = 3000; // 启动爬虫 spider.Run(); }