public void SatrtSpider() { List <Request> resList = new List <Request>(); var site = new DotnetSpider.Core.Site() { EncodingName = "UTF-8" }; site.CycleRetryTimes = 1; //循环获取29页句子迷原创句子 for (int k = 1; k < 50; k++) { for (int i = 0; i < 100; i++) { var url = $"http://www.liwushuo.com/api/channels/{k}/items?limit=50&offset={i * 50}"; site.AddStartUrl(url); } } var spider = Spider.Create(site, new GiftDetailsPageProcessor()) .AddStartRequests(resList.ToArray()) .AddPipeline(new GiftDetailsPipeliner()); spider.ThreadNum = 1; spider.Run(); }
public EntityProcessor(Site site, DataHandler <T> dataHandler = null, string tableName = null) { Site = site; Extractor = new EntityExtractor <T>(dataHandler, tableName); if (Extractor.EntityDefine.TargetUrlsSelectors != null && Extractor.EntityDefine.TargetUrlsSelectors.Count > 0) { foreach (var targetUrlsSelector in Extractor.EntityDefine.TargetUrlsSelectors) { var patterns = targetUrlsSelector.Patterns?.Select(x => x?.Trim()).Distinct().ToArray(); var xpaths = targetUrlsSelector.XPaths?.Select(x => x?.Trim()).Distinct().ToList(); if (xpaths == null && patterns == null) { throw new SpiderException("Region xpath and patterns should not be null both."); } if (xpaths != null && xpaths.Count > 0) { foreach (var xpath in xpaths) { AddTargetUrlExtractor(xpath, patterns); } } else { if (patterns != null && patterns.Length > 0) { AddTargetUrlExtractor(null, patterns); } } } } }
public static void CustmizeProcessorAndPipeline() { // Config encoding, header, cookie, proxy etc... 定义采集的 Site 对象, 设置 Header、Cookie、代理等 var site = new DotnetSpider.Core.Site { EncodingName = "UTF-8", RemoveOutboundLinks = true }; for (int i = 1; i < 5; ++i) { // Add start/feed urls. 添加初始采集链接 site.AddStartUrl($"http://list.youku.com/category/show/c_96_s_1_d_1_p_{i}.html"); } Spider spider = Spider.Create(site, // use memoery queue scheduler. 使用内存调度 new QueueDuplicateRemovedScheduler(), // use custmize processor for youku 为优酷自定义的 Processor new YoukuPageProcessor()) // use custmize pipeline for youku 为优酷自定义的 Pipeline .AddPipeline(new YoukuPipeline()); spider.Downloader = new HttpClientDownloader(); spider.ThreadNum = 1; spider.EmptySleepTime = 3000; // Start crawler 启动爬虫 spider.Run(); }
public EntityProcessor(Site site, EntityMetadata entity) { Site = site; _entity = entity; _extractor = new EntityExtractor(entity.Entity.Name, entity.SharedValues, entity); if (entity.TargetUrlsSelectors != null && entity.TargetUrlsSelectors.Count > 0) { var pairs = new List <string>(); foreach (var targetUrlsSelector in entity.TargetUrlsSelectors) { if (targetUrlsSelector.XPaths == null && targetUrlsSelector.Patterns == null) { throw new SpiderException("Region xpath and patterns should not be null both."); } if (targetUrlsSelector.XPaths == null) { targetUrlsSelector.XPaths = new string[] { }; } foreach (var xpath in targetUrlsSelector.XPaths?.Select(x => x?.Trim()).Distinct()) { AddTargetUrlExtractor(xpath, targetUrlsSelector.Patterns); } } } }
public void SatrtSpider() { List <Request> resList = new List <Request>(); var site = new DotnetSpider.Core.Site() { EncodingName = "UTF-8" }; site.CycleRetryTimes = 1; //循环获取29页句子迷原创句子 var db = GiftDB.GetInstance(); var list = db.Query <ArticleDetail>("SELECT * FROM t_gift_ArticleDetails"); foreach (var item in list) { //var url = $"http://www.liwushuo.com/posts/{item.PID}/content"; var url = $"http://www.liwushuo.com/posts/{item.PID}"; site.AddStartUrl(url); } var spider = Spider.Create(site, new GiftPageProcessor()) .AddStartRequests(resList.ToArray()) .AddPipeline(new GiftPipeliner()); spider.ThreadNum = 1; spider.Run(); }
public EntityProcessor(Site site, EntityMetadata entity) { Site = site; _entity = entity; _extractor = new EntityExtractor(entity.Entity.Name, entity.SharedValues, entity); if (entity.TargetUrlExtractor != null) { if (entity.TargetUrlExtractor.Patterns != null && entity.TargetUrlExtractor.Patterns.Length > 0) { TargetUrlPatterns = new HashSet <Regex>(entity.TargetUrlExtractor.Patterns.Select(p => new Regex(p))); } if (entity.TargetUrlExtractor.XPaths != null && entity.TargetUrlExtractor.XPaths.Length > 0) { TargetUrlRegions = new HashSet <ISelector>(entity.TargetUrlExtractor.XPaths.Select(x => Selectors.XPath(x))); } } }
public static void CrawlerPagesWithoutTraverse() { var site = new DotnetSpider.Core.Site { EncodingName = "UTF-8", RemoveOutboundLinks = true }; site.AddStartUrl("https://movie.douban.com/top250"); Spider spider = Spider.Create(site, "DOUBAN_" + DateTime.Now.ToString("yyyyMMddhhmmss"), new QueueDuplicateRemovedScheduler(), new DouBanPageProcessor()) .AddPipeline(new DouBanPipeline()); spider.ThreadNum = 2; spider.EmptySleepTime = 3000; // 启动爬虫 spider.Run(); }
public EntityProcessor(Site site, DataHandler <T> dataHandler = null, string tableName = null) { Site = site; Extractor = new EntityExtractor <T>(dataHandler); if (Extractor.EntityDefine.TargetUrlsSelectors != null && Extractor.EntityDefine.TargetUrlsSelectors.Count > 0) { foreach (var targetUrlsSelector in Extractor.EntityDefine.TargetUrlsSelectors) { if (targetUrlsSelector.XPaths == null && targetUrlsSelector.Patterns == null) { throw new SpiderException("Region xpath and patterns should not be null both."); } if (targetUrlsSelector.XPaths == null) { targetUrlsSelector.XPaths = new string[] { }; } foreach (var xpath in targetUrlsSelector.XPaths?.Select(x => x?.Trim()).Distinct()) { AddTargetUrlExtractor(xpath, targetUrlsSelector.Patterns); } } } }