public void ProcesserException() { var site = new Site { Headers = new System.Collections.Generic.Dictionary <string, string> { { "Upgrade-Insecure-Requests", "1" } } }; var scheduler = new QueueDuplicateRemovedScheduler(); site.AddRequests("http://v.youku.com/v_show/id_XMTMyMTkzNTY1Mg==.html?spm=a2h1n.8251845.0.0"); site.AddRequests("http://v.youku.com/v_show/id_XMjkzNzMwMDMyOA==.html?spm=a2h1n.8251845.0.0"); site.AddRequests("http://v.youku.com/v_show/id_XMjcwNDg0NDI3Mg==.html?spm=a2h1n.8251845.0.0"); site.AddRequests("http://v.youku.com/v_show/id_XMTMwNzQwMTcwMA==.html?spm=a2h1n.8251845.0.0"); site.AddRequests("http://v.youku.com/v_show/id_XMjk1MzI0Mzk4NA==.html?spm=a2h1n.8251845.0.0"); site.AddRequests("http://v.youku.com/v_show/id_XMjkzNzY0NzkyOA==.html?spm=a2h1n.8251845.0.0"); site.AddRequests("http://www.163.com/"); Spider spider = Spider.Create(site, // crawler identity "cnblogs_" + DateTime.Now.ToString("yyyyMMddhhmmss"), // use memoery queue scheduler scheduler, // default page processor will save whole html, and extract urls to target urls via regex new TestPageProcessor()) // save crawler result to file in the folder: \{running directory}\data\{crawler identity}\{guid}.dsd .AddPipeline(new FilePipeline()); spider.ClearSchedulerAfterCompleted = false; // dowload html by http client spider.Downloader = new HttpWebRequestDownloader(); spider.ThreadNum = 1; // traversal deep 遍历深度 spider.Scheduler.Depth = 3; spider.EmptySleepTime = 6000; // start crawler 启动爬虫 spider.Run(); Assert.Equal(5, spider.RetriedTimes.Value); Assert.Equal(0, scheduler.LeftRequestsCount); Assert.Equal(6, scheduler.SuccessRequestsCount); Assert.Equal(6, scheduler.ErrorRequestsCount); }
public void ThrowExceptionWhenNoPipeline() { try { Spider spider = Spider.Create(new Site { EncodingName = "UTF-8", SleepTime = 1000 }, new TestPageProcessor()); spider.Run(); } catch (SpiderException exception) { Assert.AreEqual("Pipelines should not be null.", exception.Message); return; } throw new Exception("TEST FAILED."); }
public void RunAsyncAndStop() { Spider spider = Spider.Create(new Site { EncodingName = "UTF-8", MinSleepTime = 1000 }, new TestPageProcessor()).AddPipeline(new TestPipeline()).SetThreadNum(1); for (int i = 0; i < 10000; i++) { spider.AddStartUrl("http://www.baidu.com/" + i); } spider.RunAsync(); Thread.Sleep(5000); spider.Stop(); Thread.Sleep(5000); spider.RunAsync(); Thread.Sleep(5000); }
public static void Run() { Instance instance = Instance.LoadFrom("sohu.xml"); var table = new TableInfo("websites", "html"); var fields = new[] { new FieldSelector(".//title", "title"), new FieldSelector(Env.UrlPropertyKey, "url", SelectorType.Enviroment), new FieldSelector(".//body", "content", SelectorType.XPath, DataType.String, int.MaxValue), new FieldSelector("is_match", "is_match", SelectorType.XPath, DataType.Bool), new FieldSelector("matchs", "matchs", SelectorType.XPath, DataType.String, int.MaxValue), new FieldSelector("id", "id", SelectorType.Enviroment, DataType.Int) { IsPrimary = true }, }; var targetRequestSelector = new TargetRequestSelector { XPaths = instance.TargetXpaths, Patterns = instance.TargetPatterns, ExcludePatterns = instance.ExcludePatterns }; var model = new ModelDefinition(null, fields, table, targetRequestSelector); var modeProcessor = new ModelProcessor(model); modeProcessor.CleanPound = true; modeProcessor.AddDataHanlder(new MyDataHandler()); Spider spider = Spider.Create( new QueueDuplicateRemovedScheduler(), modeProcessor) .AddPipeline(new MySqlEntityPipeline()); spider.EncodingName = instance.Encording; spider.AddRequests(instance.Url); if (instance.Downloader.ToLower() == "chrome") { spider.Downloader = new WebDriverDownloader(Browser.Chrome, new Option { Headless = true }); } spider.Run(); }
public static void Run() { var site = new Site { EncodingName = "UTF-8" }; site.AddStartUrl("http://www.meituan.com/dianying/zuixindianying"); Spider spider = Spider.Create(site, new QueueDuplicateRemovedScheduler(), new MeituanPageProcessor()) .AddPipeline(new MeituanPipeline()) .SetDownloader(new HttpClientDownloader()) .SetThreadNum(1); spider.EmptySleepTime = 60000; spider.Run(); }
static void Main(string[] args) { Site site = new Site { EncodingName = "gb2312", RemoveOutboundLinks = false }; site.AddStartUrl("http://www.17500.cn/ssq"); Spider spider = Spider.Create(site, new QueueDuplicateRemovedScheduler(), new SSQProcessor()).AddPipeline(new SSQPipeline()); spider.Downloader = new DotnetSpider.Core.Downloader.HttpClientDownloader(); spider.ThreadNum = 1; spider.EmptySleepTime = 3000; spider.Run(); Console.WriteLine("Press any key to continue..."); Console.ReadKey(); }
public static void Run() { // 使用内存Scheduler、自定义PageProcessor、自定义Pipeline创建爬虫 Spider spider = Spider.Create( new QueueDuplicateRemovedScheduler(), new BlogSumaryProcessor(), new NewsProcessor()). AddPipeline(new MyPipeline()); spider.EncodingName = "UTF-8"; for (int i = 1; i < 5; ++i) { // 添加初始采集链接 spider.AddRequests("http://www.cnblogs.com"); } // 启动爬虫 spider.Run(); }
public static void Run() { Spider spider = Spider.Create( // use memoery queue scheduler. 使用内存调度 new QueueDuplicateRemovedScheduler(), // use custmize processor for youku 为优酷自定义的 Processor new YoukuPageProcessor()) // use custmize pipeline for youku 为优酷自定义的 Pipeline .AddPipeline(new YoukuPipeline()); // Start crawler 启动爬虫 spider.EncodingName = "UTF-8"; for (int i = 1; i < 5; ++i) { // Add start/feed urls. 添加初始采集链接 spider.AddRequests($"http://list.youku.com/category/show/c_96_s_1_d_1_p_{i}.html"); } spider.Run(); }
public void IdentityLengthLimit() { try { Spider.Create(new Site { EncodingName = "UTF-8", SleepTime = 1000 }, "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", new QueueDuplicateRemovedScheduler(), new TestPageProcessor()); } catch (Exception exception) { Assert.Equal("Length of Identity should less than 100.", exception.Message); return; } throw new Exception("TEST FAILED."); }
public void RunAsyncAndStopThenExit() { Spider spider = Spider.Create(new Site { EncodingName = "UTF-8", SleepTime = 1000 }, new TestPageProcessor()).AddPipeline(new TestPipeline()); spider.ThreadNum = 1; for (int i = 0; i < 10000; i++) { spider.AddStartUrl("http://www.baidu.com/" + i); } spider.RunAsync(); Thread.Sleep(5000); spider.Pause(() => { spider.Exit(); }); Thread.Sleep(5000); }
public static void CrawlerPagesWithoutTraverse() { var site = new DotnetSpider.Core.Site { EncodingName = "UTF-8", RemoveOutboundLinks = true }; site.AddStartUrl("https://movie.douban.com/top250"); Spider spider = Spider.Create(site, "DOUBAN_" + DateTime.Now.ToString("yyyyMMddhhmmss"), new QueueDuplicateRemovedScheduler(), new DouBanPageProcessor()) .AddPipeline(new DouBanPipeline()); spider.ThreadNum = 2; spider.EmptySleepTime = 3000; // 启动爬虫 spider.Run(); }
public void RetryRequest() { var site = new Site { EncodingName = "UTF-8", RemoveOutboundLinks = true }; var scheduler = new QueueDuplicateRemovedScheduler(); site.AddStartUrl("http://v.youku.com/v_show/id_XMTMyMTkzNTY1Mg==.html?spm=a2h1n.8251845.0.0"); site.AddStartUrl("http://v.youku.com/v_show/id_XMjkzNzMwMDMyOA==.html?spm=a2h1n.8251845.0.0"); site.AddStartUrl("http://v.youku.com/v_show/id_XMjcwNDg0NDI3Mg==.html?spm=a2h1n.8251845.0.0"); site.AddStartUrl("http://v.youku.com/v_show/id_XMTMwNzQwMTcwMA==.html?spm=a2h1n.8251845.0.0"); site.AddStartUrl("http://v.youku.com/v_show/id_XMjk1MzI0Mzk4NA==.html?spm=a2h1n.8251845.0.0"); site.AddStartUrl("http://v.youku.com/v_show/id_XMjkzNzY0NzkyOA==.html?spm=a2h1n.8251845.0.0"); site.AddStartUrl("http://www.cnblogs.com/"); Spider spider = Spider.Create(site, // crawler identity "cnblogs_" + DateTime.Now.ToString("yyyyMMddhhmmss"), // use memoery queue scheduler scheduler, // default page processor will save whole html, and extract urls to target urls via regex new TestPageProcessor()) // save crawler result to file in the folder: \{running directory}\data\{crawler identity}\{guid}.dsd .AddPipeline(new FilePipeline()); // dowload html by http client spider.Downloader = new HttpClientDownloader(); spider.ThreadNum = 1; // traversal deep 遍历深度 spider.Deep = 3; // start crawler 启动爬虫 spider.Run(); Assert.Equal(5, spider.RetriedTimes.Value); Assert.Equal(0, scheduler.LeftRequestsCount); Assert.Equal(6, scheduler.SuccessRequestsCount); // 重试次数应该包含 Assert.Equal(5, scheduler.ErrorRequestsCount); }
public void FastExit() { if (Environment.GetEnvironmentVariable("TRAVIS") == "1") { return; } var path = "FastExit_Result.txt"; if (File.Exists(path)) { File.Delete(path); } Stopwatch stopwatch = new Stopwatch(); stopwatch.Start(); Spider spider = Spider.Create(new Site { CycleRetryTimes = 5, EncodingName = "UTF-8", SleepTime = 0 }, new FastExitPageProcessor()) .AddPipeline(new FastExitPipeline()); spider.ThreadNum = 1; spider.EmptySleepTime = 0; spider.AddStartUrl("http://item.jd.com/1013286.html?_t=1"); spider.AddStartUrl("http://item.jd.com/1013286.html?_t=2"); spider.AddStartUrl("http://item.jd.com/1013286.html?_t=3"); spider.AddStartUrl("http://item.jd.com/1013286.html?_t=4"); spider.AddStartUrl("http://item.jd.com/1013286.html?_t=5"); spider.Run(); stopwatch.Stop(); var costTime = stopwatch.ElapsedMilliseconds; Assert.True(costTime < 3000); var results = File.ReadAllLines("FastExit_Result.txt"); Assert.Contains("http://item.jd.com/1013286.html?_t=1", results); Assert.Contains("http://item.jd.com/1013286.html?_t=2", results); Assert.Contains("http://item.jd.com/1013286.html?_t=3", results); Assert.Contains("http://item.jd.com/1013286.html?_t=4", results); Assert.Contains("http://item.jd.com/1013286.html?_t=5", results); }
public static void Run() { var site = new Site { EncodingName = "UTF-8" }; site.AddStartUrl("http://58921.com/alltime/wangpiao"); Spider spider = Spider.Create(site, new QueueDuplicateRemovedScheduler(), new BoxOfficePageProcessor()) .AddPipeline(new BoxOfficePipeline()) .SetDownloader(new HttpClientDownloader()) .SetThreadNum(1); spider.EmptySleepTime = 60000; spider.Run(); DeleteOldData(); }
public static void Run() { IocExtension.ServiceCollection.AddSingleton <IMonitorService, NLogMonitor>(); var site = new Site { EncodingName = "UTF-8" }; for (int i = 1; i < 5; ++i) { site.AddStartUrl("http://" + $"www.youku.com/v_olist/c_97_g__a__sg__mt__lg__q__s_1_r_0_u_0_pt_0_av_0_ag_0_sg__pr__h__d_1_p_{i}.html"); } Spider spider = Spider.Create(site, new MyPageProcessor(), new QueueDuplicateRemovedScheduler()).AddPipeline(new MyPipeline()).SetThreadNum(1); SpiderMonitor.Register(spider); spider.Run(); Console.Read(); }
public void RemoveOutboundLinksSetting() { try { var spider = Spider.Create(new Site { RemoveOutboundLinks = true }, "1111", new QueueDuplicateRemovedScheduler(), new TestPageProcessor()); spider.Run(); } catch (Exception exception) { Assert.Equal($"When you want remove outbound links, the domains should not be null or empty.", exception.Message); return; } throw new Exception("TEST FAILED."); }
public static void Main(string[] args) { IocExtension.ServiceCollection.AddSingleton <IMonitorService, NLogMonitor>(); Spider spider = Spider.Create(new Site { EncodingName = "UTF-8", MinSleepTime = 1000 }, new SpiderTest.TestPageProcessor()).AddPipeline(new SpiderTest.TestPipeline()).SetThreadNum(1); spider.SetDownloader(new TestDownloader()); for (int i = 0; i < 10000; i++) { spider.AddStartUrl("http://www.baidu.com/" + i); } spider.Run(); Thread.Sleep(5000); spider.Stop(); Thread.Sleep(5000); spider.RunAsync(); Thread.Sleep(5000); }
public static void WandaSpiderRun() { LogHelper.WriteLog(typeof(WandaSpider), "Start to get data from wanda website"); var site = new Site { EncodingName = "UTF-8", RemoveOutboundLinks = true }; site.AddStartUrl("http://www.wandafilm.com/"); Spider spider = Spider.Create(site, new QueueDuplicateRemovedScheduler(), new WandaPageProcessor()) .AddPipeline(new WandaPipeline()) .SetDownloader(new HttpClientDownloader()) .SetThreadNum(1); spider.EmptySleepTime = 3000; spider.Run(); }
public void _404Url() { if (!Env.IsWindows) { return; } var spider = Spider.Create(new Site { EncodingName = "UTF-8", SleepTime = 1000 }, "abcd", new QueueDuplicateRemovedScheduler(), new TestPageProcessor()); spider.AddPipeline(new ConsolePipeline()); spider.SkipTargetUrlsWhenResultIsEmpty = false; spider.EmptySleepTime = 6000; spider.AddStartUrl("http://www.mlr.gov.cn/xwdt/jrxw/201707/t20170710_15242382.htm"); spider.Run(); Assert.Equal(5, spider.RetriedTimes.Value); }
public void DatebaseLogAndStatus() { lock (TestBase.Locker) { if (File.Exists(Env.GlobalAppConfigPath)) { File.Delete(Env.GlobalAppConfigPath); } AppDomain.CurrentDomain.SetData("CONFIG", ""); AppDomain.CurrentDomain.SetData("DBCONFIG", ""); Env.Reload(); string id = Guid.NewGuid().ToString("N"); string taskGroup = Guid.NewGuid().ToString("N"); string userId = Guid.NewGuid().ToString("N"); using (Spider spider = Spider.Create(new Site { EncodingName = "UTF-8", SleepTime = 1000 }, id, new QueueDuplicateRemovedScheduler(), new TestPageProcessor())) { spider.AddPipeline(new TestPipeline()); spider.ThreadNum = 1; for (int i = 0; i < 5; i++) { spider.AddStartUrl("http://www.baidu.com/" + i); } spider.Monitor = new DbMonitor(id); spider.Run(); } Thread.Sleep(3000); using (var conn = (Env.SystemConnectionStringSettings.GetDbConnection())) { Assert.StartsWith("Crawl complete, cost", conn.Query <Log>($"SELECT * FROM DotnetSpider.Log where Identity='{id}'").Last().message); Assert.Equal(1, conn.Query <CountResult>($"SELECT COUNT(*) as Count FROM DotnetSpider.Status where Identity='{id}'").First().Count); Assert.Equal("Finished", conn.Query <statusObj>($"SELECT * FROM DotnetSpider.Status where Identity='{id}'").First().status); } } }
public void _301Url() { if (!Env.IsWindows) { return; } var spider = Spider.Create(new Site { EncodingName = "UTF-8", SleepTime = 1000 }, "abcd", new QueueDuplicateRemovedScheduler(), new TestPageProcessor()); spider.AddPipeline(new ConsolePipeline()); spider.SkipTargetUrlsWhenResultIsEmpty = true; spider.Downloader = new HttpClientDownloader(); spider.EmptySleepTime = 6000; spider.AddStartUrl("https://tieba.baidu.com/f?kw=%E7%AE%80%E9%98%B3&ie=utf-8&pn=50"); spider.Run(); Assert.Equal(0, spider.RetriedTimes.Value); }
public void NoPipeline() { //try //{ // Spider spider = Spider.Create(new Site { EncodingName = "UTF-8", SleepTime = 1000 }, new TestPageProcessor()); // spider.Run(); //} //catch (SpiderException exception) //{ // Assert.Equal("Pipelines should not be null.", exception.Message); // return; //} //throw new Exception("TEST FAILED."); Spider spider = Spider.Create(new Site { EncodingName = "UTF-8", SleepTime = 1000 }, new TestPageProcessor()); spider.EmptySleepTime = 1000; spider.Run(); }
private static void GetHouseNull() { var site = new Site { CycleRetryTimes = 1, SleepTime = 200, Headers = new Dictionary <string, string>() { { "Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8" }, { "Referer", "https://zz.lianjia.com/ershoufang/ng1nb1mw1f2/" }, { "Cache-Control", "max-age=0" }, { "Connection", "keep-alive" }, { "Content-Type", "application/x-www-form-urlencoded; charset=UTF-8" }, { "User-Agent", "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36" } }, }; List <Request> resList = new List <Request>(); List <HouseInfo> houseList = Repository <HouseInfo> .Query(n => n.HouseInsideArea == null); foreach (HouseInfo item in houseList) { Request res = new Request(); //res.PostBody = string.Format("id=7&j=%7B%22createMan%22%3A%2218273159100%22%2C%22createTime%22%3A1518433690000%2C%22row%22%3A5%2C%22siteUserActivityListId%22%3A8553%2C%22siteUserPageRowModuleId%22%3A84959%2C%22topids%22%3A%22%22%2C%22wherePhase%22%3A%221%22%2C%22wherePreferential%22%3A%220%22%2C%22whereUsertype%22%3A%220%22%7D&page={0}&shopid=83106681", i); res.Url = item.Url; res.Method = System.Net.Http.HttpMethod.Get; resList.Add(res); } var spider = Spider.Create(site, new QueueDuplicateRemovedScheduler(), new HouseDetailsProcessor()) .AddStartRequests(resList.ToArray()) .AddPipeline(new HouseDetailsPipe()); spider.ThreadNum = 10; spider.Run(); }
static void Main(string[] args) { var site = new Site { CycleRetryTimes = 1, SleepTime = 200, Headers = new Dictionary <string, string>() { { "Accept", "text/html, */*; q=0.01" }, { "Referer", "https://store.mall.autohome.com.cn/83106681.html" }, { "Cache-Control", "no-cache" }, { "Connection", "keep-alive" }, { "Content-Type", "application/x-www-form-urlencoded; charset=UTF-8" }, { "User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.167 Safari/537.36" } //{ "Cookie","fvlid=1496991192019sTVlIox4; sessionid=792A6590-B1C1-4292-902E-BAB469F6B66A%7C%7C2017-06-09+14%3A52%3A46.308%7C%7Cwww.baidu.com; mallsfvi=1501164251817G4Yjfly2%7Cstore.mall.autohome.com.cn%7C0; ag_fid=P3aAofwpa4LibiRF; Hm_lvt_765ecde8c11b85f1ac5f168fa6e6821f=1507955352; cookieCityId=110100; __utma=1.675003556.1498310668.1509877694.1512100363.4; __utmz=1.1512100363.4.4.utmcsr=autohome.com.cn|utmccn=(referral)|utmcmd=referral|utmcct=/beijing/; sessionuid=792A6590-B1C1-4292-902E-BAB469F6B66A%7C%7C2017-06-09+14%3A52%3A46.308%7C%7Cwww.baidu.com; UM_distinctid=15d58caae69692-00e158964a2cc1-8383667-100200-15d58caae6a8f3; cn_1262640694_dplus=%7B%22distinct_id%22%3A%20%2215d58caae69692-00e158964a2cc1-8383667-100200-15d58caae6a8f3%22%2C%22sp%22%3A%20%7B%22%24_sessionid%22%3A%200%2C%22%24_sessionTime%22%3A%201513783651%2C%22%24dp%22%3A%200%2C%22%24_sessionPVTime%22%3A%201513783651%7D%7D; _ga=GA1.3.675003556.1498310668; ahsids=588; Hm_lvt_9924a05a5a75caf05dbbfb51af638b07=1513782253,1514425536,1516346152; ahpau=1; o2oPlatform_user_info_new=\"2PEXHhEpMSe8Ab9wePPfBM/cPTSA+LEhouYSvKqf21ifTs8v3Wa7+5ML9XHTa4PIxdohDy/pfgZn+7jU57K8QJk6GqJpuAFmGdo4AAkQj1s=\"; providerLogin=\"2PEXHhEpMSe8Ab9wePPfBM/cPTSA+LEhouYSvKqf21ifTs8v3Wa7+5ML9XHTa4PIxdohDy/pfgZn+7jU57K8QJk6GqJpuAFmGdo4AAkQj1s=\"; o2oPlatform_company_user_info=2PEXHhEpMSco9C5zzw6CuOMB4aAqz2tF4FWkU/d1pCIKwHFVei9AI0tef+vYWhLhKK6S2blgHM0hg0WFK8FWIX+0p68SYs23; area=431099; mallslvi=0%7C%7C15187909317891KBbjcg5; sessionip=223.152.110.108; mallCityId=999999; ahpvno=21; sessionvid=554FA305-FACF-4617-B002-CEBC69D55AE3; ref=www.mangoauto.com.cn%7C0%7C0%7Cwww.baidu.com%7C2018-02-17+09%3A45%3A31.833%7C2018-02-11+21%3A19%3A03.217; ahrlid=1518831923227kTwIQZVJ-1518832004398" }, } }; List <Request> resList = new List <Request>(); for (int i = 1; i <= 33; i++) { Request res = new Request(); res.PostBody = $"id=7&j=%7B%22createMan%22%3A%2218273159100%22%2C%22createTime%22%3A1518433690000%2C%22row%22%3A5%2C%22siteUserActivityListId%22%3A8553%2C%22siteUserPageRowModuleId%22%3A84959%2C%22topids%22%3A%22%22%2C%22wherePhase%22%3A%221%22%2C%22wherePreferential%22%3A%220%22%2C%22whereUsertype%22%3A%220%22%7D&page={i}&shopid=83106681"; res.Url = "https://store.mall.autohome.com.cn/shop/ajaxsitemodlecontext.jtml"; res.Method = System.Net.Http.HttpMethod.Post; resList.Add(res); } var spider = Spider.Create(site, new QueueDuplicateRemovedScheduler(), new AutoHomeProcessor()) .AddStartRequests(resList.ToArray()) .AddPipeline(new AutoHomePipe()); spider.ThreadNum = 1; spider.Run(); Console.Read(); }
public void FastExit() { if (Environment.GetEnvironmentVariable("TRAVIS") == "1") { return; } var path = "FastExit_Result.txt"; if (File.Exists(path)) { File.Delete(path); } Stopwatch stopwatch = new Stopwatch(); stopwatch.Start(); Spider spider = Spider.Create( new FastExitPageProcessor()) .AddPipeline(new FastExitPipeline()); spider.ThreadNum = 1; spider.EmptySleepTime = 0; spider.EncodingName = "UTF-8"; spider.CycleRetryTimes = 5; spider.SleepTime = 0; spider.AddRequests("http://war.163.com/"); spider.AddRequests("http://sports.163.com/"); spider.AddRequests("http://ent.163.com/"); spider.Downloader = new TestDownloader(); spider.Run(); stopwatch.Stop(); var costTime = stopwatch.ElapsedMilliseconds; Assert.True(costTime < 3000); var results = File.ReadAllLines("FastExit_Result.txt"); Assert.Contains("http://war.163.com/", results); Assert.Contains("http://sports.163.com/", results); Assert.Contains("http://ent.163.com/", results); }
public void RunAsyncAndContiune() { if (Environment.GetEnvironmentVariable("TRAVIS") == "1") { return; } Spider spider = Spider.Create(new TestPageProcessor()).AddPipeline(new TestPipeline()); spider.ThreadNum = 1; spider.EncodingName = "UTF-8"; for (int i = 0; i < 10000; i++) { spider.AddRequests("http://www.baidu.com/" + i); } spider.RunAsync(); Thread.Sleep(5000); spider.Pause(() => { spider.Contiune(); }); Thread.Sleep(5000); }
public static void Run() { // 定义要采集的 Site 对象, 可以设置 Header、Cookie、代理等 var site = new Site { EncodingName = "UTF-8", RemoveOutboundLinks = true }; for (int i = 1; i < 5; ++i) { // 添加初始采集链接 site.AddStartUrl("http://" + $"www.youku.com/v_olist/c_97_g__a__sg__mt__lg__q__s_1_r_0_u_0_pt_0_av_0_ag_0_sg__pr__h__d_1_p_{i}.html"); } // 使用内存Scheduler、自定义PageProcessor、自定义Pipeline创建爬虫 Spider spider = Spider.Create(site, new QueueDuplicateRemovedScheduler(), new MyPageProcessor()).AddPipeline(new MyPipeline()).SetThreadNum(1); spider.EmptySleepTime = 3000; spider.Deep = 2; // 启动爬虫 spider.Run(); }
public void Setting() { var site = new Site { EncodingName = "UTF-8", RemoveOutboundLinks = true }; // Set start/seed url site.AddStartUrl("http://www.cnblogs.com/"); Spider spider = Spider.Create(site, // crawler identity "cnblogs_" + DateTime.Now.ToString("yyyyMMddhhmmss"), // use memoery queue scheduler new QueueDuplicateRemovedScheduler(), // default page processor will save whole html, and extract urls to target urls via regex new DefaultPageProcessor(new[] { "cnblogs\\.com" })) // save crawler result to file in the folder: \{running directory}\data\{crawler identity}\{guid}.dsd .AddPipeline(new FilePipeline()); spider.RedialExecutor = new FileLockerRedialExecutor(new AdslRedialer("", "", ""), new VpsInternetDetector()); Assert.IsNotNull(spider.RedialExecutor); }
public void RetryRequest() { var site = new Site { EncodingName = "UTF-8" }; var scheduler = new QueueDuplicateRemovedScheduler(); site.AddStartUrl("http://www.baidu.com"); site.AddStartUrl("http://www.163.com/"); Spider spider = Spider.Create(site, // crawler identity "cnblogs_" + DateTime.Now.ToString("yyyyMMddhhmmss"), // use memoery queue scheduler scheduler, // default page processor will save whole html, and extract urls to target urls via regex new TestPageProcessor()) // save crawler result to file in the folder: \{running directory}\data\{crawler identity}\{guid}.dsd .AddPipeline(new FilePipeline()); spider.Monitor = new NLogMonitor(); // dowload html by http client spider.Downloader = new HttpClientDownloader(); spider.ThreadNum = 1; // traversal deep 遍历深度 spider.Scheduler.Depth = 3; spider.ClearSchedulerAfterCompleted = false; spider.EmptySleepTime = 6000; // start crawler 启动爬虫 spider.Run(); Assert.Equal(5, spider.RetriedTimes.Value); Assert.Equal(0, scheduler.LeftRequestsCount); Assert.Equal(1, scheduler.SuccessRequestsCount); // 重试次数应该包含 Assert.Equal(5, scheduler.ErrorRequestsCount); }
public static void Main(string[] args) { #if NET_CORE Encoding.RegisterProvider(CodePagesEncodingProvider.Instance); #endif IocContainer.Default.AddSingleton <IMonitor, NLogMonitor>(); Spider spider = Spider.Create(new Site { EncodingName = "UTF-8", MinSleepTime = 1000 }, new SpiderTest.TestPageProcessor()).AddPipeline(new SpiderTest.TestPipeline()).SetThreadNum(1); spider.SetDownloader(new TestDownloader()); for (int i = 0; i < 10; i++) { spider.AddStartUrl("http://www.baidu.com/" + i); } spider.Run(); Thread.Sleep(5000); spider.Stop(); Thread.Sleep(5000); spider.RunAsync(); Thread.Sleep(5000); }