Exemple #1
0
        public void ProcesserException()
        {
            var site = new Site
            {
                Headers = new System.Collections.Generic.Dictionary <string, string>
                {
                    { "Upgrade-Insecure-Requests", "1" }
                }
            };

            var scheduler = new QueueDuplicateRemovedScheduler();

            site.AddRequests("http://v.youku.com/v_show/id_XMTMyMTkzNTY1Mg==.html?spm=a2h1n.8251845.0.0");
            site.AddRequests("http://v.youku.com/v_show/id_XMjkzNzMwMDMyOA==.html?spm=a2h1n.8251845.0.0");
            site.AddRequests("http://v.youku.com/v_show/id_XMjcwNDg0NDI3Mg==.html?spm=a2h1n.8251845.0.0");
            site.AddRequests("http://v.youku.com/v_show/id_XMTMwNzQwMTcwMA==.html?spm=a2h1n.8251845.0.0");
            site.AddRequests("http://v.youku.com/v_show/id_XMjk1MzI0Mzk4NA==.html?spm=a2h1n.8251845.0.0");
            site.AddRequests("http://v.youku.com/v_show/id_XMjkzNzY0NzkyOA==.html?spm=a2h1n.8251845.0.0");
            site.AddRequests("http://www.163.com/");

            Spider spider = Spider.Create(site,
                            // crawler identity
                                          "cnblogs_" + DateTime.Now.ToString("yyyyMMddhhmmss"),
                            // use memoery queue scheduler
                                          scheduler,
                            // default page processor will save whole html, and extract urls to target urls via regex
                                          new TestPageProcessor())
                            // save crawler result to file in the folder: \{running directory}\data\{crawler identity}\{guid}.dsd
                            .AddPipeline(new FilePipeline());

            spider.ClearSchedulerAfterCompleted = false;
            // dowload html by http client
            spider.Downloader = new HttpWebRequestDownloader();

            spider.ThreadNum = 1;
            // traversal deep 遍历深度
            spider.Scheduler.Depth = 3;
            spider.EmptySleepTime  = 6000;
            // start crawler 启动爬虫
            spider.Run();

            Assert.Equal(5, spider.RetriedTimes.Value);
            Assert.Equal(0, scheduler.LeftRequestsCount);
            Assert.Equal(6, scheduler.SuccessRequestsCount);
            Assert.Equal(6, scheduler.ErrorRequestsCount);
        }
Exemple #2
0
        public void ThrowExceptionWhenNoPipeline()
        {
            try
            {
                Spider spider = Spider.Create(new Site {
                    EncodingName = "UTF-8", SleepTime = 1000
                }, new TestPageProcessor());
                spider.Run();
            }
            catch (SpiderException exception)
            {
                Assert.AreEqual("Pipelines should not be null.", exception.Message);
                return;
            }

            throw new Exception("TEST FAILED.");
        }
Exemple #3
0
        public void RunAsyncAndStop()
        {
            Spider spider = Spider.Create(new Site {
                EncodingName = "UTF-8", MinSleepTime = 1000
            }, new TestPageProcessor()).AddPipeline(new TestPipeline()).SetThreadNum(1);

            for (int i = 0; i < 10000; i++)
            {
                spider.AddStartUrl("http://www.baidu.com/" + i);
            }
            spider.RunAsync();
            Thread.Sleep(5000);
            spider.Stop();
            Thread.Sleep(5000);
            spider.RunAsync();
            Thread.Sleep(5000);
        }
        public static void Run()
        {
            Instance instance = Instance.LoadFrom("sohu.xml");

            var table  = new TableInfo("websites", "html");
            var fields = new[]
            {
                new FieldSelector(".//title", "title"),
                new FieldSelector(Env.UrlPropertyKey, "url", SelectorType.Enviroment),
                new FieldSelector(".//body", "content", SelectorType.XPath, DataType.String, int.MaxValue),
                new FieldSelector("is_match", "is_match", SelectorType.XPath, DataType.Bool),
                new FieldSelector("matchs", "matchs", SelectorType.XPath, DataType.String, int.MaxValue),
                new FieldSelector("id", "id", SelectorType.Enviroment, DataType.Int)
                {
                    IsPrimary = true
                },
            };
            var targetRequestSelector = new TargetRequestSelector
            {
                XPaths          = instance.TargetXpaths,
                Patterns        = instance.TargetPatterns,
                ExcludePatterns = instance.ExcludePatterns
            };
            var model         = new ModelDefinition(null, fields, table, targetRequestSelector);
            var modeProcessor = new ModelProcessor(model);

            modeProcessor.CleanPound = true;
            modeProcessor.AddDataHanlder(new MyDataHandler());

            Spider spider = Spider.Create(
                new QueueDuplicateRemovedScheduler(),
                modeProcessor)
                            .AddPipeline(new MySqlEntityPipeline());

            spider.EncodingName = instance.Encording;
            spider.AddRequests(instance.Url);
            if (instance.Downloader.ToLower() == "chrome")
            {
                spider.Downloader = new WebDriverDownloader(Browser.Chrome, new Option {
                    Headless = true
                });
            }

            spider.Run();
        }
Exemple #5
0
        public static void Run()
        {
            var site = new Site {
                EncodingName = "UTF-8"
            };

            site.AddStartUrl("http://www.meituan.com/dianying/zuixindianying");

            Spider spider = Spider.Create(site,
                                          new QueueDuplicateRemovedScheduler(),
                                          new MeituanPageProcessor())
                            .AddPipeline(new MeituanPipeline())
                            .SetDownloader(new HttpClientDownloader())
                            .SetThreadNum(1);

            spider.EmptySleepTime = 60000;
            spider.Run();
        }
Exemple #6
0
        static void Main(string[] args)
        {
            Site site = new Site {
                EncodingName = "gb2312", RemoveOutboundLinks = false
            };

            site.AddStartUrl("http://www.17500.cn/ssq");
            Spider spider = Spider.Create(site,
                                          new QueueDuplicateRemovedScheduler(),
                                          new SSQProcessor()).AddPipeline(new SSQPipeline());

            spider.Downloader     = new DotnetSpider.Core.Downloader.HttpClientDownloader();
            spider.ThreadNum      = 1;
            spider.EmptySleepTime = 3000;
            spider.Run();
            Console.WriteLine("Press any key to continue...");
            Console.ReadKey();
        }
        public static void Run()
        {
            // 使用内存Scheduler、自定义PageProcessor、自定义Pipeline创建爬虫
            Spider spider = Spider.Create(
                new QueueDuplicateRemovedScheduler(),
                new BlogSumaryProcessor(),
                new NewsProcessor()).
                            AddPipeline(new MyPipeline());

            spider.EncodingName = "UTF-8";
            for (int i = 1; i < 5; ++i)
            {
                // 添加初始采集链接
                spider.AddRequests("http://www.cnblogs.com");
            }
            // 启动爬虫
            spider.Run();
        }
Exemple #8
0
        public static void Run()
        {
            Spider spider = Spider.Create(
                // use memoery queue scheduler. 使用内存调度
                new QueueDuplicateRemovedScheduler(),
                // use custmize processor for youku 为优酷自定义的 Processor
                new YoukuPageProcessor())
                            // use custmize pipeline for youku 为优酷自定义的 Pipeline
                            .AddPipeline(new YoukuPipeline());

            // Start crawler 启动爬虫
            spider.EncodingName = "UTF-8";
            for (int i = 1; i < 5; ++i)
            {
                // Add start/feed urls. 添加初始采集链接
                spider.AddRequests($"http://list.youku.com/category/show/c_96_s_1_d_1_p_{i}.html");
            }
            spider.Run();
        }
        public void IdentityLengthLimit()
        {
            try
            {
                Spider.Create(new Site {
                    EncodingName = "UTF-8", SleepTime = 1000
                },
                              "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
                              new QueueDuplicateRemovedScheduler(),
                              new TestPageProcessor());
            }
            catch (Exception exception)
            {
                Assert.Equal("Length of Identity should less than 100.", exception.Message);
                return;
            }

            throw new Exception("TEST FAILED.");
        }
        public void RunAsyncAndStopThenExit()
        {
            Spider spider = Spider.Create(new Site {
                EncodingName = "UTF-8", SleepTime = 1000
            }, new TestPageProcessor()).AddPipeline(new TestPipeline());

            spider.ThreadNum = 1;
            for (int i = 0; i < 10000; i++)
            {
                spider.AddStartUrl("http://www.baidu.com/" + i);
            }
            spider.RunAsync();
            Thread.Sleep(5000);
            spider.Pause(() =>
            {
                spider.Exit();
            });
            Thread.Sleep(5000);
        }
        public static void CrawlerPagesWithoutTraverse()
        {
            var site = new DotnetSpider.Core.Site {
                EncodingName = "UTF-8", RemoveOutboundLinks = true
            };

            site.AddStartUrl("https://movie.douban.com/top250");
            Spider spider = Spider.Create(site,
                                          "DOUBAN_" + DateTime.Now.ToString("yyyyMMddhhmmss"),
                                          new QueueDuplicateRemovedScheduler(),
                                          new DouBanPageProcessor())
                            .AddPipeline(new DouBanPipeline());

            spider.ThreadNum      = 2;
            spider.EmptySleepTime = 3000;

            // 启动爬虫
            spider.Run();
        }
        public void RetryRequest()
        {
            var site = new Site {
                EncodingName = "UTF-8", RemoveOutboundLinks = true
            };

            var scheduler = new QueueDuplicateRemovedScheduler();

            site.AddStartUrl("http://v.youku.com/v_show/id_XMTMyMTkzNTY1Mg==.html?spm=a2h1n.8251845.0.0");
            site.AddStartUrl("http://v.youku.com/v_show/id_XMjkzNzMwMDMyOA==.html?spm=a2h1n.8251845.0.0");
            site.AddStartUrl("http://v.youku.com/v_show/id_XMjcwNDg0NDI3Mg==.html?spm=a2h1n.8251845.0.0");
            site.AddStartUrl("http://v.youku.com/v_show/id_XMTMwNzQwMTcwMA==.html?spm=a2h1n.8251845.0.0");
            site.AddStartUrl("http://v.youku.com/v_show/id_XMjk1MzI0Mzk4NA==.html?spm=a2h1n.8251845.0.0");
            site.AddStartUrl("http://v.youku.com/v_show/id_XMjkzNzY0NzkyOA==.html?spm=a2h1n.8251845.0.0");
            site.AddStartUrl("http://www.cnblogs.com/");

            Spider spider = Spider.Create(site,
                            // crawler identity
                                          "cnblogs_" + DateTime.Now.ToString("yyyyMMddhhmmss"),
                            // use memoery queue scheduler
                                          scheduler,
                            // default page processor will save whole html, and extract urls to target urls via regex
                                          new TestPageProcessor())
                            // save crawler result to file in the folder: \{running directory}\data\{crawler identity}\{guid}.dsd
                            .AddPipeline(new FilePipeline());

            // dowload html by http client
            spider.Downloader = new HttpClientDownloader();

            spider.ThreadNum = 1;
            // traversal deep 遍历深度
            spider.Deep = 3;

            // start crawler 启动爬虫
            spider.Run();

            Assert.Equal(5, spider.RetriedTimes.Value);
            Assert.Equal(0, scheduler.LeftRequestsCount);
            Assert.Equal(6, scheduler.SuccessRequestsCount);
            // 重试次数应该包含
            Assert.Equal(5, scheduler.ErrorRequestsCount);
        }
Exemple #13
0
        public void FastExit()
        {
            if (Environment.GetEnvironmentVariable("TRAVIS") == "1")
            {
                return;
            }
            var path = "FastExit_Result.txt";

            if (File.Exists(path))
            {
                File.Delete(path);
            }
            Stopwatch stopwatch = new Stopwatch();

            stopwatch.Start();

            Spider spider = Spider.Create(new Site {
                CycleRetryTimes = 5, EncodingName = "UTF-8", SleepTime = 0
            },
                                          new FastExitPageProcessor())
                            .AddPipeline(new FastExitPipeline());

            spider.ThreadNum      = 1;
            spider.EmptySleepTime = 0;
            spider.AddStartUrl("http://item.jd.com/1013286.html?_t=1");
            spider.AddStartUrl("http://item.jd.com/1013286.html?_t=2");
            spider.AddStartUrl("http://item.jd.com/1013286.html?_t=3");
            spider.AddStartUrl("http://item.jd.com/1013286.html?_t=4");
            spider.AddStartUrl("http://item.jd.com/1013286.html?_t=5");
            spider.Run();
            stopwatch.Stop();
            var costTime = stopwatch.ElapsedMilliseconds;

            Assert.True(costTime < 3000);
            var results = File.ReadAllLines("FastExit_Result.txt");

            Assert.Contains("http://item.jd.com/1013286.html?_t=1", results);
            Assert.Contains("http://item.jd.com/1013286.html?_t=2", results);
            Assert.Contains("http://item.jd.com/1013286.html?_t=3", results);
            Assert.Contains("http://item.jd.com/1013286.html?_t=4", results);
            Assert.Contains("http://item.jd.com/1013286.html?_t=5", results);
        }
Exemple #14
0
        public static void Run()
        {
            var site = new Site {
                EncodingName = "UTF-8"
            };

            site.AddStartUrl("http://58921.com/alltime/wangpiao");

            Spider spider = Spider.Create(site,
                                          new QueueDuplicateRemovedScheduler(),
                                          new BoxOfficePageProcessor())
                            .AddPipeline(new BoxOfficePipeline())
                            .SetDownloader(new HttpClientDownloader())
                            .SetThreadNum(1);

            spider.EmptySleepTime = 60000;
            spider.Run();

            DeleteOldData();
        }
Exemple #15
0
        public static void Run()
        {
            IocExtension.ServiceCollection.AddSingleton <IMonitorService, NLogMonitor>();

            var site = new Site {
                EncodingName = "UTF-8"
            };

            for (int i = 1; i < 5; ++i)
            {
                site.AddStartUrl("http://" + $"www.youku.com/v_olist/c_97_g__a__sg__mt__lg__q__s_1_r_0_u_0_pt_0_av_0_ag_0_sg__pr__h__d_1_p_{i}.html");
            }

            Spider spider = Spider.Create(site, new MyPageProcessor(), new QueueDuplicateRemovedScheduler()).AddPipeline(new MyPipeline()).SetThreadNum(1);

            SpiderMonitor.Register(spider);

            spider.Run();
            Console.Read();
        }
Exemple #16
0
        public void RemoveOutboundLinksSetting()
        {
            try
            {
                var spider = Spider.Create(new Site {
                    RemoveOutboundLinks = true
                },
                                           "1111",
                                           new QueueDuplicateRemovedScheduler(),
                                           new TestPageProcessor());
                spider.Run();
            }
            catch (Exception exception)
            {
                Assert.Equal($"When you want remove outbound links, the domains should not be null or empty.", exception.Message);
                return;
            }

            throw new Exception("TEST FAILED.");
        }
Exemple #17
0
        public static void Main(string[] args)
        {
            IocExtension.ServiceCollection.AddSingleton <IMonitorService, NLogMonitor>();

            Spider spider = Spider.Create(new Site {
                EncodingName = "UTF-8", MinSleepTime = 1000
            }, new SpiderTest.TestPageProcessor()).AddPipeline(new SpiderTest.TestPipeline()).SetThreadNum(1);

            spider.SetDownloader(new TestDownloader());
            for (int i = 0; i < 10000; i++)
            {
                spider.AddStartUrl("http://www.baidu.com/" + i);
            }
            spider.Run();
            Thread.Sleep(5000);
            spider.Stop();
            Thread.Sleep(5000);
            spider.RunAsync();
            Thread.Sleep(5000);
        }
Exemple #18
0
        public static void WandaSpiderRun()
        {
            LogHelper.WriteLog(typeof(WandaSpider), "Start to get data from wanda website");

            var site = new Site {
                EncodingName = "UTF-8", RemoveOutboundLinks = true
            };

            site.AddStartUrl("http://www.wandafilm.com/");
            Spider spider = Spider.Create(site,
                                          new QueueDuplicateRemovedScheduler(),
                                          new WandaPageProcessor())
                            .AddPipeline(new WandaPipeline())
                            .SetDownloader(new HttpClientDownloader())
                            .SetThreadNum(1);

            spider.EmptySleepTime = 3000;

            spider.Run();
        }
        public void _404Url()
        {
            if (!Env.IsWindows)
            {
                return;
            }
            var spider = Spider.Create(new Site {
                EncodingName = "UTF-8", SleepTime = 1000
            },
                                       "abcd",
                                       new QueueDuplicateRemovedScheduler(),
                                       new TestPageProcessor());

            spider.AddPipeline(new ConsolePipeline());
            spider.SkipTargetUrlsWhenResultIsEmpty = false;
            spider.EmptySleepTime = 6000;
            spider.AddStartUrl("http://www.mlr.gov.cn/xwdt/jrxw/201707/t20170710_15242382.htm");
            spider.Run();
            Assert.Equal(5, spider.RetriedTimes.Value);
        }
Exemple #20
0
        public void DatebaseLogAndStatus()
        {
            lock (TestBase.Locker)
            {
                if (File.Exists(Env.GlobalAppConfigPath))
                {
                    File.Delete(Env.GlobalAppConfigPath);
                }

                AppDomain.CurrentDomain.SetData("CONFIG", "");
                AppDomain.CurrentDomain.SetData("DBCONFIG", "");
                Env.Reload();
                string id        = Guid.NewGuid().ToString("N");
                string taskGroup = Guid.NewGuid().ToString("N");
                string userId    = Guid.NewGuid().ToString("N");

                using (Spider spider = Spider.Create(new Site {
                    EncodingName = "UTF-8", SleepTime = 1000
                },
                                                     id,
                                                     new QueueDuplicateRemovedScheduler(),
                                                     new TestPageProcessor()))
                {
                    spider.AddPipeline(new TestPipeline());
                    spider.ThreadNum = 1;
                    for (int i = 0; i < 5; i++)
                    {
                        spider.AddStartUrl("http://www.baidu.com/" + i);
                    }
                    spider.Monitor = new DbMonitor(id);
                    spider.Run();
                }
                Thread.Sleep(3000);
                using (var conn = (Env.SystemConnectionStringSettings.GetDbConnection()))
                {
                    Assert.StartsWith("Crawl complete, cost", conn.Query <Log>($"SELECT * FROM DotnetSpider.Log where Identity='{id}'").Last().message);
                    Assert.Equal(1, conn.Query <CountResult>($"SELECT COUNT(*) as Count FROM DotnetSpider.Status where Identity='{id}'").First().Count);
                    Assert.Equal("Finished", conn.Query <statusObj>($"SELECT * FROM DotnetSpider.Status where Identity='{id}'").First().status);
                }
            }
        }
        public void _301Url()
        {
            if (!Env.IsWindows)
            {
                return;
            }
            var spider = Spider.Create(new Site {
                EncodingName = "UTF-8", SleepTime = 1000
            },
                                       "abcd",
                                       new QueueDuplicateRemovedScheduler(),
                                       new TestPageProcessor());

            spider.AddPipeline(new ConsolePipeline());
            spider.SkipTargetUrlsWhenResultIsEmpty = true;
            spider.Downloader     = new HttpClientDownloader();
            spider.EmptySleepTime = 6000;
            spider.AddStartUrl("https://tieba.baidu.com/f?kw=%E7%AE%80%E9%98%B3&ie=utf-8&pn=50");
            spider.Run();
            Assert.Equal(0, spider.RetriedTimes.Value);
        }
        public void NoPipeline()
        {
            //try
            //{
            //    Spider spider = Spider.Create(new Site { EncodingName = "UTF-8", SleepTime = 1000 }, new TestPageProcessor());
            //    spider.Run();
            //}
            //catch (SpiderException exception)
            //{
            //    Assert.Equal("Pipelines should not be null.", exception.Message);
            //    return;
            //}

            //throw new Exception("TEST FAILED.");
            Spider spider = Spider.Create(new Site {
                EncodingName = "UTF-8", SleepTime = 1000
            }, new TestPageProcessor());

            spider.EmptySleepTime = 1000;
            spider.Run();
        }
Exemple #23
0
        private static void GetHouseNull()
        {
            var site = new Site
            {
                CycleRetryTimes = 1,
                SleepTime       = 200,
                Headers         = new Dictionary <string, string>()
                {
                    { "Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8" },
                    { "Referer", "https://zz.lianjia.com/ershoufang/ng1nb1mw1f2/" },
                    { "Cache-Control", "max-age=0" },
                    { "Connection", "keep-alive" },
                    { "Content-Type", "application/x-www-form-urlencoded; charset=UTF-8" },
                    { "User-Agent", "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36" }
                },
            };

            List <Request>   resList   = new List <Request>();
            List <HouseInfo> houseList = Repository <HouseInfo> .Query(n => n.HouseInsideArea == null);

            foreach (HouseInfo item in houseList)
            {
                Request res = new Request();
                //res.PostBody = string.Format("id=7&j=%7B%22createMan%22%3A%2218273159100%22%2C%22createTime%22%3A1518433690000%2C%22row%22%3A5%2C%22siteUserActivityListId%22%3A8553%2C%22siteUserPageRowModuleId%22%3A84959%2C%22topids%22%3A%22%22%2C%22wherePhase%22%3A%221%22%2C%22wherePreferential%22%3A%220%22%2C%22whereUsertype%22%3A%220%22%7D&page={0}&shopid=83106681", i);
                res.Url    = item.Url;
                res.Method = System.Net.Http.HttpMethod.Get;

                resList.Add(res);
            }


            var spider = Spider.Create(site, new QueueDuplicateRemovedScheduler(), new HouseDetailsProcessor())
                         .AddStartRequests(resList.ToArray())

                         .AddPipeline(new HouseDetailsPipe());

            spider.ThreadNum = 10;

            spider.Run();
        }
Exemple #24
0
        static void Main(string[] args)
        {
            var site = new Site
            {
                CycleRetryTimes = 1,
                SleepTime       = 200,
                Headers         = new Dictionary <string, string>()
                {
                    { "Accept", "text/html, */*; q=0.01" },
                    { "Referer", "https://store.mall.autohome.com.cn/83106681.html" },
                    { "Cache-Control", "no-cache" },
                    { "Connection", "keep-alive" },
                    { "Content-Type", "application/x-www-form-urlencoded; charset=UTF-8" },
                    { "User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.167 Safari/537.36" }
                    //{ "Cookie","fvlid=1496991192019sTVlIox4; sessionid=792A6590-B1C1-4292-902E-BAB469F6B66A%7C%7C2017-06-09+14%3A52%3A46.308%7C%7Cwww.baidu.com; mallsfvi=1501164251817G4Yjfly2%7Cstore.mall.autohome.com.cn%7C0; ag_fid=P3aAofwpa4LibiRF; Hm_lvt_765ecde8c11b85f1ac5f168fa6e6821f=1507955352; cookieCityId=110100; __utma=1.675003556.1498310668.1509877694.1512100363.4; __utmz=1.1512100363.4.4.utmcsr=autohome.com.cn|utmccn=(referral)|utmcmd=referral|utmcct=/beijing/; sessionuid=792A6590-B1C1-4292-902E-BAB469F6B66A%7C%7C2017-06-09+14%3A52%3A46.308%7C%7Cwww.baidu.com; UM_distinctid=15d58caae69692-00e158964a2cc1-8383667-100200-15d58caae6a8f3; cn_1262640694_dplus=%7B%22distinct_id%22%3A%20%2215d58caae69692-00e158964a2cc1-8383667-100200-15d58caae6a8f3%22%2C%22sp%22%3A%20%7B%22%24_sessionid%22%3A%200%2C%22%24_sessionTime%22%3A%201513783651%2C%22%24dp%22%3A%200%2C%22%24_sessionPVTime%22%3A%201513783651%7D%7D; _ga=GA1.3.675003556.1498310668; ahsids=588; Hm_lvt_9924a05a5a75caf05dbbfb51af638b07=1513782253,1514425536,1516346152; ahpau=1; o2oPlatform_user_info_new=\"2PEXHhEpMSe8Ab9wePPfBM/cPTSA+LEhouYSvKqf21ifTs8v3Wa7+5ML9XHTa4PIxdohDy/pfgZn+7jU57K8QJk6GqJpuAFmGdo4AAkQj1s=\"; providerLogin=\"2PEXHhEpMSe8Ab9wePPfBM/cPTSA+LEhouYSvKqf21ifTs8v3Wa7+5ML9XHTa4PIxdohDy/pfgZn+7jU57K8QJk6GqJpuAFmGdo4AAkQj1s=\"; o2oPlatform_company_user_info=2PEXHhEpMSco9C5zzw6CuOMB4aAqz2tF4FWkU/d1pCIKwHFVei9AI0tef+vYWhLhKK6S2blgHM0hg0WFK8FWIX+0p68SYs23; area=431099; mallslvi=0%7C%7C15187909317891KBbjcg5; sessionip=223.152.110.108; mallCityId=999999; ahpvno=21; sessionvid=554FA305-FACF-4617-B002-CEBC69D55AE3; ref=www.mangoauto.com.cn%7C0%7C0%7Cwww.baidu.com%7C2018-02-17+09%3A45%3A31.833%7C2018-02-11+21%3A19%3A03.217; ahrlid=1518831923227kTwIQZVJ-1518832004398" },
                }
            };


            List <Request> resList = new List <Request>();

            for (int i = 1; i <= 33; i++)
            {
                Request res = new Request();
                res.PostBody = $"id=7&j=%7B%22createMan%22%3A%2218273159100%22%2C%22createTime%22%3A1518433690000%2C%22row%22%3A5%2C%22siteUserActivityListId%22%3A8553%2C%22siteUserPageRowModuleId%22%3A84959%2C%22topids%22%3A%22%22%2C%22wherePhase%22%3A%221%22%2C%22wherePreferential%22%3A%220%22%2C%22whereUsertype%22%3A%220%22%7D&page={i}&shopid=83106681";
                res.Url      = "https://store.mall.autohome.com.cn/shop/ajaxsitemodlecontext.jtml";
                res.Method   = System.Net.Http.HttpMethod.Post;

                resList.Add(res);
            }


            var spider = Spider.Create(site, new QueueDuplicateRemovedScheduler(), new AutoHomeProcessor())
                         .AddStartRequests(resList.ToArray())
                         .AddPipeline(new AutoHomePipe());

            spider.ThreadNum = 1;
            spider.Run();
            Console.Read();
        }
        public void FastExit()
        {
            if (Environment.GetEnvironmentVariable("TRAVIS") == "1")
            {
                return;
            }
            var path = "FastExit_Result.txt";

            if (File.Exists(path))
            {
                File.Delete(path);
            }
            Stopwatch stopwatch = new Stopwatch();

            stopwatch.Start();

            Spider spider = Spider.Create(
                new FastExitPageProcessor())
                            .AddPipeline(new FastExitPipeline());

            spider.ThreadNum       = 1;
            spider.EmptySleepTime  = 0;
            spider.EncodingName    = "UTF-8";
            spider.CycleRetryTimes = 5;
            spider.SleepTime       = 0;
            spider.AddRequests("http://war.163.com/");
            spider.AddRequests("http://sports.163.com/");
            spider.AddRequests("http://ent.163.com/");
            spider.Downloader = new TestDownloader();
            spider.Run();
            stopwatch.Stop();
            var costTime = stopwatch.ElapsedMilliseconds;

            Assert.True(costTime < 3000);
            var results = File.ReadAllLines("FastExit_Result.txt");

            Assert.Contains("http://war.163.com/", results);
            Assert.Contains("http://sports.163.com/", results);
            Assert.Contains("http://ent.163.com/", results);
        }
        public void RunAsyncAndContiune()
        {
            if (Environment.GetEnvironmentVariable("TRAVIS") == "1")
            {
                return;
            }
            Spider spider = Spider.Create(new TestPageProcessor()).AddPipeline(new TestPipeline());

            spider.ThreadNum    = 1;
            spider.EncodingName = "UTF-8";
            for (int i = 0; i < 10000; i++)
            {
                spider.AddRequests("http://www.baidu.com/" + i);
            }
            spider.RunAsync();
            Thread.Sleep(5000);
            spider.Pause(() =>
            {
                spider.Contiune();
            });
            Thread.Sleep(5000);
        }
Exemple #27
0
        public static void Run()
        {
            // 定义要采集的 Site 对象, 可以设置 Header、Cookie、代理等
            var site = new Site {
                EncodingName = "UTF-8", RemoveOutboundLinks = true
            };

            for (int i = 1; i < 5; ++i)
            {
                // 添加初始采集链接
                site.AddStartUrl("http://" + $"www.youku.com/v_olist/c_97_g__a__sg__mt__lg__q__s_1_r_0_u_0_pt_0_av_0_ag_0_sg__pr__h__d_1_p_{i}.html");
            }

            // 使用内存Scheduler、自定义PageProcessor、自定义Pipeline创建爬虫
            Spider spider = Spider.Create(site, new QueueDuplicateRemovedScheduler(), new MyPageProcessor()).AddPipeline(new MyPipeline()).SetThreadNum(1);

            spider.EmptySleepTime = 3000;
            spider.Deep           = 2;

            // 启动爬虫
            spider.Run();
        }
Exemple #28
0
        public void Setting()
        {
            var site = new Site {
                EncodingName = "UTF-8", RemoveOutboundLinks = true
            };

            // Set start/seed url
            site.AddStartUrl("http://www.cnblogs.com/");

            Spider spider = Spider.Create(site,
                            // crawler identity
                                          "cnblogs_" + DateTime.Now.ToString("yyyyMMddhhmmss"),
                            // use memoery queue scheduler
                                          new QueueDuplicateRemovedScheduler(),
                            // default page processor will save whole html, and extract urls to target urls via regex
                                          new DefaultPageProcessor(new[] { "cnblogs\\.com" }))
                            // save crawler result to file in the folder: \{running directory}\data\{crawler identity}\{guid}.dsd
                            .AddPipeline(new FilePipeline());

            spider.RedialExecutor = new FileLockerRedialExecutor(new AdslRedialer("", "", ""), new VpsInternetDetector());
            Assert.IsNotNull(spider.RedialExecutor);
        }
        public void RetryRequest()
        {
            var site = new Site {
                EncodingName = "UTF-8"
            };

            var scheduler = new QueueDuplicateRemovedScheduler();

            site.AddStartUrl("http://www.baidu.com");
            site.AddStartUrl("http://www.163.com/");

            Spider spider = Spider.Create(site,
                            // crawler identity
                                          "cnblogs_" + DateTime.Now.ToString("yyyyMMddhhmmss"),
                            // use memoery queue scheduler
                                          scheduler,
                            // default page processor will save whole html, and extract urls to target urls via regex
                                          new TestPageProcessor())
                            // save crawler result to file in the folder: \{running directory}\data\{crawler identity}\{guid}.dsd
                            .AddPipeline(new FilePipeline());

            spider.Monitor = new NLogMonitor();
            // dowload html by http client
            spider.Downloader = new HttpClientDownloader();

            spider.ThreadNum = 1;
            // traversal deep 遍历深度
            spider.Scheduler.Depth = 3;
            spider.ClearSchedulerAfterCompleted = false;
            spider.EmptySleepTime = 6000;
            // start crawler 启动爬虫
            spider.Run();

            Assert.Equal(5, spider.RetriedTimes.Value);
            Assert.Equal(0, scheduler.LeftRequestsCount);
            Assert.Equal(1, scheduler.SuccessRequestsCount);
            // 重试次数应该包含
            Assert.Equal(5, scheduler.ErrorRequestsCount);
        }
Exemple #30
0
        public static void Main(string[] args)
        {
#if NET_CORE
            Encoding.RegisterProvider(CodePagesEncodingProvider.Instance);
#endif
            IocContainer.Default.AddSingleton <IMonitor, NLogMonitor>();

            Spider spider = Spider.Create(new Site {
                EncodingName = "UTF-8", MinSleepTime = 1000
            }, new SpiderTest.TestPageProcessor()).AddPipeline(new SpiderTest.TestPipeline()).SetThreadNum(1);
            spider.SetDownloader(new TestDownloader());
            for (int i = 0; i < 10; i++)
            {
                spider.AddStartUrl("http://www.baidu.com/" + i);
            }
            spider.Run();
            Thread.Sleep(5000);
            spider.Stop();
            Thread.Sleep(5000);
            spider.RunAsync();
            Thread.Sleep(5000);
        }