예제 #1
0
        public void CloseSignal()
        {
            Spider spider = Spider.Create(
                new TestPageProcessor()).AddPipeline(new TestPipeline());

            spider.EncodingName    = "UTF-8";
            spider.CycleRetryTimes = 5;
            spider.ClearSchedulerAfterCompleted = false;
            for (int i = 0; i < 20; ++i)
            {
                spider.AddRequests($"http://www.baidu.com/t={i}");
            }
            var task = spider.RunAsync();

            Thread.Sleep(500);
            spider.SendExitSignal();
            task.Wait();
            Assert.Equal(10, spider.Scheduler.SuccessRequestsCount);

            Spider spider2 = Spider.Create(
                new TestPageProcessor()).AddPipeline(new TestPipeline());

            spider2.ClearSchedulerAfterCompleted = false;
            spider2.EncodingName    = "UTF-8";
            spider2.CycleRetryTimes = 5;
            for (int i = 0; i < 25; ++i)
            {
                spider2.AddRequests($"http://www.baidu.com/t={i}");
            }
            spider2.Run();
            Assert.Equal(25, spider2.Scheduler.SuccessRequestsCount);
        }
예제 #2
0
        public void DatebaseLogAndStatus()
        {
            string id = Guid.NewGuid().ToString("N");

            Env.NodeId = "DEFAULT";
            using (Spider spider = Spider.Create(
                       id,
                       new QueueDuplicateRemovedScheduler(),
                       new TestPageProcessor()))
            {
                spider.EncodingName = "UTF-8";
                spider.Downloader   = new TestDownloader();
                spider.TaskId       = "1";
                spider.Monitor      = new MySqlMonitor(spider.TaskId, spider.Identity, false, "Database='mysql';Data Source=localhost;User ID=root;Port=3306;SslMode=None;");
                spider.AddPipeline(new TestPipeline());
                for (int i = 0; i < 5; i++)
                {
                    Serilog.Log.Logger.Information("add start url" + i, id);
                    spider.AddRequests("http://www.baidu.com/" + i);
                }
                spider.EmptySleepTime = 1000;
                spider.Run();
            }
            using (var conn = new MySqlConnection("Database='mysql';Data Source=localhost;User ID=root;Port=3306;SslMode=None;"))
            {
                var logs = conn.Query <Log>($"SELECT * FROM dotnetspider.log where identity='{id}'").ToList();
                Assert.StartsWith("Crawl complete, cost", logs[logs.Count - 1].message);
                Assert.Equal(1, conn.Query <CountResult>($"SELECT COUNT(*) as Count FROM dotnetspider.status where identity='{id}'").First().Count);
                Assert.Equal("Finished", conn.Query <statusObj>($"SELECT * FROM dotnetspider.status where identity='{id}'").First().status);
            }
        }
        public static void Run()
        {
            Spider spider = Spider.Create(
                // use memoery queue scheduler
                new QueueDuplicateRemovedScheduler(),
                // default page processor will save whole html, and extract urls to target urls via regex
                new DefaultPageProcessor
            {
                Filter           = new PatternFilter(new[] { "cnblogs\\.com" }),
                RequestExtractor = new XPathRequestExtractor(".")
            })
                            // save crawler result to file in the folder: \{running directory}\data\{crawler identity}\{guid}.dsd
                            .AddPipeline(new FilePipeline());

            // dowload html by http client
            spider.Downloader = new HttpClientDownloader();
            spider.Name       = "CNBLOGS";
            // 4 threads 4线程
            spider.ThreadNum = 4;
            spider.TaskId    = "cnblogs";
            // traversal deep 遍历深度
            spider.Depth        = 3;
            spider.EncodingName = "UTF-8";
            // stop crawler if it can't get url from the scheduler after 30000 ms 当爬虫连续30秒无法从调度中心取得需要采集的链接时结束.
            spider.EmptySleepTime = 30000;
            // Set start/seed url
            spider.AddRequests("http://www.cnblogs.com");
            // start crawler 启动爬虫
            spider.Run();
        }
예제 #4
0
        public static void Run()
        {
            var table    = new TableInfo("youku", "show", TableNamePostfix.Today);
            var selector = new Selector("//div[@class='yk-pack pack-film']");
            var fields   = new[]
            {
                new FieldSelector(".//img[@class='quic']/@alt", "name"),
                new FieldSelector("index", "index", SelectorType.Enviroment, DataType.Int),
                new FieldSelector("", "id", SelectorType.Enviroment, DataType.Int)
                {
                    IsPrimary = true
                },
            };
            var targetRequestSelector = new TargetRequestSelector("//ul[@class='yk-pages']");
            var model = new ModelDefinition(selector, fields, table, targetRequestSelector);
            var json  = JsonConvert.SerializeObject(model);

            Spider spider = Spider.Create(
                new QueueDuplicateRemovedScheduler(),
                new ModelProcessor(model))
                            .AddPipeline(new ConsoleEntityPipeline());

            spider.Name         = "Youku";
            spider.EncodingName = "UTF-8";
            spider.TaskId       = "1";
            for (int i = 1; i < 5; ++i)
            {
                // Add start/feed urls. 添加初始采集链接
                spider.AddRequests($"http://list.youku.com/category/show/c_96_s_1_d_1_p_{i}.html");
            }

            spider.Run();
        }
예제 #5
0
        public void FastExit()
        {
            if (Environment.GetEnvironmentVariable("TRAVIS") == "1")
            {
                return;
            }
            var path = "FastExit_Result.txt";

            if (File.Exists(path))
            {
                File.Delete(path);
            }
            Stopwatch stopwatch = new Stopwatch();

            stopwatch.Start();

            Spider spider = Spider.Create(
                new FastExitPageProcessor())
                            .AddPipeline(new FastExitPipeline());

            spider.ThreadNum       = 1;
            spider.EmptySleepTime  = 0;
            spider.EncodingName    = "UTF-8";
            spider.CycleRetryTimes = 5;
            spider.SleepTime       = 0;
            spider.AddRequests("http://war.163.com/");
            spider.AddRequests("http://sports.163.com/");
            spider.AddRequests("http://ent.163.com/");
            spider.Downloader = new TestDownloader();
            spider.Run();
            stopwatch.Stop();
            var costTime = stopwatch.ElapsedMilliseconds;

            Assert.True(costTime < 3000);
            var results = File.ReadAllLines("FastExit_Result.txt");

            Assert.Contains("http://war.163.com/", results);
            Assert.Contains("http://sports.163.com/", results);
            Assert.Contains("http://ent.163.com/", results);
        }
예제 #6
0
        public void RetryWhenResultIsEmpty()
        {
            Spider spider = Spider.Create(new TestPageProcessor()).AddPipeline(new TestPipeline());

            spider.ThreadNum       = 1;
            spider.EncodingName    = "UTF-8";
            spider.CycleRetryTimes = 5;
            spider.SleepTime       = 1000;
            spider.AddRequests("http://taobao.com");
            spider.Run();

            Assert.Equal(Status.Finished, spider.Status);
        }
예제 #7
0
        public void RetryRequest()
        {
            var scheduler = new QueueDuplicateRemovedScheduler();

            Spider spider = Spider.Create(
                // crawler identity
                "cnblogs_" + DateTime.Now.ToString("yyyyMMddhhmmss"),
                // use memoery queue scheduler
                scheduler,
                // default page processor will save whole html, and extract urls to target urls via regex
                new TestPageProcessor())
                            // save crawler result to file in the folder: \{running directory}\data\{crawler identity}\{guid}.dsd
                            .AddPipeline(new FilePipeline());

            spider.Monitor = new LogMonitor();
            // dowload html by http client
            spider.Downloader   = new HttpClientDownloader();
            spider.EncodingName = "UTF-8";
            spider.ThreadNum    = 1;
            // traversal deep 遍历深度
            spider.Depth = 3;
            spider.ClearSchedulerAfterCompleted = false;
            spider.EmptySleepTime = 6000;
            // start crawler 启动爬虫


            spider.AddRequests("http://www.baidu.com");
            spider.AddRequests("http://www.163.com/");

            spider.Run();

            Assert.Equal(5, spider.RetriedTimes.Value);
            Assert.Equal(0, scheduler.LeftRequestsCount);
            Assert.Equal(1, scheduler.SuccessRequestsCount);
            // 重试次数应该包含
            Assert.Equal(6, scheduler.ErrorRequestsCount);
        }
예제 #8
0
        /// <summary>
        /// 房天下 石家庄  新房
        /// </summary>
        /// <param name="args"></param>
        static void Main(string[] args)
        {
            Spider spider = Spider.Create(
                new QueueDuplicateRemovedScheduler(),
                new Housing.NewHouse.Fang.sjzHousingPageProcessor())
                            .AddPipeline(new HousingPipeline());

            // Start crawler 启动爬虫
            spider.EncodingName = "GBK";
            for (int i = 1; i < 16; ++i)
            {
                // Add start/feed urls. 添加初始采集链接
                spider.AddRequests($"https://sjz.newhouse.fang.com/house/s/b9{i}/");
            }
            spider.Run();
            Console.Read();
        }
예제 #9
0
        public static void Run()
        {
            Instance instance = Instance.LoadFrom("sohu.xml");

            var table  = new TableInfo("websites", "html");
            var fields = new[]
            {
                new FieldSelector(".//title", "title"),
                new FieldSelector(Env.UrlPropertyKey, "url", SelectorType.Enviroment),
                new FieldSelector(".//body", "content", SelectorType.XPath, DataType.String, int.MaxValue),
                new FieldSelector("is_match", "is_match", SelectorType.XPath, DataType.Bool),
                new FieldSelector("matchs", "matchs", SelectorType.XPath, DataType.String, int.MaxValue),
                new FieldSelector("id", "id", SelectorType.Enviroment, DataType.Int)
                {
                    IsPrimary = true
                },
            };
            var targetRequestSelector = new TargetRequestSelector
            {
                XPaths          = instance.TargetXpaths,
                Patterns        = instance.TargetPatterns,
                ExcludePatterns = instance.ExcludePatterns
            };
            var model         = new ModelDefinition(null, fields, table, targetRequestSelector);
            var modeProcessor = new ModelProcessor(model);

            modeProcessor.CleanPound = true;
            modeProcessor.AddDataHanlder(new MyDataHandler());

            Spider spider = Spider.Create(
                new QueueDuplicateRemovedScheduler(),
                modeProcessor)
                            .AddPipeline(new MySqlEntityPipeline());

            spider.EncodingName = instance.Encording;
            spider.AddRequests(instance.Url);
            if (instance.Downloader.ToLower() == "chrome")
            {
                spider.Downloader = new WebDriverDownloader(Browser.Chrome, new Option {
                    Headless = true
                });
            }

            spider.Run();
        }
        public static void Run()
        {
            // 使用内存Scheduler、自定义PageProcessor、自定义Pipeline创建爬虫
            Spider spider = Spider.Create(
                new QueueDuplicateRemovedScheduler(),
                new BlogSumaryProcessor(),
                new NewsProcessor()).
                            AddPipeline(new MyPipeline());

            spider.EncodingName = "UTF-8";
            for (int i = 1; i < 5; ++i)
            {
                // 添加初始采集链接
                spider.AddRequests("http://www.cnblogs.com");
            }
            // 启动爬虫
            spider.Run();
        }
예제 #11
0
        public static void Run()
        {
            Spider spider = Spider.Create(
                // use memoery queue scheduler. 使用内存调度
                new QueueDuplicateRemovedScheduler(),
                // use custmize processor for youku 为优酷自定义的 Processor
                new YoukuPageProcessor())
                            // use custmize pipeline for youku 为优酷自定义的 Pipeline
                            .AddPipeline(new YoukuPipeline());

            // Start crawler 启动爬虫
            spider.EncodingName = "UTF-8";
            for (int i = 1; i < 5; ++i)
            {
                // Add start/feed urls. 添加初始采集链接
                spider.AddRequests($"http://list.youku.com/category/show/c_96_s_1_d_1_p_{i}.html");
            }
            spider.Run();
        }
예제 #12
0
        public void RunAsyncAndContiune()
        {
            if (Environment.GetEnvironmentVariable("TRAVIS") == "1")
            {
                return;
            }
            Spider spider = Spider.Create(new TestPageProcessor()).AddPipeline(new TestPipeline());

            spider.ThreadNum    = 1;
            spider.EncodingName = "UTF-8";
            for (int i = 0; i < 10000; i++)
            {
                spider.AddRequests("http://www.baidu.com/" + i);
            }
            spider.RunAsync();
            Thread.Sleep(5000);
            spider.Pause(() =>
            {
                spider.Contiune();
            });
            Thread.Sleep(5000);
        }
예제 #13
0
        public void ProcesserException()
        {
            var    scheduler = new QueueDuplicateRemovedScheduler();
            Spider spider    = Spider.Create(
                // crawler identity
                "youku",
                // use memoery queue scheduler
                scheduler,
                // default page processor will save whole html, and extract urls to target urls via regex
                new TestPageProcessor())
                               // save crawler result to file in the folder: \{running directory}\data\{crawler identity}\{guid}.dsd
                               .AddPipeline(new FilePipeline());

            spider.ClearSchedulerAfterCompleted = false;
            // dowload html by http client
            spider.Downloader = new HttpClientDownloader();
            spider.AddHeaders("v.youku.com", new Dictionary <string, object> {
                { "Upgrade-Insecure-Requests", "1" }
            });

            spider.ThreadNum = 1;
            // traversal deep 遍历深度
            spider.Depth          = 3;
            spider.EmptySleepTime = 6000;

            spider.AddRequests("http://v.youku.com/v_show/id_XMTMyMTkzNTY1Mg==.html?spm=a2h1n.8251845.0.0");
            spider.AddRequests("http://v.youku.com/v_show/id_XMjkzNzMwMDMyOA==.html?spm=a2h1n.8251845.0.0");
            spider.AddRequests("http://v.youku.com/v_show/id_XMjcwNDg0NDI3Mg==.html?spm=a2h1n.8251845.0.0");
            spider.AddRequests("http://v.youku.com/v_show/id_XMTMwNzQwMTcwMA==.html?spm=a2h1n.8251845.0.0");
            spider.AddRequests("http://v.youku.com/v_show/id_XMjk1MzI0Mzk4NA==.html?spm=a2h1n.8251845.0.0");
            spider.AddRequests("http://v.youku.com/v_show/id_XMjkzNzY0NzkyOA==.html?spm=a2h1n.8251845.0.0");
            spider.AddRequests("http://www.163.com/");

            // start crawler 启动爬虫
            spider.Run();

            Assert.Equal(5, spider.RetriedTimes.Value);
            Assert.Equal(0, scheduler.LeftRequestsCount);
            Assert.Equal(6, scheduler.SuccessRequestsCount);
            Assert.Equal(6, scheduler.ErrorRequestsCount);
        }
예제 #14
0
        /// <summary>
        /// 启动
        /// </summary>
        /// <param name="request">抓取请求</param>
        /// <param name="parser">页面分析器</param>
        /// <param name="storage">数据存储器,,默认显示在控制台</param>
        public void Start(List <Request> request, List <IDataFlow> parser, IDataFlow storage = null)
        {
            Spider _spider = _provider.Create <Spider>();

            if (null == storage)
            {
                storage = new ConsoleStorage();
            }
            _spider.NewGuidId();           // 设置任务标识
            _spider.Name  = "测试采集";        // 设置任务名称
            _spider.Speed = 10;            // 设置采集速度, 表示每秒下载多少个请求, 大于 1 时越大速度越快, 小于 1 时越小越慢, 不能为0.
            _spider.Depth = 3;             // 设置采集深度
            if (parser != null)
            {
                foreach (IDataFlow item in parser)
                {
                    _spider.AddDataFlow(item);
                }
            }
            _spider.AddDataFlow(storage);
            _spider.AddRequests(request.ToArray()); // 设置链接
            _spider.RunAsync();                     // 启动
        }