Exemplo n.º 1
0
        public void DatebaseLogAndStatus()
        {
            string id            = Guid.NewGuid().ToString("N");
            string taskGroup     = Guid.NewGuid().ToString("N");
            string userId        = Guid.NewGuid().ToString("N");
            string connectString = "Database='test';Data Source=localhost;User ID=root;Password=1qazZAQ!;Port=3306";

            Configuration.SetValue("logAndStatusConnectString", connectString);
            Assert.Equal("Database='test';Data Source=localhost;User ID=root;Password=1qazZAQ!;Port=3306", Configuration.GetValue("logAndStatusConnectString"));

            using (Spider spider = Spider.Create(new Site {
                EncodingName = "UTF-8", MinSleepTime = 1000
            },
                                                 id,
                                                 userId,
                                                 taskGroup,
                                                 new TestPageProcessor(), new QueueDuplicateRemovedScheduler()))
            {
                spider.AddPipeline(new TestPipeline()).SetThreadNum(1);
                for (int i = 0; i < 5; i++)
                {
                    spider.AddStartUrl("http://www.baidu.com/" + i);
                }
                MonitorCenter.Register(spider);
                spider.Run();
            }
            using (MySqlConnection conn = new MySqlConnection(connectString))
            {
                Assert.Equal(1, conn.Query <CountResult>($"SELECT COUNT(*) as Count FROM dotnetspider.status where userid='{userId}' and taskgroup='{taskGroup}' and identity='{id}'").First().Count);
                Assert.Equal(7, conn.Query <CountResult>($"SELECT COUNT(*) as Count FROM dotnetspider.log where userid='{userId}' and taskgroup='{taskGroup}' and identity='{id}'").First().Count);
            }
        }
Exemplo n.º 2
0
        public static void Run()
        {
            // 注入监控服务
            IocContainer.Default.AddSingleton <IMonitor, NLogMonitor>();

            // 定义要采集的 Site 对象, 可以设置 Header、Cookie、代理等
            var site = new Site {
                EncodingName = "UTF-8"
            };

            for (int i = 1; i < 5; ++i)
            {
                // 添加初始采集链接
                site.AddStartUrl("http://" + $"www.youku.com/v_olist/c_97_g__a__sg__mt__lg__q__s_1_r_0_u_0_pt_0_av_0_ag_0_sg__pr__h__d_1_p_{i}.html");
            }

            // 使用内存Scheduler、自定义PageProcessor、自定义Pipeline创建爬虫
            Spider spider = Spider.Create(site, new MyPageProcessor(), new QueueDuplicateRemovedScheduler()).AddPipeline(new MyPipeline()).SetThreadNum(1);

            spider.EmptySleepTime = 3000;
            // 注册爬虫到监控服务
            MonitorCenter.Register(spider);

            // 启动爬虫
            spider.Run();
            Console.Read();
        }
Exemplo n.º 3
0
        public static void Run()
        {
            // 注入监控服务
            IocManager.AddSingleton <IMonitor, NLogMonitor>();

            // 定义要采集的 Site 对象, 可以设置 Header、Cookie、代理等
            var site = new Site {
                EncodingName = "UTF-8"
            };

            for (int i = 1; i < 5; ++i)
            {
                // 添加初始采集链接
                site.AddStartUrl("http://www.cnblogs.com");
            }

            // 使用内存Scheduler、自定义PageProcessor、自定义Pipeline创建爬虫
            Spider spider = Spider.Create(site,
                                          new QueueDuplicateRemovedScheduler(),
                                          new BlogSumaryProcessor(),
                                          new NewsProcessor()).
                            AddPipeline(new MyPipeline()).
                            SetThreadNum(1);

            // 注册爬虫到监控服务
            MonitorCenter.Register(spider);

            // 启动爬虫
            spider.Run();
            Console.Read();
        }
Exemplo n.º 4
0
        public override void Run(params string[] arguments)
        {
            InitEnvorimentAndVerify();

            try
            {
#if !NET_CORE
                if (CookieInterceptor != null)
                {
                    this.Log("尝试获取 Cookie...", LogLevel.Info);
                    var cookie = CookieInterceptor.GetCookie();
                    if (cookie == null)
                    {
                        this.Log("获取 Cookie 失败, 爬虫无法继续.", LogLevel.Warn);
                        return;
                    }
                    else
                    {
                        Site.CookiesStringPart = cookie.CookiesStringPart;
                        Site.Cookies           = cookie.CookiesDictionary;
                    }
                }
#endif

                this.Log("创建爬虫...", LogLevel.Info);
                EntityProcessor processor = new EntityProcessor(this);

                foreach (var entity in Entities)
                {
                    processor.AddEntity(entity);
                }
                PageProcessor = processor;
                foreach (var entity in Entities)
                {
                    string entiyName = entity.Entity.Name;
                    var    pipelines = new List <BaseEntityPipeline>();
                    foreach (var pipeline in EntityPipelines)
                    {
                        var newPipeline = pipeline.Clone();
                        newPipeline.InitiEntity(entity);
                        if (newPipeline.IsEnabled)
                        {
                            pipelines.Add(newPipeline);
                        }
                    }
                    if (pipelines.Count > 0)
                    {
                        Pipelines.Add(new EntityPipeline(entiyName, pipelines));
                    }
                }

                CheckIfSettingsCorrect();

                bool   needInitStartRequest = true;
                string key = "locker-" + Identity;
                if (Db != null)
                {
                    while (!Db.LockTake(key, "0", TimeSpan.FromMinutes(10)))
                    {
                        Thread.Sleep(1000);
                    }
                    var lockerValue = Db.HashGet(InitStatusSetName, Identity);
                    needInitStartRequest = lockerValue != "init finished";
                }

                if (arguments.Contains("rerun"))
                {
                    Scheduler.Init(this);
                    Scheduler.Clear();
                    //DELETE verify record.
                    Db?.HashDelete(ValidateStatusName, Identity);
                    needInitStartRequest = true;
                }

                this.Log("构建内部模块、准备爬虫数据...", LogLevel.Info);
                InitComponent();

                if (needInitStartRequest)
                {
                    if (PrepareStartUrls != null)
                    {
                        for (int i = 0; i < PrepareStartUrls.Length; ++i)
                        {
                            var prepareStartUrl = PrepareStartUrls[i];
                            this.Log($"[步骤 {i + 2}] 添加链接到调度中心.", LogLevel.Info);
                            prepareStartUrl.Build(this, null);
                        }
                    }
                }

                MonitorCenter.Register(this);

                Db?.LockRelease(key, 0);

                RegisterControl(this);

                if (!arguments.Contains("running-test"))
                {
                    base.Run();
                }
                else
                {
                    IsExited = true;
                }

                TaskFinished();

                HandleVerifyCollectData();
            }
            finally
            {
                Dispose();
                MonitorCenter.Dispose();
            }
        }