Example #1
0
        public static void Run()
        {
            // 注入监控服务
            IocExtension.ServiceCollection.AddSingleton <IMonitorService, NLogMonitor>();

            // 定义要采集的 Site 对象, 可以设置 Header、Cookie、代理等
            var site = new Site {
                EncodingName = "UTF-8"
            };

            for (int i = 1; i < 5; ++i)
            {
                // 添加初始采集链接
                site.AddStartUrl("http://" + $"www.youku.com/v_olist/c_97_g__a__sg__mt__lg__q__s_1_r_0_u_0_pt_0_av_0_ag_0_sg__pr__h__d_1_p_{i}.html");
            }

            // 使用内存Scheduler、自定义PageProcessor、自定义Pipeline创建爬虫
            Spider spider = Spider.Create(site, new MyPageProcessor(), new QueueDuplicateRemovedScheduler()).AddPipeline(new MyPipeline()).SetThreadNum(1);

            spider.EmptySleepTime = 3000;
            // 注册爬虫到监控服务
            SpiderMonitor.Register(spider);

            // 启动爬虫
            spider.Run();
            Console.Read();
        }
Example #2
0
        public void DatebaseLogAndStatus()
        {
            IocExtension.ServiceCollection.AddSingleton <IMonitorService, NLogMonitor>();
            string id            = Guid.NewGuid().ToString("N");
            string taskGroup     = Guid.NewGuid().ToString("N");
            string userId        = Guid.NewGuid().ToString("N");
            string connectString = "Database='test';Data Source=localhost;User ID=root;Password=1qazZAQ!;Port=3306";

            Configuration.SetValue("logAndStatusConnectString", connectString);
            Assert.Equal("Database='test';Data Source=localhost;User ID=root;Password=1qazZAQ!;Port=3306", Configuration.GetValue("logAndStatusConnectString"));
            LogManagerHelper.InitLogManager(true);
            using (Spider spider = Spider.Create(new Site {
                EncodingName = "UTF-8", MinSleepTime = 1000
            },
                                                 id,
                                                 userId,
                                                 taskGroup,
                                                 new TestPageProcessor(), new QueueDuplicateRemovedScheduler()))
            {
                spider.AddPipeline(new TestPipeline()).SetThreadNum(1);
                for (int i = 0; i < 5; i++)
                {
                    spider.AddStartUrl("http://www.baidu.com/" + i);
                }
                SpiderMonitor.Register(spider);
                spider.Run();
            }
            using (MySqlConnection conn = new MySqlConnection(connectString))
            {
                Assert.Equal(1, conn.Query <CountResult>($"SELECT COUNT(*) as Count FROM dotnetspider.status where userid='{userId}' and taskgroup='{taskGroup}' and identity='{id}'").First().Count);
                Assert.Equal(7, conn.Query <CountResult>($"SELECT COUNT(*) as Count FROM dotnetspider.log where userid='{userId}' and taskgroup='{taskGroup}' and identity='{id}'").First().Count);
            }
        }
Example #3
0
        public static void Run()
        {
            Site site = new Site();

            site.AddStartUrl("http://www.36kr.com/");
            Core.Spider thread = OoSpider.Create(site, new CollectorPageModelToDbPipeline(), typeof(Kr36NewsModel)).SetThreadNum(20);
            thread.Start();
            SpiderMonitor spiderMonitor = SpiderMonitor.Instance;

            spiderMonitor.Register(thread);
        }
Example #4
0
        public static void Run()
        {
            IocExtension.ServiceCollection.AddSingleton <IMonitorService, NLogMonitor>();

            var site = new Site {
                EncodingName = "UTF-8"
            };

            for (int i = 1; i < 5; ++i)
            {
                site.AddStartUrl("http://" + $"www.youku.com/v_olist/c_97_g__a__sg__mt__lg__q__s_1_r_0_u_0_pt_0_av_0_ag_0_sg__pr__h__d_1_p_{i}.html");
            }

            Spider spider = Spider.Create(site, new MyPageProcessor(), new QueueDuplicateRemovedScheduler()).AddPipeline(new MyPipeline()).SetThreadNum(1);

            SpiderMonitor.Register(spider);

            spider.Run();
            Console.Read();
        }
Example #5
0
        public override void Run(params string[] arguments)
        {
            InitEnvorimentAndVerify();

            try
            {
#if !NET_CORE
                if (CookieInterceptor != null)
                {
                    Logger.SaveLog(LogInfo.Create("尝试获取 Cookie...", Logger.Name, this, LogLevel.Info));
                    var cookie = CookieInterceptor.GetCookie();
                    if (cookie == null)
                    {
                        Logger.SaveLog(LogInfo.Create("获取 Cookie 失败, 爬虫无法继续.", Logger.Name, this, LogLevel.Error));
                        return;
                    }
                    else
                    {
                        Site.CookiesStringPart = cookie.CookiesStringPart;
                        Site.Cookies           = cookie.CookiesDictionary;
                    }
                }
#endif

                Logger.SaveLog(LogInfo.Create("创建爬虫...", Logger.Name, this, LogLevel.Info));

                EntityProcessor processor = new EntityProcessor(this);

                foreach (var entity in Entities)
                {
                    processor.AddEntity(entity);
                }
                PageProcessor = processor;
                foreach (var entity in Entities)
                {
                    string entiyName = entity.Entity.Name;
                    var    pipelines = new List <BaseEntityPipeline>();
                    foreach (var pipeline in EntityPipelines)
                    {
                        var newPipeline = pipeline.Clone();
                        newPipeline.InitiEntity(entity);
                        if (newPipeline.IsEnabled)
                        {
                            pipelines.Add(newPipeline);
                        }
                    }
                    if (pipelines.Count > 0)
                    {
                        Pipelines.Add(new EntityPipeline(entiyName, pipelines));
                    }
                }

                CheckIfSettingsCorrect();

                bool   needInitStartRequest = true;
                string key = "locker-" + Identity;
                if (Db != null)
                {
                    while (!Db.LockTake(key, "0", TimeSpan.FromMinutes(10)))
                    {
                        Thread.Sleep(1000);
                    }
                    var lockerValue = Db.HashGet(InitStatusSetName, Identity);
                    needInitStartRequest = lockerValue != "init finished";
                }

                if (arguments.Contains("rerun"))
                {
                    Scheduler.Init(this);
                    Scheduler.Clear();
                    //DELETE verify record.
                    Db?.HashDelete(ValidateStatusName, Identity);
                    needInitStartRequest = true;
                }

                Logger.SaveLog(LogInfo.Create("构建内部模块、准备爬虫数据...", Logger.Name, this, LogLevel.Info));
                InitComponent();

                if (needInitStartRequest)
                {
                    if (PrepareStartUrls != null)
                    {
                        for (int i = 0; i < PrepareStartUrls.Length; ++i)
                        {
                            var prepareStartUrl = PrepareStartUrls[i];
                            Logger.SaveLog(LogInfo.Create($"[步骤 {i + 2}] 添加链接到调度中心.", Logger.Name, this, LogLevel.Info));
                            prepareStartUrl.Build(this, null);
                        }
                    }
                }

                SpiderMonitor.Register(this);

                Db?.LockRelease(key, 0);

                RegisterControl(this);

                if (!arguments.Contains("running-test"))
                {
                    base.Run();
                }
                else
                {
                    IsExited = true;
                }

                TaskFinished();

                HandleVerifyCollectData();
            }
            finally
            {
                Dispose();
                SpiderMonitor.Dispose();
            }
        }