/// <summary>
        /// 单机模式
        /// 在单机模式下,使用内存型消息队列,因此只有在此作用域 SpiderBuilder 下构建的的爬虫才会共用一个消息队列。
        /// </summary>
        /// <param name="builder">爬虫构造器</param>
        /// <returns>爬虫构造器</returns>
        public static SpiderBuilder UserKafka(this SpiderBuilder builder)
        {
            Check.NotNull(builder, nameof(builder));

            builder.Services.AddSingleton <IMessageQueue, KafkaMessageQueue>();
            return(builder);
        }
Пример #2
0
        public static Task Run()
        {
            //var spider = Create<VnexpressSpider>();
            var builder = new SpiderBuilder();

            builder.AddSerilog();
            builder.ConfigureAppConfiguration();
            builder.UseStandalone();
            var settings = new ProjectDefinition()
            {
                ProjectName      = "Vnexpress Spider",
                Site             = "Vnexpress/Kinh Doanh",
                ItemUrlsSelector = "",
                Urls             = "https://vnexpress.net/kinh-doanh",
                FileFormat       = "*.html",
                FileStorage      = @"P:\Neil.Test\Spider Storage\Vnexpress",
                PageLimit        = 4,
            };

            builder.Services.AddSingleton <ProjectDefinition>(settings);
            builder.AddSpider <VnexpressSpider>();
            //	builder.Services.AddSingleton<IDynamicMessageQueue, InMemoryMessageQueue>((s)=> null);
            //builder.Services.AddSingleton<IDynamicMessageQueue,InMemoryMessageQueue>();
            builder.UseDynamicMessageQueue();
            var factory = builder.Build();
            var spider  = factory.Create <VnexpressSpider>();

            return(spider.RunAsync());
        }
Пример #3
0
        public void DoNotRetryWhenResultIsEmpty()
        {
            var builder = new SpiderBuilder();

            builder.AddSerilog();
            builder.ConfigureAppConfiguration(null, null, false);
            builder.UseStandalone();
            var provider = builder.Build();
            var spider   = provider.Create <Spider>();

            spider.NewGuidId();
            spider.Name                    = "RetryWhenResultIsEmpty";
            spider.EmptySleepTime          = 15;
            spider.RetryDownloadTimes      = 5;
            spider.RetryWhenResultIsEmpty  = false;
            spider.DownloaderSettings.Type = DownloaderType.Empty;
            spider.Scheduler               = new QueueDistinctBfsScheduler();
            spider.AddRequests("http://www.DoNotRetryWhenResultIsEmpty.com");
            spider.RunAsync().Wait();

            var statisticsStore = provider.GetRequiredService <IStatisticsStore>();
            var s = statisticsStore.GetSpiderStatisticsAsync(spider.Id).Result;

            var ds = statisticsStore.GetDownloadStatisticsListAsync(1, 10).Result[0];

            Assert.Equal(1, s.Total);
            Assert.Equal(0, s.Failed);
            Assert.Equal(1, s.Success);

            Assert.Equal(0, ds.Failed);
            Assert.Equal(1, ds.Success);
        }
        public static void Run1()
        {
            var builder = new SpiderBuilder();

            builder.AddSerilog();
            builder.ConfigureAppConfiguration();
            builder.UseStandalone();
            builder.AddSpider <EntitySpider>();
            var provider = builder.Build();
            var spider   = provider.Create <Spider>();

            spider.Id    = Guid.NewGuid().ToString("N");                // 设置任务标识
            spider.Name  = "博客园全站采集";                                   // 设置任务名称
            spider.Speed = 1;                                           // 设置采集速度, 表示每秒下载多少个请求, 大于 1 时越大速度越快, 小于 1 时越小越慢, 不能为0.
            spider.Depth = 3;                                           // 设置采集深度
            spider.DownloaderSettings.Type = DownloaderType.HttpClient; // 使用普通下载器, 无关 Cookie, 干净的 HttpClient
            spider.AddDataFlow(new DataParser
            {
                SelectableFactory   = context => context.GetSelectable(ContentType.Html),
                CanParse            = DataParserHelper.CanParseByRegex("cnblogs\\.com"),
                QueryFollowRequests = DataParserHelper.QueryFollowRequestsByXPath(".")
            }).AddDataFlow(new ConsoleStorage());          // 控制台打印采集结果
            spider.AddRequests("http://www.cnblogs.com/"); // 设置起始链接
            spider.RunAsync();                             // 启动
        }
        public static void Run()
        {
            ImageDownloader.GetInstance().Start();

            var builder = new SpiderBuilder();

            builder.AddSerilog();
            builder.ConfigureAppConfiguration();
            builder.UseStandalone();
            builder.AddSpider <EntitySpider>();
            var provider = builder.Build();
            var spider   = provider.Create <Spider>();

            spider.Id    = Guid.NewGuid().ToString("N");                // 设置任务标识
            spider.Name  = "宅男女神图片采集";                                  // 设置任务名称
            spider.Speed = 2;                                           // 设置采集速度, 表示每秒下载多少个请求, 大于 1 时越大速度越快, 小于 1 时越小越慢, 不能为0.
            spider.Depth = 5;                                           // 设置采集深度
            spider.DownloaderSettings.Type = DownloaderType.HttpClient; // 使用普通下载器, 无关 Cookie, 干净的 HttpClient
            //spider.AddDataFlow(new NvshensTagIndexDataParser());
            spider.AddDataFlow(new NvshensFirstPageTagDataParser());
            spider.AddDataFlow(new NvshensPageTagDataParser());
            spider.AddDataFlow(new NvshensFirstPageDetailDataParser());
            spider.AddDataFlow(new NvshensPageDetailDataParser());
            //spider.AddRequests("https://www.nvshens.com/gallery/"); // 设置起始链接
            spider.AddRequests("https://www.nvshens.com/gallery/luoli/"); // 设置起始链接
            spider.RunAsync();                                            // 启动
        }
Пример #6
0
        public static Task Run()
        {
            var builder = new SpiderBuilder();

            builder.AddSerilog();
            builder.ConfigureAppConfiguration();
            builder.UserKafka();
            var provider = builder.Build();

            var spider = provider.Create <Spider>();

            spider.Id    = Guid.NewGuid().ToString("N");                // 设置任务标识
            spider.Name  = "博客园全站采集";                                   // 设置任务名称
            spider.Speed = 2;                                           // 设置采集速度, 表示每秒下载多少个请求, 大于 1 时越大速度越快, 小于 1 时越小越慢, 不能为0.
            spider.Depth = 3;                                           // 设置采集深度
            spider.DownloaderSettings.Type = DownloaderType.HttpClient; // 使用普通下载器, 无关 Cookie, 干净的 HttpClient
            spider.AddDataFlow(new DataParser <EntitySpider.CnblogsEntry>())
            .AddDataFlow(spider.GetDefaultStorage());
            spider.AddRequests(
                new Request("https://news.cnblogs.com/n/page/1/", new Dictionary <string, string> {
                { "网站", "博客园" }
            }),
                new Request("https://news.cnblogs.com/n/page/2/", new Dictionary <string, string> {
                { "网站", "博客园" }
            }));
            return(spider.RunAsync());            // 启动
        }
Пример #7
0
        public void Run()
        {
            Spider spider = SpiderBuilder.CreateBuilder()
                            .AddRequest("https://weixin.sogou.com/").Buid();

            spider.AddPageProcessor(new Processor2());

            for (int i = 1; i <= 10; i++)
            {
                spider.AddRequest($"https://weixin.sogou.com/weixin?type=2&ie=utf8&page={i}&query=马云");
            }

            spider.NewRequestSleepInterval = 2000; // 2s
            // spider.EmptySleepTime = 60; // 60s

            spider.ThreadNumber = 5;

            //spider.UseNLog();
            spider.UseRedisScheduler("localhost");

            spider.Run();

            spider.OnStatusChanged += Spider_OnStatusChanged;

            // Console.WriteLine("end main ");
        }
Пример #8
0
        /// <summary>
        /// 运行
        /// </summary>
        /// <param name="args">运行参数</param>
        public static void Run(params string[] args)
        {
            Framework.SetEncoding();

            var    configurationBuilder = Framework.CreateConfigurationBuilder(null, args);
            var    configuration        = configurationBuilder.Build();
            string spider = configuration["spider"];

            if (string.IsNullOrWhiteSpace(spider))
            {
                throw new SpiderException("未指定需要执行的爬虫");
            }

            var name       = configuration["name"];
            var id         = configuration["id"] ?? Guid.NewGuid().ToString("N");
            var config     = configuration["config"];
            var arguments  = configuration["args"]?.Split(' ');
            var distribute = configuration["distribute"] == "true";

            PrintEnvironment(args);

            var spiderTypes = DetectSpiders();

            if (spiderTypes == null || spiderTypes.Count == 0)
            {
                return;
            }

            var spiderType = spiderTypes.FirstOrDefault(x => x.Name.ToLower() == spider.ToLower());

            if (spiderType == null)
            {
                ConsoleHelper.WriteLine($"未找到爬虫: {spider}", 0, ConsoleColor.DarkYellow);
                return;
            }

            var builder = new SpiderBuilder();

            builder.AddSerilog();
            builder.ConfigureAppConfiguration(config);
            if (!distribute)
            {
                builder.UseStandalone();
            }

            builder.AddSpider(spiderType);
            var provider = builder.Build();
            var instance = provider.Create(spiderType);

            if (instance != null)
            {
                instance.Name = name;
                instance.Id   = id;
                instance.RunAsync(arguments);
            }
            else
            {
                ConsoleHelper.WriteLine("创建爬虫对象失败", 0, ConsoleColor.DarkYellow);
            }
        }
Пример #9
0
        protected TestBase()
        {
            var builder = new SpiderBuilder();

            builder.AddSerilog();
            builder.ConfigureAppConfiguration(null, null, false);
            builder.UseStandalone();
            SpiderFactory = builder.Build();

            SpiderFactory.GetRequiredService <ILogger <TestBase> >()
            .LogInformation($"Development {SpiderFactory.GetRequiredService<IConfiguration>()["Development"]}");
        }
Пример #10
0
        public void Run()
        {
            Spider spider = SpiderBuilder.CreateBuilder()
                            .AddRequest("https://www.cnblogs.com/")
                            .AddPageProcessor(new CNBlogProcessor())
                            .Buid();

            spider.UseNLog();

            // spider.UseRedisScheduler("localhost");
            //spider.UseChromeWebDriverDownloader(@"C:\Users\admin\.nuget\packages\selenium.webdriver.chromedriver\2.44.0\driver\win32\");
            // spider.UseChromeWebDriverDownloader();
            //spider.AddDapperDataBasePipeline(new DapperDatabaseStore()
            //{
            //    OnSave = UseDapperStoreSave
            //});


            //spider.SetDownloaderProxy(new WebProxy("127.0.0.1", 1080)
            //{
            //    // Credentials = new NetworkCredential("[USERNAME]", "[PASSWORD]")
            //});

            //spider.SetDownloaderProxy(new DownloaderProxy(new WebProxy("127.0.0.1", 1080)));
            //spider.SetDownloaderProxy(new SimpleDownloaderProxyPools(
            //    new WebProxy("127.0.0.1", 1080),
            //    new WebProxy("192.168.1.1", 1080),
            //    new WebProxy("192.168.1.2", 1080)
            //  ));

            // proxy pools
            //spider.UseHttpProxyPools(100, 100, new WebProxy("127.0.0.1", 1080)
            //{
            //    Credentials = new NetworkCredential("[USERNAME]", "[PASSWORD]")
            //});

            Random random = new Random();


            spider.UseStaticSleepInterval = false;

            spider.NewRequestDynamicSleepInterval = () => random.Next(100, 1000);

            spider.OnNewRequesting += (_, interval) =>
            {
                Console.WriteLine("sleep:" + interval);
            };

            spider.Run();
        }
Пример #11
0
        public static Task Run()
        {
            //var spider = Create<VnexpressSpider>();
            var builder = new SpiderBuilder();

            builder.AddSerilog();
            builder.ConfigureAppConfiguration();
            builder.UseStandalone();
            builder.AddSpider <VnexpressSpider>();
            builder.Services.AddSingleton <IDynamicMessageQueue, InMemoryMessageQueue>((s) => null);
            //builder.Services.AddSingleton<IDynamicMessageQueue>(null as IDynamicMessageQueue);
            var factory = builder.Build();
            var spider  = factory.Create <VnexpressSpider>();

            return(spider.RunAsync());
        }
Пример #12
0
        public static Task Run()
        {
            //var spider = Create<VnexpressSpider>();
            var builder = new SpiderBuilder();

            builder.AddSerilog();
            builder.Services.AddSingleton <IProxyValidator, FakeProxyValidator>();
            //builder.Services.AddSingleton<IProxyValidator, DefaultProxyValidator>();
            builder.ConfigureAppConfiguration(null, args: new string[] { "/ProxySupplyUrl=http://localhost:52445/api/proxies" }, true);
            builder.UseStandalone();
            var settings = new ProjectDefinition()
            {
                ProjectName                = "Vnexpress Spider",
                Site                       = "Vnexpress/Kinh Doanh",
                ItemUrlsSelector           = "//article/h1[@class='title_news']/a[1];//article[@class='list_news']/h4[@class='title_news']/a[1]",
                Urls                       = "https://vnexpress.net/kinh-doanh",
                FileStorage                = @"P:\Neil.Test\Spider Storage\Vnexpress",
                FileFormat                 = "*.json",
                PageLimit                  = 4,
                Deepth                     = 2,
                NextPageSelector           = "//p[@id='pagination']/a[@class='next']",
                NumberOfConcurrentRequests = 5,
                Mapping                    = new ItemMapping
                {
                    ItemCssSelector = "//section[@id='left_calculator']",
                    Mapping         = new FieldMapping[]
                    {
                        new FieldMapping {
                            Field = "Title", CssSelector = "//h1[@class='title_news_detail mb10']"
                        },
                        new FieldMapping {
                            Field = "Description", CssSelector = "//p[@class='description']"
                        },
                    }
                }
            };

            builder.Services.AddSingleton <ProjectDefinition>(settings);
            builder.AddSpider <HttpClientSpider>();
            //	builder.Services.AddSingleton<IDynamicMessageQueue, InMemoryMessageQueue>((s)=> null);
            //builder.Services.AddSingleton<IDynamicMessageQueue,InMemoryMessageQueue>();
            builder.UseDynamicMessageQueue();
            var factory = builder.Build();
            var spider  = factory.Create <HttpClientSpider>();

            return(spider.RunAsync());
        }
Пример #13
0
        public static Task Run()
        {
            var builder = new SpiderBuilder();

            builder.AddSerilog();
            builder.ConfigureAppConfiguration();
            builder.UseStandalone();
            builder.AddSpider <EntitySpider>();
            var provider = builder.Build();

            var spider = provider.Create <Spider>();

            spider.NewGuidId();                                         // 设置任务标识
            spider.Name  = "博客园全站采集";                                   // 设置任务名称
            spider.Speed = 1;                                           // 设置采集速度, 表示每秒下载多少个请求, 大于 1 时越大速度越快, 小于 1 时越小越慢, 不能为0.
            spider.Depth = 3;                                           // 设置采集深度
            spider.DownloaderSettings.Type = DownloaderType.HttpClient; // 使用普通下载器, 无关 Cookie, 干净的 HttpClient
            spider.AddDataFlow(new CnblogsDataParser()).AddDataFlow(new ConsoleStorage());
            spider.AddRequests("http://www.cnblogs.com/");              // 设置起始链接
            return(spider.RunAsync());                                  // 启动
        }
Пример #14
0
        public static async void Run()
        {
            var builder = new SpiderBuilder();

            builder.AddSerilog();
            builder.ConfigureAppConfiguration(loadCommandLine: false);
            builder.UseStandalone();

            builder.AddSpider <CnblogsSpider>();
            var provider = builder.Build();

            var spider = provider.Create <CnblogsSpider>();
            //spider.Scheduler = new QueueBfsScheduler();
            //spider.NewGuidId(); // 设置任务标识
            //spider.Name = "博客园全站采集"; // 设置任务名称
            //spider.Speed = 1; // 设置采集速度, 表示每秒下载多少个请求, 大于 1 时越大速度越快, 小于 1 时越小越慢, 不能为0.
            //				  //	spider.Depth = 3; // 设置采集深度
            //spider.DownloaderSettings.Type = DownloaderType.HttpClient; // 使用普通下载器, 无关 Cookie, 干净的 HttpClient
            //spider.AddDataFlow(new CnblogsDataParser()).AddDataFlow(new ConsoleStorage());
            //spider.AddRequests("http://www.cnblogs.com/"); // 设置起始链接
            await spider.RunAsync();             // 启动
        }
Пример #15
0
        static void Main(string[] args)
        {
            try
            {
                var builder = new SpiderBuilder();

                var configurationBuilder = Framework.CreateConfigurationBuilder(null, args);
                var configuration        = configurationBuilder.Build();
                var @class   = configuration["dotnetspider.spider.class"];
                var spiderId = configuration["dotnetspider.spider.id"];

                @class   = "DotnetSpider.Spiders.CnblogsSpider";
                spiderId = "xxxxxxxx";


                var folder = Directory.Exists("/logs/") ? "/logs/" : "";

                var logPath = string.IsNullOrWhiteSpace(spiderId)
                                        ? $"{folder}{DateTime.Now:yyyy-MM-dd HH:mm:ss}.log"
                                        : $"{folder}{spiderId}.log";

                var loggerConfiguration = new LoggerConfiguration()
                                          .MinimumLevel.Information()
                                          .MinimumLevel.Override("Microsoft", LogEventLevel.Warning)
                                          .Enrich.FromLogContext()
                                          .WriteTo.Console().WriteTo
                                          .RollingFile(logPath);
                builder.AddSerilog(loggerConfiguration);

                var spiderName = configuration["dotnetspider.spider.name"];
                spiderName = "博客园";
                if (string.IsNullOrWhiteSpace(@class) ||
                    string.IsNullOrWhiteSpace(spiderId) ||
                    string.IsNullOrWhiteSpace(spiderName)
                    )
                {
                    Log.Logger.Error($"执行爬虫的参数不正确: class {@class}, id {spiderId}, name {spiderName}");
                    return;
                }

                var type = Type.GetType(@class);
                if (type == null)
                {
                    Log.Logger.Error($"未找到爬虫类型: {@class}");
                    return;
                }

                Log.Logger.Information($"获取爬虫类型 {type.FullName} 成功");
                builder.ConfigureAppConfiguration(configuration);
                builder.UserKafka();
                builder.AddSpider(type);
                var provider = builder.Build();

                var spider = provider.Create(type);
                Log.Logger.Information($"创建爬虫实例成功");
                spider.Id   = spiderId;
                spider.Name = spiderName;

                Log.Logger.Information($"尝试启动爬虫实例");
                spider.Run();

                Log.Logger.Information($"爬虫实例退出");
            }
            catch (Exception e)
            {
                Log.Logger.Error($"执行失败: {e}");
            }
        }