/// <summary> /// 单机模式 /// 在单机模式下,使用内存型消息队列,因此只有在此作用域 SpiderBuilder 下构建的的爬虫才会共用一个消息队列。 /// </summary> /// <param name="builder">爬虫构造器</param> /// <returns>爬虫构造器</returns> public static SpiderBuilder UserKafka(this SpiderBuilder builder) { Check.NotNull(builder, nameof(builder)); builder.Services.AddSingleton <IMessageQueue, KafkaMessageQueue>(); return(builder); }
public static Task Run() { //var spider = Create<VnexpressSpider>(); var builder = new SpiderBuilder(); builder.AddSerilog(); builder.ConfigureAppConfiguration(); builder.UseStandalone(); var settings = new ProjectDefinition() { ProjectName = "Vnexpress Spider", Site = "Vnexpress/Kinh Doanh", ItemUrlsSelector = "", Urls = "https://vnexpress.net/kinh-doanh", FileFormat = "*.html", FileStorage = @"P:\Neil.Test\Spider Storage\Vnexpress", PageLimit = 4, }; builder.Services.AddSingleton <ProjectDefinition>(settings); builder.AddSpider <VnexpressSpider>(); // builder.Services.AddSingleton<IDynamicMessageQueue, InMemoryMessageQueue>((s)=> null); //builder.Services.AddSingleton<IDynamicMessageQueue,InMemoryMessageQueue>(); builder.UseDynamicMessageQueue(); var factory = builder.Build(); var spider = factory.Create <VnexpressSpider>(); return(spider.RunAsync()); }
public void DoNotRetryWhenResultIsEmpty() { var builder = new SpiderBuilder(); builder.AddSerilog(); builder.ConfigureAppConfiguration(null, null, false); builder.UseStandalone(); var provider = builder.Build(); var spider = provider.Create <Spider>(); spider.NewGuidId(); spider.Name = "RetryWhenResultIsEmpty"; spider.EmptySleepTime = 15; spider.RetryDownloadTimes = 5; spider.RetryWhenResultIsEmpty = false; spider.DownloaderSettings.Type = DownloaderType.Empty; spider.Scheduler = new QueueDistinctBfsScheduler(); spider.AddRequests("http://www.DoNotRetryWhenResultIsEmpty.com"); spider.RunAsync().Wait(); var statisticsStore = provider.GetRequiredService <IStatisticsStore>(); var s = statisticsStore.GetSpiderStatisticsAsync(spider.Id).Result; var ds = statisticsStore.GetDownloadStatisticsListAsync(1, 10).Result[0]; Assert.Equal(1, s.Total); Assert.Equal(0, s.Failed); Assert.Equal(1, s.Success); Assert.Equal(0, ds.Failed); Assert.Equal(1, ds.Success); }
public static void Run1() { var builder = new SpiderBuilder(); builder.AddSerilog(); builder.ConfigureAppConfiguration(); builder.UseStandalone(); builder.AddSpider <EntitySpider>(); var provider = builder.Build(); var spider = provider.Create <Spider>(); spider.Id = Guid.NewGuid().ToString("N"); // 设置任务标识 spider.Name = "博客园全站采集"; // 设置任务名称 spider.Speed = 1; // 设置采集速度, 表示每秒下载多少个请求, 大于 1 时越大速度越快, 小于 1 时越小越慢, 不能为0. spider.Depth = 3; // 设置采集深度 spider.DownloaderSettings.Type = DownloaderType.HttpClient; // 使用普通下载器, 无关 Cookie, 干净的 HttpClient spider.AddDataFlow(new DataParser { SelectableFactory = context => context.GetSelectable(ContentType.Html), CanParse = DataParserHelper.CanParseByRegex("cnblogs\\.com"), QueryFollowRequests = DataParserHelper.QueryFollowRequestsByXPath(".") }).AddDataFlow(new ConsoleStorage()); // 控制台打印采集结果 spider.AddRequests("http://www.cnblogs.com/"); // 设置起始链接 spider.RunAsync(); // 启动 }
public static void Run() { ImageDownloader.GetInstance().Start(); var builder = new SpiderBuilder(); builder.AddSerilog(); builder.ConfigureAppConfiguration(); builder.UseStandalone(); builder.AddSpider <EntitySpider>(); var provider = builder.Build(); var spider = provider.Create <Spider>(); spider.Id = Guid.NewGuid().ToString("N"); // 设置任务标识 spider.Name = "宅男女神图片采集"; // 设置任务名称 spider.Speed = 2; // 设置采集速度, 表示每秒下载多少个请求, 大于 1 时越大速度越快, 小于 1 时越小越慢, 不能为0. spider.Depth = 5; // 设置采集深度 spider.DownloaderSettings.Type = DownloaderType.HttpClient; // 使用普通下载器, 无关 Cookie, 干净的 HttpClient //spider.AddDataFlow(new NvshensTagIndexDataParser()); spider.AddDataFlow(new NvshensFirstPageTagDataParser()); spider.AddDataFlow(new NvshensPageTagDataParser()); spider.AddDataFlow(new NvshensFirstPageDetailDataParser()); spider.AddDataFlow(new NvshensPageDetailDataParser()); //spider.AddRequests("https://www.nvshens.com/gallery/"); // 设置起始链接 spider.AddRequests("https://www.nvshens.com/gallery/luoli/"); // 设置起始链接 spider.RunAsync(); // 启动 }
public static Task Run() { var builder = new SpiderBuilder(); builder.AddSerilog(); builder.ConfigureAppConfiguration(); builder.UserKafka(); var provider = builder.Build(); var spider = provider.Create <Spider>(); spider.Id = Guid.NewGuid().ToString("N"); // 设置任务标识 spider.Name = "博客园全站采集"; // 设置任务名称 spider.Speed = 2; // 设置采集速度, 表示每秒下载多少个请求, 大于 1 时越大速度越快, 小于 1 时越小越慢, 不能为0. spider.Depth = 3; // 设置采集深度 spider.DownloaderSettings.Type = DownloaderType.HttpClient; // 使用普通下载器, 无关 Cookie, 干净的 HttpClient spider.AddDataFlow(new DataParser <EntitySpider.CnblogsEntry>()) .AddDataFlow(spider.GetDefaultStorage()); spider.AddRequests( new Request("https://news.cnblogs.com/n/page/1/", new Dictionary <string, string> { { "网站", "博客园" } }), new Request("https://news.cnblogs.com/n/page/2/", new Dictionary <string, string> { { "网站", "博客园" } })); return(spider.RunAsync()); // 启动 }
public void Run() { Spider spider = SpiderBuilder.CreateBuilder() .AddRequest("https://weixin.sogou.com/").Buid(); spider.AddPageProcessor(new Processor2()); for (int i = 1; i <= 10; i++) { spider.AddRequest($"https://weixin.sogou.com/weixin?type=2&ie=utf8&page={i}&query=马云"); } spider.NewRequestSleepInterval = 2000; // 2s // spider.EmptySleepTime = 60; // 60s spider.ThreadNumber = 5; //spider.UseNLog(); spider.UseRedisScheduler("localhost"); spider.Run(); spider.OnStatusChanged += Spider_OnStatusChanged; // Console.WriteLine("end main "); }
/// <summary> /// 运行 /// </summary> /// <param name="args">运行参数</param> public static void Run(params string[] args) { Framework.SetEncoding(); var configurationBuilder = Framework.CreateConfigurationBuilder(null, args); var configuration = configurationBuilder.Build(); string spider = configuration["spider"]; if (string.IsNullOrWhiteSpace(spider)) { throw new SpiderException("未指定需要执行的爬虫"); } var name = configuration["name"]; var id = configuration["id"] ?? Guid.NewGuid().ToString("N"); var config = configuration["config"]; var arguments = configuration["args"]?.Split(' '); var distribute = configuration["distribute"] == "true"; PrintEnvironment(args); var spiderTypes = DetectSpiders(); if (spiderTypes == null || spiderTypes.Count == 0) { return; } var spiderType = spiderTypes.FirstOrDefault(x => x.Name.ToLower() == spider.ToLower()); if (spiderType == null) { ConsoleHelper.WriteLine($"未找到爬虫: {spider}", 0, ConsoleColor.DarkYellow); return; } var builder = new SpiderBuilder(); builder.AddSerilog(); builder.ConfigureAppConfiguration(config); if (!distribute) { builder.UseStandalone(); } builder.AddSpider(spiderType); var provider = builder.Build(); var instance = provider.Create(spiderType); if (instance != null) { instance.Name = name; instance.Id = id; instance.RunAsync(arguments); } else { ConsoleHelper.WriteLine("创建爬虫对象失败", 0, ConsoleColor.DarkYellow); } }
protected TestBase() { var builder = new SpiderBuilder(); builder.AddSerilog(); builder.ConfigureAppConfiguration(null, null, false); builder.UseStandalone(); SpiderFactory = builder.Build(); SpiderFactory.GetRequiredService <ILogger <TestBase> >() .LogInformation($"Development {SpiderFactory.GetRequiredService<IConfiguration>()["Development"]}"); }
public void Run() { Spider spider = SpiderBuilder.CreateBuilder() .AddRequest("https://www.cnblogs.com/") .AddPageProcessor(new CNBlogProcessor()) .Buid(); spider.UseNLog(); // spider.UseRedisScheduler("localhost"); //spider.UseChromeWebDriverDownloader(@"C:\Users\admin\.nuget\packages\selenium.webdriver.chromedriver\2.44.0\driver\win32\"); // spider.UseChromeWebDriverDownloader(); //spider.AddDapperDataBasePipeline(new DapperDatabaseStore() //{ // OnSave = UseDapperStoreSave //}); //spider.SetDownloaderProxy(new WebProxy("127.0.0.1", 1080) //{ // // Credentials = new NetworkCredential("[USERNAME]", "[PASSWORD]") //}); //spider.SetDownloaderProxy(new DownloaderProxy(new WebProxy("127.0.0.1", 1080))); //spider.SetDownloaderProxy(new SimpleDownloaderProxyPools( // new WebProxy("127.0.0.1", 1080), // new WebProxy("192.168.1.1", 1080), // new WebProxy("192.168.1.2", 1080) // )); // proxy pools //spider.UseHttpProxyPools(100, 100, new WebProxy("127.0.0.1", 1080) //{ // Credentials = new NetworkCredential("[USERNAME]", "[PASSWORD]") //}); Random random = new Random(); spider.UseStaticSleepInterval = false; spider.NewRequestDynamicSleepInterval = () => random.Next(100, 1000); spider.OnNewRequesting += (_, interval) => { Console.WriteLine("sleep:" + interval); }; spider.Run(); }
public static Task Run() { //var spider = Create<VnexpressSpider>(); var builder = new SpiderBuilder(); builder.AddSerilog(); builder.ConfigureAppConfiguration(); builder.UseStandalone(); builder.AddSpider <VnexpressSpider>(); builder.Services.AddSingleton <IDynamicMessageQueue, InMemoryMessageQueue>((s) => null); //builder.Services.AddSingleton<IDynamicMessageQueue>(null as IDynamicMessageQueue); var factory = builder.Build(); var spider = factory.Create <VnexpressSpider>(); return(spider.RunAsync()); }
public static Task Run() { //var spider = Create<VnexpressSpider>(); var builder = new SpiderBuilder(); builder.AddSerilog(); builder.Services.AddSingleton <IProxyValidator, FakeProxyValidator>(); //builder.Services.AddSingleton<IProxyValidator, DefaultProxyValidator>(); builder.ConfigureAppConfiguration(null, args: new string[] { "/ProxySupplyUrl=http://localhost:52445/api/proxies" }, true); builder.UseStandalone(); var settings = new ProjectDefinition() { ProjectName = "Vnexpress Spider", Site = "Vnexpress/Kinh Doanh", ItemUrlsSelector = "//article/h1[@class='title_news']/a[1];//article[@class='list_news']/h4[@class='title_news']/a[1]", Urls = "https://vnexpress.net/kinh-doanh", FileStorage = @"P:\Neil.Test\Spider Storage\Vnexpress", FileFormat = "*.json", PageLimit = 4, Deepth = 2, NextPageSelector = "//p[@id='pagination']/a[@class='next']", NumberOfConcurrentRequests = 5, Mapping = new ItemMapping { ItemCssSelector = "//section[@id='left_calculator']", Mapping = new FieldMapping[] { new FieldMapping { Field = "Title", CssSelector = "//h1[@class='title_news_detail mb10']" }, new FieldMapping { Field = "Description", CssSelector = "//p[@class='description']" }, } } }; builder.Services.AddSingleton <ProjectDefinition>(settings); builder.AddSpider <HttpClientSpider>(); // builder.Services.AddSingleton<IDynamicMessageQueue, InMemoryMessageQueue>((s)=> null); //builder.Services.AddSingleton<IDynamicMessageQueue,InMemoryMessageQueue>(); builder.UseDynamicMessageQueue(); var factory = builder.Build(); var spider = factory.Create <HttpClientSpider>(); return(spider.RunAsync()); }
public static Task Run() { var builder = new SpiderBuilder(); builder.AddSerilog(); builder.ConfigureAppConfiguration(); builder.UseStandalone(); builder.AddSpider <EntitySpider>(); var provider = builder.Build(); var spider = provider.Create <Spider>(); spider.NewGuidId(); // 设置任务标识 spider.Name = "博客园全站采集"; // 设置任务名称 spider.Speed = 1; // 设置采集速度, 表示每秒下载多少个请求, 大于 1 时越大速度越快, 小于 1 时越小越慢, 不能为0. spider.Depth = 3; // 设置采集深度 spider.DownloaderSettings.Type = DownloaderType.HttpClient; // 使用普通下载器, 无关 Cookie, 干净的 HttpClient spider.AddDataFlow(new CnblogsDataParser()).AddDataFlow(new ConsoleStorage()); spider.AddRequests("http://www.cnblogs.com/"); // 设置起始链接 return(spider.RunAsync()); // 启动 }
public static async void Run() { var builder = new SpiderBuilder(); builder.AddSerilog(); builder.ConfigureAppConfiguration(loadCommandLine: false); builder.UseStandalone(); builder.AddSpider <CnblogsSpider>(); var provider = builder.Build(); var spider = provider.Create <CnblogsSpider>(); //spider.Scheduler = new QueueBfsScheduler(); //spider.NewGuidId(); // 设置任务标识 //spider.Name = "博客园全站采集"; // 设置任务名称 //spider.Speed = 1; // 设置采集速度, 表示每秒下载多少个请求, 大于 1 时越大速度越快, 小于 1 时越小越慢, 不能为0. // // spider.Depth = 3; // 设置采集深度 //spider.DownloaderSettings.Type = DownloaderType.HttpClient; // 使用普通下载器, 无关 Cookie, 干净的 HttpClient //spider.AddDataFlow(new CnblogsDataParser()).AddDataFlow(new ConsoleStorage()); //spider.AddRequests("http://www.cnblogs.com/"); // 设置起始链接 await spider.RunAsync(); // 启动 }
static void Main(string[] args) { try { var builder = new SpiderBuilder(); var configurationBuilder = Framework.CreateConfigurationBuilder(null, args); var configuration = configurationBuilder.Build(); var @class = configuration["dotnetspider.spider.class"]; var spiderId = configuration["dotnetspider.spider.id"]; @class = "DotnetSpider.Spiders.CnblogsSpider"; spiderId = "xxxxxxxx"; var folder = Directory.Exists("/logs/") ? "/logs/" : ""; var logPath = string.IsNullOrWhiteSpace(spiderId) ? $"{folder}{DateTime.Now:yyyy-MM-dd HH:mm:ss}.log" : $"{folder}{spiderId}.log"; var loggerConfiguration = new LoggerConfiguration() .MinimumLevel.Information() .MinimumLevel.Override("Microsoft", LogEventLevel.Warning) .Enrich.FromLogContext() .WriteTo.Console().WriteTo .RollingFile(logPath); builder.AddSerilog(loggerConfiguration); var spiderName = configuration["dotnetspider.spider.name"]; spiderName = "博客园"; if (string.IsNullOrWhiteSpace(@class) || string.IsNullOrWhiteSpace(spiderId) || string.IsNullOrWhiteSpace(spiderName) ) { Log.Logger.Error($"执行爬虫的参数不正确: class {@class}, id {spiderId}, name {spiderName}"); return; } var type = Type.GetType(@class); if (type == null) { Log.Logger.Error($"未找到爬虫类型: {@class}"); return; } Log.Logger.Information($"获取爬虫类型 {type.FullName} 成功"); builder.ConfigureAppConfiguration(configuration); builder.UserKafka(); builder.AddSpider(type); var provider = builder.Build(); var spider = provider.Create(type); Log.Logger.Information($"创建爬虫实例成功"); spider.Id = spiderId; spider.Name = spiderName; Log.Logger.Information($"尝试启动爬虫实例"); spider.Run(); Log.Logger.Information($"爬虫实例退出"); } catch (Exception e) { Log.Logger.Error($"执行失败: {e}"); } }