public static void Run1() { var builder = new SpiderBuilder(); builder.AddSerilog(); builder.ConfigureAppConfiguration(); builder.UseStandalone(); builder.AddSpider <EntitySpider>(); var provider = builder.Build(); var spider = provider.Create <Spider>(); spider.Id = Guid.NewGuid().ToString("N"); // 设置任务标识 spider.Name = "博客园全站采集"; // 设置任务名称 spider.Speed = 1; // 设置采集速度, 表示每秒下载多少个请求, 大于 1 时越大速度越快, 小于 1 时越小越慢, 不能为0. spider.Depth = 3; // 设置采集深度 spider.DownloaderSettings.Type = DownloaderType.HttpClient; // 使用普通下载器, 无关 Cookie, 干净的 HttpClient spider.AddDataFlow(new DataParser { SelectableFactory = context => context.GetSelectable(ContentType.Html), CanParse = DataParserHelper.CanParseByRegex("cnblogs\\.com"), QueryFollowRequests = DataParserHelper.QueryFollowRequestsByXPath(".") }).AddDataFlow(new ConsoleStorage()); // 控制台打印采集结果 spider.AddRequests("http://www.cnblogs.com/"); // 设置起始链接 spider.RunAsync(); // 启动 }
public static Task Run() { //var spider = Create<VnexpressSpider>(); var builder = new SpiderBuilder(); builder.AddSerilog(); builder.ConfigureAppConfiguration(); builder.UseStandalone(); var settings = new ProjectDefinition() { ProjectName = "Vnexpress Spider", Site = "Vnexpress/Kinh Doanh", ItemUrlsSelector = "", Urls = "https://vnexpress.net/kinh-doanh", FileFormat = "*.html", FileStorage = @"P:\Neil.Test\Spider Storage\Vnexpress", PageLimit = 4, }; builder.Services.AddSingleton <ProjectDefinition>(settings); builder.AddSpider <VnexpressSpider>(); // builder.Services.AddSingleton<IDynamicMessageQueue, InMemoryMessageQueue>((s)=> null); //builder.Services.AddSingleton<IDynamicMessageQueue,InMemoryMessageQueue>(); builder.UseDynamicMessageQueue(); var factory = builder.Build(); var spider = factory.Create <VnexpressSpider>(); return(spider.RunAsync()); }
public static void Run() { ImageDownloader.GetInstance().Start(); var builder = new SpiderBuilder(); builder.AddSerilog(); builder.ConfigureAppConfiguration(); builder.UseStandalone(); builder.AddSpider <EntitySpider>(); var provider = builder.Build(); var spider = provider.Create <Spider>(); spider.Id = Guid.NewGuid().ToString("N"); // 设置任务标识 spider.Name = "宅男女神图片采集"; // 设置任务名称 spider.Speed = 2; // 设置采集速度, 表示每秒下载多少个请求, 大于 1 时越大速度越快, 小于 1 时越小越慢, 不能为0. spider.Depth = 5; // 设置采集深度 spider.DownloaderSettings.Type = DownloaderType.HttpClient; // 使用普通下载器, 无关 Cookie, 干净的 HttpClient //spider.AddDataFlow(new NvshensTagIndexDataParser()); spider.AddDataFlow(new NvshensFirstPageTagDataParser()); spider.AddDataFlow(new NvshensPageTagDataParser()); spider.AddDataFlow(new NvshensFirstPageDetailDataParser()); spider.AddDataFlow(new NvshensPageDetailDataParser()); //spider.AddRequests("https://www.nvshens.com/gallery/"); // 设置起始链接 spider.AddRequests("https://www.nvshens.com/gallery/luoli/"); // 设置起始链接 spider.RunAsync(); // 启动 }
/// <summary> /// 运行 /// </summary> /// <param name="args">运行参数</param> public static void Run(params string[] args) { Framework.SetEncoding(); var configurationBuilder = Framework.CreateConfigurationBuilder(null, args); var configuration = configurationBuilder.Build(); string spider = configuration["spider"]; if (string.IsNullOrWhiteSpace(spider)) { throw new SpiderException("未指定需要执行的爬虫"); } var name = configuration["name"]; var id = configuration["id"] ?? Guid.NewGuid().ToString("N"); var config = configuration["config"]; var arguments = configuration["args"]?.Split(' '); var distribute = configuration["distribute"] == "true"; PrintEnvironment(args); var spiderTypes = DetectSpiders(); if (spiderTypes == null || spiderTypes.Count == 0) { return; } var spiderType = spiderTypes.FirstOrDefault(x => x.Name.ToLower() == spider.ToLower()); if (spiderType == null) { ConsoleHelper.WriteLine($"未找到爬虫: {spider}", 0, ConsoleColor.DarkYellow); return; } var builder = new SpiderBuilder(); builder.AddSerilog(); builder.ConfigureAppConfiguration(config); if (!distribute) { builder.UseStandalone(); } builder.AddSpider(spiderType); var provider = builder.Build(); var instance = provider.Create(spiderType); if (instance != null) { instance.Name = name; instance.Id = id; instance.RunAsync(arguments); } else { ConsoleHelper.WriteLine("创建爬虫对象失败", 0, ConsoleColor.DarkYellow); } }
public static Task Run() { //var spider = Create<VnexpressSpider>(); var builder = new SpiderBuilder(); builder.AddSerilog(); builder.ConfigureAppConfiguration(); builder.UseStandalone(); builder.AddSpider <VnexpressSpider>(); builder.Services.AddSingleton <IDynamicMessageQueue, InMemoryMessageQueue>((s) => null); //builder.Services.AddSingleton<IDynamicMessageQueue>(null as IDynamicMessageQueue); var factory = builder.Build(); var spider = factory.Create <VnexpressSpider>(); return(spider.RunAsync()); }
public static Task Run() { //var spider = Create<VnexpressSpider>(); var builder = new SpiderBuilder(); builder.AddSerilog(); builder.Services.AddSingleton <IProxyValidator, FakeProxyValidator>(); //builder.Services.AddSingleton<IProxyValidator, DefaultProxyValidator>(); builder.ConfigureAppConfiguration(null, args: new string[] { "/ProxySupplyUrl=http://localhost:52445/api/proxies" }, true); builder.UseStandalone(); var settings = new ProjectDefinition() { ProjectName = "Vnexpress Spider", Site = "Vnexpress/Kinh Doanh", ItemUrlsSelector = "//article/h1[@class='title_news']/a[1];//article[@class='list_news']/h4[@class='title_news']/a[1]", Urls = "https://vnexpress.net/kinh-doanh", FileStorage = @"P:\Neil.Test\Spider Storage\Vnexpress", FileFormat = "*.json", PageLimit = 4, Deepth = 2, NextPageSelector = "//p[@id='pagination']/a[@class='next']", NumberOfConcurrentRequests = 5, Mapping = new ItemMapping { ItemCssSelector = "//section[@id='left_calculator']", Mapping = new FieldMapping[] { new FieldMapping { Field = "Title", CssSelector = "//h1[@class='title_news_detail mb10']" }, new FieldMapping { Field = "Description", CssSelector = "//p[@class='description']" }, } } }; builder.Services.AddSingleton <ProjectDefinition>(settings); builder.AddSpider <HttpClientSpider>(); // builder.Services.AddSingleton<IDynamicMessageQueue, InMemoryMessageQueue>((s)=> null); //builder.Services.AddSingleton<IDynamicMessageQueue,InMemoryMessageQueue>(); builder.UseDynamicMessageQueue(); var factory = builder.Build(); var spider = factory.Create <HttpClientSpider>(); return(spider.RunAsync()); }
public static Task Run() { var builder = new SpiderBuilder(); builder.AddSerilog(); builder.ConfigureAppConfiguration(); builder.UseStandalone(); builder.AddSpider <EntitySpider>(); var provider = builder.Build(); var spider = provider.Create <Spider>(); spider.NewGuidId(); // 设置任务标识 spider.Name = "博客园全站采集"; // 设置任务名称 spider.Speed = 1; // 设置采集速度, 表示每秒下载多少个请求, 大于 1 时越大速度越快, 小于 1 时越小越慢, 不能为0. spider.Depth = 3; // 设置采集深度 spider.DownloaderSettings.Type = DownloaderType.HttpClient; // 使用普通下载器, 无关 Cookie, 干净的 HttpClient spider.AddDataFlow(new CnblogsDataParser()).AddDataFlow(new ConsoleStorage()); spider.AddRequests("http://www.cnblogs.com/"); // 设置起始链接 return(spider.RunAsync()); // 启动 }
public static async void Run() { var builder = new SpiderBuilder(); builder.AddSerilog(); builder.ConfigureAppConfiguration(loadCommandLine: false); builder.UseStandalone(); builder.AddSpider <CnblogsSpider>(); var provider = builder.Build(); var spider = provider.Create <CnblogsSpider>(); //spider.Scheduler = new QueueBfsScheduler(); //spider.NewGuidId(); // 设置任务标识 //spider.Name = "博客园全站采集"; // 设置任务名称 //spider.Speed = 1; // 设置采集速度, 表示每秒下载多少个请求, 大于 1 时越大速度越快, 小于 1 时越小越慢, 不能为0. // // spider.Depth = 3; // 设置采集深度 //spider.DownloaderSettings.Type = DownloaderType.HttpClient; // 使用普通下载器, 无关 Cookie, 干净的 HttpClient //spider.AddDataFlow(new CnblogsDataParser()).AddDataFlow(new ConsoleStorage()); //spider.AddRequests("http://www.cnblogs.com/"); // 设置起始链接 await spider.RunAsync(); // 启动 }
static void Main(string[] args) { try { var builder = new SpiderBuilder(); var configurationBuilder = Framework.CreateConfigurationBuilder(null, args); var configuration = configurationBuilder.Build(); var @class = configuration["dotnetspider.spider.class"]; var spiderId = configuration["dotnetspider.spider.id"]; @class = "DotnetSpider.Spiders.CnblogsSpider"; spiderId = "xxxxxxxx"; var folder = Directory.Exists("/logs/") ? "/logs/" : ""; var logPath = string.IsNullOrWhiteSpace(spiderId) ? $"{folder}{DateTime.Now:yyyy-MM-dd HH:mm:ss}.log" : $"{folder}{spiderId}.log"; var loggerConfiguration = new LoggerConfiguration() .MinimumLevel.Information() .MinimumLevel.Override("Microsoft", LogEventLevel.Warning) .Enrich.FromLogContext() .WriteTo.Console().WriteTo .RollingFile(logPath); builder.AddSerilog(loggerConfiguration); var spiderName = configuration["dotnetspider.spider.name"]; spiderName = "博客园"; if (string.IsNullOrWhiteSpace(@class) || string.IsNullOrWhiteSpace(spiderId) || string.IsNullOrWhiteSpace(spiderName) ) { Log.Logger.Error($"执行爬虫的参数不正确: class {@class}, id {spiderId}, name {spiderName}"); return; } var type = Type.GetType(@class); if (type == null) { Log.Logger.Error($"未找到爬虫类型: {@class}"); return; } Log.Logger.Information($"获取爬虫类型 {type.FullName} 成功"); builder.ConfigureAppConfiguration(configuration); builder.UserKafka(); builder.AddSpider(type); var provider = builder.Build(); var spider = provider.Create(type); Log.Logger.Information($"创建爬虫实例成功"); spider.Id = spiderId; spider.Name = spiderName; Log.Logger.Information($"尝试启动爬虫实例"); spider.Run(); Log.Logger.Information($"爬虫实例退出"); } catch (Exception e) { Log.Logger.Error($"执行失败: {e}"); } }